Revert "Remove gmp from AOS" This reverts commit f37c97684f0910a3f241394549392f00145ab0f7. We need gmp for SymEngine for symbolicmanipultion in C++ Change-Id: Ia13216d1715cf96944f7b4f422b7a799f921d4a4 Signed-off-by: Austin Schuh <austin.linux@gmail.com>

commit: bb1338cd84d865f1eb7a653969204a06bba8261c [log] [tgz]
author: Austin Schuh <austin.linux@gmail.com> Sat Jun 15 19:31:16 2024 -0700
committer: Austin Schuh <austin.linux@gmail.com> Wed Jun 19 19:49:35 2024 -0700
tree: e291dbf975ebfaebab464c64131d63191b8b0e39
parent: e4a8c6c24f636a763b512a3f87dee2225762d817 [diff]
diff --git a/third_party/gmp/mpn/Makeasm.am b/third_party/gmp/mpn/Makeasm.am
new file mode 100644
index 0000000..5d7306c
--- /dev/null
+++ b/third_party/gmp/mpn/Makeasm.am

@@ -0,0 +1,118 @@
+## Automake asm file rules.
+
+# Copyright 1996, 1998-2002 Free Software Foundation, Inc.
+#
+#  This file is part of the GNU MP Library.
+#
+#  The GNU MP Library is free software; you can redistribute it and/or modify
+#  it under the terms of either:
+#
+#    * the GNU Lesser General Public License as published by the Free
+#      Software Foundation; either version 3 of the License, or (at your
+#      option) any later version.
+#
+#  or
+#
+#    * the GNU General Public License as published by the Free Software
+#      Foundation; either version 2 of the License, or (at your option) any
+#      later version.
+#
+#  or both in parallel, as here.
+#
+#  The GNU MP Library is distributed in the hope that it will be useful, but
+#  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+#  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+#  for more details.
+#
+#  You should have received copies of the GNU General Public License and the
+#  GNU Lesser General Public License along with the GNU MP Library.  If not,
+#  see https://www.gnu.org/licenses/.
+
+
+# COMPILE minus CC.
+#
+COMPILE_FLAGS = $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
+	$(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) $(ASMFLAGS)
+
+# Flags used for preprocessing (in ansi2knr rules).
+#
+PREPROCESS_FLAGS = $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
+	$(CPPFLAGS)
+
+
+# Recent versions of automake (1.5 and up for instance) append automake
+# generated suffixes to this $(SUFFIXES) list.  This is essential for us,
+# since .c must come after .s, .S and .asm.  If .c is before .s, for
+# instance, then in the mpn directory "make" will see add_n.c mentioned in
+# an explicit rule (the ansi2knr stuff) and decide it must have add_n.c,
+# even if add_n.c doesn't exist but add_n.s does.  See GNU make
+# documentation "(make)Implicit Rule Search", part 5c.
+#
+# On IRIX 6 native make this doesn't work properly though.  Somehow .c
+# remains ahead of .s, perhaps because .c.s is a builtin rule.  .asm works
+# fine though, and mpn/mips3 uses this.
+#
+SUFFIXES = .s .S .asm
+
+
+# .s assembler, no preprocessing.
+#
+.s.o:
+	$(CCAS) $(COMPILE_FLAGS) `test -f '$<' || echo '$(srcdir)/'`$<
+.s.obj:
+	$(CCAS) $(COMPILE_FLAGS) `if test -f '$<'; then $(CYGPATH_W) '$<'; else $(CYGPATH_W) '$(srcdir)/$<'; fi`
+.s.lo:
+	$(LIBTOOL) --mode=compile --tag=CC $(CCAS) $(COMPILE_FLAGS) `test -f '$<' || echo '$(srcdir)/'`$<
+
+
+# can be overridden during development, eg. "make RM_TMP=: mul_1.lo"
+RM_TMP = rm -f
+
+
+# .S assembler, preprocessed with cpp.
+#
+# It's necessary to run $(CPP) separately, since it seems not all compilers
+# recognise .S files, in particular "cc" on HP-UX 10 and 11 doesn't (and
+# will silently do nothing if given a .S).
+#
+# For .lo we need a helper script, as described below for .asm.lo.
+#
+.S.o:
+	$(CPP) $(PREPROCESS_FLAGS) `test -f '$<' || echo '$(srcdir)/'`$< | grep -v '^#' >tmp-$*.s
+	$(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@
+	$(RM_TMP) tmp-$*.s
+.S.obj:
+	$(CPP) $(PREPROCESS_FLAGS) `if test -f '$<'; then $(CYGPATH_W) '$<'; else $(CYGPATH_W) '$(srcdir)/$<'; fi` | grep -v '^#' >tmp-$*.s
+	$(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@
+	$(RM_TMP) tmp-$*.s
+.S.lo:
+	$(LIBTOOL) --mode=compile --tag=CC $(top_srcdir)/mpn/cpp-ccas --cpp="$(CPP) $(PREPROCESS_FLAGS)" $(CCAS) $(COMPILE_FLAGS) `test -f '$<' || echo '$(srcdir)/'`$<
+
+
+# .asm assembler, preprocessed with m4.
+#
+# .o and .obj are non-PIC and just need m4 followed by a compile.
+#
+# .lo is a bit tricky.  Libtool (as of version 1.5) has foo.lo as a little
+# text file, and .libs/foo.o and foo.o as the PIC and non-PIC objects,
+# respectively.  It'd be asking for lots of trouble to try to create foo.lo
+# ourselves, so instead arrange to invoke libtool like a --mode=compile, but
+# with a special m4-ccas script which first m4 preprocesses, then compiles.
+# --tag=CC is necessary since foo.asm is otherwise unknown to libtool.
+#
+# Libtool adds -DPIC when building a shared object and the .asm files look
+# for that.  But it should be noted that the other PIC flags are on occasion
+# important too, in particular FreeBSD 2.2.8 gas 1.92.3 requires -k before
+# it accepts PIC constructs like @GOT, and gcc adds that flag only under
+# -fPIC.  (Later versions of gas are happy to accept PIC stuff any time.)
+#
+.asm.o:
+	$(M4) -DOPERATION_$* `test -f '$<' || echo '$(srcdir)/'`$< >tmp-$*.s
+	$(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@
+	$(RM_TMP) tmp-$*.s
+.asm.obj:
+	$(M4) -DOPERATION_$* `if test -f '$<'; then $(CYGPATH_W) '$<'; else $(CYGPATH_W) '$(srcdir)/$<'; fi` >tmp-$*.s
+	$(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@
+	$(RM_TMP) tmp-$*.s
+.asm.lo:
+	$(LIBTOOL) --mode=compile --tag=CC $(top_srcdir)/mpn/m4-ccas --m4="$(M4)" $(CCAS) $(COMPILE_FLAGS) `test -f '$<' || echo '$(srcdir)/'`$<

diff --git a/third_party/gmp/mpn/Makefile.am b/third_party/gmp/mpn/Makefile.am
new file mode 100644
index 0000000..3ab4006
--- /dev/null
+++ b/third_party/gmp/mpn/Makefile.am

@@ -0,0 +1,59 @@
+## Process this file with automake to generate Makefile.in
+
+# Copyright 1996, 1998-2002, 2005, 2011, 2013 Free Software Foundation, Inc.
+#
+#  This file is part of the GNU MP Library.
+#
+#  The GNU MP Library is free software; you can redistribute it and/or modify
+#  it under the terms of either:
+#
+#    * the GNU Lesser General Public License as published by the Free
+#      Software Foundation; either version 3 of the License, or (at your
+#      option) any later version.
+#
+#  or
+#
+#    * the GNU General Public License as published by the Free Software
+#      Foundation; either version 2 of the License, or (at your option) any
+#      later version.
+#
+#  or both in parallel, as here.
+#
+#  The GNU MP Library is distributed in the hope that it will be useful, but
+#  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+#  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+#  for more details.
+#
+#  You should have received copies of the GNU General Public License and the
+#  GNU Lesser General Public License along with the GNU MP Library.  If not,
+#  see https://www.gnu.org/licenses/.
+
+
+AM_CPPFLAGS = -D__GMP_WITHIN_GMP -I$(top_srcdir) \
+  -DOPERATION_`echo $* | sed 's/_$$//'`
+
+OFILES = @mpn_objects@
+
+noinst_LTLIBRARIES = libmpn.la
+nodist_libmpn_la_SOURCES = fib_table.c mp_bases.c
+libmpn_la_LIBADD = $(OFILES)
+libmpn_la_DEPENDENCIES = $(OFILES)
+
+TARG_DIST = alpha arm arm64 cray generic ia64 lisp m68k m88k \
+  minithres mips32 mips64 pa32 pa64 power powerpc32 powerpc64 \
+  riscv s390_32 s390_64 sh sparc32 sparc64 thumb vax x86 x86_64
+
+EXTRA_DIST = asm-defs.m4 cpp-ccas m4-ccas $(TARG_DIST)
+
+
+# These are BUILT_SOURCES at the top-level, so normally they're built before
+# recursing into this directory.
+#
+fib_table.c:
+	cd ..; $(MAKE) $(AM_MAKEFLAGS) mpn/fib_table.c
+mp_bases.c:
+	cd ..; $(MAKE) $(AM_MAKEFLAGS) mpn/mp_bases.c
+perfsqr.h:
+	cd ..; $(MAKE) $(AM_MAKEFLAGS) mpn/perfsqr.h
+
+include Makeasm.am

diff --git a/third_party/gmp/mpn/Makefile.in b/third_party/gmp/mpn/Makefile.in
new file mode 100644
index 0000000..6593969
--- /dev/null
+++ b/third_party/gmp/mpn/Makefile.in

@@ -0,0 +1,772 @@
+# Makefile.in generated by automake 1.15 from Makefile.am.
+# @configure_input@
+
+# Copyright (C) 1994-2014 Free Software Foundation, Inc.
+
+# This Makefile.in is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
+# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE.
+
+@SET_MAKE@
+
+# Copyright 1996, 1998-2002, 2005, 2011, 2013 Free Software Foundation, Inc.
+#
+#  This file is part of the GNU MP Library.
+#
+#  The GNU MP Library is free software; you can redistribute it and/or modify
+#  it under the terms of either:
+#
+#    * the GNU Lesser General Public License as published by the Free
+#      Software Foundation; either version 3 of the License, or (at your
+#      option) any later version.
+#
+#  or
+#
+#    * the GNU General Public License as published by the Free Software
+#      Foundation; either version 2 of the License, or (at your option) any
+#      later version.
+#
+#  or both in parallel, as here.
+#
+#  The GNU MP Library is distributed in the hope that it will be useful, but
+#  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+#  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+#  for more details.
+#
+#  You should have received copies of the GNU General Public License and the
+#  GNU Lesser General Public License along with the GNU MP Library.  If not,
+#  see https://www.gnu.org/licenses/.
+
+# Copyright 1996, 1998-2002 Free Software Foundation, Inc.
+#
+#  This file is part of the GNU MP Library.
+#
+#  The GNU MP Library is free software; you can redistribute it and/or modify
+#  it under the terms of either:
+#
+#    * the GNU Lesser General Public License as published by the Free
+#      Software Foundation; either version 3 of the License, or (at your
+#      option) any later version.
+#
+#  or
+#
+#    * the GNU General Public License as published by the Free Software
+#      Foundation; either version 2 of the License, or (at your option) any
+#      later version.
+#
+#  or both in parallel, as here.
+#
+#  The GNU MP Library is distributed in the hope that it will be useful, but
+#  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+#  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+#  for more details.
+#
+#  You should have received copies of the GNU General Public License and the
+#  GNU Lesser General Public License along with the GNU MP Library.  If not,
+#  see https://www.gnu.org/licenses/.
+
+VPATH = @srcdir@
+am__is_gnu_make = { \
+  if test -z '$(MAKELEVEL)'; then \
+    false; \
+  elif test -n '$(MAKE_HOST)'; then \
+    true; \
+  elif test -n '$(MAKE_VERSION)' && test -n '$(CURDIR)'; then \
+    true; \
+  else \
+    false; \
+  fi; \
+}
+am__make_running_with_option = \
+  case $${target_option-} in \
+      ?) ;; \
+      *) echo "am__make_running_with_option: internal error: invalid" \
+              "target option '$${target_option-}' specified" >&2; \
+         exit 1;; \
+  esac; \
+  has_opt=no; \
+  sane_makeflags=$$MAKEFLAGS; \
+  if $(am__is_gnu_make); then \
+    sane_makeflags=$$MFLAGS; \
+  else \
+    case $$MAKEFLAGS in \
+      *\\[\ \	]*) \
+        bs=\\; \
+        sane_makeflags=`printf '%s\n' "$$MAKEFLAGS" \
+          | sed "s/$$bs$$bs[$$bs $$bs	]*//g"`;; \
+    esac; \
+  fi; \
+  skip_next=no; \
+  strip_trailopt () \
+  { \
+    flg=`printf '%s\n' "$$flg" | sed "s/$$1.*$$//"`; \
+  }; \
+  for flg in $$sane_makeflags; do \
+    test $$skip_next = yes && { skip_next=no; continue; }; \
+    case $$flg in \
+      *=*|--*) continue;; \
+        -*I) strip_trailopt 'I'; skip_next=yes;; \
+      -*I?*) strip_trailopt 'I';; \
+        -*O) strip_trailopt 'O'; skip_next=yes;; \
+      -*O?*) strip_trailopt 'O';; \
+        -*l) strip_trailopt 'l'; skip_next=yes;; \
+      -*l?*) strip_trailopt 'l';; \
+      -[dEDm]) skip_next=yes;; \
+      -[JT]) skip_next=yes;; \
+    esac; \
+    case $$flg in \
+      *$$target_option*) has_opt=yes; break;; \
+    esac; \
+  done; \
+  test $$has_opt = yes
+am__make_dryrun = (target_option=n; $(am__make_running_with_option))
+am__make_keepgoing = (target_option=k; $(am__make_running_with_option))
+pkgdatadir = $(datadir)/@PACKAGE@
+pkgincludedir = $(includedir)/@PACKAGE@
+pkglibdir = $(libdir)/@PACKAGE@
+pkglibexecdir = $(libexecdir)/@PACKAGE@
+am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
+install_sh_DATA = $(install_sh) -c -m 644
+install_sh_PROGRAM = $(install_sh) -c
+install_sh_SCRIPT = $(install_sh) -c
+INSTALL_HEADER = $(INSTALL_DATA)
+transform = $(program_transform_name)
+NORMAL_INSTALL = :
+PRE_INSTALL = :
+POST_INSTALL = :
+NORMAL_UNINSTALL = :
+PRE_UNINSTALL = :
+POST_UNINSTALL = :
+build_triplet = @build@
+host_triplet = @host@
+subdir = mpn
+ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
+am__aclocal_m4_deps = $(top_srcdir)/acinclude.m4 \
+	$(top_srcdir)/configure.ac
+am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
+	$(ACLOCAL_M4)
+DIST_COMMON = $(srcdir)/Makefile.am $(am__DIST_COMMON)
+mkinstalldirs = $(install_sh) -d
+CONFIG_HEADER = $(top_builddir)/config.h
+CONFIG_CLEAN_FILES =
+CONFIG_CLEAN_VPATH_FILES =
+LTLIBRARIES = $(noinst_LTLIBRARIES)
+am__DEPENDENCIES_1 =
+nodist_libmpn_la_OBJECTS = fib_table.lo mp_bases.lo
+libmpn_la_OBJECTS = $(nodist_libmpn_la_OBJECTS)
+AM_V_lt = $(am__v_lt_@AM_V@)
+am__v_lt_ = $(am__v_lt_@AM_DEFAULT_V@)
+am__v_lt_0 = --silent
+am__v_lt_1 = 
+AM_V_P = $(am__v_P_@AM_V@)
+am__v_P_ = $(am__v_P_@AM_DEFAULT_V@)
+am__v_P_0 = false
+am__v_P_1 = :
+AM_V_GEN = $(am__v_GEN_@AM_V@)
+am__v_GEN_ = $(am__v_GEN_@AM_DEFAULT_V@)
+am__v_GEN_0 = @echo "  GEN     " $@;
+am__v_GEN_1 = 
+AM_V_at = $(am__v_at_@AM_V@)
+am__v_at_ = $(am__v_at_@AM_DEFAULT_V@)
+am__v_at_0 = @
+am__v_at_1 = 
+DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir)
+depcomp =
+am__depfiles_maybe =
+COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
+	$(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+LTCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) \
+	$(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \
+	$(AM_CFLAGS) $(CFLAGS)
+AM_V_CC = $(am__v_CC_@AM_V@)
+am__v_CC_ = $(am__v_CC_@AM_DEFAULT_V@)
+am__v_CC_0 = @echo "  CC      " $@;
+am__v_CC_1 = 
+CCLD = $(CC)
+LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \
+	$(AM_LDFLAGS) $(LDFLAGS) -o $@
+AM_V_CCLD = $(am__v_CCLD_@AM_V@)
+am__v_CCLD_ = $(am__v_CCLD_@AM_DEFAULT_V@)
+am__v_CCLD_0 = @echo "  CCLD    " $@;
+am__v_CCLD_1 = 
+SOURCES = $(nodist_libmpn_la_SOURCES)
+DIST_SOURCES =
+am__can_run_installinfo = \
+  case $$AM_UPDATE_INFO_DIR in \
+    n|no|NO) false;; \
+    *) (install-info --version) >/dev/null 2>&1;; \
+  esac
+am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP)
+# Read a list of newline-separated strings from the standard input,
+# and print each of them once, without duplicates.  Input order is
+# *not* preserved.
+am__uniquify_input = $(AWK) '\
+  BEGIN { nonempty = 0; } \
+  { items[$$0] = 1; nonempty = 1; } \
+  END { if (nonempty) { for (i in items) print i; }; } \
+'
+# Make sure the list of sources is unique.  This is necessary because,
+# e.g., the same source file might be shared among _SOURCES variables
+# for different programs/libraries.
+am__define_uniq_tagged_files = \
+  list='$(am__tagged_files)'; \
+  unique=`for i in $$list; do \
+    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+  done | $(am__uniquify_input)`
+ETAGS = etags
+CTAGS = ctags
+am__DIST_COMMON = $(srcdir)/Makeasm.am $(srcdir)/Makefile.in README
+DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
+ABI = @ABI@
+ACLOCAL = @ACLOCAL@
+AMTAR = @AMTAR@
+AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@
+AR = @AR@
+AS = @AS@
+ASMFLAGS = @ASMFLAGS@
+AUTOCONF = @AUTOCONF@
+AUTOHEADER = @AUTOHEADER@
+AUTOMAKE = @AUTOMAKE@
+AWK = @AWK@
+CALLING_CONVENTIONS_OBJS = @CALLING_CONVENTIONS_OBJS@
+CC = @CC@
+CCAS = @CCAS@
+CC_FOR_BUILD = @CC_FOR_BUILD@
+CFLAGS = @CFLAGS@
+CPP = @CPP@
+CPPFLAGS = @CPPFLAGS@
+CPP_FOR_BUILD = @CPP_FOR_BUILD@
+CXX = @CXX@
+CXXCPP = @CXXCPP@
+CXXFLAGS = @CXXFLAGS@
+CYGPATH_W = @CYGPATH_W@
+DEFN_LONG_LONG_LIMB = @DEFN_LONG_LONG_LIMB@
+DEFS = @DEFS@
+DLLTOOL = @DLLTOOL@
+DSYMUTIL = @DSYMUTIL@
+DUMPBIN = @DUMPBIN@
+ECHO_C = @ECHO_C@
+ECHO_N = @ECHO_N@
+ECHO_T = @ECHO_T@
+EGREP = @EGREP@
+EXEEXT = @EXEEXT@
+EXEEXT_FOR_BUILD = @EXEEXT_FOR_BUILD@
+FGREP = @FGREP@
+GMP_LDFLAGS = @GMP_LDFLAGS@
+GMP_LIMB_BITS = @GMP_LIMB_BITS@
+GMP_NAIL_BITS = @GMP_NAIL_BITS@
+GREP = @GREP@
+HAVE_CLOCK_01 = @HAVE_CLOCK_01@
+HAVE_CPUTIME_01 = @HAVE_CPUTIME_01@
+HAVE_GETRUSAGE_01 = @HAVE_GETRUSAGE_01@
+HAVE_GETTIMEOFDAY_01 = @HAVE_GETTIMEOFDAY_01@
+HAVE_HOST_CPU_FAMILY_power = @HAVE_HOST_CPU_FAMILY_power@
+HAVE_HOST_CPU_FAMILY_powerpc = @HAVE_HOST_CPU_FAMILY_powerpc@
+HAVE_SIGACTION_01 = @HAVE_SIGACTION_01@
+HAVE_SIGALTSTACK_01 = @HAVE_SIGALTSTACK_01@
+HAVE_SIGSTACK_01 = @HAVE_SIGSTACK_01@
+HAVE_STACK_T_01 = @HAVE_STACK_T_01@
+HAVE_SYS_RESOURCE_H_01 = @HAVE_SYS_RESOURCE_H_01@
+INSTALL = @INSTALL@
+INSTALL_DATA = @INSTALL_DATA@
+INSTALL_PROGRAM = @INSTALL_PROGRAM@
+INSTALL_SCRIPT = @INSTALL_SCRIPT@
+INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
+LD = @LD@
+LDFLAGS = @LDFLAGS@
+LEX = @LEX@
+LEXLIB = @LEXLIB@
+LEX_OUTPUT_ROOT = @LEX_OUTPUT_ROOT@
+LIBCURSES = @LIBCURSES@
+LIBGMPXX_LDFLAGS = @LIBGMPXX_LDFLAGS@
+LIBGMP_DLL = @LIBGMP_DLL@
+LIBGMP_LDFLAGS = @LIBGMP_LDFLAGS@
+LIBM = @LIBM@
+LIBM_FOR_BUILD = @LIBM_FOR_BUILD@
+LIBOBJS = @LIBOBJS@
+LIBREADLINE = @LIBREADLINE@
+LIBS = @LIBS@
+LIBTOOL = @LIBTOOL@
+LIPO = @LIPO@
+LN_S = @LN_S@
+LTLIBOBJS = @LTLIBOBJS@
+LT_SYS_LIBRARY_PATH = @LT_SYS_LIBRARY_PATH@
+M4 = @M4@
+MAINT = @MAINT@
+MAKEINFO = @MAKEINFO@
+MANIFEST_TOOL = @MANIFEST_TOOL@
+MKDIR_P = @MKDIR_P@
+NM = @NM@
+NMEDIT = @NMEDIT@
+OBJDUMP = @OBJDUMP@
+OBJEXT = @OBJEXT@
+OTOOL = @OTOOL@
+OTOOL64 = @OTOOL64@
+PACKAGE = @PACKAGE@
+PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
+PACKAGE_NAME = @PACKAGE_NAME@
+PACKAGE_STRING = @PACKAGE_STRING@
+PACKAGE_TARNAME = @PACKAGE_TARNAME@
+PACKAGE_URL = @PACKAGE_URL@
+PACKAGE_VERSION = @PACKAGE_VERSION@
+PATH_SEPARATOR = @PATH_SEPARATOR@
+RANLIB = @RANLIB@
+SED = @SED@
+SET_MAKE = @SET_MAKE@
+SHELL = @SHELL@
+SPEED_CYCLECOUNTER_OBJ = @SPEED_CYCLECOUNTER_OBJ@
+STRIP = @STRIP@
+TAL_OBJECT = @TAL_OBJECT@
+TUNE_LIBS = @TUNE_LIBS@
+TUNE_SQR_OBJ = @TUNE_SQR_OBJ@
+U_FOR_BUILD = @U_FOR_BUILD@
+VERSION = @VERSION@
+WITH_READLINE_01 = @WITH_READLINE_01@
+YACC = @YACC@
+YFLAGS = @YFLAGS@
+abs_builddir = @abs_builddir@
+abs_srcdir = @abs_srcdir@
+abs_top_builddir = @abs_top_builddir@
+abs_top_srcdir = @abs_top_srcdir@
+ac_ct_AR = @ac_ct_AR@
+ac_ct_CC = @ac_ct_CC@
+ac_ct_CXX = @ac_ct_CXX@
+ac_ct_DUMPBIN = @ac_ct_DUMPBIN@
+am__leading_dot = @am__leading_dot@
+am__tar = @am__tar@
+am__untar = @am__untar@
+bindir = @bindir@
+build = @build@
+build_alias = @build_alias@
+build_cpu = @build_cpu@
+build_os = @build_os@
+build_vendor = @build_vendor@
+builddir = @builddir@
+datadir = @datadir@
+datarootdir = @datarootdir@
+docdir = @docdir@
+dvidir = @dvidir@
+exec_prefix = @exec_prefix@
+gmp_srclinks = @gmp_srclinks@
+host = @host@
+host_alias = @host_alias@
+host_cpu = @host_cpu@
+host_os = @host_os@
+host_vendor = @host_vendor@
+htmldir = @htmldir@
+includedir = @includedir@
+infodir = @infodir@
+install_sh = @install_sh@
+libdir = @libdir@
+libexecdir = @libexecdir@
+localedir = @localedir@
+localstatedir = @localstatedir@
+mandir = @mandir@
+mkdir_p = @mkdir_p@
+mpn_objects = @mpn_objects@
+mpn_objs_in_libgmp = @mpn_objs_in_libgmp@
+oldincludedir = @oldincludedir@
+pdfdir = @pdfdir@
+prefix = @prefix@
+program_transform_name = @program_transform_name@
+psdir = @psdir@
+sbindir = @sbindir@
+sharedstatedir = @sharedstatedir@
+srcdir = @srcdir@
+sysconfdir = @sysconfdir@
+target_alias = @target_alias@
+top_build_prefix = @top_build_prefix@
+top_builddir = @top_builddir@
+top_srcdir = @top_srcdir@
+AM_CPPFLAGS = -D__GMP_WITHIN_GMP -I$(top_srcdir) \
+  -DOPERATION_`echo $* | sed 's/_$$//'`
+
+OFILES = @mpn_objects@
+noinst_LTLIBRARIES = libmpn.la
+nodist_libmpn_la_SOURCES = fib_table.c mp_bases.c
+libmpn_la_LIBADD = $(OFILES)
+libmpn_la_DEPENDENCIES = $(OFILES)
+TARG_DIST = alpha arm arm64 cray generic ia64 lisp m68k m88k \
+  minithres mips32 mips64 pa32 pa64 power powerpc32 powerpc64 \
+  riscv s390_32 s390_64 sh sparc32 sparc64 thumb vax x86 x86_64
+
+EXTRA_DIST = asm-defs.m4 cpp-ccas m4-ccas $(TARG_DIST)
+
+# COMPILE minus CC.
+#
+COMPILE_FLAGS = $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
+	$(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) $(ASMFLAGS)
+
+
+# Flags used for preprocessing (in ansi2knr rules).
+#
+PREPROCESS_FLAGS = $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
+	$(CPPFLAGS)
+
+
+# Recent versions of automake (1.5 and up for instance) append automake
+# generated suffixes to this $(SUFFIXES) list.  This is essential for us,
+# since .c must come after .s, .S and .asm.  If .c is before .s, for
+# instance, then in the mpn directory "make" will see add_n.c mentioned in
+# an explicit rule (the ansi2knr stuff) and decide it must have add_n.c,
+# even if add_n.c doesn't exist but add_n.s does.  See GNU make
+# documentation "(make)Implicit Rule Search", part 5c.
+#
+# On IRIX 6 native make this doesn't work properly though.  Somehow .c
+# remains ahead of .s, perhaps because .c.s is a builtin rule.  .asm works
+# fine though, and mpn/mips3 uses this.
+#
+SUFFIXES = .s .S .asm
+
+# can be overridden during development, eg. "make RM_TMP=: mul_1.lo"
+RM_TMP = rm -f
+all: all-am
+
+.SUFFIXES:
+.SUFFIXES: .s .S .asm .c .lo .o .obj
+$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am $(srcdir)/Makeasm.am $(am__configure_deps)
+	@for dep in $?; do \
+	  case '$(am__configure_deps)' in \
+	    *$$dep*) \
+	      ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \
+	        && { if test -f $@; then exit 0; else break; fi; }; \
+	      exit 1;; \
+	  esac; \
+	done; \
+	echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu --ignore-deps mpn/Makefile'; \
+	$(am__cd) $(top_srcdir) && \
+	  $(AUTOMAKE) --gnu --ignore-deps mpn/Makefile
+Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
+	@case '$?' in \
+	  *config.status*) \
+	    cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
+	  *) \
+	    echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \
+	    cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \
+	esac;
+$(srcdir)/Makeasm.am $(am__empty):
+
+$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+
+$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(am__aclocal_m4_deps):
+
+clean-noinstLTLIBRARIES:
+	-test -z "$(noinst_LTLIBRARIES)" || rm -f $(noinst_LTLIBRARIES)
+	@list='$(noinst_LTLIBRARIES)'; \
+	locs=`for p in $$list; do echo $$p; done | \
+	      sed 's|^[^/]*$$|.|; s|/[^/]*$$||; s|$$|/so_locations|' | \
+	      sort -u`; \
+	test -z "$$locs" || { \
+	  echo rm -f $${locs}; \
+	  rm -f $${locs}; \
+	}
+
+libmpn.la: $(libmpn_la_OBJECTS) $(libmpn_la_DEPENDENCIES) $(EXTRA_libmpn_la_DEPENDENCIES) 
+	$(AM_V_CCLD)$(LINK)  $(libmpn_la_OBJECTS) $(libmpn_la_LIBADD) $(LIBS)
+
+mostlyclean-compile:
+	-rm -f *.$(OBJEXT)
+
+distclean-compile:
+	-rm -f *.tab.c
+
+.c.o:
+	$(AM_V_CC)$(COMPILE) -c -o $@ $<
+
+.c.obj:
+	$(AM_V_CC)$(COMPILE) -c -o $@ `$(CYGPATH_W) '$<'`
+
+.c.lo:
+	$(AM_V_CC)$(LTCOMPILE) -c -o $@ $<
+
+mostlyclean-libtool:
+	-rm -f *.lo
+
+clean-libtool:
+	-rm -rf .libs _libs
+
+ID: $(am__tagged_files)
+	$(am__define_uniq_tagged_files); mkid -fID $$unique
+tags: tags-am
+TAGS: tags
+
+tags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
+	set x; \
+	here=`pwd`; \
+	$(am__define_uniq_tagged_files); \
+	shift; \
+	if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \
+	  test -n "$$unique" || unique=$$empty_fix; \
+	  if test $$# -gt 0; then \
+	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	      "$$@" $$unique; \
+	  else \
+	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	      $$unique; \
+	  fi; \
+	fi
+ctags: ctags-am
+
+CTAGS: ctags
+ctags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
+	$(am__define_uniq_tagged_files); \
+	test -z "$(CTAGS_ARGS)$$unique" \
+	  || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
+	     $$unique
+
+GTAGS:
+	here=`$(am__cd) $(top_builddir) && pwd` \
+	  && $(am__cd) $(top_srcdir) \
+	  && gtags -i $(GTAGS_ARGS) "$$here"
+cscopelist: cscopelist-am
+
+cscopelist-am: $(am__tagged_files)
+	list='$(am__tagged_files)'; \
+	case "$(srcdir)" in \
+	  [\\/]* | ?:[\\/]*) sdir="$(srcdir)" ;; \
+	  *) sdir=$(subdir)/$(srcdir) ;; \
+	esac; \
+	for i in $$list; do \
+	  if test -f "$$i"; then \
+	    echo "$(subdir)/$$i"; \
+	  else \
+	    echo "$$sdir/$$i"; \
+	  fi; \
+	done >> $(top_builddir)/cscope.files
+
+distclean-tags:
+	-rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
+
+distdir: $(DISTFILES)
+	@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	list='$(DISTFILES)'; \
+	  dist_files=`for file in $$list; do echo $$file; done | \
+	  sed -e "s|^$$srcdirstrip/||;t" \
+	      -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
+	case $$dist_files in \
+	  */*) $(MKDIR_P) `echo "$$dist_files" | \
+			   sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
+			   sort -u` ;; \
+	esac; \
+	for file in $$dist_files; do \
+	  if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
+	  if test -d $$d/$$file; then \
+	    dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
+	    if test -d "$(distdir)/$$file"; then \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
+	      cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \
+	  else \
+	    test -f "$(distdir)/$$file" \
+	    || cp -p $$d/$$file "$(distdir)/$$file" \
+	    || exit 1; \
+	  fi; \
+	done
+check-am: all-am
+check: check-am
+all-am: Makefile $(LTLIBRARIES)
+installdirs:
+install: install-am
+install-exec: install-exec-am
+install-data: install-data-am
+uninstall: uninstall-am
+
+install-am: all-am
+	@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
+
+installcheck: installcheck-am
+install-strip:
+	if test -z '$(STRIP)'; then \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	      install; \
+	else \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	    "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
+	fi
+mostlyclean-generic:
+
+clean-generic:
+
+distclean-generic:
+	-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
+	-test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
+
+maintainer-clean-generic:
+	@echo "This command is intended for maintainers to use"
+	@echo "it deletes files that may require special tools to rebuild."
+clean: clean-am
+
+clean-am: clean-generic clean-libtool clean-noinstLTLIBRARIES \
+	mostlyclean-am
+
+distclean: distclean-am
+	-rm -f Makefile
+distclean-am: clean-am distclean-compile distclean-generic \
+	distclean-tags
+
+dvi: dvi-am
+
+dvi-am:
+
+html: html-am
+
+html-am:
+
+info: info-am
+
+info-am:
+
+install-data-am:
+
+install-dvi: install-dvi-am
+
+install-dvi-am:
+
+install-exec-am:
+
+install-html: install-html-am
+
+install-html-am:
+
+install-info: install-info-am
+
+install-info-am:
+
+install-man:
+
+install-pdf: install-pdf-am
+
+install-pdf-am:
+
+install-ps: install-ps-am
+
+install-ps-am:
+
+installcheck-am:
+
+maintainer-clean: maintainer-clean-am
+	-rm -f Makefile
+maintainer-clean-am: distclean-am maintainer-clean-generic
+
+mostlyclean: mostlyclean-am
+
+mostlyclean-am: mostlyclean-compile mostlyclean-generic \
+	mostlyclean-libtool
+
+pdf: pdf-am
+
+pdf-am:
+
+ps: ps-am
+
+ps-am:
+
+uninstall-am:
+
+.MAKE: install-am install-strip
+
+.PHONY: CTAGS GTAGS TAGS all all-am check check-am clean clean-generic \
+	clean-libtool clean-noinstLTLIBRARIES cscopelist-am ctags \
+	ctags-am distclean distclean-compile distclean-generic \
+	distclean-libtool distclean-tags distdir dvi dvi-am html \
+	html-am info info-am install install-am install-data \
+	install-data-am install-dvi install-dvi-am install-exec \
+	install-exec-am install-html install-html-am install-info \
+	install-info-am install-man install-pdf install-pdf-am \
+	install-ps install-ps-am install-strip installcheck \
+	installcheck-am installdirs maintainer-clean \
+	maintainer-clean-generic mostlyclean mostlyclean-compile \
+	mostlyclean-generic mostlyclean-libtool pdf pdf-am ps ps-am \
+	tags tags-am uninstall uninstall-am
+
+.PRECIOUS: Makefile
+
+
+# These are BUILT_SOURCES at the top-level, so normally they're built before
+# recursing into this directory.
+#
+fib_table.c:
+	cd ..; $(MAKE) $(AM_MAKEFLAGS) mpn/fib_table.c
+mp_bases.c:
+	cd ..; $(MAKE) $(AM_MAKEFLAGS) mpn/mp_bases.c
+perfsqr.h:
+	cd ..; $(MAKE) $(AM_MAKEFLAGS) mpn/perfsqr.h
+
+# .s assembler, no preprocessing.
+#
+.s.o:
+	$(CCAS) $(COMPILE_FLAGS) `test -f '$<' || echo '$(srcdir)/'`$<
+.s.obj:
+	$(CCAS) $(COMPILE_FLAGS) `if test -f '$<'; then $(CYGPATH_W) '$<'; else $(CYGPATH_W) '$(srcdir)/$<'; fi`
+.s.lo:
+	$(LIBTOOL) --mode=compile --tag=CC $(CCAS) $(COMPILE_FLAGS) `test -f '$<' || echo '$(srcdir)/'`$<
+
+# .S assembler, preprocessed with cpp.
+#
+# It's necessary to run $(CPP) separately, since it seems not all compilers
+# recognise .S files, in particular "cc" on HP-UX 10 and 11 doesn't (and
+# will silently do nothing if given a .S).
+#
+# For .lo we need a helper script, as described below for .asm.lo.
+#
+.S.o:
+	$(CPP) $(PREPROCESS_FLAGS) `test -f '$<' || echo '$(srcdir)/'`$< | grep -v '^#' >tmp-$*.s
+	$(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@
+	$(RM_TMP) tmp-$*.s
+.S.obj:
+	$(CPP) $(PREPROCESS_FLAGS) `if test -f '$<'; then $(CYGPATH_W) '$<'; else $(CYGPATH_W) '$(srcdir)/$<'; fi` | grep -v '^#' >tmp-$*.s
+	$(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@
+	$(RM_TMP) tmp-$*.s
+.S.lo:
+	$(LIBTOOL) --mode=compile --tag=CC $(top_srcdir)/mpn/cpp-ccas --cpp="$(CPP) $(PREPROCESS_FLAGS)" $(CCAS) $(COMPILE_FLAGS) `test -f '$<' || echo '$(srcdir)/'`$<
+
+# .asm assembler, preprocessed with m4.
+#
+# .o and .obj are non-PIC and just need m4 followed by a compile.
+#
+# .lo is a bit tricky.  Libtool (as of version 1.5) has foo.lo as a little
+# text file, and .libs/foo.o and foo.o as the PIC and non-PIC objects,
+# respectively.  It'd be asking for lots of trouble to try to create foo.lo
+# ourselves, so instead arrange to invoke libtool like a --mode=compile, but
+# with a special m4-ccas script which first m4 preprocesses, then compiles.
+# --tag=CC is necessary since foo.asm is otherwise unknown to libtool.
+#
+# Libtool adds -DPIC when building a shared object and the .asm files look
+# for that.  But it should be noted that the other PIC flags are on occasion
+# important too, in particular FreeBSD 2.2.8 gas 1.92.3 requires -k before
+# it accepts PIC constructs like @GOT, and gcc adds that flag only under
+# -fPIC.  (Later versions of gas are happy to accept PIC stuff any time.)
+#
+.asm.o:
+	$(M4) -DOPERATION_$* `test -f '$<' || echo '$(srcdir)/'`$< >tmp-$*.s
+	$(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@
+	$(RM_TMP) tmp-$*.s
+.asm.obj:
+	$(M4) -DOPERATION_$* `if test -f '$<'; then $(CYGPATH_W) '$<'; else $(CYGPATH_W) '$(srcdir)/$<'; fi` >tmp-$*.s
+	$(CCAS) $(COMPILE_FLAGS) tmp-$*.s -o $@
+	$(RM_TMP) tmp-$*.s
+.asm.lo:
+	$(LIBTOOL) --mode=compile --tag=CC $(top_srcdir)/mpn/m4-ccas --m4="$(M4)" $(CCAS) $(COMPILE_FLAGS) `test -f '$<' || echo '$(srcdir)/'`$<
+
+# Tell versions [3.59,3.63) of GNU make to not export all variables.
+# Otherwise a system limit (for SysV at least) may be exceeded.
+.NOEXPORT:

diff --git a/third_party/gmp/mpn/README b/third_party/gmp/mpn/README
new file mode 100644
index 0000000..bc046be
--- /dev/null
+++ b/third_party/gmp/mpn/README

@@ -0,0 +1,44 @@
+Copyright 1996, 1999 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.
+
+
+
+
+This directory contains all code for the mpn layer of GMP.
+
+Most subdirectories contain machine-dependent code, written in assembly or C.
+The `generic' subdirectory contains default code, used when there is no
+machine-dependent replacement for a particular machine.
+
+There is one subdirectory for each ISA family.  Note that e.g., 32-bit SPARC
+and 64-bit SPARC are very different ISA's, and thus cannot share any code.
+
+A particular compile will only use code from one subdirectory, and the
+`generic' subdirectory.  The ISA-specific subdirectories contain hierachies of
+directories for various architecture variants and implementations; the
+top-most level contains code that runs correctly on all variants.

diff --git a/third_party/gmp/mpn/alpha/README b/third_party/gmp/mpn/alpha/README
new file mode 100644
index 0000000..09c2f04
--- /dev/null
+++ b/third_party/gmp/mpn/alpha/README

@@ -0,0 +1,208 @@
+Copyright 1996, 1997, 1999-2005 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.
+
+
+
+
+
+This directory contains mpn functions optimized for DEC Alpha processors.
+
+ALPHA ASSEMBLY RULES AND REGULATIONS
+
+The `.prologue N' pseudo op marks the end of instruction that needs special
+handling by unwinding.  It also says whether $27 is really needed for computing
+the gp.  The `.mask M' pseudo op says which registers are saved on the stack,
+and at what offset in the frame.
+
+Cray T3 code is very very different...
+
+"$6" / "$f6" etc is the usual syntax for registers, but on Unicos instead "r6"
+/ "f6" is required.  We use the "r6" / "f6" forms, and have m4 defines expand
+them to "$6" or "$f6" where necessary.
+
+"0x" introduces a hex constant in gas and DEC as, but on Unicos "^X" is
+required.  The X() macro accommodates this difference.
+
+"cvttqc" is required by DEC as, "cvttq/c" is required by Unicos, and gas will
+accept either.  We use cvttqc and have an m4 define expand to cvttq/c where
+necessary.
+
+"not" as an alias for "ornot r31, ..." is available in gas and DEC as, but not
+the Unicos assembler.  The full "ornot" must be used.
+
+"unop" is not available in Unicos.  We make an m4 define to the usual "ldq_u
+r31,0(r30)", and in fact use that define on all systems since it comes out the
+same.
+
+"!literal!123" etc explicit relocations as per Tru64 4.0 are apparently not
+available in older alpha assemblers (including gas prior to 2.12), according to
+the GCC manual, so the assembler macro forms must be used (eg. ldgp).
+
+
+
+RELEVANT OPTIMIZATION ISSUES
+
+EV4
+
+1. This chip has very limited store bandwidth.  The on-chip L1 cache is write-
+   through, and a cache line is transferred from the store buffer to the off-
+   chip L2 in as much 15 cycles on most systems.  This delay hurts mpn_add_n,
+   mpn_sub_n, mpn_lshift, and mpn_rshift.
+
+2. Pairing is possible between memory instructions and integer arithmetic
+   instructions.
+
+3. mulq and umulh are documented to have a latency of 23 cycles, but 2 of these
+   cycles are pipelined.  Thus, multiply instructions can be issued at a rate
+   of one each 21st cycle.
+
+EV5
+
+1. The memory bandwidth of this chip is good, both for loads and stores.  The
+   L1 cache can handle two loads or one store per cycle, but two cycles after a
+   store, no ld can issue.
+
+2. mulq has a latency of 12 cycles and an issue rate of 1 each 8th cycle.
+   umulh has a latency of 14 cycles and an issue rate of 1 each 10th cycle.
+   (Note that published documentation gets these numbers slightly wrong.)
+
+3. mpn_add_n.  With 4-fold unrolling, we need 37 instructions, whereof 12
+   are memory operations.  This will take at least
+	ceil(37/2) [dual issue] + 1 [taken branch] = 19 cycles
+   We have 12 memory cycles, plus 4 after-store conflict cycles, or 16 data
+   cache cycles, which should be completely hidden in the 19 issue cycles.
+   The computation is inherently serial, with these dependencies:
+
+	       ldq  ldq
+		 \  /\
+	  (or)   addq |
+	   |\   /   \ |
+	   | addq  cmpult
+	    \  |     |
+	     cmpult  |
+		 \  /
+		  or
+
+   I.e., 3 operations are needed between carry-in and carry-out, making 12
+   cycles the absolute minimum for the 4 limbs.  We could replace the `or' with
+   a cmoveq/cmovne, which could issue one cycle earlier that the `or', but that
+   might waste a cycle on EV4.  The total depth remain unaffected, since cmov
+   has a latency of 2 cycles.
+
+     addq
+     /   \
+   addq  cmpult
+     |      \
+   cmpult -> cmovne
+
+  Montgomery has a slightly different way of computing carry that requires one
+  less instruction, but has depth 4 (instead of the current 3).  Since the code
+  is currently instruction issue bound, Montgomery's idea should save us 1/2
+  cycle per limb, or bring us down to a total of 17 cycles or 4.25 cycles/limb.
+  Unfortunately, this method will not be good for the EV6.
+
+4. addmul_1 and friends: We previously had a scheme for splitting the single-
+   limb operand in 21-bits chunks and the multi-limb operand in 32-bit chunks,
+   and then use FP operations for every 2nd multiply, and integer operations
+   for every 2nd multiply.
+
+   But it seems much better to split the single-limb operand in 16-bit chunks,
+   since we save many integer shifts and adds that way.  See powerpc64/README
+   for some more details.
+
+EV6
+
+Here we have a really parallel pipeline, capable of issuing up to 4 integer
+instructions per cycle.  In actual practice, it is never possible to sustain
+more than 3.5 integer insns/cycle due to rename register shortage.  One integer
+multiply instruction can issue each cycle.  To get optimal speed, we need to
+pretend we are vectorizing the code, i.e., minimize the depth of recurrences.
+
+There are two dependencies to watch out for.  1) Address arithmetic
+dependencies, and 2) carry propagation dependencies.
+
+We can avoid serializing due to address arithmetic by unrolling loops, so that
+addresses don't depend heavily on an index variable.  Avoiding serializing
+because of carry propagation is trickier; the ultimate performance of the code
+will be determined of the number of latency cycles it takes from accepting
+carry-in to a vector point until we can generate carry-out.
+
+Most integer instructions can execute in either the L0, U0, L1, or U1
+pipelines.  Shifts only execute in U0 and U1, and multiply only in U1.
+
+CMOV instructions split into two internal instructions, CMOV1 and CMOV2.  CMOV
+split the mapping process (see pg 2-26 in cmpwrgd.pdf), suggesting the CMOV
+should always be placed as the last instruction of an aligned 4 instruction
+block, or perhaps simply avoided.
+
+Perhaps the most important issue is the latency between the L0/U0 and L1/U1
+clusters; a result obtained on either cluster has an extra cycle of latency for
+consumers in the opposite cluster.  Because of the dynamic nature of the
+implementation, it is hard to predict where an instruction will execute.
+
+
+
+REFERENCES
+
+"Alpha Architecture Handbook", version 4, Compaq, October 1998, order number
+EC-QD2KC-TE.
+
+"Alpha 21164 Microprocessor Hardware Reference Manual", Compaq, December 1998,
+order number EC-QP99C-TE.
+
+"Alpha 21264/EV67 Microprocessor Hardware Reference Manual", revision 1.4,
+Compaq, September 2000, order number DS-0028B-TE.
+
+"Compiler Writer's Guide for the Alpha 21264", Compaq, June 1999, order number
+EC-RJ66A-TE.
+
+All of the above are available online from
+
+  http://ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
+  ftp://ftp.compaq.com/pub/products/alphaCPUdocs
+
+"Tru64 Unix Assembly Language Programmer's Guide", Compaq, March 1996, part
+number AA-PS31D-TE.
+
+"Digital UNIX Calling Standard for Alpha Systems", Digital Equipment Corp,
+March 1996, part number AA-PY8AC-TE.
+
+The above are available online,
+
+  http://h30097.www3.hp.com/docs/pub_page/V40F_DOCS.HTM
+
+(Dunno what h30097 means in this URL, but if it moves try searching for "tru64
+online documentation" from the main www.hp.com page.)
+
+
+
+----------------
+Local variables:
+mode: text
+fill-column: 79
+End:

diff --git a/third_party/gmp/mpn/alpha/add_n.asm b/third_party/gmp/mpn/alpha/add_n.asm
new file mode 100644
index 0000000..bc572a5
--- /dev/null
+++ b/third_party/gmp/mpn/alpha/add_n.asm

@@ -0,0 +1,164 @@
+dnl  Alpha mpn_add_n -- Add two limb vectors of the same length > 0 and
+dnl  store sum in a third limb vector.
+
+dnl  Copyright 1995, 1999, 2000, 2005, 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C      cycles/limb
+C EV4:     ?
+C EV5:     4.75
+C EV6:     3
+
+dnl  INPUT PARAMETERS
+dnl  res_ptr	r16
+dnl  s1_ptr	r17
+dnl  s2_ptr	r18
+dnl  size	r19
+
+ASM_START()
+PROLOGUE(mpn_add_nc)
+	bis	r20,r31,r25
+	br	L(com)
+EPILOGUE()
+PROLOGUE(mpn_add_n)
+	bis	r31,r31,r25		C clear cy
+L(com):	subq	r19,4,r19		C decr loop cnt
+	blt	r19,$Lend2		C if less than 4 limbs, goto 2nd loop
+C Start software pipeline for 1st loop
+	ldq	r0,0(r18)
+	ldq	r4,0(r17)
+	ldq	r1,8(r18)
+	ldq	r5,8(r17)
+	addq	r17,32,r17		C update s1_ptr
+	addq	r0,r4,r28		C 1st main add
+	ldq	r2,16(r18)
+	addq	r25,r28,r20		C 1st carry add
+	ldq	r3,24(r18)
+	cmpult	r28,r4,r8		C compute cy from last add
+	ldq	r6,-16(r17)
+	cmpult	r20,r28,r25		C compute cy from last add
+	ldq	r7,-8(r17)
+	bis	r8,r25,r25		C combine cy from the two adds
+	subq	r19,4,r19		C decr loop cnt
+	addq	r1,r5,r28		C 2nd main add
+	addq	r18,32,r18		C update s2_ptr
+	addq	r28,r25,r21		C 2nd carry add
+	cmpult	r28,r5,r8		C compute cy from last add
+	blt	r19,$Lend1		C if less than 4 limbs remain, jump
+C 1st loop handles groups of 4 limbs in a software pipeline
+	ALIGN(16)
+$Loop:	cmpult	r21,r28,r25		C compute cy from last add
+	ldq	r0,0(r18)
+	bis	r8,r25,r25		C combine cy from the two adds
+	ldq	r1,8(r18)
+	addq	r2,r6,r28		C 3rd main add
+	ldq	r4,0(r17)
+	addq	r28,r25,r22		C 3rd carry add
+	ldq	r5,8(r17)
+	cmpult	r28,r6,r8		C compute cy from last add
+	cmpult	r22,r28,r25		C compute cy from last add
+	stq	r20,0(r16)
+	bis	r8,r25,r25		C combine cy from the two adds
+	stq	r21,8(r16)
+	addq	r3,r7,r28		C 4th main add
+	addq	r28,r25,r23		C 4th carry add
+	cmpult	r28,r7,r8		C compute cy from last add
+	cmpult	r23,r28,r25		C compute cy from last add
+		addq	r17,32,r17		C update s1_ptr
+	bis	r8,r25,r25		C combine cy from the two adds
+		addq	r16,32,r16		C update res_ptr
+	addq	r0,r4,r28		C 1st main add
+	ldq	r2,16(r18)
+	addq	r25,r28,r20		C 1st carry add
+	ldq	r3,24(r18)
+	cmpult	r28,r4,r8		C compute cy from last add
+	ldq	r6,-16(r17)
+	cmpult	r20,r28,r25		C compute cy from last add
+	ldq	r7,-8(r17)
+	bis	r8,r25,r25		C combine cy from the two adds
+	subq	r19,4,r19		C decr loop cnt
+	stq	r22,-16(r16)
+	addq	r1,r5,r28		C 2nd main add
+	stq	r23,-8(r16)
+	addq	r25,r28,r21		C 2nd carry add
+		addq	r18,32,r18		C update s2_ptr
+	cmpult	r28,r5,r8		C compute cy from last add
+	bge	r19,$Loop
+C Finish software pipeline for 1st loop
+$Lend1:	cmpult	r21,r28,r25		C compute cy from last add
+	bis	r8,r25,r25		C combine cy from the two adds
+	addq	r2,r6,r28		C 3rd main add
+	addq	r28,r25,r22		C 3rd carry add
+	cmpult	r28,r6,r8		C compute cy from last add
+	cmpult	r22,r28,r25		C compute cy from last add
+	stq	r20,0(r16)
+	bis	r8,r25,r25		C combine cy from the two adds
+	stq	r21,8(r16)
+	addq	r3,r7,r28		C 4th main add
+	addq	r28,r25,r23		C 4th carry add
+	cmpult	r28,r7,r8		C compute cy from last add
+	cmpult	r23,r28,r25		C compute cy from last add
+	bis	r8,r25,r25		C combine cy from the two adds
+	addq	r16,32,r16		C update res_ptr
+	stq	r22,-16(r16)
+	stq	r23,-8(r16)
+$Lend2:	addq	r19,4,r19		C restore loop cnt
+	beq	r19,$Lret
+C Start software pipeline for 2nd loop
+	ldq	r0,0(r18)
+	ldq	r4,0(r17)
+	subq	r19,1,r19
+	beq	r19,$Lend0
+C 2nd loop handles remaining 1-3 limbs
+	ALIGN(16)
+$Loop0:	addq	r0,r4,r28		C main add
+	ldq	r0,8(r18)
+	cmpult	r28,r4,r8		C compute cy from last add
+	ldq	r4,8(r17)
+	addq	r28,r25,r20		C carry add
+	addq	r18,8,r18
+	addq	r17,8,r17
+	stq	r20,0(r16)
+	cmpult	r20,r28,r25		C compute cy from last add
+	subq	r19,1,r19		C decr loop cnt
+	bis	r8,r25,r25		C combine cy from the two adds
+	addq	r16,8,r16
+	bne	r19,$Loop0
+$Lend0:	addq	r0,r4,r28		C main add
+	addq	r28,r25,r20		C carry add
+	cmpult	r28,r4,r8		C compute cy from last add
+	cmpult	r20,r28,r25		C compute cy from last add
+	stq	r20,0(r16)
+	bis	r8,r25,r25		C combine cy from the two adds
+
+$Lret:	bis	r25,r31,r0		C return cy
+	ret	r31,(r26),1
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/alpha/addmul_1.asm b/third_party/gmp/mpn/alpha/addmul_1.asm
new file mode 100644
index 0000000..c4e6834
--- /dev/null
+++ b/third_party/gmp/mpn/alpha/addmul_1.asm

@@ -0,0 +1,99 @@
+dnl Alpha mpn_addmul_1 -- Multiply a limb vector with a limb and add the
+dnl result to a second limb vector.
+
+dnl  Copyright 1992, 1994, 1995, 2000, 2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C      cycles/limb
+C EV4:     42
+C EV5:     18
+C EV6:      7
+
+C  INPUT PARAMETERS
+C  rp	r16
+C  up	r17
+C  n	r18
+C  vl	r19
+
+
+ASM_START()
+PROLOGUE(mpn_addmul_1)
+	ldq	r2,0(r17)	C r2 = s1_limb
+	addq	r17,8,r17	C s1_ptr++
+	subq	r18,1,r18	C size--
+	mulq	r2,r19,r3	C r3 = prod_low
+	ldq	r5,0(r16)	C r5 = *res_ptr
+	umulh	r2,r19,r0	C r0 = prod_high
+	beq	r18,$Lend1	C jump if size was == 1
+	ldq	r2,0(r17)	C r2 = s1_limb
+	addq	r17,8,r17	C s1_ptr++
+	subq	r18,1,r18	C size--
+	addq	r5,r3,r3
+	cmpult	r3,r5,r4
+	stq	r3,0(r16)
+	addq	r16,8,r16	C res_ptr++
+	beq	r18,$Lend2	C jump if size was == 2
+
+	ALIGN(8)
+$Loop:	mulq	r2,r19,r3	C r3 = prod_low
+	ldq	r5,0(r16)	C r5 = *res_ptr
+	addq	r4,r0,r0	C cy_limb = cy_limb + 'cy'
+	subq	r18,1,r18	C size--
+	umulh	r2,r19,r4	C r4 = cy_limb
+	ldq	r2,0(r17)	C r2 = s1_limb
+	addq	r17,8,r17	C s1_ptr++
+	addq	r3,r0,r3	C r3 = cy_limb + prod_low
+	cmpult	r3,r0,r0	C r0 = carry from (cy_limb + prod_low)
+	addq	r5,r3,r3
+	cmpult	r3,r5,r5
+	stq	r3,0(r16)
+	addq	r16,8,r16	C res_ptr++
+	addq	r5,r0,r0	C combine carries
+	bne	r18,$Loop
+
+$Lend2:	mulq	r2,r19,r3	C r3 = prod_low
+	ldq	r5,0(r16)	C r5 = *res_ptr
+	addq	r4,r0,r0	C cy_limb = cy_limb + 'cy'
+	umulh	r2,r19,r4	C r4 = cy_limb
+	addq	r3,r0,r3	C r3 = cy_limb + prod_low
+	cmpult	r3,r0,r0	C r0 = carry from (cy_limb + prod_low)
+	addq	r5,r3,r3
+	cmpult	r3,r5,r5
+	stq	r3,0(r16)
+	addq	r5,r0,r0	C combine carries
+	addq	r4,r0,r0	C cy_limb = prod_high + cy
+	ret	r31,(r26),1
+$Lend1:	addq	r5,r3,r3
+	cmpult	r3,r5,r5
+	stq	r3,0(r16)
+	addq	r0,r5,r0
+	ret	r31,(r26),1
+EPILOGUE(mpn_addmul_1)
+ASM_END()

diff --git a/third_party/gmp/mpn/alpha/alpha-defs.m4 b/third_party/gmp/mpn/alpha/alpha-defs.m4
new file mode 100644
index 0000000..af34c92
--- /dev/null
+++ b/third_party/gmp/mpn/alpha/alpha-defs.m4

@@ -0,0 +1,107 @@
+divert(-1)
+
+dnl  m4 macros for Alpha assembler.
+
+dnl  Copyright 2003, 2004 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+dnl  Usage: ASSERT([reg] [,code])
+dnl
+dnl  Require that the given reg is non-zero after executing the test code.
+dnl  For example,
+dnl
+dnl         ASSERT(r8,
+dnl         `       cmpult r16, r17, r8')
+dnl
+dnl  If the register argument is empty then nothing is tested, the code is
+dnl  just executed.  This can be used for setups required by later ASSERTs.
+dnl  If the code argument is omitted then the register is just tested, with
+dnl  no special setup code.
+
+define(ASSERT,
+m4_assert_numargs_range(1,2)
+m4_assert_defined(`WANT_ASSERT')
+`ifelse(WANT_ASSERT,1,
+`ifelse(`$2',,,`$2')
+ifelse(`$1',,,
+`	bne	$1, L(ASSERTok`'ASSERT_label_counter)
+	.long	0	C halt
+L(ASSERTok`'ASSERT_label_counter):
+define(`ASSERT_label_counter',eval(ASSERT_label_counter+1))
+')
+')')
+define(`ASSERT_label_counter',1)
+
+
+dnl  Usage: bigend(`code')
+dnl
+dnl  Emit the given code only for a big-endian system, like Unicos.  This
+dnl  can be used for instance for extra stuff needed by extwl.
+
+define(bigend,
+m4_assert_numargs(1)
+`ifdef(`HAVE_LIMB_BIG_ENDIAN',`$1',
+`ifdef(`HAVE_LIMB_LITTLE_ENDIAN',`',
+`m4_error(`Cannot assemble, unknown limb endianness')')')')
+
+
+dnl  Usage: bwx_available_p
+dnl
+dnl  Evaluate to 1 if the BWX byte memory instructions are available, or to
+dnl  0 if not.
+dnl
+dnl  Listing the chips which do have BWX means anything we haven't looked at
+dnl  will use safe non-BWX code.  The only targets without BWX currently are
+dnl  plain alpha (ie. ev4) and alphaev5.
+
+define(bwx_available_p,
+m4_assert_numargs(-1)
+`m4_ifdef_anyof_p(
+	`HAVE_HOST_CPU_alphaev56',
+	`HAVE_HOST_CPU_alphapca56',
+	`HAVE_HOST_CPU_alphapca57',
+	`HAVE_HOST_CPU_alphaev6',
+	`HAVE_HOST_CPU_alphaev67',
+	`HAVE_HOST_CPU_alphaev68',
+	`HAVE_HOST_CPU_alphaev69',
+	`HAVE_HOST_CPU_alphaev7',
+	`HAVE_HOST_CPU_alphaev79')')
+
+
+dnl  Usage: unop
+dnl
+dnl  The Cray Unicos assembler lacks unop, so give the equivalent ldq_u
+dnl  explicitly.
+
+define(unop,
+m4_assert_numargs(-1)
+`ldq_u	r31, 0(r30)')
+
+
+divert

diff --git a/third_party/gmp/mpn/alpha/aorslsh1_n.asm b/third_party/gmp/mpn/alpha/aorslsh1_n.asm
new file mode 100644
index 0000000..9525e66
--- /dev/null
+++ b/third_party/gmp/mpn/alpha/aorslsh1_n.asm

@@ -0,0 +1,164 @@
+dnl  Alpha mpn_addlsh1_n/mpn_sublsh1_n -- rp[] = up[] +- (vp[] << 1).
+
+dnl  Copyright 2003, 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C      cycles/limb
+C EV4:     ?
+C EV5:     6.25
+C EV6:     4.5
+
+define(`rp',`r16')
+define(`up',`r17')
+define(`vp',`r18')
+define(`n', `r19')
+
+define(`u0', `r8')
+define(`u1', `r1')
+define(`v0', `r4')
+define(`v1', `r5')
+
+define(`cy0', `r0')
+define(`cy1', `r20')
+define(`cy', `r22')
+define(`rr', `r24')
+define(`ps', `r25')
+define(`sl', `r28')
+
+ifdef(`OPERATION_addlsh1_n',`
+  define(ADDSUB,       addq)
+  define(CARRY,       `cmpult $1,$2,$3')
+  define(func, mpn_addlsh1_n)
+')
+ifdef(`OPERATION_sublsh1_n',`
+  define(ADDSUB,       subq)
+  define(CARRY,       `cmpult $2,$1,$3')
+  define(func, mpn_sublsh1_n)
+')
+
+MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_sublsh1_n)
+
+ASM_START()
+PROLOGUE(func)
+	and	n, 2, cy0
+	blbs	n, L(bx1)
+L(bx0):	ldq	v1, 0(vp)
+	ldq	u1, 0(up)
+	nop
+	bne	cy0, L(b10)
+
+L(b00):	lda	vp, 48(vp)
+	lda	up, -16(up)
+	lda	rp, -8(rp)
+	br	r31, L(lo0)
+
+L(b10):	lda	vp, 32(vp)
+	lda	rp, 8(rp)
+	lda	cy0, 0(r31)
+	br	r31, L(lo2)
+
+L(bx1):	ldq	v0, 0(vp)
+	ldq	u0, 0(up)
+	lda	cy1, 0(r31)
+	beq	cy0, L(b01)
+
+L(b11):	lda	vp, 40(vp)
+	lda	up, -24(up)
+	lda	rp, 16(rp)
+	br	r31, L(lo3)
+
+L(b01):	lda	n, -4(n)
+	ble	n, L(end)
+	lda	vp, 24(vp)
+	lda	up, -8(up)
+
+	ALIGN(16)
+L(top):	addq	v0, v0, sl	C left shift vlimb
+	ldq	v1, -16(vp)
+	ADDSUB	u0, sl, ps	C ulimb + (vlimb << 1)
+	cmplt	v0, r31, cy0	C carry out #1
+	ldq	u1, 16(up)
+	ADDSUB	ps, cy1, rr	C consume carry from previous operation
+	CARRY(	ps, u0, cy)	C carry out #2
+	stq	rr, 0(rp)
+	addq	cy, cy0, cy0	C combine carry out #1 and #2
+	CARRY(	rr, ps, cy)	C carry out #3
+	addq	cy, cy0, cy0	C final carry out
+	lda	vp, 32(vp)	C bookkeeping
+L(lo0):	addq	v1, v1, sl
+	ldq	v0, -40(vp)
+	ADDSUB	u1, sl, ps
+	cmplt	v1, r31, cy1
+	ldq	u0, 24(up)
+	ADDSUB	ps, cy0, rr
+	CARRY(	ps, u1, cy)
+	stq	rr, 8(rp)
+	addq	cy, cy1, cy1
+	CARRY(	rr, ps, cy)
+	addq	cy, cy1, cy1
+	lda	rp, 32(rp)	C bookkeeping
+L(lo3):	addq	v0, v0, sl
+	ldq	v1, -32(vp)
+	ADDSUB	u0, sl, ps
+	cmplt	v0, r31, cy0
+	ldq	u1, 32(up)
+	ADDSUB	ps, cy1, rr
+	CARRY(	ps, u0, cy)
+	stq	rr, -16(rp)
+	addq	cy, cy0, cy0
+	CARRY(	rr, ps, cy)
+	addq	cy, cy0, cy0
+	lda	up, 32(up)	C bookkeeping
+L(lo2):	addq	v1, v1, sl
+	ldq	v0, -24(vp)
+	ADDSUB	u1, sl, ps
+	cmplt	v1, r31, cy1
+	ldq	u0, 8(up)
+	ADDSUB	ps, cy0, rr
+	CARRY(	ps, u1, cy)
+	stq	rr, -8(rp)
+	addq	cy, cy1, cy1
+	CARRY(	rr, ps, cy)
+	addq	cy, cy1, cy1
+	lda	n, -4(n)	C bookkeeping
+	bgt	n, L(top)
+
+L(end):	addq	v0, v0, sl
+	ADDSUB	u0, sl, ps
+	ADDSUB	ps, cy1, rr
+	cmplt	v0, r31, cy0
+	CARRY(	ps, u0, cy)
+	stq	rr, 0(rp)
+	addq	cy, cy0, cy0
+	CARRY(	rr, ps, cy)
+	addq	cy, cy0, r0
+	ret	r31,(r26),1
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/alpha/aorslsh2_n.asm b/third_party/gmp/mpn/alpha/aorslsh2_n.asm
new file mode 100644
index 0000000..bdee1d6
--- /dev/null
+++ b/third_party/gmp/mpn/alpha/aorslsh2_n.asm

@@ -0,0 +1,167 @@
+dnl  Alpha mpn_addlsh2_n/mpn_sublsh2_n -- rp[] = up[] +- (vp[] << 2).
+
+dnl  Copyright 2003, 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C      cycles/limb
+C EV4:     ?
+C EV5:     6
+C EV6:     3.75
+
+C TODO
+C  * Tune to reach 3.5 c/l on ev6 and 5.75 c/l on ev5.
+
+define(`rp',`r16')
+define(`up',`r17')
+define(`vp',`r18')
+define(`n', `r19')
+
+define(`u0', `r8')
+define(`u1', `r1')
+define(`v0', `r4')
+define(`v1', `r5')
+
+define(`cy0', `r0')
+define(`cy1', `r20')
+define(`cy', `r22')
+define(`rr', `r24')
+define(`ps', `r25')
+define(`sl', `r28')
+
+ifdef(`OPERATION_addlsh2_n',`
+  define(ADDSUB,       addq)
+  define(CARRY,       `cmpult $1,$2,$3')
+  define(func, mpn_addlsh2_n)
+')
+ifdef(`OPERATION_sublsh2_n',`
+  define(ADDSUB,       subq)
+  define(CARRY,       `cmpult $2,$1,$3')
+  define(func, mpn_sublsh2_n)
+')
+
+MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_sublsh2_n)
+
+ASM_START()
+PROLOGUE(func)
+	and	n, 2, cy0
+	blbs	n, L(bx1)
+L(bx0):	ldq	v1, 0(vp)
+	ldq	u1, 0(up)
+	bis	r31, r31, r2
+	bne	cy0, L(b10)
+
+L(b00):	lda	vp, 48(vp)
+	lda	up, -16(up)
+	lda	rp, -8(rp)
+	s4addq	v1, r31, sl
+	br	r31, L(lo0)
+
+L(b10):	lda	vp, 32(vp)
+	lda	rp, 8(rp)
+	lda	cy0, 0(r31)
+	br	r31, L(lo2)
+
+L(bx1):	ldq	v0, 0(vp)
+	ldq	u0, 0(up)
+	lda	cy1, 0(r31)
+	bis	r31, r31, r3
+	nop
+	beq	cy0, L(b01)
+
+L(b11):	lda	vp, 40(vp)
+	lda	up, -24(up)
+	lda	rp, 16(rp)
+	br	r31, L(lo3)
+
+L(b01):	lda	n, -4(n)
+	ble	n, L(end)
+	lda	vp, 24(vp)
+	lda	up, -8(up)
+
+	ALIGN(16)
+L(top):	s4addq	v0, r3, sl	C combined vlimb
+	ldq	v1, -16(vp)
+	ADDSUB	u0, sl, ps	C ulimb + (vlimb << 1)
+	ldq	u1, 16(up)
+	srl	v0, 62, r2	C high v bits
+	ADDSUB	ps, cy1, rr	C consume carry from previous operation
+	CARRY(	ps, u0, cy0)	C carry out #2
+	stq	rr, 0(rp)
+	CARRY(	rr, ps, cy)	C carry out #3
+	lda	vp, 32(vp)	C bookkeeping
+	addq	cy, cy0, cy0	C final carry out
+	s4addq	v1, r2, sl
+L(lo0):	ldq	v0, -40(vp)
+	ADDSUB	u1, sl, ps
+	ldq	u0, 24(up)
+	srl	v1, 62, r3
+	ADDSUB	ps, cy0, rr
+	CARRY(	ps, u1, cy1)
+	stq	rr, 8(rp)
+	CARRY(	rr, ps, cy)
+	lda	rp, 32(rp)	C bookkeeping
+	addq	cy, cy1, cy1
+L(lo3):	s4addq	v0, r3, sl
+	ldq	v1, -32(vp)
+	ADDSUB	u0, sl, ps
+	ldq	u1, 32(up)
+	srl	v0, 62, r2
+	ADDSUB	ps, cy1, rr
+	CARRY(	ps, u0, cy0)
+	stq	rr, -16(rp)
+	CARRY(	rr, ps, cy)
+	lda	up, 32(up)	C bookkeeping
+	addq	cy, cy0, cy0
+L(lo2):	s4addq	v1, r2, sl
+	ldq	v0, -24(vp)
+	ADDSUB	u1, sl, ps
+	ldq	u0, 8(up)
+	srl	v1, 62, r3
+	ADDSUB	ps, cy0, rr
+	CARRY(	ps, u1, cy1)
+	stq	rr, -8(rp)
+	CARRY(	rr, ps, cy)
+	lda	n, -4(n)	C bookkeeping
+	addq	cy, cy1, cy1
+	bgt	n, L(top)
+
+L(end):	s4addq	v0, r3, sl
+	ADDSUB	u0, sl, ps
+	srl	v0, 62, r2
+	ADDSUB	ps, cy1, rr
+	CARRY(	ps, u0, cy0)
+	stq	rr, 0(rp)
+	CARRY(	rr, ps, cy)
+	addq	cy, cy0, cy0
+	addq	cy0, r2, r0
+
+	ret	r31,(r26),1
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/alpha/bdiv_dbm1c.asm b/third_party/gmp/mpn/alpha/bdiv_dbm1c.asm
new file mode 100644
index 0000000..472966c
--- /dev/null
+++ b/third_party/gmp/mpn/alpha/bdiv_dbm1c.asm

@@ -0,0 +1,282 @@
+dnl  Alpha mpn_bdiv_dbm1c.
+
+dnl  Copyright 2008 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C      cycles/limb
+C EV4:     42
+C EV5:     18
+C EV6:      3
+
+C TODO
+C  * Try less unrolling, 2-way should give the same performance.
+C  * Optimize feed-in and wind-down code, for speed, and perhaps further for
+C    code size.
+C  * This runs optimally given the algorithm, r8 is on a 3 operation recurrency
+C    path.  We have not tried very hard to find a better algorithm.  Perhaps
+C    it would be a good task for the GNU superoptimizer.
+
+C INPUT PARAMETERS
+define(`rp', `r16')
+define(`up', `r17')
+define(`n',  `r18')
+define(`bd', `r19')
+define(`cy', `r19')
+
+
+ASM_START()
+PROLOGUE(mpn_bdiv_dbm1c)
+	mov	r20, r8
+
+	ldq	r24, 0(r17)
+	and	r18, 3, r28
+	lda	r18, -4(r18)
+	beq	r28, L(b0)
+	cmpeq	r28, 1, r21
+	bne	r21, L(b1)
+	cmpeq	r28, 2, r21
+	bne	r21, L(b2)
+
+
+L(b3):	ldq	r2, 8(r17)
+	ldq	r3, 16(r17)
+	bgt	r18, L(gt3)
+
+	mulq	r24, r19, r5	C U1
+	umulh	r24, r19, r21	C U1
+	mulq	r2, r19, r6	C U1
+	umulh	r2, r19, r22	C U1
+	mulq	r3, r19, r7	C U1
+	umulh	r3, r19, r23	C U1
+	lda	r16, -32(r16)
+	br	L(cj3)
+
+L(gt3):	ldq	r0, 24(r17)
+	mulq	r24, r19, r5	C U1
+	umulh	r24, r19, r21	C U1
+	ldq	r1, 32(r17)
+	mulq	r2, r19, r6	C U1
+	umulh	r2, r19, r22	C U1
+	ldq	r2, 40(r17)
+	mulq	r3, r19, r7	C U1
+	umulh	r3, r19, r23	C U1
+	ldq	r3, 48(r17)
+	lda	r18, -4(r18)
+	lda	r17, 56(r17)
+	mulq	r0, r19, r4	C U1
+	bgt	r18, L(L3)
+
+	br	L(cj7)
+
+
+L(b2):	ldq	r3, 8(r17)
+	bgt	r18, L(gt2)
+
+	mulq	r24, r19, r6	C U1
+	umulh	r24, r19, r22	C U1
+	mulq	r3, r19, r7	C U1
+	umulh	r3, r19, r23	C U1
+	lda	r16, -40(r16)
+	br	L(cj2)
+
+L(gt2):	ldq	r0, 16(r17)
+	ldq	r1, 24(r17)
+	mulq	r24, r19, r6	C U1
+	umulh	r24, r19, r22	C U1
+	ldq	r2, 32(r17)
+	mulq	r3, r19, r7	C U1
+	umulh	r3, r19, r23	C U1
+	ldq	r3, 40(r17)
+	lda	r18, -4(r18)
+	lda	r17, 48(r17)
+	mulq	r0, r19, r4	C U1
+	umulh	r0, r19, r20	C U1
+	lda	r16, -8(r16)
+	bgt	r18, L(gt6)
+
+	mulq	r1, r19, r5	C U1
+	br	L(cj6)
+
+L(gt6):	ldq	r0, 0(r17)
+	mulq	r1, r19, r5	C U1
+	br	L(L2)
+
+
+L(b1):	bgt	r18, L(gt1)
+
+	mulq	r24, r19, r7	C U1
+	umulh	r24, r19, r23	C U1
+	lda	r16, -48(r16)
+	br	L(cj1)
+
+L(gt1):	ldq	r0, 8(r17)
+	ldq	r1, 16(r17)
+	ldq	r2, 24(r17)
+	mulq	r24, r19, r7	C U1
+	umulh	r24, r19, r23	C U1
+	ldq	r3, 32(r17)
+	lda	r18, -4(r18)
+	lda	r17, 40(r17)
+	mulq	r0, r19, r4	C U1
+	umulh	r0, r19, r20	C U1
+	lda	r16, -16(r16)
+	bgt	r18, L(gt5)
+
+	mulq	r1, r19, r5	C U1
+	umulh	r1, r19, r21	C U1
+	mulq	r2, r19, r6	C U1
+	br	L(cj5)
+
+L(gt5):	ldq	r0, 0(r17)
+	mulq	r1, r19, r5	C U1
+	umulh	r1, r19, r21	C U1
+	ldq	r1, 8(r17)
+	mulq	r2, r19, r6	C U1
+	br	L(L1)
+
+
+L(b0):	ldq	r1, 8(r17)
+	ldq	r2, 16(r17)
+	ldq	r3, 24(r17)
+	lda	r17, 32(r17)
+	lda	r16, -24(r16)
+	mulq	r24, r19, r4	C U1
+	umulh	r24, r19, r20	C U1
+	bgt	r18, L(gt4)
+
+	mulq	r1, r19, r5	C U1
+	umulh	r1, r19, r21	C U1
+	mulq	r2, r19, r6	C U1
+	umulh	r2, r19, r22	C U1
+	mulq	r3, r19, r7	C U1
+	br	L(cj4)
+
+L(gt4):	ldq	r0, 0(r17)
+	mulq	r1, r19, r5	C U1
+	umulh	r1, r19, r21	C U1
+	ldq	r1, 8(r17)
+	mulq	r2, r19, r6	C U1
+	umulh	r2, r19, r22	C U1
+	ldq	r2, 16(r17)
+	mulq	r3, r19, r7	C U1
+	br	L(L0)
+
+C *** MAIN LOOP START ***
+	ALIGN(16)
+L(top):	mulq	r0, r19, r4	C U1
+	subq	r8, r28, r8
+L(L3):	umulh	r0, r19, r20	C U1
+	cmpult	r8, r5, r28
+	ldq	r0, 0(r17)
+	subq	r8, r5, r8
+	addq	r21, r28, r28
+	stq	r8, 0(r16)
+
+	mulq	r1, r19, r5	C U1
+	subq	r8, r28, r8
+L(L2):	umulh	r1, r19, r21	C U1
+	cmpult	r8, r6, r28
+	ldq	r1, 8(r17)
+	subq	r8, r6, r8
+	addq	r22, r28, r28
+	stq	r8, 8(r16)
+
+	mulq	r2, r19, r6	C U1
+	subq	r8, r28, r8
+L(L1):	umulh	r2, r19, r22	C U1
+	cmpult	r8, r7, r28
+	ldq	r2, 16(r17)
+	subq	r8, r7, r8
+	addq	r23, r28, r28
+	stq	r8, 16(r16)
+
+	mulq	r3, r19, r7	C U1
+	subq	r8, r28, r8
+L(L0):	umulh	r3, r19, r23	C U1
+	cmpult	r8, r4, r28
+	ldq	r3, 24(r17)
+	subq	r8, r4, r8
+	addq	r20, r28, r28
+	stq	r8, 24(r16)
+
+	lda	r18, -4(r18)
+	lda	r17, 32(r17)
+	lda	r16, 32(r16)
+	bgt	r18, L(top)
+C *** MAIN LOOP END ***
+
+	mulq	r0, r19, r4	C U1
+	subq	r8, r28, r8
+L(cj7):	umulh	r0, r19, r20	C U1
+	cmpult	r8, r5, r28
+	subq	r8, r5, r8
+	addq	r21, r28, r28
+	stq	r8, 0(r16)
+	mulq	r1, r19, r5	C U1
+	subq	r8, r28, r8
+L(cj6):	umulh	r1, r19, r21	C U1
+	cmpult	r8, r6, r28
+	subq	r8, r6, r8
+	addq	r22, r28, r28
+	stq	r8, 8(r16)
+	mulq	r2, r19, r6	C U1
+	subq	r8, r28, r8
+L(cj5):	umulh	r2, r19, r22	C U1
+	cmpult	r8, r7, r28
+	subq	r8, r7, r8
+	addq	r23, r28, r28
+	stq	r8, 16(r16)
+	mulq	r3, r19, r7	C U1
+	subq	r8, r28, r8
+L(cj4):	umulh	r3, r19, r23	C U1
+	cmpult	r8, r4, r28
+	subq	r8, r4, r8
+	addq	r20, r28, r28
+	stq	r8, 24(r16)
+	subq	r8, r28, r8
+L(cj3):	cmpult	r8, r5, r28
+	subq	r8, r5, r8
+	addq	r21, r28, r28
+	stq	r8, 32(r16)
+	subq	r8, r28, r8
+L(cj2):	cmpult	r8, r6, r28
+	subq	r8, r6, r8
+	addq	r22, r28, r28
+	stq	r8, 40(r16)
+	subq	r8, r28, r8
+L(cj1):	cmpult	r8, r7, r28
+	subq	r8, r7, r8
+	addq	r23, r28, r28
+	stq	r8, 48(r16)
+	subq	r8, r28, r0
+	ret	r31, (r26), 1
+
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/alpha/cntlz.asm b/third_party/gmp/mpn/alpha/cntlz.asm
new file mode 100644
index 0000000..25af19b
--- /dev/null
+++ b/third_party/gmp/mpn/alpha/cntlz.asm

@@ -0,0 +1,55 @@
+dnl  Alpha auxiliary for longlong.h's count_leading_zeros
+
+dnl  Copyright 1997, 2000, 2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+ASM_START()
+EXTERN(__clz_tab)
+PROLOGUE(mpn_count_leading_zeros,gp)
+	cmpbge	r31,  r16, r1
+	LEA(r3,__clz_tab)
+	sra	r1,   1,   r1
+	xor	r1,   127, r1
+	srl	r16,  1,   r16
+	addq	r1,   r3,  r1
+	ldq_u	r0,   0(r1)
+	lda	r2,   64
+	extbl	r0,   r1,   r0
+	s8subl	r0,   8,    r0
+	srl	r16,  r0,   r16
+	addq	r16,  r3,   r16
+	ldq_u	r1,   0(r16)
+	extbl	r1,   r16,  r1
+	subq	r2,   r1,   r2
+	subq	r2,   r0,   r0
+	ret	r31,  (r26),1
+EPILOGUE(mpn_count_leading_zeros)
+ASM_END()

diff --git a/third_party/gmp/mpn/alpha/com.asm b/third_party/gmp/mpn/alpha/com.asm
new file mode 100644
index 0000000..f084ab5
--- /dev/null
+++ b/third_party/gmp/mpn/alpha/com.asm

@@ -0,0 +1,176 @@
+dnl  Alpha mpn_com -- mpn one's complement.
+
+dnl  Copyright 2003 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C      cycles/limb
+C EV4:    4.75
+C EV5:    2.0
+C EV6:    1.5
+
+
+C mp_limb_t mpn_com (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C For ev5 the main loop is 7 cycles plus 1 taken branch bubble, for a total
+C 2.0 c/l.  In general, a pattern like this unrolled to N limbs per loop
+C will be 1.5+2/N c/l.
+C
+C 2 cycles of loop control are unavoidable, for pointer updates and the
+C taken branch bubble, but also since ldq cannot issue two cycles after stq
+C (and with a run of stqs that means neither of two cycles at the end of the
+C loop.
+C
+C The fbeq is forced into the second cycle of the loop using unops, since
+C the first time through it must wait for the cvtqt result.  Once that
+C result is ready (a 1 cycle stall) then both the branch and following loads
+C can issue together.
+C
+C The main loop handles an odd count of limbs, being two limbs loaded before
+C each size test, plus one pipelined around from the previous iteration (or
+C setup in the entry sequence).
+C
+C An even number of limbs is handled by an explicit dst[0]=~src[0] in the
+C entry sequence, and an increment of the pointers.  For an odd size there's
+C no increment and the first store in the loop (r24) is a repeat of dst[0].
+C
+C Note that the load for r24 after the possible pointer increment is done
+C before the explicit store to dst[0], in case src==dst.
+
+
+ASM_START()
+
+FLOAT64(L(dat), 2.0)
+
+	ALIGN(16)
+
+PROLOGUE(mpn_com,gp)
+
+	C r16	dst
+	C r17	src
+	C r18	size
+
+	lda	r30, -16(r30)		C temporary stack space
+	lda	r7, -3(r18)		C size - 3
+
+	ldq	r20, 0(r17)		C src[0]
+	srl	r7, 1, r6		C (size-3)/2
+
+	stq	r6, 8(r30)		C (size-3)/2
+	and	r7, 1, r5		C 1 if size even
+
+	LEA(	r8, L(dat))
+	s8addq	r5, r17, r17		C skip src[0] if even
+
+	ornot	r31, r20, r20		C ~src[0]
+	unop
+
+	ldt	f0, 8(r30)		C (size-3)/2
+	ldq	r24, 0(r17)		C src[0 or 1]
+
+	stq	r20, 0(r16)		C dst[0]
+	s8addq	r5, r16, r19		C skip dst[0] if even
+
+	ldt	f1, 0(r8)		C data 2.0
+	lda	r30, 16(r30)		C restore stack
+	unop
+	cvtqt	f0, f0			C (size-3)/2 as float
+
+	ornot	r31, r24, r24
+	blt	r7, L(done_1)		C if size<=2
+	unop
+	unop
+
+
+	C 16-byte alignment here
+L(top):
+	C r17	src, incrementing
+	C r19	dst, incrementing
+	C r24	dst[i] result, ready to store
+	C f0	(size-3)/2, decrementing
+	C f1	2.0
+
+	ldq	r20, 8(r17)		C src[i+1]
+	ldq	r21, 16(r17)		C src[i+2]
+	unop
+	unop
+
+	fbeq	f0, L(done_2)
+	unop
+	ldq	r22, 24(r17)		C src[i+3]
+	ldq	r23, 32(r17)		C src[i+4]
+
+	stq	r24, 0(r19)		C dst[i]
+	ornot	r31, r20, r20
+	subt	f0, f1, f0		C count -= 2
+	unop
+
+	stq	r20, 8(r19)		C dst[i+1]
+	ornot	r31, r21, r21
+	unop
+	unop
+
+	stq	r21, 16(r19)		C dst[i+2]
+	ornot	r31, r22, r22
+
+	stq	r22, 24(r19)		C dst[i+3]
+	ornot	r31, r23, r24
+
+	lda	r17, 32(r17)		C src += 4
+	lda	r19, 32(r19)		C dst += 4
+	unop
+	fbge	f0, L(top)
+
+
+L(done_1):
+	C r19	&dst[size-1]
+	C r24	result for dst[size-1]
+
+	stq	r24, 0(r19)		C dst[size-1]
+	ret	r31, (r26), 1
+
+
+L(done_2):
+	C r19	&dst[size-3]
+	C r20	src[size-2]
+	C r21	src[size-1]
+	C r24	result for dst[size-3]
+
+	stq	r24, 0(r19)		C dst[size-3]
+	ornot	r31, r20, r20
+
+	stq	r20, 8(r19)		C dst[size-2]
+	ornot	r31, r21, r21
+
+	stq	r21, 16(r19)		C dst[size-1]
+	ret	r31, (r26), 1
+
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/alpha/copyd.asm b/third_party/gmp/mpn/alpha/copyd.asm
new file mode 100644
index 0000000..b41b536
--- /dev/null
+++ b/third_party/gmp/mpn/alpha/copyd.asm

@@ -0,0 +1,88 @@
+dnl  Alpha mpn_copyd -- copy, decrementing.
+
+dnl  Copyright 2002, 2003 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C      cycles/limb
+C EV4:     4
+C EV5:     1.75
+C EV6:     1
+
+C INPUT PARAMETERS
+C rp	r16
+C up	r17
+C n	r18
+
+
+ASM_START()
+PROLOGUE(mpn_copyd)
+	s8addq	r18,r16,r16		C E0
+	s8addq	r18,r17,r17		C E1
+	lda	r18,-8(r18)		C E0
+	blt	r18,$Lend		C E1
+$Loop:	ldq	r0,-8(r17)		C E0
+	ldq	r1,-16(r17)		C E1
+	ldq	r2,-24(r17)		C E0
+	ldq	r3,-32(r17)		C E1
+	ldq	r4,-40(r17)		C E0
+	ldq	r5,-48(r17)		C E1
+	ldq	r6,-56(r17)		C E0
+	ldq	r7,-64(r17)		C E1
+	stq	r0,-8(r16)		C E0
+	lda	r17,-64(r17)		C E1
+	stq	r1,-16(r16)		C E0
+	bis	r31, r31, r31		C E1
+	stq	r2,-24(r16)		C E0
+	lda	r18,-8(r18)		C E1
+	stq	r3,-32(r16)		C E0
+	bis	r31, r31, r31		C E1
+	stq	r4,-40(r16)		C E0
+	bis	r31, r31, r31		C E1
+	stq	r5,-48(r16)		C E0
+	bis	r31, r31, r31		C E1
+	stq	r6,-56(r16)		C E0
+	bis	r31, r31, r31		C E1
+	stq	r7,-64(r16)		C E0
+	lda	r16,-64(r16)		C E1
+	bge	r18,$Loop		C E1
+$Lend:	lda	r18,7(r18)		C E0
+	blt	r18,$Lret		C E1
+	ldq	r0,-8(r17)		C E0
+	beq	r18,$Lend0		C E1
+$Loop0:	stq	r0,-8(r16)		C E0
+	lda	r16,-8(r16)		C E1
+	ldq	r0,-16(r17)		C E0
+	lda	r18,-1(r18)		C E1
+	lda	r17,-8(r17)		C E0
+	bgt	r18,$Loop0		C E1
+$Lend0:	stq	r0,-8(r16)		C E0
+$Lret:	ret	r31,(r26),1		C E1
+EPILOGUE(mpn_copyd)
+ASM_END()

diff --git a/third_party/gmp/mpn/alpha/copyi.asm b/third_party/gmp/mpn/alpha/copyi.asm
new file mode 100644
index 0000000..f7e2ad6
--- /dev/null
+++ b/third_party/gmp/mpn/alpha/copyi.asm

@@ -0,0 +1,86 @@
+dnl  Alpha mpn_copyi -- copy, incrementing.
+
+dnl  Copyright 2002, 2003 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C      cycles/limb
+C EV4:     4
+C EV5:     1.75
+C EV6:     1
+
+C INPUT PARAMETERS
+C rp	r16
+C up	r17
+C n	r18
+
+
+ASM_START()
+PROLOGUE(mpn_copyi)
+	lda	r18,-8(r18)		C E0
+	blt	r18,$Lend		C E1
+$Loop:	ldq	r0,0(r17)		C E0
+	ldq	r1,8(r17)		C E1
+	ldq	r2,16(r17)		C E0
+	ldq	r3,24(r17)		C E1
+	ldq	r4,32(r17)		C E0
+	ldq	r5,40(r17)		C E1
+	ldq	r6,48(r17)		C E0
+	ldq	r7,56(r17)		C E1
+	stq	r0,0(r16)		C E0
+	lda	r17,64(r17)		C E1
+	stq	r1,8(r16)		C E0
+	bis	r31, r31, r31		C E1
+	stq	r2,16(r16)		C E0
+	lda	r18,-8(r18)		C E1
+	stq	r3,24(r16)		C E0
+	bis	r31, r31, r31		C E1
+	stq	r4,32(r16)		C E0
+	bis	r31, r31, r31		C E1
+	stq	r5,40(r16)		C E0
+	bis	r31, r31, r31		C E1
+	stq	r6,48(r16)		C E0
+	bis	r31, r31, r31		C E1
+	stq	r7,56(r16)		C E0
+	lda	r16,64(r16)		C E1
+	bge	r18,$Loop		C E1
+$Lend:	lda	r18,7(r18)		C E0
+	blt	r18,$Lret		C E1
+	ldq	r0,0(r17)		C E0
+	beq	r18,$Lend0		C E1
+$Loop0:	stq	r0,0(r16)		C E0
+	lda	r16,8(r16)		C E1
+	ldq	r0,8(r17)		C E0
+	lda	r18,-1(r18)		C E1
+	lda	r17,8(r17)		C E0
+	bgt	r18,$Loop0		C E1
+$Lend0:	stq	r0,0(r16)		C E0
+$Lret:	ret	r31,(r26),1		C E1
+EPILOGUE(mpn_copyi)
+ASM_END()

diff --git a/third_party/gmp/mpn/alpha/default.m4 b/third_party/gmp/mpn/alpha/default.m4
new file mode 100644
index 0000000..8fe7c4e
--- /dev/null
+++ b/third_party/gmp/mpn/alpha/default.m4

@@ -0,0 +1,127 @@
+divert(-1)
+
+dnl  m4 macros for alpha assembler (everywhere except unicos).
+
+
+dnl  Copyright 2000, 2002-2004, 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+dnl  Usage: ASM_START()
+define(`ASM_START',
+m4_assert_numargs(0)
+`	.set noreorder
+	.set noat')
+
+dnl  Usage: X(value)
+define(`X',
+m4_assert_numargs(1)
+`0x$1')
+
+dnl  Usage: FLOAT64(label,value)
+define(`FLOAT64',
+m4_assert_numargs(2)
+`	.align	3
+$1:	.t_floating $2')
+
+
+dnl  Called: PROLOGUE_cpu(GSYM_PREFIX`'foo[,gp|noalign])
+dnl          EPILOGUE_cpu(GSYM_PREFIX`'foo)
+
+define(`PROLOGUE_cpu',
+m4_assert_numargs_range(1,2)
+`ifelse(`$2',gp,,
+`ifelse(`$2',noalign,,
+`ifelse(`$2',,,`m4_error(`Unrecognised PROLOGUE parameter
+')')')')dnl
+	.text
+ifelse(`$2',noalign,,`	ALIGN(16)')
+	.globl	$1
+	.ent	$1
+$1:
+	.frame r30,0,r26,0
+ifelse(`$2',gp,`	ldgp	r29, 0(r27)
+`$'$1..ng:')
+	.prologue ifelse(`$2',gp,1,0)')
+
+define(`EPILOGUE_cpu',
+m4_assert_numargs(1)
+`	.end	$1')
+
+
+dnl  Usage: LDGP(dst,src)
+dnl
+dnl  Emit an "ldgp dst,src", but only if the system uses a GOT.
+
+define(LDGP,
+m4_assert_numargs(2)
+`ldgp	`$1', `$2'')
+
+
+dnl  Usage: EXTERN(variable_name)
+define(`EXTERN',
+m4_assert_numargs(1)
+)
+
+dnl  Usage: r0 ... r31
+dnl         f0 ... f31
+dnl
+dnl  Map register names r0 to $0, and f0 to $f0, etc.
+dnl  This is needed on all systems but Unicos
+dnl
+dnl  defreg() is used to protect the $ in $0 (otherwise it would represent a
+dnl  macro argument).  Double quoting is used to protect the f0 in $f0
+dnl  (otherwise it would be an infinite recursion).
+
+forloop(i,0,31,`defreg(`r'i,$i)')
+forloop(i,0,31,`deflit(`f'i,``$f''i)')
+
+
+dnl  Usage: DATASTART(name,align)  or  DATASTART(name)
+dnl         DATAEND()
+
+define(`DATASTART',
+m4_assert_numargs_range(1,2)
+`	RODATA
+	ALIGN(ifelse($#,1,2,$2))
+$1:')
+define(`DATAEND',
+m4_assert_numargs(0)
+)
+
+dnl  Load a symbolic address into a register
+define(`LEA',
+m4_assert_numargs(2)
+`lda	$1, $2')
+
+dnl  Usage: ASM_END()
+define(`ASM_END',
+m4_assert_numargs(0)
+)
+
+divert

diff --git a/third_party/gmp/mpn/alpha/dive_1.c b/third_party/gmp/mpn/alpha/dive_1.c
new file mode 100644
index 0000000..349d581
--- /dev/null
+++ b/third_party/gmp/mpn/alpha/dive_1.c

@@ -0,0 +1,114 @@
+/* Alpha mpn_divexact_1 -- mpn by limb exact division.
+
+   THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY.  THEY'RE ALMOST
+   CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
+   FUTURE GNU MP RELEASES.
+
+Copyright 2000-2003 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+
+/*      cycles/limb
+   EV4:    47.0
+   EV5:    30.0
+   EV6:    15.0
+*/
+
+
+/* The dependent chain is as follows (the same as modexact), and this is
+   what the code runs as.
+
+       ev4    ev5   ev6
+        1      1     1    sub    y = x - h
+       23     13     7    mulq   q = y * inverse
+       23     15     7    umulh  h = high (q * d)
+       --     --    --
+       47     30    15
+
+   The time to load src[i+1] and establish x hides under the umulh latency.  */
+
+void
+mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor)
+{
+  mp_limb_t  inverse, lshift_mask, s, sr, s_next, c, h, x, y, q, dummy;
+  unsigned   rshift, lshift;
+
+  ASSERT (size >= 1);
+  ASSERT (divisor != 0);
+  ASSERT (MPN_SAME_OR_SEPARATE_P (dst, src, size));
+  ASSERT_MPN (src, size);
+  ASSERT_LIMB (divisor);
+
+  s_next = *src++;   /* src[0] */
+
+  rshift = 0;
+  lshift_mask = 0;
+  if ((divisor & 1) == 0)
+    {
+      count_trailing_zeros (rshift, divisor);
+      lshift_mask = MP_LIMB_T_MAX;
+      divisor >>= rshift;
+    }
+
+  binvert_limb (inverse, divisor);
+  lshift = 64 - rshift;
+
+  c = 0;
+  h = 0;
+  sr = s_next >> rshift;
+
+  size--;
+  if (LIKELY (size != 0))
+    {
+      do
+        {
+          s_next = *src++;      /* src[i+1] */
+          s = sr | ((s_next << lshift) & lshift_mask);
+          x = s - c;
+          c = s < c;
+          sr = s_next >> rshift;
+
+          y = x - h;
+          c += (x < h);
+          q = y * inverse;
+          *dst++ = q;
+          umul_ppmm (h, dummy, q, divisor);
+
+          size--;
+        }
+      while (size != 0);
+    }
+
+  x = sr - c;
+  y = x - h;
+  q = y * inverse;
+  *dst = q;         /* dst[size-1] */
+}

diff --git a/third_party/gmp/mpn/alpha/divrem_2.asm b/third_party/gmp/mpn/alpha/divrem_2.asm
new file mode 100644
index 0000000..046b246
--- /dev/null
+++ b/third_party/gmp/mpn/alpha/divrem_2.asm

@@ -0,0 +1,177 @@
+dnl  Alpha mpn_divrem_2 -- Divide an mpn number by a normalized 2-limb number.
+
+dnl  Copyright 2007, 2008, 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C		norm	frac
+C ev4
+C ev5		70	70
+C ev6		29	29
+
+C TODO
+C  * Perhaps inline mpn_invert_limb, that would allow us to not save/restore
+C    any registers (thus save ~10 cycles per call).
+C  * Use negated d1 and/or d0 to speed carry propagation.  Might save a cycle
+C    or two.
+C  * Check cluster delays (for ev6).  We very likely could save some cycles.
+C  * Use branch-free code for computing di.
+C  * CAVEAT: We rely on r19 not being clobbered by mpn_invert_limb call.
+
+C INPUT PARAMETERS
+define(`qp',		`r16')
+define(`fn',		`r17')
+define(`up_param',	`r18')
+define(`un_param',	`r19')
+define(`dp',		`r20')
+
+ASM_START()
+PROLOGUE(mpn_divrem_2,gp)
+	lda	r30, -80(r30)
+	stq	r26, 0(r30)
+	stq	r9, 8(r30)
+	stq	r10, 16(r30)
+	stq	r11, 24(r30)
+	stq	r12, 32(r30)
+	stq	r13, 40(r30)
+C	stq	r14, 48(r30)
+	stq	r15, 56(r30)
+	.prologue	1
+	stq	r16, 64(r30)
+	bis	r31, r17, r15
+	s8addq	r19, r18, r13
+	lda	r13, -24(r13)
+	ldq	r12, 8(r20)
+	ldq	r10, 0(r20)
+	ldq	r11, 16(r13)
+	ldq	r9, 8(r13)
+
+	bis	r31, r31, r3		C most_significant_q_limb = 0
+	cmpult	r11, r12, r1
+	bne	r1, L(L8)
+	cmpule	r11, r12, r1
+	cmpult	r9, r10, r2
+	and	r1, r2, r1
+	bne	r1, L(L8)
+	subq	r11, r12, r11
+	subq	r11, r2, r11
+	subq	r9, r10, r9
+	lda	r3, 1(r31)		C most_significant_q_limb = 1
+L(L8):	stq	r3, 72(r30)
+
+	addq	r15, r19, r19
+	lda	r19, -3(r19)
+	blt	r19, L(L10)
+	bis	r31, r12, r16
+	jsr	r26, mpn_invert_limb
+	LDGP(	r29, 0(r26))
+	mulq	r0, r12, r4		C t0 = LO(di * d1)
+	umulh	r0, r10, r2		C s1 = HI(di * d0)
+	addq	r4, r10, r4		C t0 += d0
+	cmpule	r10, r4, r7		C (t0 < d0)
+	addq	r4, r2, r4		C t0 += s1
+	cmpult	r4, r2, r1
+	subq	r1, r7, r7		C t1 (-1, 0, or 1)
+	blt	r7, L(L42)
+L(L22):
+	lda	r0, -1(r0)		C di--
+	cmpult	r4, r12, r1		C cy for: t0 -= d1 (below)
+	subq	r7, r1, r7		C t1 -= cy
+	subq	r4, r12, r4		C t0 -= d1
+	bge	r7, L(L22)
+L(L42):
+	ldq	r16, 64(r30)
+	s8addq	r19, r16, r16
+	ALIGN(16)
+L(loop):
+	mulq	r11, r0, r5		C q0 (early)
+	umulh	r11, r0, r6		C q  (early)
+	addq	r5, r9, r8		C q0 += n1
+	addq	r6, r11, r6		C q  += n2
+	cmpult	r8, r5, r1		C cy for: q0 += n1
+	addq	r6, r1, r6		C q  += cy
+	unop
+	mulq	r12, r6, r1		C LO(d1 * q)
+	umulh	r10, r6, r7		C t1 = HI(d0 * q)
+	subq	r9, r1, r9		C n1 -= LO(d1 * q)
+	mulq	r10, r6, r4		C t0 = LO(d0 * q)
+	unop
+	cmple	r15, r19, r5		C condition and n0...
+	beq	r5, L(L31)
+	ldq	r5, 0(r13)
+	lda	r13, -8(r13)
+L(L31):	subq	r9, r12, r9		C n1 -= d1
+	cmpult	r5, r10, r1		C
+	subq	r9, r1, r9		C
+	subq	r5, r10, r5		C n0 -= d0
+	subq	r9, r7, r9		C n1 -= t0
+	cmpult	r5, r4, r1		C
+	subq	r9, r1, r2		C
+	subq	r5, r4, r5		C n0 -= t1
+	cmpult	r2, r8, r1		C (n1 < q0)
+	addq	r6, r1, r6		C q += cond
+	lda	r1, -1(r1)		C -(n1 >= q0)
+	and	r1, r10, r4		C
+	addq	r5, r4, r9		C n0 += mask & d0
+	and	r1, r12, r1		C
+	cmpult	r9, r5, r11		C cy for: n0 += mask & d0
+	addq	r2, r1, r1		C n1 += mask & d1
+	addq	r1, r11, r11		C n1 += cy
+	cmpult	r11, r12, r1		C
+	beq	r1, L(fix)		C
+L(bck):	stq	r6, 0(r16)
+	lda	r16, -8(r16)
+	lda	r19, -1(r19)
+	bge	r19, L(loop)
+
+L(L10):	stq	r9, 8(r13)
+	stq	r11, 16(r13)
+	ldq	r0, 72(r30)
+	ldq	r26, 0(r30)
+	ldq	r9, 8(r30)
+	ldq	r10, 16(r30)
+	ldq	r11, 24(r30)
+	ldq	r12, 32(r30)
+	ldq	r13, 40(r30)
+C	ldq	r14, 48(r30)
+	ldq	r15, 56(r30)
+	lda	r30, 80(r30)
+	ret	r31, (r26), 1
+
+L(fix):	cmpule	r11, r12, r1
+	cmpult	r9, r10, r2
+	and	r1, r2, r1
+	bne	r1, L(bck)
+	subq	r11, r12, r11
+	subq	r11, r2, r11
+	subq	r9, r10, r9
+	lda	r6, 1(r6)
+	br	L(bck)
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/alpha/ev5/diveby3.asm b/third_party/gmp/mpn/alpha/ev5/diveby3.asm
new file mode 100644
index 0000000..3758188
--- /dev/null
+++ b/third_party/gmp/mpn/alpha/ev5/diveby3.asm

@@ -0,0 +1,332 @@
+dnl  Alpha mpn_divexact_by3c -- mpn division by 3, expecting no remainder.
+
+dnl  Copyright 2004, 2005, 2009 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C      cycles/limb
+C EV4:    22
+C EV5:    11.5
+C EV6:     6.3		Note that mpn_bdiv_dbm1c is faster
+
+C TODO
+C  * Remove the unops, they benefit just ev6, which no longer uses this file.
+C  * Try prefetch for destination, using lds.
+C  * Improve feed-in code, by moving initial mulq earlier; make initial load
+C    to u0/u0 to save some copying.
+C  * Combine u0 and u2, u1 and u3.
+
+C INPUT PARAMETERS
+define(`rp',	`r16')
+define(`up',	`r17')
+define(`n',	`r18')
+define(`cy',	`r19')
+
+ASM_START()
+
+DATASTART(L(LC),8)
+	.quad	0xAAAAAAAAAAAAAAAB
+	.quad	0x5555555555555555
+	.quad	0xAAAAAAAAAAAAAAAA
+DATAEND()
+
+define(`xAAAAAAAAAAAAAAAB',	`r20')
+define(`x5555555555555555',	`r21')
+define(`xAAAAAAAAAAAAAAAA',	`r22')
+define(`u0',	`r0')	define(`u1',	`r1')
+define(`u2',	`r2')	define(`u3',	`r3')
+define(`l0',	`r25')	define(`x',	`r8')
+define(`q0',	`r4')	define(`q1',	`r5')
+define(`p6',	`r6')	define(`p7',	`r7')
+define(`t0',	`r23')	define(`t1',	`r24')
+define(`cymask',`r28')
+
+
+PROLOGUE(mpn_divexact_by3c,gp)
+
+	ldq	r28, 0(up)			C load first limb early
+
+C Put magic constants in registers
+	lda	r0, L(LC)
+	ldq	xAAAAAAAAAAAAAAAB, 0(r0)
+	ldq	x5555555555555555, 8(r0)
+	ldq	xAAAAAAAAAAAAAAAA, 16(r0)
+
+C Compute initial l0 value
+	cmpeq	cy, 1, p6
+	cmpeq	cy, 2, p7
+	negq	p6, p6
+	and	p6, x5555555555555555, l0
+	cmovne	p7, xAAAAAAAAAAAAAAAA, l0
+
+C Feed-in depending on (n mod 4)
+	and	n, 3, r8
+	lda	n, -3(n)
+	cmpeq	r8, 1, r4
+	cmpeq	r8, 2, r5
+	bne	r4, $Lb01
+	bne	r5, $Lb10
+	beq	r8, $Lb00
+
+$Lb11:	ldq	u3, 8(up)
+	lda	up, -24(up)
+	lda	rp, -24(rp)
+	mulq	r28, xAAAAAAAAAAAAAAAB, q0
+	mov	r28, u2
+	br	r31, $L11
+
+$Lb00:	ldq	u2, 8(up)
+	lda	up, -16(up)
+	lda	rp, -16(rp)
+	mulq	r28, xAAAAAAAAAAAAAAAB, q1
+	mov	r28, u1
+	br	r31, $L00
+
+$Lb01:	lda	rp, -8(rp)
+	mulq	r28, xAAAAAAAAAAAAAAAB, q0
+	mov	r28, u0
+	blt	n, $Lcj1
+	ldq	u1, 8(up)
+	lda	up, -8(up)
+	br	r31, $L01
+
+$Lb10:	ldq	u0, 8(up)
+	mulq	r28, xAAAAAAAAAAAAAAAB, q1
+	mov	r28, u3
+	blt	n, $Lend
+
+	ALIGN(16)
+$Ltop:
+C 0
+	cmpult	u3, cy, cy			C L0
+	mulq	u0, xAAAAAAAAAAAAAAAB, q0	C U1
+	ldq	u1, 16(up)			C L1
+	addq	q1, l0, x			C U0
+C 1
+	negq	cy, cymask			C L0
+	unop					C U1
+	unop					C L1
+	cmpult	x5555555555555555, x, p6	C U0
+C 2
+	cmpult	xAAAAAAAAAAAAAAAA, x, p7	C U1
+	unop
+	unop
+	negq	p6, t0				C L0
+C 3
+	negq	p7, t1				C L0
+	and	cymask, x5555555555555555, l0	C U1
+	addq	p6, cy, cy
+	and	t0, x5555555555555555, t0
+C 4
+	and	t1, x5555555555555555, t1
+	addq	p7, cy, cy
+	unop
+	addq	t0, l0, l0
+C 5
+	addq	t1, l0, l0
+	unop
+	stq	x, 0(rp)			C L1
+	unop
+$L01:
+C 0
+	cmpult	u0, cy, cy			C L0
+	mulq	u1, xAAAAAAAAAAAAAAAB, q1	C U1
+	ldq	u2, 24(up)			C L1
+	addq	q0, l0, x			C U0
+C 1
+	negq	cy, cymask			C L0
+	unop					C U1
+	unop					C L1
+	cmpult	x5555555555555555, x, p6	C U0
+C 2
+	cmpult	xAAAAAAAAAAAAAAAA, x, p7	C U1
+	unop
+	unop
+	negq	p6, t0				C L0
+C 3
+	negq	p7, t1				C L0
+	and	cymask, x5555555555555555, l0	C U1
+	addq	p6, cy, cy
+	and	t0, x5555555555555555, t0
+C 4
+	and	t1, x5555555555555555, t1
+	addq	p7, cy, cy
+	unop
+	addq	t0, l0, l0
+C 5
+	addq	t1, l0, l0
+	unop
+	stq	x, 8(rp)			C L1
+	unop
+$L00:
+C 0
+	cmpult	u1, cy, cy			C L0
+	mulq	u2, xAAAAAAAAAAAAAAAB, q0	C U1
+	ldq	u3, 32(up)			C L1
+	addq	q1, l0, x			C U0
+C 1
+	negq	cy, cymask			C L0
+	unop					C U1
+	unop					C L1
+	cmpult	x5555555555555555, x, p6	C U0
+C 2
+	cmpult	xAAAAAAAAAAAAAAAA, x, p7	C U1
+	unop
+	unop
+	negq	p6, t0				C L0
+C 3
+	negq	p7, t1				C L0
+	and	cymask, x5555555555555555, l0	C U1
+	addq	p6, cy, cy
+	and	t0, x5555555555555555, t0
+C 4
+	and	t1, x5555555555555555, t1
+	addq	p7, cy, cy
+	unop
+	addq	t0, l0, l0
+C 5
+	addq	t1, l0, l0
+	unop
+	stq	x, 16(rp)			C L1
+	unop
+$L11:
+C 0
+	cmpult	u2, cy, cy			C L0
+	mulq	u3, xAAAAAAAAAAAAAAAB, q1	C U1
+	ldq	u0, 40(up)			C L1
+	addq	q0, l0, x			C U0
+C 1
+	negq	cy, cymask			C L0
+	unop					C U1
+	unop					C L1
+	cmpult	x5555555555555555, x, p6	C U0
+C 2
+	cmpult	xAAAAAAAAAAAAAAAA, x, p7	C U1
+	lda	n, -4(n)			C L1 bookkeeping
+	unop
+	negq	p6, t0				C L0
+C 3
+	negq	p7, t1				C L0
+	and	cymask, x5555555555555555, l0	C U1
+	addq	p6, cy, cy
+	and	t0, x5555555555555555, t0
+C 4
+	and	t1, x5555555555555555, t1
+	addq	p7, cy, cy
+	unop
+	addq	t0, l0, l0
+C 5
+	addq	t1, l0, l0
+	unop
+	stq	x, 24(rp)			C L1
+	lda	up, 32(up)
+C
+	ldl	r31, 256(up)			C prefetch
+	unop
+	lda	rp, 32(rp)
+	bge	n, $Ltop			C U1
+C *** MAIN LOOP END ***
+$Lend:
+
+	cmpult	u3, cy, cy			C L0
+	mulq	u0, xAAAAAAAAAAAAAAAB, q0	C U1
+	unop
+	addq	q1, l0, x			C U0
+C 1
+	negq	cy, cymask			C L0
+	unop					C U1
+	unop					C L1
+	cmpult	x5555555555555555, x, p6	C U0
+C 2
+	cmpult	xAAAAAAAAAAAAAAAA, x, p7	C U1
+	unop
+	unop
+	negq	p6, t0				C L0
+C 3
+	negq	p7, t1				C L0
+	and	cymask, x5555555555555555, l0	C U1
+	addq	p6, cy, cy
+	and	t0, x5555555555555555, t0
+C 4
+	and	t1, x5555555555555555, t1
+	addq	p7, cy, cy
+	unop
+	addq	t0, l0, l0
+C 5
+	addq	t1, l0, l0
+	unop
+	stq	x, 0(rp)			C L1
+	unop
+$Lcj1:
+	cmpult	u0, cy, cy			C L0
+	addq	q0, l0, x			C U0
+	cmpult	x5555555555555555, x, p6	C U0
+	cmpult	xAAAAAAAAAAAAAAAA, x, p7	C U1
+	addq	p6, cy, cy
+	addq	p7, cy, r0
+	stq	x, 8(rp)			C L1
+
+	ret	r31,(r26),1
+EPILOGUE()
+ASM_END()
+
+C This is useful for playing with various schedules.
+C Expand as: one(0)one(1)one(2)one(3)
+define(`one',`
+C 0
+	cmpult	`$'eval(($1+3)%4), cy, cy		C L0
+	mulq	`$'$1, xAAAAAAAAAAAAAAAB, `$'eval(4+$1%2) C U1
+	ldq	`$'eval(($1+1)%4), eval($1*8+16)(up)	C L1
+	addq	`$'eval(4+($1+1)%2), l0, x		C U0
+C 1
+	negq	cy, cymask				C L0
+	unop						C U1
+	unop						C L1
+	cmpult	x5555555555555555, x, p6		C U0
+C 2
+	cmpult	xAAAAAAAAAAAAAAAA, x, p7		C U1
+	unop
+	unop
+	negq	p6, t0					C L0
+C 3
+	negq	p7, t1					C L0
+	and	cymask, x5555555555555555, l0		C U1
+	addq	p6, cy, cy
+	and	t0, x5555555555555555, t0
+C 4
+	and	t1, x5555555555555555, t1
+	addq	p7, cy, cy
+	unop
+	addq	t0, l0, l0
+C 5
+	addq	t1, l0, l0
+	unop
+	stq	x, eval($1*8)(rp)			C L1
+	unop
+')

diff --git a/third_party/gmp/mpn/alpha/ev5/gmp-mparam.h b/third_party/gmp/mpn/alpha/ev5/gmp-mparam.h
new file mode 100644
index 0000000..1575a28
--- /dev/null
+++ b/third_party/gmp/mpn/alpha/ev5/gmp-mparam.h

@@ -0,0 +1,191 @@
+/* Alpha EV5 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991-2017 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+/* 600 MHz 21164A */
+/* FFT tuning limit = 5000000 */
+/* Generated by tuneup.c, 2017-02-02, gcc 4.9 */
+
+#define DIVREM_1_NORM_THRESHOLD              0  /* preinv always */
+#define DIVREM_1_UNNORM_THRESHOLD            0  /* always */
+#define MOD_1_1P_METHOD                      2
+#define MOD_1_NORM_THRESHOLD                 0  /* always */
+#define MOD_1_UNNORM_THRESHOLD               0  /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD         10
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          7
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD        22
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD         0  /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     20
+#define USE_PREINV_DIVREM_1                  1  /* preinv always */
+#define DIV_QR_1N_PI1_METHOD                 1
+#define DIV_QR_1_NORM_THRESHOLD          MP_SIZE_T_MAX  /* never */
+#define DIV_QR_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD                 0  /* always */
+#define BMOD_1_TO_MOD_1_THRESHOLD           69
+
+#define DIV_1_VS_MUL_1_PERCENT             181
+
+#define MUL_TOOM22_THRESHOLD                16
+#define MUL_TOOM33_THRESHOLD                50
+#define MUL_TOOM44_THRESHOLD               118
+#define MUL_TOOM6H_THRESHOLD               173
+#define MUL_TOOM8H_THRESHOLD               236
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      49
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD      84
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD      81
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD      53
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD      70
+
+#define SQR_BASECASE_THRESHOLD               0  /* always */
+#define SQR_TOOM2_THRESHOLD                 22
+#define SQR_TOOM3_THRESHOLD                 69
+#define SQR_TOOM4_THRESHOLD                178
+#define SQR_TOOM6_THRESHOLD                189
+#define SQR_TOOM8_THRESHOLD                357
+
+#define MULMID_TOOM42_THRESHOLD             18
+
+#define MULMOD_BNM1_THRESHOLD                9
+#define SQRMOD_BNM1_THRESHOLD               12
+
+#define MUL_FFT_MODF_THRESHOLD             284  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    284, 5}, {     11, 6}, {      6, 5}, {     13, 6}, \
+    {      7, 5}, {     15, 6}, {     13, 7}, {      7, 6}, \
+    {     15, 7}, {      8, 6}, {     17, 7}, {     13, 8}, \
+    {      7, 7}, {     17, 8}, {      9, 7}, {     20, 8}, \
+    {     11, 7}, {     23, 8}, {     13, 9}, {      7, 8}, \
+    {     19, 9}, {     11, 8}, {     25,10}, {      7, 9}, \
+    {     15, 8}, {     33, 9}, {     19, 8}, {     39, 9}, \
+    {     23, 8}, {     47,10}, {     15, 9}, {     39,10}, \
+    {     23, 9}, {     47,11}, {     15,10}, {     31, 9}, \
+    {     67,10}, {     39, 9}, {     79,10}, {     47, 9}, \
+    {     95,10}, {     55,11}, {     31,10}, {     63, 8}, \
+    {    255, 7}, {    511,10}, {     71, 9}, {    143, 8}, \
+    {    287, 7}, {    575, 9}, {    159, 8}, {    319,11}, \
+    {     47,12}, {     31,11}, {     63, 9}, {    255, 8}, \
+    {    511,10}, {    143, 9}, {    287,11}, {     79,10}, \
+    {    159, 9}, {    319,10}, {    175, 9}, {    351, 8}, \
+    {    703,10}, {    191, 9}, {    383,10}, {    207, 9}, \
+    {    415,12}, {     63,10}, {    255,11}, {    143,10}, \
+    {    287, 9}, {    575,11}, {    159,10}, {    319, 9}, \
+    {    639,11}, {    175,12}, {     95,11}, {    191,10}, \
+    {    383,11}, {    207,10}, {    415,11}, {    223,13}, \
+    {     63,11}, {    287,10}, {    575,12}, {    159,11}, \
+    {    319,10}, {    639,11}, {    351,12}, {    191,11}, \
+    {    415,12}, {    223,11}, {    447,10}, {    895,11}, \
+    {    479,12}, {    287,11}, {    575,12}, {    351,13}, \
+    {    191,12}, {    479,13}, {    255,12}, {    575,13}, \
+    {    319,12}, {    703,13}, {    383,12}, {    831,13}, \
+    {    447,14}, {    255,13}, {   8192,14}, {  16384,15}, \
+    {  32768,16}, {  65536,17}, { 131072,18}, { 262144,19}, \
+    { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \
+    {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 121
+#define MUL_FFT_THRESHOLD                 4224
+
+#define SQR_FFT_MODF_THRESHOLD             240  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    240, 5}, {     13, 6}, {      7, 5}, {     15, 6}, \
+    {     14, 5}, {     29, 7}, {      9, 6}, {     19, 7}, \
+    {     13, 6}, {     27, 8}, {      7, 7}, {     21, 8}, \
+    {     11, 7}, {     29, 8}, {     19, 9}, {     11, 8}, \
+    {     27,10}, {      7, 9}, {     15, 8}, {     33, 9}, \
+    {     19, 8}, {     39, 9}, {     23, 8}, {     47,10}, \
+    {     15, 9}, {     39,10}, {     23, 9}, {     47,11}, \
+    {     15,10}, {     31, 9}, {     67,10}, {     39, 9}, \
+    {     79,10}, {     47,11}, {     31,10}, {     63, 9}, \
+    {    127, 8}, {    255,10}, {     71, 9}, {    143, 8}, \
+    {    287,10}, {     79,11}, {     47,12}, {     31,11}, \
+    {     63,10}, {    127, 9}, {    255,10}, {    143, 9}, \
+    {    287,11}, {     79,10}, {    159, 9}, {    319,10}, \
+    {    175,11}, {     95,10}, {    191, 9}, {    383,10}, \
+    {    207, 9}, {    415,11}, {    111,10}, {    223,12}, \
+    {     63,11}, {    175,12}, {     95,11}, {    207,13}, \
+    {     63,12}, {    127,11}, {    287,12}, {    159,11}, \
+    {    351,12}, {    191,11}, {    415,12}, {    223,11}, \
+    {    447,13}, {    127,12}, {    351,13}, {    191,12}, \
+    {    383,11}, {    767,12}, {    415,11}, {    831,12}, \
+    {    447,14}, {    127,13}, {    255,12}, {    511,11}, \
+    {   1087,12}, {    575,13}, {    319,12}, {    703,13}, \
+    {    383,12}, {    831,13}, {    447,14}, {    255,13}, \
+    {    511,12}, {   1023,13}, {   8192,14}, {  16384,15}, \
+    {  32768,16}, {  65536,17}, { 131072,18}, { 262144,19}, \
+    { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \
+    {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 105
+#define SQR_FFT_THRESHOLD                 3968
+
+#define MULLO_BASECASE_THRESHOLD             0  /* always */
+#define MULLO_DC_THRESHOLD                  50
+#define MULLO_MUL_N_THRESHOLD             5558
+#define SQRLO_BASECASE_THRESHOLD             0  /* always */
+#define SQRLO_DC_THRESHOLD                  78
+#define SQRLO_SQR_THRESHOLD               3597
+
+#define DC_DIV_QR_THRESHOLD                 47
+#define DC_DIVAPPR_Q_THRESHOLD             167
+#define DC_BDIV_QR_THRESHOLD                47
+#define DC_BDIV_Q_THRESHOLD                110
+
+#define INV_MULMOD_BNM1_THRESHOLD           30
+#define INV_NEWTON_THRESHOLD               181
+#define INV_APPR_THRESHOLD                 173
+
+#define BINV_NEWTON_THRESHOLD              182
+#define REDC_1_TO_REDC_N_THRESHOLD          47
+
+#define MU_DIV_QR_THRESHOLD                979
+#define MU_DIVAPPR_Q_THRESHOLD            1142
+#define MUPI_DIV_QR_THRESHOLD               90
+#define MU_BDIV_QR_THRESHOLD               748
+#define MU_BDIV_Q_THRESHOLD                979
+
+#define POWM_SEC_TABLE  1,16,90,386,2177
+
+#define GET_STR_DC_THRESHOLD                14
+#define GET_STR_PRECOMPUTE_THRESHOLD        26
+#define SET_STR_DC_THRESHOLD               363
+#define SET_STR_PRECOMPUTE_THRESHOLD      1201
+
+#define FAC_DSC_THRESHOLD                  342
+#define FAC_ODD_THRESHOLD                    0  /* always */
+
+#define MATRIX22_STRASSEN_THRESHOLD         13
+#define HGCD_THRESHOLD                     105
+#define HGCD_APPR_THRESHOLD                108
+#define HGCD_REDUCE_THRESHOLD             1679
+#define GCD_DC_THRESHOLD                   238
+#define GCDEXT_DC_THRESHOLD                199
+#define JACOBI_BASE_METHOD                   2

diff --git a/third_party/gmp/mpn/alpha/ev6/add_n.asm b/third_party/gmp/mpn/alpha/ev6/add_n.asm
new file mode 100644
index 0000000..9261f31
--- /dev/null
+++ b/third_party/gmp/mpn/alpha/ev6/add_n.asm

@@ -0,0 +1,283 @@
+dnl  Alpha ev6 mpn_add_n -- Add two limb vectors of the same length > 0 and
+dnl  store sum in a third limb vector.
+
+dnl  Copyright 2000, 2003, 2005 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C      cycles/limb
+C EV4:     ?
+C EV5:     5.4
+C EV6:     2.125
+
+C  INPUT PARAMETERS
+C  rp	r16
+C  up	r17
+C  vp	r18
+C  n	r19
+C  cy	r20   (for mpn_add_nc)
+
+C TODO
+C   Finish cleaning up cy registers r22, r23 (make them use cy0/cy1)
+C   Use multi-pronged feed-in.
+C   Perform additional micro-tuning
+
+C  This code was written in cooperation with ev6 pipeline expert Steve Root.
+
+C  Pair loads and stores where possible
+C  Store pairs oct-aligned where possible (didn't need it here)
+C  Stores are delayed every third cycle
+C  Loads and stores are delayed by fills
+C  U stays still, put code there where possible (note alternation of U1 and U0)
+C  L moves because of loads and stores
+C  Note dampers in L to limit damage
+
+C  This odd-looking optimization expects that were having random bits in our
+C  data, so that a pure zero result is unlikely. so we penalize the unlikely
+C  case to help the common case.
+
+define(`u0', `r0')  define(`u1', `r3')
+define(`v0', `r1')  define(`v1', `r4')
+
+define(`cy0', `r20')  define(`cy1', `r21')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc)
+
+ASM_START()
+PROLOGUE(mpn_add_nc)
+	br	r31,	$entry
+EPILOGUE()
+PROLOGUE(mpn_add_n)
+	bis	r31,	r31,	cy0	C clear carry in
+$entry:	cmpult	r19,	5,	r22	C L1 move counter
+	ldq	u1,	0(r17)		C L0 get next ones
+	ldq	v1,	0(r18)		C L1
+	bne	r22,	$Lsmall
+
+	ldq	u0,	8(r17)		C L0 get next ones
+	ldq	v0,	8(r18)		C L1
+	addq	u1,	v1,	r5	C U0 add two data
+
+	cmpult	r5,	v1,	r23	C U0 did it carry
+	ldq	u1,	16(r17)		C L0 get next ones
+	ldq	v1,	16(r18)		C L1
+
+	addq	u0,	v0,	r8	C U1 add two data
+	addq	r5,	cy0,	r5	C U0 carry in
+
+	cmpult	r8,	v0,	r22	C U1 did it carry
+	beq	r5,	$fix5f		C U0 fix exact zero
+$ret5f:	ldq	u0,	24(r17)		C L0 get next ones
+	ldq	v0,	24(r18)		C L1
+
+	addq	r8,	r23,	r8	C U1 carry from last
+	addq	u1,	v1,	r7	C U0 add two data
+
+	beq	r8,	$fix6f		C U1 fix exact zero
+$ret6f:	cmpult	r7,	v1,	r23	C U0 did it carry
+	ldq	u1,	32(r17)		C L0 get next ones
+	ldq	v1,	32(r18)		C L1
+
+	lda	r17,	40(r17)		C L0 move pointer
+	lda	r18,	40(r18)		C L1 move pointer
+
+	lda	r16,	-8(r16)
+	lda	r19,	-13(r19)	C L1 move counter
+	blt	r19,	$Lend		C U1 loop control
+
+
+C Main loop.  8-way unrolled.
+	ALIGN(16)
+$Loop:	addq	u0,	v0,	r2	C U1 add two data
+	addq	r7,	r22,	r7	C U0 add in carry
+	stq	r5,	8(r16)		C L0 put an answer
+	stq	r8,	16(r16)		C L1 pair
+
+	cmpult	r2,	v0,	cy1	C U1 did it carry
+	beq	r7,	$fix7		C U0 fix exact 0
+$ret7:	ldq	u0,	0(r17)		C L0 get next ones
+	ldq	v0,	0(r18)		C L1
+
+	bis	r31,	r31,	r31	C L  damp out
+	addq	r2,	r23,	r2	C U1 carry from last
+	bis	r31,	r31,	r31	C L  moves in L !
+	addq	u1,	v1,	r5	C U0 add two data
+
+	beq	r2,	$fix0		C U1 fix exact zero
+$ret0:	cmpult	r5,	v1,	cy0	C U0 did it carry
+	ldq	u1,	8(r17)		C L0 get next ones
+	ldq	v1,	8(r18)		C L1
+
+	addq	u0,	v0,	r8	C U1 add two data
+	addq	r5,	cy1,	r5	C U0 carry from last
+	stq	r7,	24(r16)		C L0 store pair
+	stq	r2,	32(r16)		C L1
+
+	cmpult	r8,	v0,	r22	C U1 did it carry
+	beq	r5,	$fix1		C U0 fix exact zero
+$ret1:	ldq	u0,	16(r17)		C L0 get next ones
+	ldq	v0,	16(r18)		C L1
+
+	lda	r16,	64(r16)		C L0 move pointer
+	addq	r8,	cy0,	r8	C U1 carry from last
+	lda	r19,	-8(r19)		C L1 move counter
+	addq	u1,	v1,	r7	C U0 add two data
+
+	beq	r8,	$fix2		C U1 fix exact zero
+$ret2:	cmpult	r7,	v1,	r23	C U0 did it carry
+	ldq	u1,	24(r17)		C L0 get next ones
+	ldq	v1,	24(r18)		C L1
+
+	addq	u0,	v0,	r2	C U1 add two data
+	addq	r7,	r22,	r7	C U0 add in carry
+	stq	r5,	-24(r16)	C L0 put an answer
+	stq	r8,	-16(r16)	C L1 pair
+
+	cmpult	r2,	v0,	cy1	C U1 did it carry
+	beq	r7,	$fix3		C U0 fix exact 0
+$ret3:	ldq	u0,	32(r17)		C L0 get next ones
+	ldq	v0,	32(r18)		C L1
+
+	bis	r31,	r31,	r31	C L  damp out
+	addq	r2,	r23,	r2	C U1 carry from last
+	bis	r31,	r31,	r31	C L  moves in L !
+	addq	u1,	v1,	r5	C U0 add two data
+
+	beq	r2,	$fix4		C U1 fix exact zero
+$ret4:	cmpult	r5,	v1,	cy0	C U0 did it carry
+	ldq	u1,	40(r17)		C L0 get next ones
+	ldq	v1,	40(r18)		C L1
+
+	addq	u0,	v0,	r8	C U1 add two data
+	addq	r5,	cy1,	r5	C U0 carry from last
+	stq	r7,	-8(r16)		C L0 store pair
+	stq	r2,	0(r16)		C L1
+
+	cmpult	r8,	v0,	r22	C U1 did it carry
+	beq	r5,	$fix5		C U0 fix exact zero
+$ret5:	ldq	u0,	48(r17)		C L0 get next ones
+	ldq	v0,	48(r18)		C L1
+
+	ldl	r31, 256(r17)		C L0 prefetch
+	addq	r8,	cy0,	r8	C U1 carry from last
+	ldl	r31, 256(r18)		C L1 prefetch
+	addq	u1,	v1,	r7	C U0 add two data
+
+	beq	r8,	$fix6		C U1 fix exact zero
+$ret6:	cmpult	r7,	v1,	r23	C U0 did it carry
+	ldq	u1,	56(r17)		C L0 get next ones
+	ldq	v1,	56(r18)		C L1
+
+	lda	r17,	64(r17)		C L0 move pointer
+	bis	r31,	r31,	r31	C U
+	lda	r18,	64(r18)		C L1 move pointer
+	bge	r19,	$Loop		C U1 loop control
+C ==== main loop end
+
+$Lend:	addq	u0,	v0,	r2	C U1 add two data
+	addq	r7,	r22,	r7	C U0 add in carry
+	stq	r5,	8(r16)		C L0 put an answer
+	stq	r8,	16(r16)		C L1 pair
+	cmpult	r2,	v0,	cy1	C U1 did it carry
+	beq	r7,	$fix7c		C U0 fix exact 0
+$ret7c:	addq	r2,	r23,	r2	C U1 carry from last
+	addq	u1,	v1,	r5	C U0 add two data
+	beq	r2,	$fix0c		C U1 fix exact zero
+$ret0c:	cmpult	r5,	v1,	cy0	C U0 did it carry
+	addq	r5,	cy1,	r5	C U0 carry from last
+	stq	r7,	24(r16)		C L0 store pair
+	stq	r2,	32(r16)		C L1
+	beq	r5,	$fix1c		C U0 fix exact zero
+$ret1c:	stq	r5,	40(r16)		C L0 put an answer
+	lda	r16,	48(r16)		C L0 move pointer
+
+	lda	r19,	8(r19)
+	beq	r19,	$Lret
+
+	ldq	u1,	0(r17)
+	ldq	v1,	0(r18)
+$Lsmall:
+	lda	r19,	-1(r19)
+	beq	r19,	$Lend0
+
+	ALIGN(8)
+$Loop0:	addq	u1,	v1,	r2	C main add
+	cmpult	r2,	v1,	r8	C compute cy from last add
+	ldq	u1,	8(r17)
+	ldq	v1,	8(r18)
+	addq	r2,	cy0,	r5	C carry add
+	lda	r17,	8(r17)
+	lda	r18,	8(r18)
+	stq	r5,	0(r16)
+	cmpult	r5,	r2,	cy0	C compute cy from last add
+	lda	r19,	-1(r19)		C decr loop cnt
+	bis	r8,	cy0,	cy0	C combine cy from the two adds
+	lda	r16,	8(r16)
+	bne	r19,	$Loop0
+$Lend0:	addq	u1,	v1,	r2	C main add
+	addq	r2,	cy0,	r5	C carry add
+	cmpult	r2,	v1,	r8	C compute cy from last add
+	cmpult	r5,	r2,	cy0	C compute cy from last add
+	stq	r5,	0(r16)
+	bis	r8,	cy0,	r0	C combine cy from the two adds
+	ret	r31,(r26),1
+
+	ALIGN(8)
+$Lret:	lda	r0,	0(cy0)		C copy carry into return register
+	ret	r31,(r26),1
+
+$fix5f:	bis	r23,	cy0,	r23	C bring forward carry
+	br	r31,	$ret5f
+$fix6f:	bis	r22,	r23,	r22	C bring forward carry
+	br	r31,	$ret6f
+$fix0:	bis	cy1,	r23,	cy1	C bring forward carry
+	br	r31,	$ret0
+$fix1:	bis	cy0,	cy1,	cy0	C bring forward carry
+	br	r31,	$ret1
+$fix2:	bis	r22,	cy0,	r22	C bring forward carry
+	br	r31,	$ret2
+$fix3:	bis	r23,	r22,	r23	C bring forward carry
+	br	r31,	$ret3
+$fix4:	bis	cy1,	r23,	cy1	C bring forward carry
+	br	r31,	$ret4
+$fix5:	bis	cy1,	cy0,	cy0	C bring forward carry
+	br	r31,	$ret5
+$fix6:	bis	r22,	cy0,	r22	C bring forward carry
+	br	r31,	$ret6
+$fix7:	bis	r23,	r22,	r23	C bring forward carry
+	br	r31,	$ret7
+$fix0c:	bis	cy1,	r23,	cy1	C bring forward carry
+	br	r31,	$ret0c
+$fix1c:	bis	cy0,	cy1,	cy0	C bring forward carry
+	br	r31,	$ret1c
+$fix7c:	bis	r23,	r22,	r23	C bring forward carry
+	br	r31,	$ret7c
+
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/alpha/ev6/aorslsh1_n.asm b/third_party/gmp/mpn/alpha/ev6/aorslsh1_n.asm
new file mode 100644
index 0000000..cb966ce
--- /dev/null
+++ b/third_party/gmp/mpn/alpha/ev6/aorslsh1_n.asm

@@ -0,0 +1,172 @@
+dnl  Alpha mpn_addlsh1_n/mpn_sublsh1_n -- rp[] = up[] +- (vp[] << 1).
+
+dnl  Copyright 2003, 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C      cycles/limb
+C EV4:     ?
+C EV5:     7
+C EV6:     4
+
+C TODO
+C  * Tune to reach 3.75 c/l on ev6.
+
+define(`rp',`r16')
+define(`up',`r17')
+define(`vp',`r18')
+define(`n', `r19')
+
+define(`u0', `r8')
+define(`u1', `r1')
+define(`v0', `r4')
+define(`v1', `r5')
+
+define(`cy0', `r0')
+define(`cy1', `r20')
+define(`cy', `r22')
+define(`rr', `r24')
+define(`ps', `r25')
+define(`sl', `r28')
+
+ifdef(`OPERATION_addlsh1_n',`
+  define(ADDSUB,       addq)
+  define(CARRY,       `cmpult $1,$2,$3')
+  define(func, mpn_addlsh1_n)
+')
+ifdef(`OPERATION_sublsh1_n',`
+  define(ADDSUB,       subq)
+  define(CARRY,       `cmpult $2,$1,$3')
+  define(func, mpn_sublsh1_n)
+')
+
+MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_sublsh1_n)
+
+ASM_START()
+PROLOGUE(func)
+	and	n, 2, cy0
+	blbs	n, L(bx1)
+L(bx0):	ldq	v1, 0(vp)
+	ldq	u1, 0(up)
+	lda	r2, 0(r31)
+	bne	cy0, L(b10)
+
+L(b00):	lda	vp, 48(vp)
+	lda	up, -16(up)
+	lda	rp, -8(rp)
+	lda	cy0, 0(r31)
+	br	r31, L(lo0)
+
+L(b10):	lda	vp, 32(vp)
+	lda	rp, 8(rp)
+	lda	cy0, 0(r31)
+	br	r31, L(lo2)
+
+L(bx1):	ldq	v0, 0(vp)
+	ldq	u0, 0(up)
+	lda	r3, 0(r31)
+	beq	cy0, L(b01)
+
+L(b11):	lda	vp, 40(vp)
+	lda	up, -24(up)
+	lda	rp, 16(rp)
+	lda	cy1, 0(r31)
+	br	r31, L(lo3)
+
+L(b01):	lda	n, -4(n)
+	lda	cy1, 0(r31)
+	ble	n, L(end)
+	lda	vp, 24(vp)
+	lda	up, -8(up)
+
+	ALIGN(16)
+L(top):	addq	v0, v0, r6
+	ldq	v1, -16(vp)
+	addq	r6, r3, sl	C combined vlimb
+	ldq	u1, 16(up)
+	ADDSUB	u0, sl, ps	C ulimb + (vlimb << 1)
+	cmplt	v0, r31, r2	C high v bits
+	ADDSUB	ps, cy1, rr	C consume carry from previous operation
+	CARRY(	ps, u0, cy0)	C carry out #2
+	stq	rr, 0(rp)
+	CARRY(	rr, ps, cy)	C carry out #3
+	lda	vp, 32(vp)	C bookkeeping
+	addq	cy, cy0, cy0	C final carry out
+L(lo0):	addq	v1, v1, r7
+	ldq	v0, -40(vp)
+	addq	r7, r2, sl
+	ldq	u0, 24(up)
+	ADDSUB	u1, sl, ps
+	cmplt	v1, r31, r3
+	ADDSUB	ps, cy0, rr
+	CARRY(	ps, u1, cy1)
+	stq	rr, 8(rp)
+	CARRY(	rr, ps, cy)
+	lda	rp, 32(rp)	C bookkeeping
+	addq	cy, cy1, cy1
+L(lo3):	addq	v0, v0, r6
+	ldq	v1, -32(vp)
+	addq	r6, r3, sl
+	ldq	u1, 32(up)
+	ADDSUB	u0, sl, ps
+	cmplt	v0, r31, r2
+	ADDSUB	ps, cy1, rr
+	CARRY(	ps, u0, cy0)
+	stq	rr, -16(rp)
+	CARRY(	rr, ps, cy)
+	lda	up, 32(up)	C bookkeeping
+	addq	cy, cy0, cy0
+L(lo2):	addq	v1, v1, r7
+	ldq	v0, -24(vp)
+	addq	r7, r2, sl
+	ldq	u0, 8(up)
+	ADDSUB	u1, sl, ps
+	cmplt	v1, r31, r3
+	ADDSUB	ps, cy0, rr
+	CARRY(	ps, u1, cy1)
+	stq	rr, -8(rp)
+	CARRY(	rr, ps, cy)
+	lda	n, -4(n)	C bookkeeping
+	addq	cy, cy1, cy1
+	bgt	n, L(top)
+
+L(end):	addq	v0, v0, r6
+	addq	r6, r3, sl
+	ADDSUB	u0, sl, ps
+	cmplt	v0, r31, r2
+	ADDSUB	ps, cy1, rr
+	CARRY(	ps, u0, cy0)
+	stq	rr, 0(rp)
+	CARRY(	rr, ps, cy)
+	addq	cy, cy0, cy0
+	addq	cy0, r2, r0
+
+	ret	r31,(r26),1
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/alpha/ev6/aorsmul_1.asm b/third_party/gmp/mpn/alpha/ev6/aorsmul_1.asm
new file mode 100644
index 0000000..0e68e6e
--- /dev/null
+++ b/third_party/gmp/mpn/alpha/ev6/aorsmul_1.asm

@@ -0,0 +1,398 @@
+dnl  Alpha ev6 mpn_addmul_1 and mpn_submul_1.
+
+dnl  Copyright 2000, 2003-2005, 2008 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C      cycles/limb
+C EV4:    42
+C EV5:    18
+C EV6:     3.5
+
+C  INPUT PARAMETERS
+define(`rp',	`r16')
+define(`up',	`r17')
+define(`n',	`r18')
+define(`v0',	`r19')
+
+dnl  This code was written in cooperation with ev6 pipeline expert Steve Root.
+
+dnl  The stores can issue a cycle late so we have paired no-op's to 'catch'
+dnl  them, so that further disturbance to the schedule is damped.
+
+dnl  We couldn't pair the loads, because the entangled schedule of the carry's
+dnl  has to happen on one side {0} of the machine.
+
+dnl  This is a great schedule for the d_cache, a poor schedule for the b_cache.
+dnl  The lockup on U0 means that any stall can't be recovered from.  Consider a
+dnl  ldq in L1, say that load gets stalled because it collides with a fill from
+dnl  the b_cache.  On the next cycle, this load gets priority.  If first looks
+dnl  at L0, and goes there.  The instruction we intended for L0 gets to look at
+dnl  L1, which is NOT where we want it.  It either stalls 1, because it can't
+dnl  go in L0, or goes there, and causes a further instruction to stall.
+
+dnl  So for b_cache, we're likely going to want to put one or more cycles back
+dnl  into the code! And, of course, put in lds prefetch for the rp[] operand.
+dnl  At a place where we have an mt followed by a bookkeeping, put the
+dnl  bookkeeping in upper, and the prefetch into lower.
+
+dnl  Note, the ldq's and stq's are at the end of the quadpacks.  Note, we'd
+dnl  like not to have an ldq or an stq to preceded a conditional branch in a
+dnl  quadpack.  The conditional branch moves the retire pointer one cycle
+dnl  later.
+
+ifdef(`OPERATION_addmul_1',`
+    define(`ADDSUB',	`addq')
+    define(`CMPCY',	`cmpult	$2,$1')
+    define(`func',	`mpn_addmul_1')
+')
+ifdef(`OPERATION_submul_1',`
+    define(`ADDSUB',	`subq')
+    define(`CMPCY',	`cmpult	$1,$2')
+    define(`func',	`mpn_submul_1')
+')
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
+
+ASM_START()
+PROLOGUE(func)
+	ldq	r3,	0(up)		C
+	and	r18,	7,	r20	C
+	lda	r18,	-9(r18)		C
+	cmpeq	r20,	1,	r21	C
+	beq	r21,	$L1		C
+
+$1mod8:	ldq	r5,	0(rp)		C
+	mulq	v0,	r3,	r7	C
+	umulh	v0,	r3,	r8	C
+	ADDSUB	r5,	r7,	r23	C
+	CMPCY(	r5,	r23),	r20	C
+	addq	r8,	r20,	r0	C
+	stq	r23,	0(rp)		C
+	bge	r18,	$ent1		C
+	ret	r31,	(r26),	1	C
+
+$L1:	lda	r8,	0(r31)		C zero carry reg
+	lda	r24,	0(r31)		C zero carry reg
+	cmpeq	r20,	2,	r21	C
+	bne	r21,	$2mod8		C
+	cmpeq	r20,	3,	r21	C
+	bne	r21,	$3mod8		C
+	cmpeq	r20,	4,	r21	C
+	bne	r21,	$4mod8		C
+	cmpeq	r20,	5,	r21	C
+	bne	r21,	$5mod8		C
+	cmpeq	r20,	6,	r21	C
+	bne	r21,	$6mod8		C
+	cmpeq	r20,	7,	r21	C
+	beq	r21,	$0mod8		C
+
+$7mod8:	ldq	r5,	0(rp)		C
+	lda	up,	8(up)		C
+	mulq	v0,	r3,	r7	C
+	umulh	v0,	r3,	r24	C
+	ADDSUB	r5,	r7,	r23	C
+	CMPCY(	r5,	r23),	r20	C
+	addq	r24,	r20,	r24	C
+	stq	r23,	0(rp)		C
+	lda	rp,	8(rp)		C
+	ldq	r3,	0(up)		C
+$6mod8:	ldq	r1,	8(up)		C
+	mulq	v0,	r3,	r25	C
+	umulh	v0,	r3,	r3	C
+	mulq	v0,	r1,	r28	C
+	ldq	r0,	16(up)		C
+	ldq	r4,	0(rp)		C
+	umulh	v0,	r1,	r8	C
+	ldq	r1,	24(up)		C
+	lda	up,	48(up)		C L1 bookkeeping
+	mulq	v0,	r0,	r2	C
+	ldq	r5,	8(rp)		C
+	lda	rp,	-32(rp)		C L1 bookkeeping
+	umulh	v0,	r0,	r6	C
+	ADDSUB	r4,	r25,	r25	C lo + acc
+	mulq	v0,	r1,	r7	C
+	br	r31,	$ent6		C
+
+$ent1:	lda	up,	8(up)		C
+	lda	rp,	8(rp)		C
+	lda	r8,	0(r0)		C
+	ldq	r3,	0(up)		C
+$0mod8:	ldq	r1,	8(up)		C
+	mulq	v0,	r3,	r2	C
+	umulh	v0,	r3,	r6	C
+	mulq	v0,	r1,	r7	C
+	ldq	r0,	16(up)		C
+	ldq	r4,	0(rp)		C
+	umulh	v0,	r1,	r24	C
+	ldq	r1,	24(up)		C
+	mulq	v0,	r0,	r25	C
+	ldq	r5,	8(rp)		C
+	umulh	v0,	r0,	r3	C
+	ADDSUB	r4,	r2,	r2	C lo + acc
+	mulq	v0,	r1,	r28	C
+	lda	rp,	-16(rp)		C
+	br	r31,	$ent0		C
+
+$3mod8:	ldq	r5,	0(rp)		C
+	lda	up,	8(up)		C
+	mulq	v0,	r3,	r7	C
+	umulh	v0,	r3,	r8	C
+	ADDSUB	r5,	r7,	r23	C
+	CMPCY(	r5,	r23),	r20	C
+	addq	r8,	r20,	r24	C
+	stq	r23,	0(rp)		C
+	lda	rp,	8(rp)		C
+	ldq	r3,	0(up)		C
+$2mod8:	ldq	r1,	8(up)		C
+	mulq	v0,	r3,	r25	C
+	umulh	v0,	r3,	r3	C
+	mulq	v0,	r1,	r28	C
+	ble	r18,	$n23		C
+	ldq	r0,	16(up)		C
+	ldq	r4,	0(rp)		C
+	umulh	v0,	r1,	r8	C
+	ldq	r1,	24(up)		C
+	lda	up,	16(up)		C L1 bookkeeping
+	mulq	v0,	r0,	r2	C
+	ldq	r5,	8(rp)		C
+	lda	rp,	0(rp)		C L1 bookkeeping
+	umulh	v0,	r0,	r6	C
+	ADDSUB	r4,	r25,	r25	C lo + acc
+	mulq	v0,	r1,	r7	C
+	br	r31,	$ent2		C
+
+$5mod8:	ldq	r5,	0(rp)		C
+	lda	up,	8(up)		C
+	mulq	v0,	r3,	r7	C
+	umulh	v0,	r3,	r24	C
+	ADDSUB	r5,	r7,	r23	C
+	CMPCY(	r5,	r23),	r20	C
+	addq	r24,	r20,	r8	C
+	stq	r23,	0(rp)		C
+	lda	rp,	8(rp)		C
+	ldq	r3,	0(up)		C
+$4mod8:	ldq	r1,	8(up)		C
+	mulq	v0,	r3,	r2	C
+	umulh	v0,	r3,	r6	C
+	mulq	v0,	r1,	r7	C
+	ldq	r0,	16(up)		C
+	ldq	r4,	0(rp)		C
+	umulh	v0,	r1,	r24	C
+	ldq	r1,	24(up)		C
+	lda	up,	32(up)		C L1 bookkeeping
+	mulq	v0,	r0,	r25	C
+	ldq	r5,	8(rp)		C
+	lda	rp,	16(rp)		C L1 bookkeeping
+	umulh	v0,	r0,	r3	C
+	ADDSUB	r4,	r2,	r2	C lo + acc
+	mulq	v0,	r1,	r28	C
+	CMPCY(	r4,	r2),	r20	C L0 lo add => carry
+	ADDSUB	r2,	r8,	r22	C U0 hi add => answer
+	ble	r18,	$Lend		C
+	ALIGN(16)
+$Loop:
+	bis	r31,	r31,	r31	C U1 mt
+	CMPCY(	r2,	r22),	r21	C L0 hi add => carry
+	addq	r6,	r20,	r6	C U0 hi mul + carry
+	ldq	r0,	0(up)		C
+
+	bis	r31,	r31,	r31	C U1 mt
+	ADDSUB	r5,	r7,	r7	C L0 lo + acc
+	addq	r6,	r21,	r6	C U0 hi mul + carry
+	ldq	r4,	0(rp)		C L1
+
+	umulh	v0,	r1,	r8	C U1
+	CMPCY(	r5,	r7),	r20	C L0 lo add => carry
+	ADDSUB	r7,	r6,	r23	C U0 hi add => answer
+	ldq	r1,	8(up)		C L1
+
+	mulq	v0,	r0,	r2	C U1
+	CMPCY(	r7,	r23),	r21	C L0 hi add => carry
+	addq	r24,	r20,	r24	C U0 hi mul + carry
+	ldq	r5,	8(rp)		C L1
+
+	umulh	v0,	r0,	r6	C U1
+	ADDSUB	r4,	r25,	r25	C U0 lo + acc
+	stq	r22,	-16(rp)		C L0
+	stq	r23,	-8(rp)		C L1
+
+	bis	r31,	r31,	r31	C L0 st slosh
+	mulq	v0,	r1,	r7	C U1
+	bis	r31,	r31,	r31	C L1 st slosh
+	addq	r24,	r21,	r24	C U0 hi mul + carry
+$ent2:
+	CMPCY(	r4,	r25),	r20	C L0 lo add => carry
+	bis	r31,	r31,	r31	C U1 mt
+	lda	r18,	-8(r18)		C L1 bookkeeping
+	ADDSUB	r25,	r24,	r22	C U0 hi add => answer
+
+	bis	r31,	r31,	r31	C U1 mt
+	CMPCY(	r25,	r22),	r21	C L0 hi add => carry
+	addq	r3,	r20,	r3	C U0 hi mul + carry
+	ldq	r0,	16(up)		C L1
+
+	bis	r31,	r31,	r31	C U1 mt
+	ADDSUB	r5,	r28,	r28	C L0 lo + acc
+	addq	r3,	r21,	r3	C U0 hi mul + carry
+	ldq	r4,	16(rp)		C L1
+
+	umulh	v0,	r1,	r24	C U1
+	CMPCY(	r5,	r28),	r20	C L0 lo add => carry
+	ADDSUB	r28,	r3,	r23	C U0 hi add => answer
+	ldq	r1,	24(up)		C L1
+
+	mulq	v0,	r0,	r25	C U1
+	CMPCY(	r28,	r23),	r21	C L0 hi add => carry
+	addq	r8,	r20,	r8	C U0 hi mul + carry
+	ldq	r5,	24(rp)		C L1
+
+	umulh	v0,	r0,	r3	C U1
+	ADDSUB	r4,	r2,	r2	C U0 lo + acc
+	stq	r22,	0(rp)		C L0
+	stq	r23,	8(rp)		C L1
+
+	bis	r31,	r31,	r31	C L0 st slosh
+	mulq	v0,	r1,	r28	C U1
+	bis	r31,	r31,	r31	C L1 st slosh
+	addq	r8,	r21,	r8	C U0 hi mul + carry
+$ent0:
+	CMPCY(	r4,	r2),	r20	C L0 lo add => carry
+	bis	r31,	r31,	r31	C U1 mt
+	lda	up,	64(up)		C L1 bookkeeping
+	ADDSUB	r2,	r8,	r22	C U0 hi add => answer
+
+	bis	r31,	r31,	r31	C U1 mt
+	CMPCY(	r2,	r22),	r21	C L0 hi add => carry
+	addq	r6,	r20,	r6	C U0 hi mul + carry
+	ldq	r0,	-32(up)		C L1
+
+	bis	r31,	r31,	r31	C U1 mt
+	ADDSUB	r5,	r7,	r7	C L0 lo + acc
+	addq	r6,	r21,	r6	C U0 hi mul + carry
+	ldq	r4,	32(rp)		C L1
+
+	umulh	v0,	r1,	r8	C U1
+	CMPCY(	r5,	r7),	r20	C L0 lo add => carry
+	ADDSUB	r7,	r6,	r23	C U0 hi add => answer
+	ldq	r1,	-24(up)		C L1
+
+	mulq	v0,	r0,	r2	C U1
+	CMPCY(	r7,	r23),	r21	C L0 hi add => carry
+	addq	r24,	r20,	r24	C U0 hi mul + carry
+	ldq	r5,	40(rp)		C L1
+
+	umulh	v0,	r0,	r6	C U1
+	ADDSUB	r4,	r25,	r25	C U0 lo + acc
+	stq	r22,	16(rp)		C L0
+	stq	r23,	24(rp)		C L1
+
+	bis	r31,	r31,	r31	C L0 st slosh
+	mulq	v0,	r1,	r7	C U1
+	bis	r31,	r31,	r31	C L1 st slosh
+	addq	r24,	r21,	r24	C U0 hi mul + carry
+$ent6:
+	CMPCY(	r4,	r25),	r20	C L0 lo add => carry
+	bis	r31,	r31,	r31	C U1 mt
+	lda	rp,	64(rp)		C L1 bookkeeping
+	ADDSUB	r25,	r24,	r22	C U0 hi add => answer
+
+	bis	r31,	r31,	r31	C U1 mt
+	CMPCY(	r25,	r22),	r21	C L0 hi add => carry
+	addq	r3,	r20,	r3	C U0 hi mul + carry
+	ldq	r0,	-16(up)		C L1
+
+	bis	r31,	r31,	r31	C U1 mt
+	ADDSUB	r5,	r28,	r28	C L0 lo + acc
+	addq	r3,	r21,	r3	C U0 hi mul + carry
+	ldq	r4,	-16(rp)		C L1
+
+	umulh	v0,	r1,	r24	C U1
+	CMPCY(	r5,	r28),	r20	C L0 lo add => carry
+	ADDSUB	r28,	r3,	r23	C U0 hi add => answer
+	ldq	r1,	-8(up)		C L1
+
+	mulq	v0,	r0,	r25	C U1
+	CMPCY(	r28,	r23),	r21	C L0 hi add => carry
+	addq	r8,	r20,	r8	C U0 hi mul + carry
+	ldq	r5,	-8(rp)		C L1
+
+	umulh	v0,	r0,	r3	C U1
+	ADDSUB	r4,	r2,	r2	C U0 lo + acc
+	stq	r22,	-32(rp)		C L0
+	stq	r23,	-24(rp)		C L1
+
+	bis	r31,	r31,	r31	C L0 st slosh
+	mulq	v0,	r1,	r28	C U1
+	bis	r31,	r31,	r31	C L1 st slosh
+	addq	r8,	r21,	r8	C U0 hi mul + carry
+
+	CMPCY(	r4,	r2),	r20	C L0 lo add => carry
+	ADDSUB	r2,	r8,	r22	C U0 hi add => answer
+	ldl	r31,	256(up)		C prefetch up[]
+	bgt	r18,	$Loop		C U1 bookkeeping
+
+$Lend:	CMPCY(	r2,	r22),	r21	C
+	addq	r6,	r20,	r6	C
+	ADDSUB	r5,	r7,	r7	C
+	addq	r6,	r21,	r6	C
+	ldq	r4,	0(rp)		C
+	umulh	v0,	r1,	r8	C
+	CMPCY(	r5,	r7),	r20	C
+	ADDSUB	r7,	r6,	r23	C
+	CMPCY(r7,	r23),	r21	C
+	addq	r24,	r20,	r24	C
+	ldq	r5,	8(rp)		C
+	ADDSUB	r4,	r25,	r25	C
+	stq	r22,	-16(rp)		C
+	stq	r23,	-8(rp)		C
+	addq	r24,	r21,	r24	C
+	br	L(x)
+
+	ALIGN(16)
+$n23:	ldq	r4,	0(rp)		C
+	ldq	r5,	8(rp)		C
+	umulh	v0,	r1,	r8	C
+	ADDSUB	r4,	r25,	r25	C
+L(x):	CMPCY(	r4,	r25),	r20	C
+	ADDSUB	r25,	r24,	r22	C
+	CMPCY(	r25,	r22),	r21	C
+	addq	r3,	r20,	r3	C
+	ADDSUB	r5,	r28,	r28	C
+	addq	r3,	r21,	r3	C
+	CMPCY(	r5,	r28),	r20	C
+	ADDSUB	r28,	r3,	r23	C
+	CMPCY(	r28,	r23),	r21	C
+	addq	r8,	r20,	r8	C
+	stq	r22,	0(rp)		C
+	stq	r23,	8(rp)		C
+	addq	r8,	r21,	r0	C
+	ret	r31,	(r26),	1	C
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/alpha/ev6/gmp-mparam.h b/third_party/gmp/mpn/alpha/ev6/gmp-mparam.h
new file mode 100644
index 0000000..e51d6b0
--- /dev/null
+++ b/third_party/gmp/mpn/alpha/ev6/gmp-mparam.h

@@ -0,0 +1,209 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 1999-2002, 2004, 2005, 2008-2010, 2014 Free
+Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+#define DIVEXACT_BY3_METHOD 0	/* override ../diveby3.asm */
+
+/* 500 MHz 21164 (agnesi.math.su.se) */
+/* FFT tuning limit = 20000000 */
+/* Generated by tuneup.c, 2014-03-14, gcc 3.3 */
+
+#define DIVREM_1_NORM_THRESHOLD              0  /* preinv always */
+#define DIVREM_1_UNNORM_THRESHOLD            0  /* always */
+#define MOD_1_1P_METHOD                      2
+#define MOD_1_NORM_THRESHOLD                 0  /* always */
+#define MOD_1_UNNORM_THRESHOLD               0  /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          4
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          2
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD        10
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD        21
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD      7
+#define USE_PREINV_DIVREM_1                  1  /* preinv always */
+#define DIV_QR_1N_PI1_METHOD                 2
+#define DIV_QR_1_NORM_THRESHOLD              5
+#define DIV_QR_1_UNNORM_THRESHOLD            1
+#define DIV_QR_2_PI2_THRESHOLD               8
+#define DIVEXACT_1_THRESHOLD                 0  /* always */
+#define BMOD_1_TO_MOD_1_THRESHOLD           20
+
+#define MUL_TOOM22_THRESHOLD                32
+#define MUL_TOOM33_THRESHOLD               117
+#define MUL_TOOM44_THRESHOLD               124
+#define MUL_TOOM6H_THRESHOLD               230
+#define MUL_TOOM8H_THRESHOLD               357
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      97
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     107
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD      88
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD     105
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD     136
+
+#define SQR_BASECASE_THRESHOLD               0  /* always */
+#define SQR_TOOM2_THRESHOLD                 59
+#define SQR_TOOM3_THRESHOLD                123
+#define SQR_TOOM4_THRESHOLD                163
+#define SQR_TOOM6_THRESHOLD                333
+#define SQR_TOOM8_THRESHOLD                  0  /* always */
+
+#define MULMID_TOOM42_THRESHOLD             52
+
+#define MULMOD_BNM1_THRESHOLD               19
+#define SQRMOD_BNM1_THRESHOLD                5
+
+#define MUL_FFT_MODF_THRESHOLD             468  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    468, 5}, {     19, 6}, {     10, 5}, {     21, 6}, \
+    {     11, 5}, {     23, 6}, {     19, 7}, {     10, 6}, \
+    {     24, 7}, {     13, 6}, {     27, 7}, {     14, 6}, \
+    {     29, 7}, {     17, 6}, {     35, 7}, {     29, 8}, \
+    {     15, 7}, {     32, 8}, {     17, 7}, {     35, 8}, \
+    {     19, 7}, {     39, 8}, {     29, 9}, {     15, 8}, \
+    {     35, 9}, {     19, 8}, {     41, 9}, {     23, 8}, \
+    {     51, 9}, {     27, 8}, {     55, 9}, {     35, 8}, \
+    {     71, 9}, {     39,10}, {     23, 9}, {     55,10}, \
+    {     31, 9}, {     67,10}, {     39, 9}, {     79,10}, \
+    {     47, 9}, {     95,10}, {     55,11}, {     31,10}, \
+    {     79,11}, {     47,10}, {    103,12}, {     31,11}, \
+    {     63,10}, {    135,11}, {     79,10}, {    167,11}, \
+    {     95,10}, {    199,11}, {    111,12}, {     63,11}, \
+    {    143,10}, {    287, 9}, {    575,11}, {    159,10}, \
+    {    319,12}, {     95,11}, {    191,10}, {    383,11}, \
+    {    207,13}, {     63,12}, {    127,11}, {    255,10}, \
+    {    511,11}, {    271,10}, {    543,11}, {    287,10}, \
+    {    575,12}, {    159,11}, {    319,10}, {    639,11}, \
+    {    335,10}, {    671,11}, {    351,10}, {    703,12}, \
+    {    191,11}, {    383,10}, {    767,11}, {    415,12}, \
+    {    223,11}, {    447,13}, {    127,12}, {    255,11}, \
+    {    543,12}, {    287,11}, {    575,10}, {   1151,11}, \
+    {    607,12}, {    319,11}, {    671,12}, {    351,11}, \
+    {    703,13}, {    191,12}, {    383,11}, {    767,12}, \
+    {    415,11}, {    831,12}, {    447,14}, {    127,13}, \
+    {    255,12}, {    575,11}, {   1151,12}, {    607,13}, \
+    {    319,12}, {    735,13}, {    383,12}, {    767,11}, \
+    {   1535,12}, {    831,13}, {    447,12}, {    959,14}, \
+    {    255,13}, {    511,12}, {   1087,13}, {    575,12}, \
+    {   1215,13}, {    639,12}, {   1343,13}, {    703,12}, \
+    {   1407,14}, {    383,13}, {    767,12}, {   1535,13}, \
+    {    831,12}, {   1663,13}, {    959,15}, {    255,14}, \
+    {    511,13}, {   1215,14}, {    639,13}, {   1407,14}, \
+    {    767,13}, {   1663,14}, {    895,13}, {   1855,15}, \
+    {    511,14}, {  16384,15}, {  32768,16}, {  65536,17}, \
+    { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
+    {2097152,22}, {4194304,23}, {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 151
+#define MUL_FFT_THRESHOLD                 5760
+
+#define SQR_FFT_MODF_THRESHOLD             412  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    412, 5}, {     19, 6}, {     10, 5}, {     21, 6}, \
+    {     11, 5}, {     23, 6}, {     12, 5}, {     25, 6}, \
+    {     27, 7}, {     14, 6}, {     29, 7}, {     28, 8}, \
+    {     15, 7}, {     31, 8}, {     17, 7}, {     36, 8}, \
+    {     19, 7}, {     39, 8}, {     29, 9}, {     15, 8}, \
+    {     35, 9}, {     19, 8}, {     41, 9}, {     23, 8}, \
+    {     49, 9}, {     27,10}, {     15, 9}, {     39,10}, \
+    {     23, 9}, {     51,11}, {     15,10}, {     31, 9}, \
+    {     67,10}, {     39, 9}, {     79,10}, {     47, 9}, \
+    {     95,10}, {     55,11}, {     31,10}, {     79,11}, \
+    {     47,10}, {     95,12}, {     31,11}, {     63,10}, \
+    {    127, 9}, {    255,11}, {     79,10}, {    159, 9}, \
+    {    319,10}, {    167,11}, {     95,10}, {    191, 9}, \
+    {    383,11}, {    111,12}, {     63,11}, {    127,10}, \
+    {    271,11}, {    143,10}, {    287, 9}, {    575,10}, \
+    {    303,11}, {    159,10}, {    319,12}, {     95,11}, \
+    {    191,10}, {    383,11}, {    207,13}, {     63,12}, \
+    {    127,11}, {    255,10}, {    511,11}, {    271,10}, \
+    {    543,11}, {    287,10}, {    575,11}, {    303,12}, \
+    {    159,11}, {    319,10}, {    639,11}, {    335,10}, \
+    {    671,11}, {    351,10}, {    703,11}, {    367,12}, \
+    {    191,11}, {    383,10}, {    767,11}, {    415,12}, \
+    {    223,11}, {    447,13}, {    127,12}, {    255,11}, \
+    {    543,12}, {    287,11}, {    575,10}, {   1151,11}, \
+    {    607,12}, {    319,11}, {    639,10}, {   1279,11}, \
+    {    671,12}, {    351,11}, {    703,13}, {    191,12}, \
+    {    383,11}, {    767,12}, {    415,11}, {    831,12}, \
+    {    447,11}, {    895,12}, {    479,14}, {    127,13}, \
+    {    255,12}, {    575,11}, {   1151,12}, {    607,13}, \
+    {    319,12}, {    703,11}, {   1407,12}, {    735,13}, \
+    {    383,12}, {    831,13}, {    447,12}, {    959,14}, \
+    {    255,13}, {    511,12}, {   1087,13}, {    575,12}, \
+    {   1151,13}, {    639,12}, {   1279,13}, {    703,12}, \
+    {   1407,14}, {    383,13}, {    767,12}, {   1535,13}, \
+    {    831,12}, {   1663,13}, {    959,15}, {    255,14}, \
+    {    511,13}, {   1215,14}, {    639,13}, {   1407,14}, \
+    {    767,13}, {   1663,14}, {    895,13}, {   1791,15}, \
+    {    511,14}, {  16384,15}, {  32768,16}, {  65536,17}, \
+    { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
+    {2097152,22}, {4194304,23}, {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 159
+#define SQR_FFT_THRESHOLD                 5056
+
+#define MULLO_BASECASE_THRESHOLD             0  /* always */
+#define MULLO_DC_THRESHOLD                 100
+#define MULLO_MUL_N_THRESHOLD            11355
+
+#define DC_DIV_QR_THRESHOLD                124
+#define DC_DIVAPPR_Q_THRESHOLD             438
+#define DC_BDIV_QR_THRESHOLD               153
+#define DC_BDIV_Q_THRESHOLD                318
+
+#define INV_MULMOD_BNM1_THRESHOLD           62
+#define INV_NEWTON_THRESHOLD               384
+#define INV_APPR_THRESHOLD                 402
+
+#define BINV_NEWTON_THRESHOLD              381
+#define REDC_1_TO_REDC_N_THRESHOLD         110
+
+#define MU_DIV_QR_THRESHOLD               1752
+#define MU_DIVAPPR_Q_THRESHOLD            1895
+#define MUPI_DIV_QR_THRESHOLD              174
+#define MU_BDIV_QR_THRESHOLD              1387
+#define MU_BDIV_Q_THRESHOLD               1787
+
+#define POWM_SEC_TABLE  1,13,66,82,579
+
+#define MATRIX22_STRASSEN_THRESHOLD         15
+#define HGCD_THRESHOLD                     318
+#define HGCD_APPR_THRESHOLD                363
+#define HGCD_REDUCE_THRESHOLD             2384
+#define GCD_DC_THRESHOLD                  2504
+#define GCDEXT_DC_THRESHOLD                671
+#define JACOBI_BASE_METHOD                   3
+
+#define GET_STR_DC_THRESHOLD                14
+#define GET_STR_PRECOMPUTE_THRESHOLD        25
+#define SET_STR_DC_THRESHOLD              3754
+#define SET_STR_PRECOMPUTE_THRESHOLD      8097
+
+#define FAC_DSC_THRESHOLD                  951
+#define FAC_ODD_THRESHOLD                   24

diff --git a/third_party/gmp/mpn/alpha/ev6/mod_1_4.asm b/third_party/gmp/mpn/alpha/ev6/mod_1_4.asm
new file mode 100644
index 0000000..82c42ae
--- /dev/null
+++ b/third_party/gmp/mpn/alpha/ev6/mod_1_4.asm

@@ -0,0 +1,336 @@
+dnl Alpha mpn_mod_1s_4p
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  Copyright 2009, 2010 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C TODO:
+C  * Optimise.  2.75 c/l should be possible.
+C  * Write a proper mpn_mod_1s_4p_cps.  The code below was compiler generated.
+C  * Optimise feed-in code, starting the sw pipeline in switch code.
+C  * Shorten software pipeline.  The mul instructions are scheduled too far
+C    from their users.  Fixing this will allow us to use fewer registers.
+C  * If we cannot reduce register usage, write perhaps small-n basecase.
+C  * Does this work for PIC?
+
+C      cycles/limb
+C EV4:     ?
+C EV5:    23
+C EV6:     3
+
+define(`ap',     `r16')
+define(`n',      `r17')
+define(`pl',     `r24')
+define(`ph',     `r25')
+define(`rl',     `r6')
+define(`rh',     `r7')
+define(`B1modb', `r1')
+define(`B2modb', `r2')
+define(`B3modb', `r3')
+define(`B4modb', `r4')
+define(`B5modb', `r5')
+
+ASM_START()
+PROLOGUE(mpn_mod_1s_4p)
+	lda	r30, -64(r30)
+	stq	r9, 8(r30)
+	ldq	B1modb, 16(r19)
+	stq	r10, 16(r30)
+	ldq	B2modb, 24(r19)
+	stq	r11, 24(r30)
+	ldq	B3modb, 32(r19)
+	stq	r12, 32(r30)
+	ldq	B4modb, 40(r19)
+	stq	r13, 40(r30)
+	ldq	B5modb, 48(r19)
+	s8addq	n, ap, ap		C point ap at vector end
+
+	and	n, 3, r0
+	lda	n, -4(n)
+	beq	r0, L(b0)
+	lda	r6, -2(r0)
+	blt	r6, L(b1)
+	beq	r6, L(b2)
+
+L(b3):	ldq	r21, -16(ap)
+	ldq	r22, -8(ap)
+	ldq	r20, -24(ap)
+	mulq	r21, B1modb, r8
+	umulh	r21, B1modb, r12
+	mulq	r22, B2modb, r9
+	umulh	r22, B2modb, r13
+	addq	r8, r20, pl
+	cmpult	pl, r8, r0
+	addq	r0, r12, ph
+	addq	r9, pl, rl
+	cmpult	rl, r9, r0
+	addq	r13, ph, ph
+	addq	r0, ph, rh
+	lda	ap, -56(ap)
+	br	L(com)
+
+L(b0):	ldq	r21, -24(ap)
+	ldq	r22, -16(ap)
+	ldq	r23, -8(ap)
+	ldq	r20, -32(ap)
+	mulq	r21, B1modb, r8
+	umulh	r21, B1modb, r12
+	mulq	r22, B2modb, r9
+	umulh	r22, B2modb, r13
+	mulq	r23, B3modb, r10
+	umulh	r23, B3modb, r27
+	addq	r8, r20, pl
+	cmpult	pl, r8, r0
+	addq	r0, r12, ph
+	addq	r9, pl, pl
+	cmpult	pl, r9, r0
+	addq	r13, ph, ph
+	addq	r0, ph, ph
+	addq	r10, pl, rl
+	cmpult	rl, r10, r0
+	addq	r27, ph, ph
+	addq	r0, ph, rh
+	lda	ap, -64(ap)
+	br	L(com)
+
+L(b1):	bis	r31, r31, rh
+	ldq	rl, -8(ap)
+	lda	ap, -40(ap)
+	br	L(com)
+
+L(b2):	ldq	rh, -8(ap)
+	ldq	rl, -16(ap)
+	lda	ap, -48(ap)
+
+L(com):	ble	n, L(ed3)
+	ldq	r21, 8(ap)
+	ldq	r22, 16(ap)
+	ldq	r23, 24(ap)
+	ldq	r20, 0(ap)
+	lda	n, -4(n)
+	lda	ap, -32(ap)
+	mulq	r21, B1modb, r8
+	umulh	r21, B1modb, r12
+	mulq	r22, B2modb, r9
+	umulh	r22, B2modb, r13
+	mulq	r23, B3modb, r10
+	umulh	r23, B3modb, r27
+	mulq	rl, B4modb, r11
+	umulh	rl, B4modb, r28
+	ble	n, L(ed2)
+
+	ALIGN(16)
+L(top):	ldq	r21, 8(ap)
+	mulq	rh, B5modb, rl
+	addq	r8, r20, pl
+	ldq	r22, 16(ap)
+	cmpult	pl, r8, r0
+	umulh	rh, B5modb, rh
+	ldq	r23, 24(ap)
+	addq	r0, r12, ph
+	addq	r9, pl, pl
+	mulq	r21, B1modb, r8
+	cmpult	pl, r9, r0
+	addq	r13, ph, ph
+	umulh	r21, B1modb, r12
+	lda	ap, -32(ap)
+	addq	r0, ph, ph
+	addq	r10, pl, pl
+	mulq	r22, B2modb, r9
+	cmpult	pl, r10, r0
+	addq	r27, ph, ph
+	addq	r11, pl, pl
+	umulh	r22, B2modb, r13
+	addq	r0, ph, ph
+	cmpult	pl, r11, r0
+	addq	r28, ph, ph
+	mulq	r23, B3modb, r10
+	ldq	r20, 32(ap)
+	addq	pl, rl, rl
+	umulh	r23, B3modb, r27
+	addq	r0, ph, ph
+	cmpult	rl, pl, r0
+	mulq	rl, B4modb, r11
+	addq	ph, rh, rh
+	umulh	rl, B4modb, r28
+	addq	r0, rh, rh
+	lda	n, -4(n)
+	bgt	n, L(top)
+
+L(ed2):	mulq	rh, B5modb, rl
+	addq	r8, r20, pl
+	umulh	rh, B5modb, rh
+	cmpult	pl, r8, r0
+	addq	r0, r12, ph
+	addq	r9, pl, pl
+	cmpult	pl, r9, r0
+	addq	r13, ph, ph
+	addq	r0, ph, ph
+	addq	r10, pl, pl
+	cmpult	pl, r10, r0
+	addq	r27, ph, ph
+	addq	r11, pl, pl
+	addq	r0, ph, ph
+	cmpult	pl, r11, r0
+	addq	r28, ph, ph
+	addq	pl, rl, rl
+	addq	r0, ph, ph
+	cmpult	rl, pl, r0
+	addq	ph, rh, rh
+	addq	r0, rh, rh
+
+L(ed3):	mulq	rh, B1modb, r8
+	umulh	rh, B1modb, rh
+	addq	r8, rl, rl
+	cmpult	rl, r8, r0
+	addq	r0, rh, rh
+
+	ldq	r24, 8(r19)		C cnt
+	sll	rh, r24, rh
+	subq	r31, r24, r25
+	srl	rl, r25, r2
+	sll	rl, r24, rl
+	or	r2, rh, rh
+
+	ldq	r23, 0(r19)		C bi
+	mulq	rh, r23, r8
+	umulh	rh, r23, r9
+	addq	rh, 1, r7
+	addq	r8, rl, r8		C ql
+	cmpult	r8, rl, r0
+	addq	r9, r7, r9
+	addq	r0, r9, r9		C qh
+	mulq	r9, r18, r21		C qh * b
+	subq	rl, r21, rl
+	cmpult	r8, rl, r0		C rl > ql
+	negq	r0, r0
+	and	r0, r18, r0
+	addq	rl, r0, rl
+	cmpule	r18, rl, r0		C rl >= b
+	negq	r0, r0
+	and	r0, r18, r0
+	subq	rl, r0, rl
+
+	srl	rl, r24, r0
+
+	ldq	r9, 8(r30)
+	ldq	r10, 16(r30)
+	ldq	r11, 24(r30)
+	ldq	r12, 32(r30)
+	ldq	r13, 40(r30)
+	lda	r30, 64(r30)
+	ret	r31, (r26), 1
+EPILOGUE()
+
+PROLOGUE(mpn_mod_1s_4p_cps,gp)
+	lda	r30, -32(r30)
+	stq	r26, 0(r30)
+	stq	r9, 8(r30)
+	stq	r10, 16(r30)
+	stq	r11, 24(r30)
+	mov	r16, r11
+	LEA(	r4, __clz_tab)
+	lda	r10, 65(r31)
+	cmpbge	r31, r17, r1
+	srl	r1, 1, r1
+	xor	r1, 127, r1
+	addq	r1, r4, r1
+	ldq_u	r2, 0(r1)
+	extbl	r2, r1, r2
+	s8subq	r2, 7, r2
+	srl	r17, r2, r3
+	subq	r10, r2, r10
+	addq	r3, r4, r3
+	ldq_u	r1, 0(r3)
+	extbl	r1, r3, r1
+	subq	r10, r1, r10
+	sll	r17, r10, r9
+	mov	r9, r16
+	jsr	r26, mpn_invert_limb
+	LDGP(	r29, 0(r26))
+	subq	r31, r10, r2
+	lda	r1, 1(r31)
+	sll	r1, r10, r1
+	subq	r31, r9, r3
+	srl	r0, r2, r2
+	ldq	r26, 0(r30)
+	bis	r2, r1, r2
+	stq	r0, 0(r11)
+	stq	r10, 8(r11)
+	mulq	r2, r3, r2
+	srl	r2, r10, r3
+	umulh	r2, r0, r1
+	stq	r3, 16(r11)
+	mulq	r2, r0, r3
+	ornot	r31, r1, r1
+	subq	r1, r2, r1
+	mulq	r1, r9, r1
+	addq	r1, r9, r2
+	cmpule	r1, r3, r3
+	cmoveq	r3, r2, r1
+	srl	r1, r10, r3
+	umulh	r1, r0, r2
+	stq	r3, 24(r11)
+	mulq	r1, r0, r3
+	ornot	r31, r2, r2
+	subq	r2, r1, r2
+	mulq	r2, r9, r2
+	addq	r2, r9, r1
+	cmpule	r2, r3, r3
+	cmoveq	r3, r1, r2
+	srl	r2, r10, r1
+	umulh	r2, r0, r3
+	stq	r1, 32(r11)
+	mulq	r2, r0, r1
+	ornot	r31, r3, r3
+	subq	r3, r2, r3
+	mulq	r3, r9, r3
+	addq	r3, r9, r2
+	cmpule	r3, r1, r1
+	cmoveq	r1, r2, r3
+	srl	r3, r10, r2
+	umulh	r3, r0, r1
+	stq	r2, 40(r11)
+	mulq	r3, r0, r0
+	ornot	r31, r1, r1
+	subq	r1, r3, r1
+	mulq	r1, r9, r1
+	addq	r1, r9, r9
+	cmpule	r1, r0, r0
+	cmoveq	r0, r9, r1
+	ldq	r9, 8(r30)
+	srl	r1, r10, r1
+	ldq	r10, 16(r30)
+	stq	r1, 48(r11)
+	ldq	r11, 24(r30)
+	lda	r30, 32(r30)
+	ret	r31, (r26), 1
+EPILOGUE()

diff --git a/third_party/gmp/mpn/alpha/ev6/mul_1.asm b/third_party/gmp/mpn/alpha/ev6/mul_1.asm
new file mode 100644
index 0000000..8ee19cd
--- /dev/null
+++ b/third_party/gmp/mpn/alpha/ev6/mul_1.asm

@@ -0,0 +1,496 @@
+dnl  Alpha ev6 mpn_mul_1 -- Multiply a limb vector with a limb and store the
+dnl  result in a second limb vector.
+
+dnl  Copyright 2000, 2001, 2005 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr	r16
+C s1_ptr	r17
+C size		r18
+C s2_limb	r19
+
+C This code runs at 2.25 cycles/limb on EV6.
+
+C This code was written in close cooperation with ev6 pipeline expert
+C Steve Root.  Any errors are tege's fault, though.
+
+C Code structure:
+
+C  code for n < 8
+C  code for n > 8	code for (n mod 8)
+C			code for (n div 8)	feed-in code
+C						8-way unrolled loop
+C						wind-down code
+
+C Some notes about unrolled loop:
+C
+C   r1-r8     multiplies and workup
+C   r21-r28   multiplies and workup
+C   r9-r12    loads
+C   r0       -1
+C   r20,r29,r13-r15  scramble
+C
+C   We're doing 7 of the 8 carry propagations with a br fixup code and 1 with a
+C   put-the-carry-into-hi.  The idea is that these branches are very rarely
+C   taken, and since a non-taken branch consumes no resources, that is better
+C   than an addq.
+C
+C   Software pipeline: a load in cycle #09, feeds a mul in cycle #16, feeds an
+C   add NEXT cycle #09 which feeds a store in NEXT cycle #02
+
+C The code could use some further work:
+C   1. Speed up really small multiplies.  The default alpha/mul_1.asm code is
+C      faster than this for size < 3.
+C   2. Improve feed-in code, perhaps with the equivalent of switch(n%8) unless
+C      that is too costly.
+C   3. Consider using 4-way unrolling, even if that runs slower.
+C   4. Reduce register usage.  In particular, try to avoid using r29.
+
+ASM_START()
+PROLOGUE(mpn_mul_1)
+	cmpult	r18,	8,	r1
+	beq	r1,	$Large
+$Lsmall:
+	ldq	r2,0(r17)	C r2 = s1_limb
+	lda	r18,-1(r18)	C size--
+	mulq	r2,r19,r3	C r3 = prod_low
+	bic	r31,r31,r4	C clear cy_limb
+	umulh	r2,r19,r0	C r0 = prod_high
+	beq	r18,$Le1a	C jump if size was == 1
+	ldq	r2,8(r17)	C r2 = s1_limb
+	lda	r18,-1(r18)	C size--
+	stq	r3,0(r16)
+	beq	r18,$Le2a	C jump if size was == 2
+	ALIGN(8)
+$Lopa:	mulq	r2,r19,r3	C r3 = prod_low
+	addq	r4,r0,r0	C cy_limb = cy_limb + 'cy'
+	lda	r18,-1(r18)	C size--
+	umulh	r2,r19,r4	C r4 = cy_limb
+	ldq	r2,16(r17)	C r2 = s1_limb
+	lda	r17,8(r17)	C s1_ptr++
+	addq	r3,r0,r3	C r3 = cy_limb + prod_low
+	stq	r3,8(r16)
+	cmpult	r3,r0,r0	C r0 = carry from (cy_limb + prod_low)
+	lda	r16,8(r16)	C res_ptr++
+	bne	r18,$Lopa
+
+$Le2a:	mulq	r2,r19,r3	C r3 = prod_low
+	addq	r4,r0,r0	C cy_limb = cy_limb + 'cy'
+	umulh	r2,r19,r4	C r4 = cy_limb
+	addq	r3,r0,r3	C r3 = cy_limb + prod_low
+	cmpult	r3,r0,r0	C r0 = carry from (cy_limb + prod_low)
+	stq	r3,8(r16)
+	addq	r4,r0,r0	C cy_limb = prod_high + cy
+	ret	r31,(r26),1
+$Le1a:	stq	r3,0(r16)
+	ret	r31,(r26),1
+
+$Large:
+	lda	r30,	-224(r30)
+	stq	r26,	0(r30)
+	stq	r9,	8(r30)
+	stq	r10,	16(r30)
+	stq	r11,	24(r30)
+	stq	r12,	32(r30)
+	stq	r13,	40(r30)
+	stq	r14,	48(r30)
+	stq	r15,	56(r30)
+	stq	r29,	64(r30)
+
+	and	r18,	7,	r20	C count for the first loop, 0-7
+	srl	r18,	3,	r18	C count for unrolled loop
+	bis	r31,	r31,	r21
+	beq	r20,	$L_8_or_more	C skip first loop
+
+$L_9_or_more:
+	ldq	r2,0(r17)	C r2 = s1_limb
+	lda	r17,8(r17)	C s1_ptr++
+	lda	r20,-1(r20)	C size--
+	mulq	r2,r19,r3	C r3 = prod_low
+	umulh	r2,r19,r21	C r21 = prod_high
+	beq	r20,$Le1b	C jump if size was == 1
+	bis	r31, r31, r0	C FIXME: shouldn't need this
+	ldq	r2,0(r17)	C r2 = s1_limb
+	lda	r17,8(r17)	C s1_ptr++
+	lda	r20,-1(r20)	C size--
+	stq	r3,0(r16)
+	lda	r16,8(r16)	C res_ptr++
+	beq	r20,$Le2b	C jump if size was == 2
+	ALIGN(8)
+$Lopb:	mulq	r2,r19,r3	C r3 = prod_low
+	addq	r21,r0,r0	C cy_limb = cy_limb + 'cy'
+	lda	r20,-1(r20)	C size--
+	umulh	r2,r19,r21	C r21 = prod_high
+	ldq	r2,0(r17)	C r2 = s1_limb
+	lda	r17,8(r17)	C s1_ptr++
+	addq	r3,r0,r3	C r3 = cy_limb + prod_low
+	stq	r3,0(r16)
+	cmpult	r3,r0,r0	C r0 = carry from (cy_limb + prod_low)
+	lda	r16,8(r16)	C res_ptr++
+	bne	r20,$Lopb
+
+$Le2b:	mulq	r2,r19,r3	C r3 = prod_low
+	addq	r21,r0,r0	C cy_limb = cy_limb + 'cy'
+	umulh	r2,r19,r21	C r21 = prod_high
+	addq	r3,r0,r3	C r3 = cy_limb + prod_low
+	cmpult	r3,r0,r0	C r0 = carry from (cy_limb + prod_low)
+	stq	r3,0(r16)
+	lda	r16,8(r16)	C res_ptr++
+	addq	r21,r0,r21	C cy_limb = prod_high + cy
+	br	r31,	$L_8_or_more
+$Le1b:	stq	r3,0(r16)
+	lda	r16,8(r16)	C res_ptr++
+
+$L_8_or_more:
+	lda	r0,	-1(r31)		C put -1 in r0, for tricky loop control
+	lda	r17,	-32(r17)	C L1 bookkeeping
+	lda	r18,	-1(r18)		C decrement count
+
+	ldq	r9,	32(r17)		C L1
+	ldq	r10,	40(r17)		C L1
+	mulq	r9,	r19,	r22	C U1 #07
+	ldq	r11,	48(r17)		C L1
+	umulh	r9,	r19,	r23	C U1 #08
+	ldq	r12,	56(r17)		C L1
+	mulq	r10,	r19,	r24	C U1 #09
+	ldq	r9,	64(r17)		C L1
+
+	lda	r17,	64(r17)		C L1 bookkeeping
+
+	umulh	r10,	r19,	r25	C U1 #11
+	mulq	r11,	r19,	r26	C U1 #12
+	umulh	r11,	r19,	r27	C U1 #13
+	mulq	r12,	r19,	r28	C U1 #14
+	ldq	r10,	8(r17)		C L1
+	umulh	r12,	r19,	r1	C U1 #15
+	ldq	r11,	16(r17)		C L1
+	mulq	r9,	r19,	r2	C U1 #16
+	ldq	r12,	24(r17)		C L1
+	umulh	r9,	r19,	r3	C U1 #17
+	addq	r21,	r22,	r13	C L1 mov
+	mulq	r10,	r19,	r4	C U1 #18
+	addq	r23,	r24,	r22	C L0 sum 2 mul's
+	cmpult	r13,	r21,	r14	C L1 carry from sum
+	bgt	r18,	$L_16_or_more
+
+	cmpult	r22,	r24,	r24	C U0 carry from sum
+	umulh	r10,	r19,	r5	C U1 #02
+	addq	r25,	r26,	r23	C U0 sum 2 mul's
+	mulq	r11,	r19,	r6	C U1 #03
+	cmpult	r23,	r26,	r25	C U0 carry from sum
+	umulh	r11,	r19,	r7	C U1 #04
+	addq	r27,	r28,	r28	C U0 sum 2 mul's
+	mulq	r12,	r19,	r8	C U1 #05
+	cmpult	r28,	r27,	r15	C L0 carry from sum
+	lda	r16,	32(r16)		C L1 bookkeeping
+	addq	r13,	r31,	r13	C U0 start carry cascade
+	umulh	r12,	r19,	r21	C U1 #06
+	br	r31,	$ret0c
+
+$L_16_or_more:
+C ---------------------------------------------------------------
+	subq	r18,1,r18
+	cmpult	r22,	r24,	r24	C U0 carry from sum
+	ldq	r9,	32(r17)		C L1
+
+	umulh	r10,	r19,	r5	C U1 #02
+	addq	r25,	r26,	r23	C U0 sum 2 mul's
+	mulq	r11,	r19,	r6	C U1 #03
+	cmpult	r23,	r26,	r25	C U0 carry from sum
+	umulh	r11,	r19,	r7	C U1 #04
+	addq	r27,	r28,	r28	C U0 sum 2 mul's
+	mulq	r12,	r19,	r8	C U1 #05
+	cmpult	r28,	r27,	r15	C L0 carry from sum
+	lda	r16,	32(r16)		C L1 bookkeeping
+	addq	r13,	r31,	r13	C U0 start carry cascade
+
+	umulh	r12,	r19,	r21	C U1 #06
+C	beq	r13,	$fix0w		C U0
+$ret0w:	addq	r22,	r14,	r26	C L0
+	ldq	r10,	40(r17)		C L1
+
+	mulq	r9,	r19,	r22	C U1 #07
+	beq	r26,	$fix1w		C U0
+$ret1w:	addq	r23,	r24,	r27	C L0
+	ldq	r11,	48(r17)		C L1
+
+	umulh	r9,	r19,	r23	C U1 #08
+	beq	r27,	$fix2w		C U0
+$ret2w:	addq	r28,	r25,	r28	C L0
+	ldq	r12,	56(r17)		C L1
+
+	mulq	r10,	r19,	r24	C U1 #09
+	beq	r28,	$fix3w		C U0
+$ret3w:	addq	r1,	r2,	r20	C L0 sum 2 mul's
+	ldq	r9,	64(r17)		C L1
+
+	addq	r3,	r4,	r2	C L0 #10 2 mul's
+	lda	r17,	64(r17)		C L1 bookkeeping
+	cmpult	r20,	r1,	r29	C U0 carry from sum
+
+	umulh	r10,	r19,	r25	C U1 #11
+	cmpult	r2,	r4,	r4	C U0 carry from sum
+	stq	r13,	-32(r16)	C L0
+	stq	r26,	-24(r16)	C L1
+
+	mulq	r11,	r19,	r26	C U1 #12
+	addq	r5,	r6,	r14	C U0 sum 2 mul's
+	stq	r27,	-16(r16)	C L0
+	stq	r28,	-8(r16)		C L1
+
+	umulh	r11,	r19,	r27	C U1 #13
+	cmpult	r14,	r6,	r3	C U0 carry from sum
+C could do cross-jumping here:
+C	bra	$L_middle_of_unrolled_loop
+	mulq	r12,	r19,	r28	C U1 #14
+	addq	r7,	r3,	r5	C L0 eat carry
+	addq	r20,	r15,	r20	C U0 carry cascade
+	ldq	r10,	8(r17)		C L1
+
+	umulh	r12,	r19,	r1	C U1 #15
+	beq	r20,	$fix4		C U0
+$ret4w:	addq	r2,	r29,	r6	C L0
+	ldq	r11,	16(r17)		C L1
+
+	mulq	r9,	r19,	r2	C U1 #16
+	beq	r6,	$fix5		C U0
+$ret5w:	addq	r14,	r4,	r7	C L0
+	ldq	r12,	24(r17)		C L1
+
+	umulh	r9,	r19,	r3	C U1 #17
+	beq	r7,	$fix6		C U0
+$ret6w:	addq	r5,	r8,	r8	C L0 sum 2
+	addq	r21,	r22,	r13	C L1 sum 2 mul's
+
+	mulq	r10,	r19,	r4	C U1 #18
+	addq	r23,	r24,	r22	C L0 sum 2 mul's
+	cmpult	r13,	r21,	r14	C L1 carry from sum
+	ble	r18,	$Lend		C U0
+C ---------------------------------------------------------------
+	ALIGN(16)
+$Loop:
+	umulh	r0,	r18,	r18	C U1 #01 decrement r18!
+	cmpult	r8,	r5,	r29	C L0 carry from last bunch
+	cmpult	r22,	r24,	r24	C U0 carry from sum
+	ldq	r9,	32(r17)		C L1
+
+	umulh	r10,	r19,	r5	C U1 #02
+	addq	r25,	r26,	r23	C U0 sum 2 mul's
+	stq	r20,	0(r16)		C L0
+	stq	r6,	8(r16)		C L1
+
+	mulq	r11,	r19,	r6	C U1 #03
+	cmpult	r23,	r26,	r25	C U0 carry from sum
+	stq	r7,	16(r16)		C L0
+	stq	r8,	24(r16)		C L1
+
+	umulh	r11,	r19,	r7	C U1 #04
+	bis	r31,	r31,	r31	C L0 st slosh
+	bis	r31,	r31,	r31	C L1 st slosh
+	addq	r27,	r28,	r28	C U0 sum 2 mul's
+
+	mulq	r12,	r19,	r8	C U1 #05
+	cmpult	r28,	r27,	r15	C L0 carry from sum
+	lda	r16,	64(r16)		C L1 bookkeeping
+	addq	r13,	r29,	r13	C U0 start carry cascade
+
+	umulh	r12,	r19,	r21	C U1 #06
+	beq	r13,	$fix0		C U0
+$ret0:	addq	r22,	r14,	r26	C L0
+	ldq	r10,	40(r17)		C L1
+
+	mulq	r9,	r19,	r22	C U1 #07
+	beq	r26,	$fix1		C U0
+$ret1:	addq	r23,	r24,	r27	C L0
+	ldq	r11,	48(r17)		C L1
+
+	umulh	r9,	r19,	r23	C U1 #08
+	beq	r27,	$fix2		C U0
+$ret2:	addq	r28,	r25,	r28	C L0
+	ldq	r12,	56(r17)		C L1
+
+	mulq	r10,	r19,	r24	C U1 #09
+	beq	r28,	$fix3		C U0
+$ret3:	addq	r1,	r2,	r20	C L0 sum 2 mul's
+	ldq	r9,	64(r17)		C L1
+
+	addq	r3,	r4,	r2	C L0 #10 2 mul's
+	bis	r31,	r31,	r31	C U1 mul hole
+	lda	r17,	64(r17)		C L1 bookkeeping
+	cmpult	r20,	r1,	r29	C U0 carry from sum
+
+	umulh	r10,	r19,	r25	C U1 #11
+	cmpult	r2,	r4,	r4	C U0 carry from sum
+	stq	r13,	-32(r16)	C L0
+	stq	r26,	-24(r16)	C L1
+
+	mulq	r11,	r19,	r26	C U1 #12
+	addq	r5,	r6,	r14	C U0 sum 2 mul's
+	stq	r27,	-16(r16)	C L0
+	stq	r28,	-8(r16)		C L1
+
+	umulh	r11,	r19,	r27	C U1 #13
+	bis	r31,	r31,	r31	C L0 st slosh
+	bis	r31,	r31,	r31	C L1 st slosh
+	cmpult	r14,	r6,	r3	C U0 carry from sum
+$L_middle_of_unrolled_loop:
+	mulq	r12,	r19,	r28	C U1 #14
+	addq	r7,	r3,	r5	C L0 eat carry
+	addq	r20,	r15,	r20	C U0 carry cascade
+	ldq	r10,	8(r17)		C L1
+
+	umulh	r12,	r19,	r1	C U1 #15
+	beq	r20,	$fix4		C U0
+$ret4:	addq	r2,	r29,	r6	C L0
+	ldq	r11,	16(r17)		C L1
+
+	mulq	r9,	r19,	r2	C U1 #16
+	beq	r6,	$fix5		C U0
+$ret5:	addq	r14,	r4,	r7	C L0
+	ldq	r12,	24(r17)		C L1
+
+	umulh	r9,	r19,	r3	C U1 #17
+	beq	r7,	$fix6		C U0
+$ret6:	addq	r5,	r8,	r8	C L0 sum 2
+	addq	r21,	r22,	r13	C L1 sum 2 mul's
+
+	mulq	r10,	r19,	r4	C U1 #18
+	addq	r23,	r24,	r22	C L0 sum 2 mul's
+	cmpult	r13,	r21,	r14	C L1 carry from sum
+	bgt	r18,	$Loop		C U0
+C ---------------------------------------------------------------
+$Lend:
+	cmpult	r8,	r5,	r29	C L0 carry from last bunch
+	cmpult	r22,	r24,	r24	C U0 carry from sum
+
+	umulh	r10,	r19,	r5	C U1 #02
+	addq	r25,	r26,	r23	C U0 sum 2 mul's
+	stq	r20,	0(r16)		C L0
+	stq	r6,	8(r16)		C L1
+
+	mulq	r11,	r19,	r6	C U1 #03
+	cmpult	r23,	r26,	r25	C U0 carry from sum
+	stq	r7,	16(r16)		C L0
+	stq	r8,	24(r16)		C L1
+
+	umulh	r11,	r19,	r7	C U1 #04
+	addq	r27,	r28,	r28	C U0 sum 2 mul's
+
+	mulq	r12,	r19,	r8	C U1 #05
+	cmpult	r28,	r27,	r15	C L0 carry from sum
+	lda	r16,	64(r16)		C L1 bookkeeping
+	addq	r13,	r29,	r13	C U0 start carry cascade
+
+	umulh	r12,	r19,	r21	C U1 #06
+	beq	r13,	$fix0c		C U0
+$ret0c:	addq	r22,	r14,	r26	C L0
+	beq	r26,	$fix1c		C U0
+$ret1c:	addq	r23,	r24,	r27	C L0
+	beq	r27,	$fix2c		C U0
+$ret2c:	addq	r28,	r25,	r28	C L0
+	beq	r28,	$fix3c		C U0
+$ret3c:	addq	r1,	r2,	r20	C L0 sum 2 mul's
+	addq	r3,	r4,	r2	C L0 #10 2 mul's
+	lda	r17,	64(r17)		C L1 bookkeeping
+	cmpult	r20,	r1,	r29	C U0 carry from sum
+	cmpult	r2,	r4,	r4	C U0 carry from sum
+	stq	r13,	-32(r16)	C L0
+	stq	r26,	-24(r16)	C L1
+	addq	r5,	r6,	r14	C U0 sum 2 mul's
+	stq	r27,	-16(r16)	C L0
+	stq	r28,	-8(r16)		C L1
+	cmpult	r14,	r6,	r3	C U0 carry from sum
+	addq	r7,	r3,	r5	C L0 eat carry
+	addq	r20,	r15,	r20	C U0 carry cascade
+	beq	r20,	$fix4c		C U0
+$ret4c:	addq	r2,	r29,	r6	C L0
+	beq	r6,	$fix5c		C U0
+$ret5c:	addq	r14,	r4,	r7	C L0
+	beq	r7,	$fix6c		C U0
+$ret6c:	addq	r5,	r8,	r8	C L0 sum 2
+	cmpult	r8,	r5,	r29	C L0 carry from last bunch
+	stq	r20,	0(r16)		C L0
+	stq	r6,	8(r16)		C L1
+	stq	r7,	16(r16)		C L0
+	stq	r8,	24(r16)		C L1
+	addq	r29,	r21,	r0
+
+	ldq	r26,	0(r30)
+	ldq	r9,	8(r30)
+	ldq	r10,	16(r30)
+	ldq	r11,	24(r30)
+	ldq	r12,	32(r30)
+	ldq	r13,	40(r30)
+	ldq	r14,	48(r30)
+	ldq	r15,	56(r30)
+	ldq	r29,	64(r30)
+	lda	r30,	224(r30)
+	ret	r31,	(r26),	1
+
+C $fix0w:	bis	r14,	r29,	r14	C join carries
+C	br	r31,	$ret0w
+$fix1w:	bis	r24,	r14,	r24	C join carries
+	br	r31,	$ret1w
+$fix2w:	bis	r25,	r24,	r25	C join carries
+	br	r31,	$ret2w
+$fix3w:	bis	r15,	r25,	r15	C join carries
+	br	r31,	$ret3w
+$fix0:	bis	r14,	r29,	r14	C join carries
+	br	r31,	$ret0
+$fix1:	bis	r24,	r14,	r24	C join carries
+	br	r31,	$ret1
+$fix2:	bis	r25,	r24,	r25	C join carries
+	br	r31,	$ret2
+$fix3:	bis	r15,	r25,	r15	C join carries
+	br	r31,	$ret3
+$fix4:	bis	r29,	r15,	r29	C join carries
+	br	r31,	$ret4
+$fix5:	bis	r4,	r29,	r4	C join carries
+	br	r31,	$ret5
+$fix6:	addq	r5,	r4,	r5	C can't carry twice!
+	br	r31,	$ret6
+$fix0c:	bis	r14,	r29,	r14	C join carries
+	br	r31,	$ret0c
+$fix1c:	bis	r24,	r14,	r24	C join carries
+	br	r31,	$ret1c
+$fix2c:	bis	r25,	r24,	r25	C join carries
+	br	r31,	$ret2c
+$fix3c:	bis	r15,	r25,	r15	C join carries
+	br	r31,	$ret3c
+$fix4c:	bis	r29,	r15,	r29	C join carries
+	br	r31,	$ret4c
+$fix5c:	bis	r4,	r29,	r4	C join carries
+	br	r31,	$ret5c
+$fix6c:	addq	r5,	r4,	r5	C can't carry twice!
+	br	r31,	$ret6c
+
+EPILOGUE(mpn_mul_1)
+ASM_END()

diff --git a/third_party/gmp/mpn/alpha/ev6/nails/README b/third_party/gmp/mpn/alpha/ev6/nails/README
new file mode 100644
index 0000000..b214ac5
--- /dev/null
+++ b/third_party/gmp/mpn/alpha/ev6/nails/README

@@ -0,0 +1,65 @@
+Copyright 2002, 2005 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.
+
+
+
+
+
+This directory contains assembly code for nails-enabled 21264.  The code is not
+very well optimized.
+
+For addmul_N, as N grows larger, we could make multiple loads together, then do
+about 3.3 i/c.  10 cycles after the last load, we can increase to 4 i/c.  This
+would surely allow addmul_4 to run at 2 c/l, but the same should be possible
+also for addmul_3 and perhaps even addmul_2.
+
+
+		current		fair		best
+Routine		c/l  unroll	c/l  unroll	c/l  i/c
+mul_1		3.25		2.75		2.75 3.273
+addmul_1	4.0	4	3.5	4 14	3.25 3.385
+addmul_2	4.0	1	2.5	2 10	2.25 3.333
+addmul_3	3.0	1	2.33	2 14	2    3.333
+addmul_4	2.5	1	2.125	2 17	2    3.135
+
+addmul_5			2	1 10
+addmul_6			2	1 12
+addmul_7			2	1 14
+
+(The "best" column doesn't account for bookkeeping instructions and
+thereby assumes infinite unrolling.)
+
+Basecase usages:
+
+1	 addmul_1
+2	 addmul_2
+3	 addmul_3
+4	 addmul_4
+5	 addmul_3 + addmul_2	2.3998
+6	 addmul_4 + addmul_2
+7	 addmul_4 + addmul_3

diff --git a/third_party/gmp/mpn/alpha/ev6/nails/addmul_1.asm b/third_party/gmp/mpn/alpha/ev6/nails/addmul_1.asm
new file mode 100644
index 0000000..711d4e6
--- /dev/null
+++ b/third_party/gmp/mpn/alpha/ev6/nails/addmul_1.asm

@@ -0,0 +1,396 @@
+dnl  Alpha ev6 nails mpn_addmul_1.
+
+dnl  Copyright 2002, 2005, 2006 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C      cycles/limb
+C EV4:    42
+C EV5:    18
+C EV6:     4
+
+C TODO
+C  * Reroll loop for 3.75 c/l with current 4-way unrolling.
+C  * The loop is overscheduled wrt loads and wrt multiplies, in particular
+C    umulh.
+C  * Use FP loop count and multiple exit points, that would simplify feed-in lp0
+C    and would work since the loop structure is really regular.
+
+C  INPUT PARAMETERS
+define(`rp',`r16')
+define(`up',`r17')
+define(`n', `r18')
+define(`vl0',`r19')
+
+define(`numb_mask',`r6')
+
+define(`m0a',`r0')
+define(`m0b',`r1')
+define(`m1a',`r2')
+define(`m1b',`r3')
+define(`m2a',`r20')
+define(`m2b',`r21')
+define(`m3a',`r22')
+define(`m3b',`r23')
+
+define(`acc0',`r25')
+define(`acc1',`r27')
+
+define(`ul0',`r4')
+define(`ul1',`r5')
+define(`ul2',`r4')
+define(`ul3',`r5')
+
+define(`rl0',`r24')
+define(`rl1',`r24')
+define(`rl2',`r24')
+define(`rl3',`r24')
+
+define(`t0',`r7')
+define(`t1',`r8')
+
+define(`NAIL_BITS',`GMP_NAIL_BITS')
+define(`NUMB_BITS',`GMP_NUMB_BITS')
+
+dnl  This declaration is munged by configure
+NAILS_SUPPORT(2-63)
+
+ASM_START()
+PROLOGUE(mpn_addmul_1)
+	sll	vl0, NAIL_BITS, vl0
+	lda	numb_mask, -1(r31)
+	srl	numb_mask, NAIL_BITS, numb_mask
+
+	and	n,	3,	r25
+	cmpeq	r25,	1,	r21
+	bne	r21,	L(1m4)
+	cmpeq	r25,	2,	r21
+	bne	r21,	L(2m4)
+	beq	r25,	L(0m4)
+
+L(3m4):	ldq	ul3,	0(up)
+	lda	n,	-4(n)
+	ldq	ul0,	8(up)
+	mulq	vl0,	ul3,	m3a
+	umulh	vl0,	ul3,	m3b
+	ldq	ul1,	16(up)
+	lda	up,	24(up)
+	lda	rp,	-8(rp)
+	mulq	vl0,	ul0,	m0a
+	umulh	vl0,	ul0,	m0b
+	bge	n,	L(ge3)
+
+	mulq	vl0,	ul1,	m1a
+	umulh	vl0,	ul1,	m1b
+	ldq	rl3,	8(rp)
+	srl	m3a,NAIL_BITS,	t0
+	addq	t0,	r31,	acc1
+	addq	rl3,	acc1,	acc1
+	ldq	rl0,	16(rp)
+	srl	m0a,NAIL_BITS,	t0
+	addq	t0,	m3b,	acc0
+	srl	acc1,NUMB_BITS,	t1
+	br	r31,	L(ta3)
+
+L(ge3):	ldq	ul2,	0(up)
+	mulq	vl0,	ul1,	m1a
+	umulh	vl0,	ul1,	m1b
+	ldq	rl3,	8(rp)
+	srl	m3a,NAIL_BITS,	t0
+	ldq	ul3,	8(up)
+	lda	n,	-4(n)
+	mulq	vl0,	ul2,	m2a
+	addq	t0,	r31,	acc1
+	umulh	vl0,	ul2,	m2b
+	addq	rl3,	acc1,	acc1
+	ldq	rl0,	16(rp)
+	srl	m0a,NAIL_BITS,	t0
+	ldq	ul0,	16(up)
+	mulq	vl0,	ul3,	m3a
+	addq	t0,	m3b,	acc0
+	srl	acc1,NUMB_BITS,	t1
+	br	r31,	L(el3)
+
+L(0m4):	lda	n,	-8(n)
+	ldq	ul2,	0(up)
+	ldq	ul3,	8(up)
+	mulq	vl0,	ul2,	m2a
+	umulh	vl0,	ul2,	m2b
+	ldq	ul0,	16(up)
+	mulq	vl0,	ul3,	m3a
+	umulh	vl0,	ul3,	m3b
+	ldq	ul1,	24(up)
+	lda	up,	32(up)
+	mulq	vl0,	ul0,	m0a
+	umulh	vl0,	ul0,	m0b
+	bge	n,	L(ge4)
+
+	ldq	rl2,	0(rp)
+	srl	m2a,NAIL_BITS,	t0
+	mulq	vl0,	ul1,	m1a
+	addq	t0,	r31,	acc0
+	umulh	vl0,	ul1,	m1b
+	addq	rl2,	acc0,	acc0
+	ldq	rl3,	8(rp)
+	srl	m3a,NAIL_BITS,	t0
+	addq	t0,	m2b,	acc1
+	srl	acc0,NUMB_BITS,	t1
+	br	r31,	L(ta4)
+
+L(ge4):	ldq	rl2,	0(rp)
+	srl	m2a,NAIL_BITS,	t0
+	ldq	ul2,	0(up)
+	mulq	vl0,	ul1,	m1a
+	addq	t0,	r31,	acc0
+	umulh	vl0,	ul1,	m1b
+	addq	rl2,	acc0,	acc0
+	ldq	rl3,	8(rp)
+	srl	m3a,NAIL_BITS,	t0
+	ldq	ul3,	8(up)
+	lda	n,	-4(n)
+	mulq	vl0,	ul2,	m2a
+	addq	t0,	m2b,	acc1
+	srl	acc0,NUMB_BITS,	t1
+	br	r31,	L(el0)
+
+L(2m4):	lda	n,	-4(n)
+	ldq	ul0,	0(up)
+	ldq	ul1,	8(up)
+	lda	up,	16(up)
+	lda	rp,	-16(rp)
+	mulq	vl0,	ul0,	m0a
+	umulh	vl0,	ul0,	m0b
+	bge	n,	L(ge2)
+
+	mulq	vl0,	ul1,	m1a
+	umulh	vl0,	ul1,	m1b
+	ldq	rl0,	16(rp)
+	srl	m0a,NAIL_BITS,	t0
+	addq	t0,	r31,	acc0
+	addq	rl0,	acc0,	acc0
+	ldq	rl1,	24(rp)
+	srl	m1a,NAIL_BITS,	t0
+	addq	t0,	m0b,	acc1
+	srl	acc0,NUMB_BITS,	t1
+	br	r31,	L(ta2)
+
+L(ge2):	ldq	ul2,	0(up)
+	mulq	vl0,	ul1,	m1a
+	umulh	vl0,	ul1,	m1b
+	ldq	ul3,	8(up)
+	lda	n,	-4(n)
+	mulq	vl0,	ul2,	m2a
+	umulh	vl0,	ul2,	m2b
+	ldq	rl0,	16(rp)
+	srl	m0a,NAIL_BITS,	t0
+	ldq	ul0,	16(up)
+	mulq	vl0,	ul3,	m3a
+	addq	t0,	r31,	acc0
+	umulh	vl0,	ul3,	m3b
+	addq	rl0,	acc0,	acc0
+	ldq	rl1,	24(rp)
+	srl	m1a,NAIL_BITS,	t0
+	ldq	ul1,	24(up)
+	lda	up,	32(up)
+	lda	rp,	32(rp)
+	mulq	vl0,	ul0,	m0a
+	addq	t0,	m0b,	acc1
+	srl	acc0,NUMB_BITS,	t1
+	bge	n,	L(el2)
+
+	br	r31,	L(ta6)
+
+L(1m4):	lda	n,	-4(n)
+	ldq	ul1,	0(up)
+	lda	up,	8(up)
+	lda	rp,	-24(rp)
+	bge	n,	L(ge1)
+
+	mulq	vl0,	ul1,	m1a
+	umulh	vl0,	ul1,	m1b
+	ldq	rl1,	24(rp)
+	srl	m1a,NAIL_BITS,	t0
+	addq	rl1,	t0,	acc1
+	and	acc1,numb_mask,	r28
+	srl	acc1,NUMB_BITS,	t1
+	stq	r28,	24(rp)
+	addq	t1,	m1b,	r0
+	ret	r31,	(r26),	1
+
+L(ge1):	ldq	ul2,	0(up)
+	mulq	vl0,	ul1,	m1a
+	umulh	vl0,	ul1,	m1b
+	ldq	ul3,	8(up)
+	lda	n,	-4(n)
+	mulq	vl0,	ul2,	m2a
+	umulh	vl0,	ul2,	m2b
+	ldq	ul0,	16(up)
+	mulq	vl0,	ul3,	m3a
+	umulh	vl0,	ul3,	m3b
+	ldq	rl1,	24(rp)
+	srl	m1a,NAIL_BITS,	t0
+	ldq	ul1,	24(up)
+	lda	up,	32(up)
+	lda	rp,	32(rp)
+	mulq	vl0,	ul0,	m0a
+	addq	t0,	r31,	acc1
+	umulh	vl0,	ul0,	m0b
+	addq	rl1,	acc1,	acc1
+	ldq	rl2,	0(rp)
+	srl	m2a,NAIL_BITS,	t0
+	mulq	vl0,	ul1,	m1a
+	addq	t0,	m1b,	acc0
+	srl	acc1,NUMB_BITS,	t1
+	blt	n,	L(ta5)
+
+L(ge5):	ldq	ul2,	0(up)
+	br	r31,	L(el1)
+
+	ALIGN(16)
+L(top):	mulq	vl0,	ul0,	m0a		C U1
+	addq	t0,	m0b,	acc1		C L0
+	srl	acc0,NUMB_BITS,	t1		C U0
+	stq	r28,	-24(rp)			C L1
+C
+L(el2):	umulh	vl0,	ul0,	m0b		C U1
+	and	acc0,numb_mask,	r28		C L0
+	addq	rl1,	acc1,	acc1		C U0
+	ldq	rl2,	0(rp)			C L1
+C
+	unop					C U1
+	addq	t1,	acc1,	acc1		C L0
+	srl	m2a,NAIL_BITS,	t0		C U0
+	ldq	ul2,	0(up)			C L1
+C
+	mulq	vl0,	ul1,	m1a		C U1
+	addq	t0,	m1b,	acc0		C L0
+	srl	acc1,NUMB_BITS,	t1		C U0
+	stq	r28,	-16(rp)			C L1
+C
+L(el1):	umulh	vl0,	ul1,	m1b		C U1
+	and	acc1,numb_mask,	r28		C L0
+	addq	rl2,	acc0,	acc0		C U0
+	ldq	rl3,	8(rp)			C L1
+C
+	lda	n,	-4(n)			C L1
+	addq	t1,	acc0,	acc0		C L0
+	srl	m3a,NAIL_BITS,	t0		C U0
+	ldq	ul3,	8(up)			C L1
+C
+	mulq	vl0,	ul2,	m2a		C U1
+	addq	t0,	m2b,	acc1		C L0
+	srl	acc0,NUMB_BITS,	t1		C U0
+	stq	r28,	-8(rp)			C L1
+C
+L(el0):	umulh	vl0,	ul2,	m2b		C U1
+	and	acc0,numb_mask,	r28		C L0
+	addq	rl3,	acc1,	acc1		C U0
+	ldq	rl0,	16(rp)			C L1
+C
+	unop					C U1
+	addq	t1,	acc1,	acc1		C L0
+	srl	m0a,NAIL_BITS,	t0		C U0
+	ldq	ul0,	16(up)			C L1
+C
+	mulq	vl0,	ul3,	m3a		C U1
+	addq	t0,	m3b,	acc0		C L0
+	srl	acc1,NUMB_BITS,	t1		C U0
+	stq	r28,	0(rp)			C L1
+C
+L(el3):	umulh	vl0,	ul3,	m3b		C U1
+	and	acc1,numb_mask,	r28		C L0
+	addq	rl0,	acc0,	acc0		C U0
+	ldq	rl1,	24(rp)			C L1
+C
+	unop					C U1
+	addq	t1,	acc0,	acc0		C L0
+	srl	m1a,NAIL_BITS,	t0		C U0
+	ldq	ul1,	24(up)			C L1
+C
+	lda	up,	32(up)			C L0
+	unop					C U1
+	lda	rp,	32(rp)			C L1
+	bge	n,	L(top)			C U0
+
+L(end):	mulq	vl0,	ul0,	m0a
+	addq	t0,	m0b,	acc1
+	srl	acc0,NUMB_BITS,	t1
+	stq	r28,	-24(rp)
+L(ta6):	umulh	vl0,	ul0,	m0b
+	and	acc0,numb_mask,	r28
+	addq	rl1,	acc1,	acc1
+	ldq	rl2,	0(rp)
+	addq	t1,	acc1,	acc1
+	srl	m2a,NAIL_BITS,	t0
+	mulq	vl0,	ul1,	m1a
+	addq	t0,	m1b,	acc0
+	srl	acc1,NUMB_BITS,	t1
+	stq	r28,	-16(rp)
+L(ta5):	umulh	vl0,	ul1,	m1b
+	and	acc1,numb_mask,	r28
+	addq	rl2,	acc0,	acc0
+	ldq	rl3,	8(rp)
+	addq	t1,	acc0,	acc0
+	srl	m3a,NAIL_BITS,	t0
+	addq	t0,	m2b,	acc1
+	srl	acc0,NUMB_BITS,	t1
+	stq	r28,	-8(rp)
+	unop
+	ALIGN(16)
+L(ta4):	and	acc0,numb_mask,	r28
+	addq	rl3,	acc1,	acc1
+	ldq	rl0,	16(rp)
+	addq	t1,	acc1,	acc1
+	srl	m0a,NAIL_BITS,	t0
+	addq	t0,	m3b,	acc0
+	srl	acc1,NUMB_BITS,	t1
+	stq	r28,	0(rp)
+	unop
+	ALIGN(16)
+L(ta3):	and	acc1,numb_mask,	r28
+	addq	rl0,	acc0,	acc0
+	ldq	rl1,	24(rp)
+	addq	t1,	acc0,	acc0
+	srl	m1a,NAIL_BITS,	t0
+	addq	t0,	m0b,	acc1
+	srl	acc0,NUMB_BITS,	t1
+	stq	r28,	8(rp)
+	unop
+	ALIGN(16)
+L(ta2):	and	acc0,numb_mask,	r28
+	addq	rl1,	acc1,	acc1
+	addq	t1,	acc1,	acc1
+	srl	acc1,NUMB_BITS,	t1
+	stq	r28,	16(rp)
+	and	acc1,numb_mask,	r28
+	addq	t1,	m1b,	r0
+	stq	r28,	24(rp)
+	ret	r31,	(r26),	1
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/alpha/ev6/nails/addmul_2.asm b/third_party/gmp/mpn/alpha/ev6/nails/addmul_2.asm
new file mode 100644
index 0000000..6ff6b3a
--- /dev/null
+++ b/third_party/gmp/mpn/alpha/ev6/nails/addmul_2.asm

@@ -0,0 +1,146 @@
+dnl  Alpha ev6 nails mpn_addmul_2.
+
+dnl  Copyright 2002, 2005, 2006 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C Runs at 4.0 cycles/limb.
+
+C We could either go for 2-way unrolling over 11 cycles, or 2.75 c/l,
+C or 4-way unrolling over 20 cycles, for 2.5 c/l.
+
+
+C  INPUT PARAMETERS
+define(`rp',`r16')
+define(`up',`r17')
+define(`n',`r18')
+define(`vp',`r19')
+
+C  Useful register aliases
+define(`numb_mask',`r24')
+define(`ulimb',`r25')
+define(`rlimb',`r27')
+
+define(`m0a',`r0')
+define(`m0b',`r1')
+define(`m1a',`r2')
+define(`m1b',`r3')
+
+define(`acc0',`r4')
+define(`acc1',`r5')
+
+define(`v0',`r6')
+define(`v1',`r7')
+
+C Used for temps: r8 r19 r28
+
+define(`NAIL_BITS',`GMP_NAIL_BITS')
+define(`NUMB_BITS',`GMP_NUMB_BITS')
+
+C  This declaration is munged by configure
+NAILS_SUPPORT(3-63)
+
+ASM_START()
+PROLOGUE(mpn_addmul_2)
+	lda	numb_mask,-1(r31)
+	srl	numb_mask,NAIL_BITS,numb_mask
+
+	ldq	v0,	0(vp)
+	ldq	v1,	8(vp)
+
+	bis	r31,	r31,	acc0		C	zero acc0
+	sll	v0,NAIL_BITS,	v0
+	bis	r31,	r31,	acc1		C	zero acc1
+	sll	v1,NAIL_BITS,	v1
+	bis	r31,	r31,	r19
+
+	ldq	ulimb,	0(up)
+	lda	up,	8(up)
+	mulq	v0,	ulimb,	m0a		C U1
+	umulh	v0,	ulimb,	m0b		C U1
+	mulq	v1,	ulimb,	m1a		C U1
+	umulh	v1,	ulimb,	m1b		C U1
+	lda	n,	-1(n)
+	beq	n,	L(end)			C U0
+
+	ALIGN(16)
+L(top):	bis	r31,	r31,	r31		C U1	nop
+	addq	r19,	acc0,	acc0		C U0	propagate nail
+	ldq	rlimb,	0(rp)			C L0
+	ldq	ulimb,	0(up)			C L1
+
+	lda	rp,	8(rp)			C L1
+	srl	m0a,NAIL_BITS,	r8		C U0
+	lda	up,	8(up)			C L0
+	mulq	v0,	ulimb,	m0a		C U1
+
+	addq	r8,	acc0,	r19		C U0
+	addq	m0b,	acc1,	acc0		C L1
+	umulh	v0,	ulimb,	m0b		C U1
+	bis	r31,	r31,	r31		C L0	nop
+
+	addq	rlimb,	r19,	r19		C L1	FINAL PROD-SUM
+	srl	m1a,NAIL_BITS,	r8		C U0
+	lda	n,	-1(n)			C L0
+	mulq	v1,	ulimb,	m1a		C U1
+
+	addq	r8,	acc0,	acc0		C U0
+	bis	r31,	m1b,	acc1		C L1
+	umulh	v1,	ulimb,	m1b		C U1
+	and	r19,numb_mask,	r28		C L0	extract numb part
+
+	unop
+	srl	r19,NUMB_BITS,	r19		C U1	extract nail part
+	stq	r28,	-8(rp)			C L1
+	bne	n,	L(top)			C U0
+
+L(end):	ldq	rlimb,	0(rp)
+	addq	r19,	acc0,	acc0		C	propagate nail
+	lda	rp,	8(rp)
+	srl	m0a,NAIL_BITS,	r8		C U0
+	addq	r8,	acc0,	r19
+	addq	m0b,	acc1,	acc0
+	addq	rlimb,	r19,	r19
+	srl	m1a,NAIL_BITS,	r8		C U0
+	addq	r8,	acc0,	acc0
+	bis	r31,	m1b,	acc1
+	and	r19,numb_mask,	r28		C extract limb
+
+	srl	r19,NUMB_BITS,	r19		C extract nail
+	stq	r28,	-8(rp)
+
+	addq	r19,	acc0,	acc0		C propagate nail
+	and	acc0,numb_mask,	r28
+	stq	r28,	0(rp)
+	srl	acc0,NUMB_BITS,	r19
+	addq	r19,	acc1,	r0
+
+	ret	r31,	(r26),	1
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/alpha/ev6/nails/addmul_3.asm b/third_party/gmp/mpn/alpha/ev6/nails/addmul_3.asm
new file mode 100644
index 0000000..a1ffb68
--- /dev/null
+++ b/third_party/gmp/mpn/alpha/ev6/nails/addmul_3.asm

@@ -0,0 +1,169 @@
+dnl  Alpha ev6 nails mpn_addmul_3.
+
+dnl  Copyright 2002, 2006 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C Runs at 3.0 cycles/limb.
+
+C With 2-way unrolling, we could probably reach 2.25 c/l (3.33 i/c).
+
+
+C  INPUT PARAMETERS
+define(`rp',`r16')
+define(`up',`r17')
+define(`n',`r18')
+define(`vp',`r19')
+
+C  Useful register aliases
+define(`numb_mask',`r24')
+define(`ulimb',`r25')
+define(`rlimb',`r27')
+
+define(`m0a',`r0')
+define(`m0b',`r1')
+define(`m1a',`r2')
+define(`m1b',`r3')
+define(`m2a',`r20')
+define(`m2b',`r21')
+
+define(`acc0',`r4')
+define(`acc1',`r5')
+define(`acc2',`r22')
+
+define(`v0',`r6')
+define(`v1',`r7')
+define(`v2',`r23')
+
+C Used for temps: r8 r19 r28
+
+define(`NAIL_BITS',`GMP_NAIL_BITS')
+define(`NUMB_BITS',`GMP_NUMB_BITS')
+
+C  This declaration is munged by configure
+NAILS_SUPPORT(3-63)
+
+ASM_START()
+PROLOGUE(mpn_addmul_3)
+	lda	numb_mask,-1(r31)
+	srl	numb_mask,NAIL_BITS,numb_mask
+
+	ldq	v0,	0(vp)
+	ldq	v1,	8(vp)
+	ldq	v2,	16(vp)
+
+	bis	r31,	r31,	acc0		C	zero acc0
+	sll	v0,NAIL_BITS,	v0
+	bis	r31,	r31,	acc1		C	zero acc1
+	sll	v1,NAIL_BITS,	v1
+	bis	r31,	r31,	acc2		C	zero acc2
+	sll	v2,NAIL_BITS,	v2
+	bis	r31,	r31,	r19
+
+	ldq	ulimb,	0(up)
+	lda	up,	8(up)
+	mulq	v0,	ulimb,	m0a		C U1
+	umulh	v0,	ulimb,	m0b		C U1
+	mulq	v1,	ulimb,	m1a		C U1
+	umulh	v1,	ulimb,	m1b		C U1
+	lda	n,	-1(n)
+	mulq	v2,	ulimb,	m2a		C U1
+	umulh	v2,	ulimb,	m2b		C U1
+	beq	n,	L(end)			C U0
+
+	ALIGN(16)
+L(top):	ldq	rlimb,	0(rp)			C L1
+	ldq	ulimb,	0(up)			C L0
+	bis	r31,	r31,	r31		C U0	nop
+	addq	r19,	acc0,	acc0		C U1	propagate nail
+
+	lda	rp,	8(rp)			C L1
+	srl	m0a,NAIL_BITS,	r8		C U0
+	lda	up,	8(up)			C L0
+	mulq	v0,	ulimb,	m0a		C U1
+
+	addq	r8,	acc0,	r19		C U0
+	addq	m0b,	acc1,	acc0		C L1
+	umulh	v0,	ulimb,	m0b		C U1
+	bis	r31,	r31,	r31		C L0	nop
+
+	addq	rlimb,	r19,	r19		C L1
+	srl	m1a,NAIL_BITS,	r8		C U0
+	bis	r31,	r31,	r31		C L0	nop
+	mulq	v1,	ulimb,	m1a		C U1
+
+	addq	r8,	acc0,	acc0		C U0
+	addq	m1b,	acc2,	acc1		C L1
+	umulh	v1,	ulimb,	m1b		C U1
+	and	r19,numb_mask,	r28		C L0	extract numb part
+
+	bis	r31,	r31,	r31		C L1	nop
+	srl	m2a,NAIL_BITS,	r8		C U0
+	lda	n,	-1(n)			C L0
+	mulq	v2,	ulimb,	m2a		C U1
+
+	addq	r8,	acc1,	acc1		C L0
+	bis	r31,	m2b,	acc2		C L1
+	umulh	v2,	ulimb,	m2b		C U1
+	srl	r19,NUMB_BITS,	r19		C U0	extract nail part
+
+	stq	r28,	-8(rp)			C L
+	bne	n,	L(top)			C U0
+
+L(end):	ldq	rlimb,	0(rp)
+	addq	r19,	acc0,	acc0		C	propagate nail
+	lda	rp,	8(rp)
+	srl	m0a,NAIL_BITS,	r8		C U0
+	addq	r8,	acc0,	r19
+	addq	m0b,	acc1,	acc0
+	addq	rlimb,	r19,	r19
+	srl	m1a,NAIL_BITS,	r8		C U0
+	addq	r8,	acc0,	acc0
+	addq	m1b,	acc2,	acc1
+	and	r19,numb_mask,	r28		C extract limb
+	srl	m2a,NAIL_BITS,	r8		C U0
+	addq	r8,	acc1,	acc1
+	bis	r31,	m2b,	acc2
+	srl	r19,NUMB_BITS,	r19		C extract nail
+	stq	r28,	-8(rp)
+
+	addq	r19,	acc0,	acc0		C propagate nail
+	and	acc0,numb_mask,	r28
+	stq	r28,	0(rp)
+	srl	acc0,NUMB_BITS,	r19
+	addq	r19,	acc1,	acc1
+
+	and	acc1,numb_mask,	r28
+	stq	r28,	8(rp)
+	srl	acc1,NUMB_BITS,	r19
+	addq	r19,	acc2,	m0a
+
+	ret	r31,	(r26),	1
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/alpha/ev6/nails/addmul_4.asm b/third_party/gmp/mpn/alpha/ev6/nails/addmul_4.asm
new file mode 100644
index 0000000..77e02a4
--- /dev/null
+++ b/third_party/gmp/mpn/alpha/ev6/nails/addmul_4.asm

@@ -0,0 +1,210 @@
+dnl  Alpha ev6 nails mpn_addmul_4.
+
+dnl  Copyright 2002, 2005, 2006 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C Runs at 2.5 cycles/limb.
+
+C We should go for 2-way unrolling over 17 cycles, for 2.125 c/l corresponding
+C to 3.24 insn/cycle.
+
+
+C  INPUT PARAMETERS
+define(`rp',`r16')
+define(`up',`r17')
+define(`n',`r18')
+define(`vp',`r19')
+
+C  Useful register aliases
+define(`numb_mask',`r24')
+define(`ulimb',`r25')
+define(`rlimb',`r27')
+
+define(`m0a',`r0')
+define(`m0b',`r1')
+define(`m1a',`r2')
+define(`m1b',`r3')
+define(`m2a',`r20')
+define(`m2b',`r21')
+define(`m3a',`r12')
+define(`m3b',`r13')
+
+define(`acc0',`r4')
+define(`acc1',`r5')
+define(`acc2',`r22')
+define(`acc3',`r14')
+
+define(`v0',`r6')
+define(`v1',`r7')
+define(`v2',`r23')
+define(`v3',`r15')
+
+C Used for temps: r8 r19 r28
+
+define(`NAIL_BITS',`GMP_NAIL_BITS')
+define(`NUMB_BITS',`GMP_NUMB_BITS')
+
+C  This declaration is munged by configure
+NAILS_SUPPORT(4-63)
+
+ASM_START()
+PROLOGUE(mpn_addmul_4)
+	lda	r30,	-240(r30)
+	stq	r12,	32(r30)
+	stq	r13,	40(r30)
+	stq	r14,	48(r30)
+	stq	r15,	56(r30)
+
+	lda	numb_mask,-1(r31)
+	srl	numb_mask,NAIL_BITS,numb_mask
+
+	ldq	v0,	0(vp)
+	ldq	v1,	8(vp)
+	ldq	v2,	16(vp)
+	ldq	v3,	24(vp)
+
+	bis	r31,	r31,	acc0		C	zero acc0
+	sll	v0,NAIL_BITS,	v0
+	bis	r31,	r31,	acc1		C	zero acc1
+	sll	v1,NAIL_BITS,	v1
+	bis	r31,	r31,	acc2		C	zero acc2
+	sll	v2,NAIL_BITS,	v2
+	bis	r31,	r31,	acc3		C	zero acc3
+	sll	v3,NAIL_BITS,	v3
+	bis	r31,	r31,	r19
+
+	ldq	ulimb,	0(up)
+	lda	up,	8(up)
+	mulq	v0,	ulimb,	m0a		C U1
+	umulh	v0,	ulimb,	m0b		C U1
+	mulq	v1,	ulimb,	m1a		C U1
+	umulh	v1,	ulimb,	m1b		C U1
+	lda	n,	-1(n)
+	mulq	v2,	ulimb,	m2a		C U1
+	umulh	v2,	ulimb,	m2b		C U1
+	mulq	v3,	ulimb,	m3a		C U1
+	umulh	v3,	ulimb,	m3b		C U1
+	beq	n,	L(end)			C U0
+
+	ALIGN(16)
+L(top):	bis	r31,	r31,	r31		C U1	nop
+	ldq	rlimb,	0(rp)			C L0
+	ldq	ulimb,	0(up)			C L1
+	addq	r19,	acc0,	acc0		C U0	propagate nail
+
+	bis	r31,	r31,	r31		C L0	nop
+	bis	r31,	r31,	r31		C U1	nop
+	bis	r31,	r31,	r31		C L1	nop
+	bis	r31,	r31,	r31		C U0	nop
+
+	lda	rp,	8(rp)			C L0
+	srl	m0a,NAIL_BITS,	r8		C U0
+	lda	up,	8(up)			C L1
+	mulq	v0,	ulimb,	m0a		C U1
+
+	addq	r8,	acc0,	r19		C U0
+	addq	m0b,	acc1,	acc0		C L0
+	umulh	v0,	ulimb,	m0b		C U1
+	bis	r31,	r31,	r31		C L1	nop
+
+	addq	rlimb,	r19,	r19		C L0
+	srl	m1a,NAIL_BITS,	r8		C U0
+	bis	r31,	r31,	r31		C L1	nop
+	mulq	v1,	ulimb,	m1a		C U1
+
+	addq	r8,	acc0,	acc0		C U0
+	addq	m1b,	acc2,	acc1		C L0
+	umulh	v1,	ulimb,	m1b		C U1
+	and	r19,numb_mask,	r28		C L1	extract numb part
+
+	bis	r31,	r31,	r31		C L0	nop
+	srl	m2a,NAIL_BITS,	r8		C U0
+	lda	n,	-1(n)			C L1
+	mulq	v2,	ulimb,	m2a		C U1
+
+	addq	r8,	acc1,	acc1		C L1
+	addq	m2b,	acc3,	acc2		C L0
+	umulh	v2,	ulimb,	m2b		C U1
+	srl	r19,NUMB_BITS,	r19		C U0	extract nail part
+
+	bis	r31,	r31,	r31		C L0	nop
+	srl	m3a,NAIL_BITS,	r8		C U0
+	stq	r28,	-8(rp)			C L1
+	mulq	v3,	ulimb,	m3a		C U1
+
+	addq	r8,	acc2,	acc2		C L0
+	bis	r31,	m3b,	acc3		C L1
+	umulh	v3,	ulimb,	m3b		C U1
+	bne	n,	L(top)			C U0
+
+L(end):	ldq	rlimb,	0(rp)
+	addq	r19,	acc0,	acc0		C	propagate nail
+	lda	rp,	8(rp)			C FIXME: DELETE
+	srl	m0a,NAIL_BITS,	r8		C U0
+	addq	r8,	acc0,	r19
+	addq	m0b,	acc1,	acc0
+	addq	rlimb,	r19,	r19
+	srl	m1a,NAIL_BITS,	r8		C U0
+	addq	r8,	acc0,	acc0
+	addq	m1b,	acc2,	acc1
+	and	r19,numb_mask,	r28		C extract limb
+	srl	m2a,NAIL_BITS,	r8		C U0
+	addq	r8,	acc1,	acc1
+	addq	m2b,	acc3,	acc2
+	srl	r19,NUMB_BITS,	r19		C extract nail
+	srl	m3a,NAIL_BITS,	r8		C U0
+	stq	r28,	-8(rp)
+	addq	r8,	acc2,	acc2
+	bis	r31,	m3b,	acc3
+
+	addq	r19,	acc0,	acc0		C propagate nail
+	and	acc0,numb_mask,	r28
+	stq	r28,	0(rp)
+	srl	acc0,NUMB_BITS,	r19
+	addq	r19,	acc1,	acc1
+
+	and	acc1,numb_mask,	r28
+	stq	r28,	8(rp)
+	srl	acc1,NUMB_BITS,	r19
+	addq	r19,	acc2,	acc2
+
+	and	acc2,numb_mask,	r28
+	stq	r28,	16(rp)
+	srl	acc2,NUMB_BITS,	r19
+	addq	r19,	acc3,	r0
+
+	ldq	r12,	32(r30)
+	ldq	r13,	40(r30)
+	ldq	r14,	48(r30)
+	ldq	r15,	56(r30)
+	lda	r30,	240(r30)
+	ret	r31,	(r26),	1
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/alpha/ev6/nails/aors_n.asm b/third_party/gmp/mpn/alpha/ev6/nails/aors_n.asm
new file mode 100644
index 0000000..f658677
--- /dev/null
+++ b/third_party/gmp/mpn/alpha/ev6/nails/aors_n.asm

@@ -0,0 +1,233 @@
+dnl  Alpha ev6 nails mpn_add_n and mpn_sub_n.
+
+dnl  Copyright 2002, 2006 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+dnl  Runs at 2.5 cycles/limb.  It would be possible to reach 2.0 cycles/limb
+dnl  with 8-way unrolling.
+
+include(`../config.m4')
+
+dnl  INPUT PARAMETERS
+define(`rp',`r16')
+define(`up',`r17')
+define(`vp',`r18')
+define(`n',`r19')
+
+define(`rl0',`r0')
+define(`rl1',`r1')
+define(`rl2',`r2')
+define(`rl3',`r3')
+
+define(`ul0',`r4')
+define(`ul1',`r5')
+define(`ul2',`r6')
+define(`ul3',`r7')
+
+define(`vl0',`r22')
+define(`vl1',`r23')
+define(`vl2',`r24')
+define(`vl3',`r25')
+
+define(`numb_mask',`r21')
+
+define(`NAIL_BITS',`GMP_NAIL_BITS')
+define(`CYSH',`GMP_NUMB_BITS')
+
+dnl  This declaration is munged by configure
+NAILS_SUPPORT(1-63)
+
+ifdef(`OPERATION_add_n', `
+	define(`OP',        addq)
+	define(`CYSH',`GMP_NUMB_BITS')
+	define(`func',  mpn_add_n)')
+ifdef(`OPERATION_sub_n', `
+	define(`OP',        subq)
+	define(`CYSH',63)
+	define(`func',  mpn_sub_n)')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_sub_n)
+
+ASM_START()
+PROLOGUE(func)
+	lda	numb_mask, -1(r31)
+	srl	numb_mask, NAIL_BITS, numb_mask
+	bis	r31,	r31,	r20
+
+	and	n,	3,	r25
+	lda	n,	-4(n)
+	beq	r25,	L(ge4)
+
+L(lp0):	ldq	ul0,	0(up)
+	lda	up,	8(up)
+	ldq	vl0,	0(vp)
+	lda	vp,	8(vp)
+	lda	rp,	8(rp)
+	lda	r25,	-1(r25)
+	OP	ul0,	vl0,	rl0
+	OP	rl0,	r20,	rl0
+	and	rl0, numb_mask,	r28
+	stq	r28,	-8(rp)
+	srl	rl0,	CYSH,	r20
+	bne	r25,	L(lp0)
+
+	blt	n,	L(ret)
+
+L(ge4):	ldq	ul0,	0(up)
+	ldq	vl0,	0(vp)
+	ldq	ul1,	8(up)
+	ldq	vl1,	8(vp)
+	ldq	ul2,	16(up)
+	ldq	vl2,	16(vp)
+	ldq	ul3,	24(up)
+	ldq	vl3,	24(vp)
+	lda	up,	32(up)
+	lda	vp,	32(vp)
+	lda	n,	-4(n)
+	bge	n,	L(ge8)
+
+	OP	ul0,	vl0,	rl0	C		main-add 0
+	OP	rl0,	r20,	rl0	C		cy-add 0
+	OP	ul1,	vl1,	rl1	C		main-add 1
+	srl	rl0,	CYSH,	r20	C		gen cy 0
+	OP	rl1,	r20,	rl1	C		cy-add 1
+	and	rl0,numb_mask,	r27
+	br	r31,	L(cj0)
+
+L(ge8):	OP	ul0,	vl0,	rl0	C		main-add 0
+	ldq	ul0,	0(up)
+	ldq	vl0,	0(vp)
+	OP	rl0,	r20,	rl0	C		cy-add 0
+	OP	ul1,	vl1,	rl1	C		main-add 1
+	srl	rl0,	CYSH,	r20	C		gen cy 0
+	ldq	ul1,	8(up)
+	ldq	vl1,	8(vp)
+	OP	rl1,	r20,	rl1	C		cy-add 1
+	and	rl0,numb_mask,	r27
+	OP	ul2,	vl2,	rl2	C		main-add 2
+	srl	rl1,	CYSH,	r20	C		gen cy 1
+	ldq	ul2,	16(up)
+	ldq	vl2,	16(vp)
+	OP	rl2,	r20,	rl2	C		cy-add 2
+	and	rl1,numb_mask,	r28
+	stq	r27,	0(rp)
+	OP	ul3,	vl3,	rl3	C		main-add 3
+	srl	rl2,	CYSH,	r20	C		gen cy 2
+	ldq	ul3,	24(up)
+	ldq	vl3,	24(vp)
+	OP	rl3,	r20,	rl3	C		cy-add 3
+	and	rl2,numb_mask,	r27
+	stq	r28,	8(rp)
+	lda	rp,	32(rp)
+	lda	up,	32(up)
+	lda	vp,	32(vp)
+	lda	n,	-4(n)
+	blt	n,	L(end)
+
+	ALIGN(32)
+L(top):	OP	ul0,	vl0,	rl0	C		main-add 0
+	srl	rl3,	CYSH,	r20	C		gen cy 3
+	ldq	ul0,	0(up)
+	ldq	vl0,	0(vp)
+
+	OP	rl0,	r20,	rl0	C		cy-add 0
+	and	rl3,numb_mask,	r28
+	stq	r27,	-16(rp)
+	bis	r31,	r31,	r31
+
+	OP	ul1,	vl1,	rl1	C		main-add 1
+	srl	rl0,	CYSH,	r20	C		gen cy 0
+	ldq	ul1,	8(up)
+	ldq	vl1,	8(vp)
+
+	OP	rl1,	r20,	rl1	C		cy-add 1
+	and	rl0,numb_mask,	r27
+	stq	r28,	-8(rp)
+	bis	r31,	r31,	r31
+
+	OP	ul2,	vl2,	rl2	C		main-add 2
+	srl	rl1,	CYSH,	r20	C		gen cy 1
+	ldq	ul2,	16(up)
+	ldq	vl2,	16(vp)
+
+	OP	rl2,	r20,	rl2	C		cy-add 2
+	and	rl1,numb_mask,	r28
+	stq	r27,	0(rp)
+	bis	r31,	r31,	r31
+
+	OP	ul3,	vl3,	rl3	C		main-add 3
+	srl	rl2,	CYSH,	r20	C		gen cy 2
+	ldq	ul3,	24(up)
+	ldq	vl3,	24(vp)
+
+	OP	rl3,	r20,	rl3	C		cy-add 3
+	and	rl2,numb_mask,	r27
+	stq	r28,	8(rp)
+	bis	r31,	r31,	r31
+
+	bis	r31,	r31,	r31
+	lda	n,	-4(n)
+	lda	up,	32(up)
+	lda	vp,	32(vp)
+
+	bis	r31,	r31,	r31
+	bis	r31,	r31,	r31
+	lda	rp,	32(rp)
+	bge	n,	L(top)
+
+L(end):	OP	ul0,	vl0,	rl0	C		main-add 0
+	srl	rl3,	CYSH,	r20	C		gen cy 3
+	OP	rl0,	r20,	rl0	C		cy-add 0
+	and	rl3,numb_mask,	r28
+	stq	r27,	-16(rp)
+	OP	ul1,	vl1,	rl1	C		main-add 1
+	srl	rl0,	CYSH,	r20	C		gen cy 0
+	OP	rl1,	r20,	rl1	C		cy-add 1
+	and	rl0,numb_mask,	r27
+	stq	r28,	-8(rp)
+L(cj0):	OP	ul2,	vl2,	rl2	C		main-add 2
+	srl	rl1,	CYSH,	r20	C		gen cy 1
+	OP	rl2,	r20,	rl2	C		cy-add 2
+	and	rl1,numb_mask,	r28
+	stq	r27,	0(rp)
+	OP	ul3,	vl3,	rl3	C		main-add 3
+	srl	rl2,	CYSH,	r20	C		gen cy 2
+	OP	rl3,	r20,	rl3	C		cy-add 3
+	and	rl2,numb_mask,	r27
+	stq	r28,	8(rp)
+
+	srl	rl3,	CYSH,	r20	C		gen cy 3
+	and	rl3,numb_mask,	r28
+	stq	r27,	16(rp)
+	stq	r28,	24(rp)
+
+L(ret):	and	r20,	1,	r0
+	ret	r31,	(r26),	1
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/alpha/ev6/nails/gmp-mparam.h b/third_party/gmp/mpn/alpha/ev6/nails/gmp-mparam.h
new file mode 100644
index 0000000..7949fe8
--- /dev/null
+++ b/third_party/gmp/mpn/alpha/ev6/nails/gmp-mparam.h

@@ -0,0 +1,72 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 1999-2004 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+/* Generated by tuneup.c, 2004-02-07, gcc 3.3 */
+
+#define MUL_TOOM22_THRESHOLD             40
+#define MUL_TOOM33_THRESHOLD            236
+
+#define SQR_BASECASE_THRESHOLD            7  /* karatsuba */
+#define SQR_TOOM2_THRESHOLD               0  /* never sqr_basecase */
+#define SQR_TOOM3_THRESHOLD             120
+
+#define DIV_SB_PREINV_THRESHOLD       MP_SIZE_T_MAX  /* no preinv with nails */
+#define DIV_DC_THRESHOLD                 48
+#define POWM_THRESHOLD                  113
+
+#define HGCD_THRESHOLD                   78
+#define GCD_ACCEL_THRESHOLD               3
+#define GCD_DC_THRESHOLD                392
+#define JACOBI_BASE_METHOD                1
+
+#define DIVREM_1_NORM_THRESHOLD       MP_SIZE_T_MAX  /* no preinv with nails */
+#define DIVREM_1_UNNORM_THRESHOLD     MP_SIZE_T_MAX  /* no preinv with nails */
+#define MOD_1_NORM_THRESHOLD          MP_SIZE_T_MAX  /* no preinv with nails */
+#define MOD_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* no preinv with nails */
+#define USE_PREINV_DIVREM_1               0  /* no preinv with nails */
+#define USE_PREINV_MOD_1                  0  /* no preinv with nails */
+#define DIVREM_2_THRESHOLD            MP_SIZE_T_MAX  /* no preinv with nails */
+#define DIVEXACT_1_THRESHOLD              0  /* always */
+#define MODEXACT_1_ODD_THRESHOLD          0  /* always */
+
+#define GET_STR_DC_THRESHOLD             15
+#define GET_STR_PRECOMPUTE_THRESHOLD     24
+#define SET_STR_THRESHOLD              6336
+
+#define MUL_FFT_TABLE  { 688, 1440, 3648, 6400, 25600, 0 }
+#define MUL_FFT_MODF_THRESHOLD          488
+#define MUL_FFT_THRESHOLD              3712
+
+#define SQR_FFT_TABLE  { 432, 864, 3136, 6400, 25600, 0 }
+#define SQR_FFT_MODF_THRESHOLD          480
+#define SQR_FFT_THRESHOLD              2976

diff --git a/third_party/gmp/mpn/alpha/ev6/nails/mul_1.asm b/third_party/gmp/mpn/alpha/ev6/nails/mul_1.asm
new file mode 100644
index 0000000..da2ee3d
--- /dev/null
+++ b/third_party/gmp/mpn/alpha/ev6/nails/mul_1.asm

@@ -0,0 +1,364 @@
+dnl  Alpha ev6 nails mpn_mul_1.
+
+dnl  Copyright 2002, 2005, 2006 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C      cycles/limb
+C EV4:    42
+C EV5:    18
+C EV6:     3.25
+
+C TODO
+C  * Reroll loop for 3.0 c/l with current 4-way unrolling.
+C  * The loop is overscheduled wrt loads and wrt multiplies, in particular
+C    umulh.
+C  * Use FP loop count and multiple exit points, that would simplify feed-in lp0
+C    and would work since the loop structure is really regular.
+
+C  INPUT PARAMETERS
+define(`rp',`r16')
+define(`up',`r17')
+define(`n', `r18')
+define(`vl0',`r19')
+
+define(`numb_mask',`r6')
+
+define(`m0a',`r0')
+define(`m0b',`r1')
+define(`m1a',`r2')
+define(`m1b',`r3')
+define(`m2a',`r20')
+define(`m2b',`r21')
+define(`m3a',`r22')
+define(`m3b',`r23')
+
+define(`acc0',`r25')
+define(`acc1',`r27')
+
+define(`ul0',`r4')
+define(`ul1',`r5')
+define(`ul2',`r4')
+define(`ul3',`r5')
+
+define(`rl0',`r24')
+define(`rl1',`r24')
+define(`rl2',`r24')
+define(`rl3',`r24')
+
+define(`t0',`r7')
+define(`t1',`r8')
+
+define(`NAIL_BITS',`GMP_NAIL_BITS')
+define(`NUMB_BITS',`GMP_NUMB_BITS')
+
+dnl  This declaration is munged by configure
+NAILS_SUPPORT(1-63)
+
+ASM_START()
+PROLOGUE(mpn_mul_1)
+	sll	vl0, NAIL_BITS, vl0
+	lda	numb_mask, -1(r31)
+	srl	numb_mask, NAIL_BITS, numb_mask
+
+	and	n,	3,	r25
+	cmpeq	r25,	1,	r21
+	bne	r21,	L(1m4)
+	cmpeq	r25,	2,	r21
+	bne	r21,	L(2m4)
+	beq	r25,	L(0m4)
+
+L(3m4):	ldq	ul3,	0(up)
+	lda	n,	-4(n)
+	ldq	ul0,	8(up)
+	mulq	vl0,	ul3,	m3a
+	umulh	vl0,	ul3,	m3b
+	ldq	ul1,	16(up)
+	lda	up,	24(up)
+	lda	rp,	-8(rp)
+	mulq	vl0,	ul0,	m0a
+	umulh	vl0,	ul0,	m0b
+	bge	n,	L(ge3)
+
+	mulq	vl0,	ul1,	m1a
+	umulh	vl0,	ul1,	m1b
+	srl	m3a,NAIL_BITS,	t0
+	addq	t0,	r31,	acc1
+	srl	m0a,NAIL_BITS,	t0
+	addq	t0,	m3b,	acc0
+	srl	acc1,NUMB_BITS,	t1
+	br	r31,	L(ta3)
+
+L(ge3):	ldq	ul2,	0(up)
+	mulq	vl0,	ul1,	m1a
+	umulh	vl0,	ul1,	m1b
+	srl	m3a,NAIL_BITS,	t0
+	ldq	ul3,	8(up)
+	lda	n,	-4(n)
+	mulq	vl0,	ul2,	m2a
+	addq	t0,	r31,	acc1
+	umulh	vl0,	ul2,	m2b
+	srl	m0a,NAIL_BITS,	t0
+	ldq	ul0,	16(up)
+	mulq	vl0,	ul3,	m3a
+	addq	t0,	m3b,	acc0
+	srl	acc1,NUMB_BITS,	t1
+	br	r31,	L(el3)
+
+L(0m4):	lda	n,	-8(n)
+	ldq	ul2,	0(up)
+	ldq	ul3,	8(up)
+	mulq	vl0,	ul2,	m2a
+	umulh	vl0,	ul2,	m2b
+	ldq	ul0,	16(up)
+	mulq	vl0,	ul3,	m3a
+	umulh	vl0,	ul3,	m3b
+	ldq	ul1,	24(up)
+	lda	up,	32(up)
+	mulq	vl0,	ul0,	m0a
+	umulh	vl0,	ul0,	m0b
+	bge	n,	L(ge4)
+
+	srl	m2a,NAIL_BITS,	t0
+	mulq	vl0,	ul1,	m1a
+	addq	t0,	r31,	acc0
+	umulh	vl0,	ul1,	m1b
+	srl	m3a,NAIL_BITS,	t0
+	addq	t0,	m2b,	acc1
+	srl	acc0,NUMB_BITS,	t1
+	br	r31,	L(ta4)
+
+L(ge4):	srl	m2a,NAIL_BITS,	t0
+	ldq	ul2,	0(up)
+	mulq	vl0,	ul1,	m1a
+	addq	t0,	r31,	acc0
+	umulh	vl0,	ul1,	m1b
+	srl	m3a,NAIL_BITS,	t0
+	ldq	ul3,	8(up)
+	lda	n,	-4(n)
+	mulq	vl0,	ul2,	m2a
+	addq	t0,	m2b,	acc1
+	srl	acc0,NUMB_BITS,	t1
+	br	r31,	L(el0)
+
+L(2m4):	lda	n,	-4(n)
+	ldq	ul0,	0(up)
+	ldq	ul1,	8(up)
+	lda	up,	16(up)
+	lda	rp,	-16(rp)
+	mulq	vl0,	ul0,	m0a
+	umulh	vl0,	ul0,	m0b
+	bge	n,	L(ge2)
+
+	mulq	vl0,	ul1,	m1a
+	umulh	vl0,	ul1,	m1b
+	srl	m0a,NAIL_BITS,	t0
+	addq	t0,	r31,	acc0
+	srl	m1a,NAIL_BITS,	t0
+	addq	t0,	m0b,	acc1
+	srl	acc0,NUMB_BITS,	t1
+	br	r31,	L(ta2)
+
+L(ge2):	ldq	ul2,	0(up)
+	mulq	vl0,	ul1,	m1a
+	umulh	vl0,	ul1,	m1b
+	ldq	ul3,	8(up)
+	lda	n,	-4(n)
+	mulq	vl0,	ul2,	m2a
+	umulh	vl0,	ul2,	m2b
+	srl	m0a,NAIL_BITS,	t0
+	ldq	ul0,	16(up)
+	mulq	vl0,	ul3,	m3a
+	addq	t0,	r31,	acc0
+	umulh	vl0,	ul3,	m3b
+	srl	m1a,NAIL_BITS,	t0
+	ldq	ul1,	24(up)
+	lda	up,	32(up)
+	lda	rp,	32(rp)
+	mulq	vl0,	ul0,	m0a
+	addq	t0,	m0b,	acc1
+	srl	acc0,NUMB_BITS,	t1
+	bge	n,	L(el2)
+
+	br	r31,	L(ta6)
+
+L(1m4):	lda	n,	-4(n)
+	ldq	ul1,	0(up)
+	lda	up,	8(up)
+	lda	rp,	-24(rp)
+	bge	n,	L(ge1)
+
+	mulq	vl0,	ul1,	m1a
+	umulh	vl0,	ul1,	m1b
+	srl	m1a,NAIL_BITS,	t0
+	addq	t0,	r31,	acc1
+	and	acc1,numb_mask,	r28
+	srl	acc1,NUMB_BITS,	t1
+	stq	r28,	24(rp)
+	addq	t1,	m1b,	r0
+	ret	r31,	(r26),	1
+
+L(ge1):	ldq	ul2,	0(up)
+	mulq	vl0,	ul1,	m1a
+	umulh	vl0,	ul1,	m1b
+	ldq	ul3,	8(up)
+	lda	n,	-4(n)
+	mulq	vl0,	ul2,	m2a
+	umulh	vl0,	ul2,	m2b
+	ldq	ul0,	16(up)
+	mulq	vl0,	ul3,	m3a
+	umulh	vl0,	ul3,	m3b
+	srl	m1a,NAIL_BITS,	t0
+	ldq	ul1,	24(up)
+	lda	up,	32(up)
+	lda	rp,	32(rp)
+	mulq	vl0,	ul0,	m0a
+	addq	t0,	r31,	acc1
+	umulh	vl0,	ul0,	m0b
+	srl	m2a,NAIL_BITS,	t0
+	mulq	vl0,	ul1,	m1a
+	addq	t0,	m1b,	acc0
+	srl	acc1,NUMB_BITS,	t1
+	blt	n,	L(ta5)
+
+L(ge5):	ldq	ul2,	0(up)
+	br	r31,	L(el1)
+
+	ALIGN(16)
+L(top):	mulq	vl0,	ul0,	m0a		C U1
+	addq	t0,	m0b,	acc1		C L0
+	srl	acc0,NUMB_BITS,	t1		C U0
+	stq	r28,	-24(rp)			C L1
+C
+L(el2):	umulh	vl0,	ul0,	m0b		C U1
+	and	acc0,numb_mask,	r28		C L0
+	unop					C U0
+	unop					C L1
+C
+	unop					C U1
+	addq	t1,	acc1,	acc1		C L0
+	srl	m2a,NAIL_BITS,	t0		C U0
+	ldq	ul2,	0(up)			C L1
+C
+	mulq	vl0,	ul1,	m1a		C U1
+	addq	t0,	m1b,	acc0		C L0
+	srl	acc1,NUMB_BITS,	t1		C U0
+	stq	r28,	-16(rp)			C L1
+C
+L(el1):	umulh	vl0,	ul1,	m1b		C U1
+	and	acc1,numb_mask,	r28		C L0
+	unop					C U0
+	lda	n,	-4(n)			C L1
+C
+	unop					C U1
+	addq	t1,	acc0,	acc0		C L0
+	srl	m3a,NAIL_BITS,	t0		C U0
+	ldq	ul3,	8(up)			C L1
+C
+	mulq	vl0,	ul2,	m2a		C U1
+	addq	t0,	m2b,	acc1		C L0
+	srl	acc0,NUMB_BITS,	t1		C U0
+	stq	r28,	-8(rp)			C L1
+C
+L(el0):	umulh	vl0,	ul2,	m2b		C U1
+	and	acc0,numb_mask,	r28		C L0
+	unop					C U0
+	unop					C L1
+C
+	unop					C U1
+	addq	t1,	acc1,	acc1		C L0
+	srl	m0a,NAIL_BITS,	t0		C U0
+	ldq	ul0,	16(up)			C L1
+C
+	mulq	vl0,	ul3,	m3a		C U1
+	addq	t0,	m3b,	acc0		C L0
+	srl	acc1,NUMB_BITS,	t1		C U0
+	stq	r28,	0(rp)			C L1
+C
+L(el3):	umulh	vl0,	ul3,	m3b		C U1
+	and	acc1,numb_mask,	r28		C L0
+	unop					C U0
+	unop					C L1
+C
+	unop					C U1
+	addq	t1,	acc0,	acc0		C L0
+	srl	m1a,NAIL_BITS,	t0		C U0
+	ldq	ul1,	24(up)			C L1
+C
+	lda	up,	32(up)			C L0
+	unop					C U1
+	lda	rp,	32(rp)			C L1
+	bge	n,	L(top)			C U0
+
+L(end):	mulq	vl0,	ul0,	m0a
+	addq	t0,	m0b,	acc1
+	srl	acc0,NUMB_BITS,	t1
+	stq	r28,	-24(rp)
+L(ta6):	umulh	vl0,	ul0,	m0b
+	and	acc0,numb_mask,	r28
+	addq	t1,	acc1,	acc1
+	srl	m2a,NAIL_BITS,	t0
+	mulq	vl0,	ul1,	m1a
+	addq	t0,	m1b,	acc0
+	srl	acc1,NUMB_BITS,	t1
+	stq	r28,	-16(rp)
+L(ta5):	umulh	vl0,	ul1,	m1b
+	and	acc1,numb_mask,	r28
+	addq	t1,	acc0,	acc0
+	srl	m3a,NAIL_BITS,	t0
+	addq	t0,	m2b,	acc1
+	srl	acc0,NUMB_BITS,	t1
+	stq	r28,	-8(rp)
+	ALIGN(16)
+L(ta4):	and	acc0,numb_mask,	r28
+	addq	t1,	acc1,	acc1
+	srl	m0a,NAIL_BITS,	t0
+	addq	t0,	m3b,	acc0
+	srl	acc1,NUMB_BITS,	t1
+	stq	r28,	0(rp)
+	unop
+	ALIGN(16)
+L(ta3):	and	acc1,numb_mask,	r28
+	addq	t1,	acc0,	acc0
+	srl	m1a,NAIL_BITS,	t0
+	addq	t0,	m0b,	acc1
+	srl	acc0,NUMB_BITS,	t1
+	stq	r28,	8(rp)
+	unop
+	ALIGN(16)
+L(ta2):	and	acc0,numb_mask,	r28
+	addq	t1,	acc1,	acc1
+	srl	acc1,NUMB_BITS,	t1
+	stq	r28,	16(rp)
+	and	acc1,numb_mask,	r28
+	addq	t1,	m1b,	r0
+	stq	r28,	24(rp)
+	ret	r31,	(r26),	1
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/alpha/ev6/nails/submul_1.asm b/third_party/gmp/mpn/alpha/ev6/nails/submul_1.asm
new file mode 100644
index 0000000..f473a59
--- /dev/null
+++ b/third_party/gmp/mpn/alpha/ev6/nails/submul_1.asm

@@ -0,0 +1,396 @@
+dnl  Alpha ev6 nails mpn_submul_1.
+
+dnl  Copyright 2002, 2005, 2006 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C      cycles/limb
+C EV4:    42
+C EV5:    18
+C EV6:     4
+
+C TODO
+C  * Reroll loop for 3.75 c/l with current 4-way unrolling.
+C  * The loop is overscheduled wrt loads and wrt multiplies, in particular
+C    umulh.
+C  * Use FP loop count and multiple exit points, that would simplify feed-in lp0
+C    and would work since the loop structure is really regular.
+
+C  INPUT PARAMETERS
+define(`rp',`r16')
+define(`up',`r17')
+define(`n', `r18')
+define(`vl0',`r19')
+
+define(`numb_mask',`r6')
+
+define(`m0a',`r0')
+define(`m0b',`r1')
+define(`m1a',`r2')
+define(`m1b',`r3')
+define(`m2a',`r20')
+define(`m2b',`r21')
+define(`m3a',`r22')
+define(`m3b',`r23')
+
+define(`acc0',`r25')
+define(`acc1',`r27')
+
+define(`ul0',`r4')
+define(`ul1',`r5')
+define(`ul2',`r4')
+define(`ul3',`r5')
+
+define(`rl0',`r24')
+define(`rl1',`r24')
+define(`rl2',`r24')
+define(`rl3',`r24')
+
+define(`t0',`r7')
+define(`t1',`r8')
+
+define(`NAIL_BITS',`GMP_NAIL_BITS')
+define(`NUMB_BITS',`GMP_NUMB_BITS')
+
+dnl  This declaration is munged by configure
+NAILS_SUPPORT(2-63)
+
+ASM_START()
+PROLOGUE(mpn_submul_1)
+	sll	vl0, NAIL_BITS, vl0
+	lda	numb_mask, -1(r31)
+	srl	numb_mask, NAIL_BITS, numb_mask
+
+	and	n,	3,	r25
+	cmpeq	r25,	1,	r21
+	bne	r21,	L(1m4)
+	cmpeq	r25,	2,	r21
+	bne	r21,	L(2m4)
+	beq	r25,	L(0m4)
+
+L(3m4):	ldq	ul3,	0(up)
+	lda	n,	-4(n)
+	ldq	ul0,	8(up)
+	mulq	vl0,	ul3,	m3a
+	umulh	vl0,	ul3,	m3b
+	ldq	ul1,	16(up)
+	lda	up,	24(up)
+	lda	rp,	-8(rp)
+	mulq	vl0,	ul0,	m0a
+	umulh	vl0,	ul0,	m0b
+	bge	n,	L(ge3)
+
+	mulq	vl0,	ul1,	m1a
+	umulh	vl0,	ul1,	m1b
+	ldq	rl3,	8(rp)
+	srl	m3a,NAIL_BITS,	t0
+	addq	t0,	r31,	acc1
+	subq	rl3,	acc1,	acc1
+	ldq	rl0,	16(rp)
+	srl	m0a,NAIL_BITS,	t0
+	addq	t0,	m3b,	acc0
+	sra	acc1,NUMB_BITS,	t1
+	br	r31,	L(ta3)
+
+L(ge3):	ldq	ul2,	0(up)
+	mulq	vl0,	ul1,	m1a
+	umulh	vl0,	ul1,	m1b
+	ldq	rl3,	8(rp)
+	srl	m3a,NAIL_BITS,	t0
+	ldq	ul3,	8(up)
+	lda	n,	-4(n)
+	mulq	vl0,	ul2,	m2a
+	addq	t0,	r31,	acc1
+	umulh	vl0,	ul2,	m2b
+	subq	rl3,	acc1,	acc1
+	ldq	rl0,	16(rp)
+	srl	m0a,NAIL_BITS,	t0
+	ldq	ul0,	16(up)
+	mulq	vl0,	ul3,	m3a
+	addq	t0,	m3b,	acc0
+	sra	acc1,NUMB_BITS,	t1
+	br	r31,	L(el3)
+
+L(0m4):	lda	n,	-8(n)
+	ldq	ul2,	0(up)
+	ldq	ul3,	8(up)
+	mulq	vl0,	ul2,	m2a
+	umulh	vl0,	ul2,	m2b
+	ldq	ul0,	16(up)
+	mulq	vl0,	ul3,	m3a
+	umulh	vl0,	ul3,	m3b
+	ldq	ul1,	24(up)
+	lda	up,	32(up)
+	mulq	vl0,	ul0,	m0a
+	umulh	vl0,	ul0,	m0b
+	bge	n,	L(ge4)
+
+	ldq	rl2,	0(rp)
+	srl	m2a,NAIL_BITS,	t0
+	mulq	vl0,	ul1,	m1a
+	addq	t0,	r31,	acc0
+	umulh	vl0,	ul1,	m1b
+	subq	rl2,	acc0,	acc0
+	ldq	rl3,	8(rp)
+	srl	m3a,NAIL_BITS,	t0
+	addq	t0,	m2b,	acc1
+	sra	acc0,NUMB_BITS,	t1
+	br	r31,	L(ta4)
+
+L(ge4):	ldq	rl2,	0(rp)
+	srl	m2a,NAIL_BITS,	t0
+	ldq	ul2,	0(up)
+	mulq	vl0,	ul1,	m1a
+	addq	t0,	r31,	acc0
+	umulh	vl0,	ul1,	m1b
+	subq	rl2,	acc0,	acc0
+	ldq	rl3,	8(rp)
+	srl	m3a,NAIL_BITS,	t0
+	ldq	ul3,	8(up)
+	lda	n,	-4(n)
+	mulq	vl0,	ul2,	m2a
+	addq	t0,	m2b,	acc1
+	sra	acc0,NUMB_BITS,	t1
+	br	r31,	L(el0)
+
+L(2m4):	lda	n,	-4(n)
+	ldq	ul0,	0(up)
+	ldq	ul1,	8(up)
+	lda	up,	16(up)
+	lda	rp,	-16(rp)
+	mulq	vl0,	ul0,	m0a
+	umulh	vl0,	ul0,	m0b
+	bge	n,	L(ge2)
+
+	mulq	vl0,	ul1,	m1a
+	umulh	vl0,	ul1,	m1b
+	ldq	rl0,	16(rp)
+	srl	m0a,NAIL_BITS,	t0
+	addq	t0,	r31,	acc0
+	subq	rl0,	acc0,	acc0
+	ldq	rl1,	24(rp)
+	srl	m1a,NAIL_BITS,	t0
+	addq	t0,	m0b,	acc1
+	sra	acc0,NUMB_BITS,	t1
+	br	r31,	L(ta2)
+
+L(ge2):	ldq	ul2,	0(up)
+	mulq	vl0,	ul1,	m1a
+	umulh	vl0,	ul1,	m1b
+	ldq	ul3,	8(up)
+	lda	n,	-4(n)
+	mulq	vl0,	ul2,	m2a
+	umulh	vl0,	ul2,	m2b
+	ldq	rl0,	16(rp)
+	srl	m0a,NAIL_BITS,	t0
+	ldq	ul0,	16(up)
+	mulq	vl0,	ul3,	m3a
+	addq	t0,	r31,	acc0
+	umulh	vl0,	ul3,	m3b
+	subq	rl0,	acc0,	acc0
+	ldq	rl1,	24(rp)
+	srl	m1a,NAIL_BITS,	t0
+	ldq	ul1,	24(up)
+	lda	up,	32(up)
+	lda	rp,	32(rp)
+	mulq	vl0,	ul0,	m0a
+	addq	t0,	m0b,	acc1
+	sra	acc0,NUMB_BITS,	t1
+	bge	n,	L(el2)
+
+	br	r31,	L(ta6)
+
+L(1m4):	lda	n,	-4(n)
+	ldq	ul1,	0(up)
+	lda	up,	8(up)
+	lda	rp,	-24(rp)
+	bge	n,	L(ge1)
+
+	mulq	vl0,	ul1,	m1a
+	umulh	vl0,	ul1,	m1b
+	ldq	rl1,	24(rp)
+	srl	m1a,NAIL_BITS,	t0
+	subq	rl1,	t0,	acc1
+	and	acc1,numb_mask,	r28
+	sra	acc1,NUMB_BITS,	t1
+	stq	r28,	24(rp)
+	subq	m1b,	t1,	r0
+	ret	r31,	(r26),	1
+
+L(ge1):	ldq	ul2,	0(up)
+	mulq	vl0,	ul1,	m1a
+	umulh	vl0,	ul1,	m1b
+	ldq	ul3,	8(up)
+	lda	n,	-4(n)
+	mulq	vl0,	ul2,	m2a
+	umulh	vl0,	ul2,	m2b
+	ldq	ul0,	16(up)
+	mulq	vl0,	ul3,	m3a
+	umulh	vl0,	ul3,	m3b
+	ldq	rl1,	24(rp)
+	srl	m1a,NAIL_BITS,	t0
+	ldq	ul1,	24(up)
+	lda	up,	32(up)
+	lda	rp,	32(rp)
+	mulq	vl0,	ul0,	m0a
+	addq	t0,	r31,	acc1
+	umulh	vl0,	ul0,	m0b
+	subq	rl1,	acc1,	acc1
+	ldq	rl2,	0(rp)
+	srl	m2a,NAIL_BITS,	t0
+	mulq	vl0,	ul1,	m1a
+	addq	t0,	m1b,	acc0
+	sra	acc1,NUMB_BITS,	t1
+	blt	n,	L(ta5)
+
+L(ge5):	ldq	ul2,	0(up)
+	br	r31,	L(el1)
+
+	ALIGN(16)
+L(top):	mulq	vl0,	ul0,	m0a		C U1
+	addq	t0,	m0b,	acc1		C L0
+	sra	acc0,NUMB_BITS,	t1		C U0
+	stq	r28,	-24(rp)			C L1
+C
+L(el2):	umulh	vl0,	ul0,	m0b		C U1
+	and	acc0,numb_mask,	r28		C L0
+	subq	rl1,	acc1,	acc1		C U0
+	ldq	rl2,	0(rp)			C L1
+C
+	unop					C U1
+	addq	t1,	acc1,	acc1		C L0
+	srl	m2a,NAIL_BITS,	t0		C U0
+	ldq	ul2,	0(up)			C L1
+C
+	mulq	vl0,	ul1,	m1a		C U1
+	addq	t0,	m1b,	acc0		C L0
+	sra	acc1,NUMB_BITS,	t1		C U0
+	stq	r28,	-16(rp)			C L1
+C
+L(el1):	umulh	vl0,	ul1,	m1b		C U1
+	and	acc1,numb_mask,	r28		C L0
+	subq	rl2,	acc0,	acc0		C U0
+	ldq	rl3,	8(rp)			C L1
+C
+	lda	n,	-4(n)			C L1
+	addq	t1,	acc0,	acc0		C L0
+	srl	m3a,NAIL_BITS,	t0		C U0
+	ldq	ul3,	8(up)			C L1
+C
+	mulq	vl0,	ul2,	m2a		C U1
+	addq	t0,	m2b,	acc1		C L0
+	sra	acc0,NUMB_BITS,	t1		C U0
+	stq	r28,	-8(rp)			C L1
+C
+L(el0):	umulh	vl0,	ul2,	m2b		C U1
+	and	acc0,numb_mask,	r28		C L0
+	subq	rl3,	acc1,	acc1		C U0
+	ldq	rl0,	16(rp)			C L1
+C
+	unop					C U1
+	addq	t1,	acc1,	acc1		C L0
+	srl	m0a,NAIL_BITS,	t0		C U0
+	ldq	ul0,	16(up)			C L1
+C
+	mulq	vl0,	ul3,	m3a		C U1
+	addq	t0,	m3b,	acc0		C L0
+	sra	acc1,NUMB_BITS,	t1		C U0
+	stq	r28,	0(rp)			C L1
+C
+L(el3):	umulh	vl0,	ul3,	m3b		C U1
+	and	acc1,numb_mask,	r28		C L0
+	subq	rl0,	acc0,	acc0		C U0
+	ldq	rl1,	24(rp)			C L1
+C
+	unop					C U1
+	addq	t1,	acc0,	acc0		C L0
+	srl	m1a,NAIL_BITS,	t0		C U0
+	ldq	ul1,	24(up)			C L1
+C
+	lda	up,	32(up)			C L0
+	unop					C U1
+	lda	rp,	32(rp)			C L1
+	bge	n,	L(top)			C U0
+
+L(end):	mulq	vl0,	ul0,	m0a
+	addq	t0,	m0b,	acc1
+	sra	acc0,NUMB_BITS,	t1
+	stq	r28,	-24(rp)
+L(ta6):	umulh	vl0,	ul0,	m0b
+	and	acc0,numb_mask,	r28
+	subq	rl1,	acc1,	acc1
+	ldq	rl2,	0(rp)
+	addq	t1,	acc1,	acc1
+	srl	m2a,NAIL_BITS,	t0
+	mulq	vl0,	ul1,	m1a
+	addq	t0,	m1b,	acc0
+	sra	acc1,NUMB_BITS,	t1
+	stq	r28,	-16(rp)
+L(ta5):	umulh	vl0,	ul1,	m1b
+	and	acc1,numb_mask,	r28
+	subq	rl2,	acc0,	acc0
+	ldq	rl3,	8(rp)
+	addq	t1,	acc0,	acc0
+	srl	m3a,NAIL_BITS,	t0
+	addq	t0,	m2b,	acc1
+	sra	acc0,NUMB_BITS,	t1
+	stq	r28,	-8(rp)
+	unop
+	ALIGN(16)
+L(ta4):	and	acc0,numb_mask,	r28
+	subq	rl3,	acc1,	acc1
+	ldq	rl0,	16(rp)
+	addq	t1,	acc1,	acc1
+	srl	m0a,NAIL_BITS,	t0
+	addq	t0,	m3b,	acc0
+	sra	acc1,NUMB_BITS,	t1
+	stq	r28,	0(rp)
+	unop
+	ALIGN(16)
+L(ta3):	and	acc1,numb_mask,	r28
+	subq	rl0,	acc0,	acc0
+	ldq	rl1,	24(rp)
+	addq	t1,	acc0,	acc0
+	srl	m1a,NAIL_BITS,	t0
+	addq	t0,	m0b,	acc1
+	sra	acc0,NUMB_BITS,	t1
+	stq	r28,	8(rp)
+	unop
+	ALIGN(16)
+L(ta2):	and	acc0,numb_mask,	r28
+	subq	rl1,	acc1,	acc1
+	addq	t1,	acc1,	acc1
+	sra	acc1,NUMB_BITS,	t1
+	stq	r28,	16(rp)
+	and	acc1,numb_mask,	r28
+	subq	m1b,	t1,	r0
+	stq	r28,	24(rp)
+	ret	r31,	(r26),	1
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/alpha/ev6/slot.pl b/third_party/gmp/mpn/alpha/ev6/slot.pl
new file mode 100755
index 0000000..a4c8a36
--- /dev/null
+++ b/third_party/gmp/mpn/alpha/ev6/slot.pl

@@ -0,0 +1,318 @@
+#!/usr/bin/perl -w
+
+# Copyright 2000, 2001, 2003-2005, 2011 Free Software Foundation, Inc.
+#
+#  This file is part of the GNU MP Library.
+#
+#  The GNU MP Library is free software; you can redistribute it and/or modify
+#  it under the terms of either:
+#
+#    * the GNU Lesser General Public License as published by the Free
+#      Software Foundation; either version 3 of the License, or (at your
+#      option) any later version.
+#
+#  or
+#
+#    * the GNU General Public License as published by the Free Software
+#      Foundation; either version 2 of the License, or (at your option) any
+#      later version.
+#
+#  or both in parallel, as here.
+#
+#  The GNU MP Library is distributed in the hope that it will be useful, but
+#  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+#  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+#  for more details.
+#
+#  You should have received copies of the GNU General Public License and the
+#  GNU Lesser General Public License along with the GNU MP Library.  If not,
+#  see https://www.gnu.org/licenses/.
+
+
+# Usage: slot.pl [filename.o]...
+#
+# Run "objdump" to produce a disassembly of the given object file(s) and
+# annotate the output with "U" or "L" slotting which Alpha EV6 will use.
+#
+# When an instruction is E (ie. either U or L), an "eU" or "eL" is shown, as
+# a reminder that it wasn't a fixed requirement that gave the U or L, but
+# the octaword slotting rules.
+#
+# If an instruction is not recognised, that octaword does not get any U/L
+# shown, only lower-case "u", "l" or "e" for the instructions which are
+# known.  Add any unknown instructions to %optable below.
+
+
+use strict;
+
+# The U or L which various instructions demand, or E if either.
+#
+my %optable =
+  (
+   'addq'   => 'E',
+   'and'    => 'E',
+   'andnot' => 'E',
+   'beq'    => 'U',
+   'bge'    => 'U',
+   'bgt'    => 'U',
+   'bic'    => 'E',
+   'bis'    => 'E',
+   'blt'    => 'U',
+   'bne'    => 'U',
+   'br'     => 'L',
+   'clr'    => 'E',
+   'cmpule' => 'E',
+   'cmpult' => 'E',
+   'cmpeq'  => 'E',
+   'cmoveq' => 'E',
+   'cmovne' => 'E',
+   'ctpop'  => 'U',
+   'ctlz'   => 'U',
+   'cttz'   => 'U',
+   'extbl'  => 'U',
+   'extlh'  => 'U',
+   'extll'  => 'U',
+   'extqh'  => 'U',
+   'extql'  => 'U',
+   'extwh'  => 'U',
+   'extwl'  => 'U',
+   'jsr'    => 'L',
+   'lda'    => 'E',
+   'ldah'   => 'E',
+   'ldbu'   => 'L',
+   'ldl'    => 'L',
+   'ldq'    => 'L',
+   'ldt'    => 'L',
+   'ret'    => 'L',
+   'mov'    => 'E',
+   'mull'   => 'U',
+   'mulq'   => 'U',
+   'negq'   => 'E',
+   'nop'    => 'E',
+   'not'    => 'E',
+   's8addq' => 'E',
+   's8subq' => 'E',
+   # 'sextb'  => ?
+   # 'sextl'  => ?
+   'sll'    => 'U',
+   'srl'    => 'U',
+   'stq'    => 'L',
+   'subq'   => 'E',
+   'umulh'  => 'U',
+   'unop'   => 'E',
+   'xor'    => 'E',
+  );
+
+# Slottings used for a given pattern of U/L/E in an octaword.  This is as
+# per the "Ebox Slotting" section of the EV6 hardware reference manual.
+#
+my %slottable =
+  (
+   'EEEE' => 'ULUL',
+   'EEEL' => 'ULUL',
+   'EEEU' => 'ULLU',
+   'EELE' => 'ULLU',
+   'EELL' => 'UULL',
+   'EELU' => 'ULLU',
+   'EEUE' => 'ULUL',
+   'EEUL' => 'ULUL',
+   'EEUU' => 'LLUU',
+   'ELEE' => 'ULUL',
+   'ELEL' => 'ULUL',
+   'ELEU' => 'ULLU',
+   'ELLE' => 'ULLU',
+   'ELLL' => 'ULLL',
+   'ELLU' => 'ULLU',
+   'ELUE' => 'ULUL',
+   'ELUL' => 'ULUL',
+
+   'LLLL' => 'LLLL',
+   'LLLU' => 'LLLU',
+   'LLUE' => 'LLUU',
+   'LLUL' => 'LLUL',
+   'LLUU' => 'LLUU',
+   'LUEE' => 'LULU',
+   'LUEL' => 'LUUL',
+   'LUEU' => 'LULU',
+   'LULE' => 'LULU',
+   'LULL' => 'LULL',
+   'LULU' => 'LULU',
+   'LUUE' => 'LUUL',
+   'LUUL' => 'LUUL',
+   'LUUU' => 'LUUU',
+   'UEEE' => 'ULUL',
+   'UEEL' => 'ULUL',
+   'UEEU' => 'ULLU',
+
+   'ELUU' => 'LLUU',
+   'EUEE' => 'LULU',
+   'EUEL' => 'LUUL',
+   'EUEU' => 'LULU',
+   'EULE' => 'LULU',
+   'EULL' => 'UULL',
+   'EULU' => 'LULU',
+   'EUUE' => 'LUUL',
+   'EUUL' => 'LUUL',
+   'EUUU' => 'LUUU',
+   'LEEE' => 'LULU',
+   'LEEL' => 'LUUL',
+   'LEEU' => 'LULU',
+   'LELE' => 'LULU',
+   'LELL' => 'LULL',
+   'LELU' => 'LULU',
+   'LEUE' => 'LUUL',
+   'LEUL' => 'LUUL',
+   'LEUU' => 'LLUU',
+   'LLEE' => 'LLUU',
+   'LLEL' => 'LLUL',
+   'LLEU' => 'LLUU',
+   'LLLE' => 'LLLU',
+
+   'UELE' => 'ULLU',
+   'UELL' => 'UULL',
+   'UELU' => 'ULLU',
+   'UEUE' => 'ULUL',
+   'UEUL' => 'ULUL',
+   'UEUU' => 'ULUU',
+   'ULEE' => 'ULUL',
+   'ULEL' => 'ULUL',
+   'ULEU' => 'ULLU',
+   'ULLE' => 'ULLU',
+   'ULLL' => 'ULLL',
+   'ULLU' => 'ULLU',
+   'ULUE' => 'ULUL',
+   'ULUL' => 'ULUL',
+   'ULUU' => 'ULUU',
+   'UUEE' => 'UULL',
+   'UUEL' => 'UULL',
+   'UUEU' => 'UULU',
+   'UULE' => 'UULL',
+   'UULL' => 'UULL',
+   'UULU' => 'UULU',
+   'UUUE' => 'UUUL',
+   'UUUL' => 'UUUL',
+   'UUUU' => 'UUUU',
+  );
+
+# Check all combinations of U/L/E are present in %slottable.
+sub coverage {
+  foreach my $a ('U', 'L', 'E') {
+    foreach my $b ('U', 'L', 'E') {
+      foreach my $c ('U', 'L', 'E') {
+        foreach my $d ('U', 'L', 'E') {
+          my $x = $a . $b . $c . $d;
+          if (! defined $slottable{$x}) {
+            print "slottable missing: $x\n"
+          }
+        }
+      }
+    }
+  }
+}
+
+# Certain consistency checks for %slottable.
+sub check {
+  foreach my $x (keys %slottable) {
+    my $a = substr($x,0,1);
+    my $b = substr($x,1,1);
+    my $c = substr($x,2,1);
+    my $d = substr($x,3,1);
+    my $es = ($a eq 'E') + ($b eq 'E') + ($c eq 'E') + ($d eq 'E');
+    my $ls = ($a eq 'L') + ($b eq 'L') + ($c eq 'L') + ($d eq 'L');
+    my $us = ($a eq 'U') + ($b eq 'U') + ($c eq 'U') + ($d eq 'U');
+
+    my $got = $slottable{$x};
+    my $want = $x;
+
+    if ($es == 0) {
+
+    } elsif ($es == 1) {
+      # when only one E, it's mapped to whichever of U or L is otherwise
+      # used the least
+      if ($ls > $us) {
+        $want =~ s/E/U/;
+      } else {
+        $want =~ s/E/L/;
+      }
+    } elsif ($es == 2) {
+      # when two E's and two U, then the E's map to L; vice versa for two E
+      # and two L
+      if ($ls == 2) {
+        $want =~ s/E/U/g;
+      } elsif ($us == 2) {
+        $want =~ s/E/L/g;
+      } else {
+        next;
+      }
+    } elsif ($es == 3) {
+      next;
+
+    } else { # $es == 4
+      next;
+    }
+
+    if ($want ne $got) {
+      print "slottable $x want $want got $got\n";
+    }
+  }
+}
+
+sub disassemble {
+  my ($file) = @_;
+
+  open (IN, "objdump -Srfh $file |") || die "Cannot open pipe from objdump\n";
+
+  my (%pre, %post, %type);
+  while (<IN>) {
+    my $line = $_ . "";
+
+    if ($line =~ /(^[ \t]*[0-9a-f]*([0-9a-f]):[ \t]*[0-9a-f][0-9a-f] [0-9a-f][0-9a-f] [0-9a-f][0-9a-f] [0-9a-f][0-9a-f] )\t(([a-z0-9]+).*)/) {
+      my ($this_pre, $addr, $this_post, $opcode) = ($1, $2, $3, $4);
+
+      my $this_type = $optable{$opcode};
+      if (! defined ($this_type)) { $this_type = ' '; }
+
+      $pre{$addr} = $this_pre;
+      $post{$addr} = $this_post;
+      $type{$addr} = $this_type;
+
+      if ($addr eq 'c') {
+        my %slot = ('0'=>' ', '4'=>' ', '8'=>' ', 'c'=>' ');
+
+        my $str = $type{'c'} . $type{'8'} . $type{'4'} . $type{'0'};
+        $str = $slottable{$str};
+        if (defined $str) {
+          $slot{'c'} = substr($str,0,1);
+          $slot{'8'} = substr($str,1,1);
+          $slot{'4'} = substr($str,2,1);
+          $slot{'0'} = substr($str,3,1);
+        }
+
+        foreach my $i ('0', '4', '8', 'c') {
+          if ($slot{$i} eq $type{$i}) { $type{$i} = ' '; }
+          print $pre{$i}, ' ', lc($type{$i}),$slot{$i}, '  ', $post{$i}, "\n";
+        }
+
+        %pre = ();
+        %type = ();
+        %post = ();
+      }
+    }
+  }
+
+  close IN || die "Error from objdump (or objdump not available)\n";
+}
+
+coverage();
+check();
+
+my @files;
+if ($#ARGV >= 0) {
+  @files = @ARGV;
+} else {
+  die
+}
+
+foreach (@files)  {
+    disassemble($_);
+}

diff --git a/third_party/gmp/mpn/alpha/ev6/sub_n.asm b/third_party/gmp/mpn/alpha/ev6/sub_n.asm
new file mode 100644
index 0000000..a35ba40
--- /dev/null
+++ b/third_party/gmp/mpn/alpha/ev6/sub_n.asm

@@ -0,0 +1,283 @@
+dnl  Alpha ev6 mpn_sub_n -- Subtract two limb vectors of the same length > 0
+dnl  and store difference in a third limb vector.
+
+dnl  Copyright 2000, 2003, 2005 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C      cycles/limb
+C EV4:     ?
+C EV5:     5.4
+C EV6:     2.125
+
+C  INPUT PARAMETERS
+C  rp	r16
+C  up	r17
+C  vp	r18
+C  n	r19
+C  cy	r20   (for mpn_add_nc)
+
+C TODO
+C   Finish cleaning up cy registers r22, r23 (make them use cy0/cy1)
+C   Use multi-pronged feed-in.
+C   Perform additional micro-tuning
+
+C  This code was written in cooperation with ev6 pipeline expert Steve Root.
+
+C  Pair loads and stores where possible
+C  Store pairs oct-aligned where possible (didn't need it here)
+C  Stores are delayed every third cycle
+C  Loads and stores are delayed by fills
+C  U stays still, put code there where possible (note alternation of U1 and U0)
+C  L moves because of loads and stores
+C  Note dampers in L to limit damage
+
+C  This odd-looking optimization expects that were having random bits in our
+C  data, so that a pure zero result is unlikely. so we penalize the unlikely
+C  case to help the common case.
+
+define(`u0', `r0')  define(`u1', `r3')
+define(`v0', `r1')  define(`v1', `r4')
+
+define(`cy0', `r20')  define(`cy1', `r21')
+
+MULFUNC_PROLOGUE(mpn_sub_n mpn_sub_nc)
+
+ASM_START()
+PROLOGUE(mpn_sub_nc)
+	br	r31,	$entry
+EPILOGUE()
+PROLOGUE(mpn_sub_n)
+	bis	r31,	r31,	cy0	C clear carry in
+$entry:	cmpult	r19,	5,	r22	C L1 move counter
+	ldq	u1,	0(r17)		C L0 get next ones
+	ldq	v1,	0(r18)		C L1
+	bne	r22,	$Lsmall
+
+	ldq	u0,	8(r17)		C L0 get next ones
+	ldq	v0,	8(r18)		C L1
+	subq	u1,	v1,	r5	C U0 sub two data
+
+	cmpult	u1,	v1,	r23	C U0 did it borrow
+	ldq	u1,	16(r17)		C L0 get next ones
+	ldq	v1,	16(r18)		C L1
+
+	subq	u0,	v0,	r8	C U1 sub two data
+	subq	r5,	cy0,	r24	C U0 borrow in
+
+	cmpult	u0,	v0,	r22	C U1 did it borrow
+	beq	r5,	$fix5f		C U0 fix exact zero
+$ret5f:	ldq	u0,	24(r17)		C L0 get next ones
+	ldq	v0,	24(r18)		C L1
+
+	subq	r8,	r23,	r25	C U1 borrow from last
+	subq	u1,	v1,	r7	C U0 sub two data
+
+	beq	r8,	$fix6f		C U1 fix exact zero
+$ret6f:	cmpult	u1,	v1,	r23	C U0 did it borrow
+	ldq	u1,	32(r17)		C L0 get next ones
+	ldq	v1,	32(r18)		C L1
+
+	lda	r17,	40(r17)		C L0 move pointer
+	lda	r18,	40(r18)		C L1 move pointer
+
+	lda	r16,	-8(r16)
+	lda	r19,	-13(r19)	C L1 move counter
+	blt	r19,	$Lend		C U1 loop control
+
+
+C Main loop.  8-way unrolled.
+	ALIGN(16)
+$Loop:	subq	u0,	v0,	r2	C U1 sub two data
+	stq	r24,	8(r16)		C L0 put an answer
+	subq	r7,	r22,	r24	C U0 borrow from last
+	stq	r25,	16(r16)		C L1 pair
+
+	cmpult	u0,	v0,	cy1	C U1 did it borrow
+	beq	r7,	$fix7		C U0 fix exact 0
+$ret7:	ldq	u0,	0(r17)		C L0 get next ones
+	ldq	v0,	0(r18)		C L1
+
+	bis	r31,	r31,	r31	C L  damp out
+	subq	r2,	r23,	r25	C U1 borrow from last
+	bis	r31,	r31,	r31	C L  moves in L !
+	subq	u1,	v1,	r5	C U0 sub two data
+
+	beq	r2,	$fix0		C U1 fix exact zero
+$ret0:	cmpult	u1,	v1,	cy0	C U0 did it borrow
+	ldq	u1,	8(r17)		C L0 get next ones
+	ldq	v1,	8(r18)		C L1
+
+	subq	u0,	v0,	r8	C U1 sub two data
+	stq	r24,	24(r16)		C L0 store pair
+	subq	r5,	cy1,	r24	C U0 borrow from last
+	stq	r25,	32(r16)		C L1
+
+	cmpult	u0,	v0,	r22	C U1 did it borrow
+	beq	r5,	$fix1		C U0 fix exact zero
+$ret1:	ldq	u0,	16(r17)		C L0 get next ones
+	ldq	v0,	16(r18)		C L1
+
+	lda	r16,	64(r16)		C L0 move pointer
+	subq	r8,	cy0,	r25	C U1 borrow from last
+	lda	r19,	-8(r19)		C L1 move counter
+	subq	u1,	v1,	r7	C U0 sub two data
+
+	beq	r8,	$fix2		C U1 fix exact zero
+$ret2:	cmpult	u1,	v1,	r23	C U0 did it borrow
+	ldq	u1,	24(r17)		C L0 get next ones
+	ldq	v1,	24(r18)		C L1
+
+	subq	u0,	v0,	r2	C U1 sub two data
+	stq	r24,	-24(r16)	C L0 put an answer
+	subq	r7,	r22,	r24	C U0 borrow from last
+	stq	r25,	-16(r16)	C L1 pair
+
+	cmpult	u0,	v0,	cy1	C U1 did it borrow
+	beq	r7,	$fix3		C U0 fix exact 0
+$ret3:	ldq	u0,	32(r17)		C L0 get next ones
+	ldq	v0,	32(r18)		C L1
+
+	bis	r31,	r31,	r31	C L  damp out
+	subq	r2,	r23,	r25	C U1 borrow from last
+	bis	r31,	r31,	r31	C L  moves in L !
+	subq	u1,	v1,	r5	C U0 sub two data
+
+	beq	r2,	$fix4		C U1 fix exact zero
+$ret4:	cmpult	u1,	v1,	cy0	C U0 did it borrow
+	ldq	u1,	40(r17)		C L0 get next ones
+	ldq	v1,	40(r18)		C L1
+
+	subq	u0,	v0,	r8	C U1 sub two data
+	stq	r24,	-8(r16)		C L0 store pair
+	subq	r5,	cy1,	r24	C U0 borrow from last
+	stq	r25,	0(r16)		C L1
+
+	cmpult	u0,	v0,	r22	C U1 did it borrow
+	beq	r5,	$fix5		C U0 fix exact zero
+$ret5:	ldq	u0,	48(r17)		C L0 get next ones
+	ldq	v0,	48(r18)		C L1
+
+	ldl	r31, 256(r17)		C L0 prefetch
+	subq	r8,	cy0,	r25	C U1 borrow from last
+	ldl	r31, 256(r18)		C L1 prefetch
+	subq	u1,	v1,	r7	C U0 sub two data
+
+	beq	r8,	$fix6		C U1 fix exact zero
+$ret6:	cmpult	u1,	v1,	r23	C U0 did it borrow
+	ldq	u1,	56(r17)		C L0 get next ones
+	ldq	v1,	56(r18)		C L1
+
+	lda	r17,	64(r17)		C L0 move pointer
+	bis	r31,	r31,	r31	C U
+	lda	r18,	64(r18)		C L1 move pointer
+	bge	r19,	$Loop		C U1 loop control
+C ==== main loop end
+
+$Lend:	subq	u0,	v0,	r2	C U1 sub two data
+	stq	r24,	8(r16)		C L0 put an answer
+	subq	r7,	r22,	r24	C U0 borrow from last
+	stq	r25,	16(r16)		C L1 pair
+	cmpult	u0,	v0,	cy1	C U1 did it borrow
+	beq	r7,	$fix7c		C U0 fix exact 0
+$ret7c:	subq	r2,	r23,	r25	C U1 borrow from last
+	subq	u1,	v1,	r5	C U0 sub two data
+	beq	r2,	$fix0c		C U1 fix exact zero
+$ret0c:	cmpult	u1,	v1,	cy0	C U0 did it borrow
+	stq	r24,	24(r16)		C L0 store pair
+	subq	r5,	cy1,	r24	C U0 borrow from last
+	stq	r25,	32(r16)		C L1
+	beq	r5,	$fix1c		C U0 fix exact zero
+$ret1c:	stq	r24,	40(r16)		C L0 put an answer
+	lda	r16,	48(r16)		C L0 move pointer
+
+	lda	r19,	8(r19)
+	beq	r19,	$Lret
+
+	ldq	u1,	0(r17)
+	ldq	v1,	0(r18)
+$Lsmall:
+	lda	r19,	-1(r19)
+	beq	r19,	$Lend0
+
+	ALIGN(8)
+$Loop0:	subq	u1,	v1,	r2	C main sub
+	cmpult	u1,	v1,	r8	C compute bw from last sub
+	ldq	u1,	8(r17)
+	ldq	v1,	8(r18)
+	subq	r2,	cy0,	r5	C borrow sub
+	lda	r17,	8(r17)
+	lda	r18,	8(r18)
+	stq	r5,	0(r16)
+	cmpult	r2,	cy0,	cy0	C compute bw from last sub
+	lda	r19,	-1(r19)		C decr loop cnt
+	bis	r8,	cy0,	cy0	C combine bw from the two subs
+	lda	r16,	8(r16)
+	bne	r19,	$Loop0
+$Lend0:	subq	u1,	v1,	r2	C main sub
+	subq	r2,	cy0,	r5	C borrow sub
+	cmpult	u1,	v1,	r8	C compute bw from last sub
+	cmpult	r2,	cy0,	cy0	C compute bw from last sub
+	stq	r5,	0(r16)
+	bis	r8,	cy0,	r0	C combine bw from the two subs
+	ret	r31,(r26),1
+
+	ALIGN(8)
+$Lret:	lda	r0,	0(cy0)		C copy borrow into return register
+	ret	r31,(r26),1
+
+$fix5f:	bis	r23,	cy0,	r23	C bring forward borrow
+	br	r31,	$ret5f
+$fix6f:	bis	r22,	r23,	r22	C bring forward borrow
+	br	r31,	$ret6f
+$fix0:	bis	cy1,	r23,	cy1	C bring forward borrow
+	br	r31,	$ret0
+$fix1:	bis	cy0,	cy1,	cy0	C bring forward borrow
+	br	r31,	$ret1
+$fix2:	bis	r22,	cy0,	r22	C bring forward borrow
+	br	r31,	$ret2
+$fix3:	bis	r23,	r22,	r23	C bring forward borrow
+	br	r31,	$ret3
+$fix4:	bis	cy1,	r23,	cy1	C bring forward borrow
+	br	r31,	$ret4
+$fix5:	bis	cy1,	cy0,	cy0	C bring forward borrow
+	br	r31,	$ret5
+$fix6:	bis	r22,	cy0,	r22	C bring forward borrow
+	br	r31,	$ret6
+$fix7:	bis	r23,	r22,	r23	C bring forward borrow
+	br	r31,	$ret7
+$fix0c:	bis	cy1,	r23,	cy1	C bring forward borrow
+	br	r31,	$ret0c
+$fix1c:	bis	cy0,	cy1,	cy0	C bring forward borrow
+	br	r31,	$ret1c
+$fix7c:	bis	r23,	r22,	r23	C bring forward borrow
+	br	r31,	$ret7c
+
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/alpha/ev67/gcd_11.asm b/third_party/gmp/mpn/alpha/ev67/gcd_11.asm
new file mode 100644
index 0000000..03c234b
--- /dev/null
+++ b/third_party/gmp/mpn/alpha/ev67/gcd_11.asm

@@ -0,0 +1,79 @@
+dnl  Alpha ev67 mpn_gcd_11 -- Nx1 greatest common divisor.
+
+dnl  Copyright 2003, 2004 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C ev67: 3.4 cycles/bitpair for 1x1 part
+
+
+C mp_limb_t mpn_gcd_1 (mp_srcptr xp, mp_size_t xsize, mp_limb_t y);
+C
+C In the 1x1 part, the algorithm is to change x,y to abs(x-y),min(x,y) and
+C strip trailing zeros from abs(x-y) to maintain x and y both odd.
+C
+C The trailing zeros are calculated from just x-y, since in twos-complement
+C there's the same number of trailing zeros on d or -d.  This means the cttz
+C runs in parallel with abs(x-y).
+C
+C The loop takes 5 cycles, and at 0.68 iterations per bit for two N-bit
+C operands with this algorithm gives the measured 3.4 c/l.
+C
+C The slottings shown are for SVR4 style systems, Unicos differs in the
+C initial gp setup and the LEA.
+
+
+ASM_START()
+PROLOGUE(mpn_gcd_11)
+	mov	r16, r0
+	mov	r17, r1
+
+	ALIGN(16)
+L(top):	subq	r0, r1, r7		C l0  d = x - y
+	cmpult	r0, r1, r16		C u0  test x >= y
+
+	subq	r1, r0, r4		C l0  new_x = y - x
+	cttz	r7, r8			C U0  d twos
+
+	cmoveq	r16, r7, r4		C l0  new_x = d if x>=y
+	cmovne	r16, r0, r1		C u0  y = x if x<y
+	unop				C l   \ force cmoveq into l0
+	unop				C u   /
+
+	C				C cmoveq2 L0, cmovne2 U0
+
+	srl	r4, r8, r0		C U0  x = new_x >> twos
+	bne	r7, L(top)		C U1  stop when d==0
+
+
+L(end):	mov	r1, r0			C U0  return y << common_twos
+	ret	r31, (r26), 1		C L0
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/alpha/ev67/hamdist.asm b/third_party/gmp/mpn/alpha/ev67/hamdist.asm
new file mode 100644
index 0000000..4b13e9f
--- /dev/null
+++ b/third_party/gmp/mpn/alpha/ev67/hamdist.asm

@@ -0,0 +1,111 @@
+dnl  Alpha ev67 mpn_hamdist -- mpn hamming distance.
+
+dnl  Copyright 2003, 2005 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C ev67: 2.5 cycles/limb
+
+
+C unsigned long mpn_hamdist (mp_srcptr xp, mp_srcptr yp, mp_size_t size);
+C
+C The hope was for 2.0 c/l here, but that isn't achieved.  We're limited by
+C renaming register shortage.  Since we need 5 instructions per limb, further
+C unrolling could approach 1.5 c/l.
+C
+C The main loop processes two limbs from each operand on each iteration.  An
+C odd size is handled by processing xp[0]^yp[0] at the start.  If the size
+C is even that result is discarded, and is repeated by the main loop.
+C
+
+ASM_START()
+PROLOGUE(mpn_hamdist)
+
+	C r16	xp
+	C r17	yp
+	C r18	size
+
+	ldq	r1, 0(r16)		C L0  xp[0]
+	ldq	r2, 0(r17)		C L1  yp[0]
+	and	r18, 1, r8		C U1  1 if size odd
+	srl	r18, 1, r18		C U0  size, limb pairs
+
+	clr	r0			C L0  initial total
+	s8addq	r8, r17, r17		C U1  yp++ if size odd
+	s8addq	r8, r16, r16		C L1  xp++ if size odd
+	clr	r6			C U0  dummy initial xor 1
+
+	xor	r1, r2, r5		C L   initial xor 0
+	beq	r18, L(one)		C U   if size==1
+
+	cmoveq	r8, r31, r5		C L   discard first limb if size even
+	unop				C U
+
+
+	ALIGN(16)
+L(top):
+	C r0	total accumulating
+	C r7	xor 0
+	C r8	xor 1
+	C r16	xp, incrementing
+	C r17	yp, incrementing
+	C r18	size, limb pairs, decrementing
+
+	ldq	r1, 0(r16)		C L
+	ldq	r2, 0(r17)		C L
+	ctpop	r5, r7			C U0
+	lda	r16, 16(r16)		C U
+
+	ldq	r3, -8(r16)		C L
+	ldq	r4, 8(r17)		C L
+	ctpop	r6, r8			C U0
+	lda	r17, 16(r17)		C U
+
+	ldl	r31, 256(r16)		C L	prefetch
+	ldl	r31, 256(r17)		C L	prefetch
+	xor	r1, r2, r5		C U
+	lda	r18, -1(r18)		C U
+
+	xor	r3, r4, r6		C U
+	addq	r0, r7, r0		C L
+	addq	r0, r8, r0		C L
+	bne	r18, L(top)		C U
+
+
+	ctpop	r6, r8			C U0
+	addq	r0, r8, r0		C L
+L(one):
+	ctpop	r5, r7			C U0
+	addq	r0, r7, r0		C L
+
+	ret	r31, (r26), 1		C L0
+
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/alpha/ev67/popcount.asm b/third_party/gmp/mpn/alpha/ev67/popcount.asm
new file mode 100644
index 0000000..049c1cd
--- /dev/null
+++ b/third_party/gmp/mpn/alpha/ev67/popcount.asm

@@ -0,0 +1,101 @@
+dnl  Alpha ev67 mpn_popcount -- mpn bit population count.
+
+dnl  Copyright 2003, 2005 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C ev67: 1.5 cycles/limb
+
+
+C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size);
+C
+C This schedule seems necessary for the full 1.5 c/l, the IQ can't quite hide
+C all latencies, the addq's must be deferred to the next iteration.
+C
+C Since we need just 3 instructions per limb, further unrolling could approach
+C 1.0 c/l.
+C
+C The main loop processes two limbs at a time.  An odd size is handled by
+C processing src[0] at the start.  If the size is even that result is
+C discarded, and src[0] is repeated by the main loop.
+C
+
+ASM_START()
+PROLOGUE(mpn_popcount)
+
+	C r16	src
+	C r17	size
+
+	ldq	r0, 0(r16)		C L0  src[0]
+	and	r17, 1, r8		C U1  1 if size odd
+	srl	r17, 1, r17		C U0  size, limb pairs
+
+	s8addq	r8, r16, r16		C L1  src++ if size odd
+	ctpop	r0, r0			C U0
+	beq	r17, L(one)		C U1  if size==1
+
+	cmoveq	r8, r31, r0		C L   discard first limb if size even
+	clr	r3			C L
+
+	clr	r4			C L
+	unop				C U
+	unop				C L
+	unop				C U
+
+
+	ALIGN(16)
+L(top):
+	C r0	total accumulating
+	C r3	pop 0
+	C r4	pop 1
+	C r16	src, incrementing
+	C r17	size, decrementing
+
+	ldq	r1, 0(r16)		C L
+	ldq	r2, 8(r16)		C L
+	lda	r16, 16(r16)		C U
+	lda	r17, -1(r17)		C U
+
+	addq	r0, r3, r0		C L
+	addq	r0, r4, r0		C L
+	ctpop	r1, r3			C U0
+	ctpop	r2, r4			C U0
+
+	ldl	r31, 512(r16)		C L	prefetch
+	bne	r17, L(top)		C U
+
+
+	addq	r0, r3, r0		C L
+	addq	r0, r4, r0		C U
+L(one):
+	ret	r31, (r26), 1		C L0
+
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/alpha/gmp-mparam.h b/third_party/gmp/mpn/alpha/gmp-mparam.h
new file mode 100644
index 0000000..b850bd2
--- /dev/null
+++ b/third_party/gmp/mpn/alpha/gmp-mparam.h

@@ -0,0 +1,86 @@
+/* Alpha EV4 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 1999-2002, 2004, 2005, 2009 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+
+/* 175MHz 21064 */
+
+/* Generated by tuneup.c, 2009-01-15, gcc 3.2 */
+
+#define MUL_TOOM22_THRESHOLD             12
+#define MUL_TOOM33_THRESHOLD             69
+#define MUL_TOOM44_THRESHOLD             88
+
+#define SQR_BASECASE_THRESHOLD            4
+#define SQR_TOOM2_THRESHOLD              20
+#define SQR_TOOM3_THRESHOLD              62
+#define SQR_TOOM4_THRESHOLD             155
+
+#define MULLO_BASECASE_THRESHOLD          0  /* always */
+#define MULLO_DC_THRESHOLD               40
+#define MULLO_MUL_N_THRESHOLD           202
+
+#define DIV_SB_PREINV_THRESHOLD           0  /* preinv always */
+#define DIV_DC_THRESHOLD                 38
+#define POWM_THRESHOLD                   60
+
+#define MATRIX22_STRASSEN_THRESHOLD      17
+#define HGCD_THRESHOLD                   80
+#define GCD_DC_THRESHOLD                237
+#define GCDEXT_DC_THRESHOLD             198
+#define JACOBI_BASE_METHOD                2
+
+#define DIVREM_1_NORM_THRESHOLD           0  /* preinv always */
+#define DIVREM_1_UNNORM_THRESHOLD         0  /* always */
+#define MOD_1_NORM_THRESHOLD              0  /* always */
+#define MOD_1_UNNORM_THRESHOLD            0  /* always */
+#define MOD_1_1_THRESHOLD                 2
+#define MOD_1_2_THRESHOLD                 9
+#define MOD_1_4_THRESHOLD                20
+#define USE_PREINV_DIVREM_1               1  /* preinv always */
+#define USE_PREINV_MOD_1                  1  /* preinv always */
+#define DIVEXACT_1_THRESHOLD              0  /* always */
+#define MODEXACT_1_ODD_THRESHOLD          0  /* always */
+
+#define GET_STR_DC_THRESHOLD             20
+#define GET_STR_PRECOMPUTE_THRESHOLD     37
+#define SET_STR_DC_THRESHOLD            746
+#define SET_STR_PRECOMPUTE_THRESHOLD   1332
+
+#define MUL_FFT_TABLE  { 240, 480, 1344, 2304, 5120, 20480, 49152, 0 }
+#define MUL_FFT_MODF_THRESHOLD          232
+#define MUL_FFT_THRESHOLD              1664
+
+#define SQR_FFT_TABLE  { 240, 480, 1216, 2304, 5120, 12288, 49152, 0 }
+#define SQR_FFT_MODF_THRESHOLD          232
+#define SQR_FFT_THRESHOLD              1408

diff --git a/third_party/gmp/mpn/alpha/invert_limb.asm b/third_party/gmp/mpn/alpha/invert_limb.asm
new file mode 100644
index 0000000..afc010f
--- /dev/null
+++ b/third_party/gmp/mpn/alpha/invert_limb.asm

@@ -0,0 +1,95 @@
+dnl  Alpha mpn_invert_limb -- Invert a normalized limb.
+
+dnl  Copyright 1996, 2000-2003, 2007, 2011, 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C      cycles/limb
+C EV4:     ?
+C EV5:   137/140  (with BWX/without BWX)
+C EV6:    71/72   (with BWX/without BWX)
+
+C This was compiler generated, with minimal manual edits.  Surely several
+C cycles could be cut with some thought.
+
+ASM_START()
+PROLOGUE(mpn_invert_limb,gp)
+	LEA(	r2, approx_tab)
+	srl	r16, 54, r1
+	srl	r16, 24, r4
+	and	r16, 1, r5
+	bic	r1, 1, r7
+	lda	r4, 1(r4)
+	srl	r16, 1, r3
+	addq	r7, r2, r1
+ifelse(bwx_available_p,1,`
+	ldwu	r0, -512(r1)
+',`
+	ldq_u	r0, -512(r1)
+	extwl	r0, r7, r0
+')
+	addq	r3, r5, r3
+	mull	r0, r0, r1
+	sll	r0, 11, r0
+	mulq	r1, r4, r1
+	srl	r1, 40, r1
+	subq	r0, r1, r0
+	lda	r0, -1(r0)
+	mulq	r0, r0, r2
+	sll	r0, 60, r1
+	sll	r0, 13, r0
+	mulq	r2, r4, r2
+	subq	r1, r2, r1
+	srl	r1, 47, r1
+	addq	r0, r1, r0
+	mulq	r0, r3, r3
+	srl	r0, 1, r1
+	cmoveq	r5, 0, r1
+	subq	r1, r3, r1
+	umulh	r1, r0, r3
+	sll	r0, 31, r0
+	srl	r3, 1, r1
+	addq	r0, r1, r0
+	mulq	r0, r16, r2
+	umulh	r0, r16, r3
+	addq	r2, r16, r1
+	addq	r3, r16, r16
+	cmpult	r1, r2, r1
+	addq	r16, r1, r3
+	subq	r0, r3, r0
+	ret	r31, (r26), 1
+EPILOGUE()
+DATASTART(approx_tab,8)
+forloop(i,256,512-1,dnl
+`	.word	eval(0x7fd00/i)
+')dnl
+	SIZE(approx_tab, 512)
+	TYPE(approx_tab, object)
+DATAEND()
+ASM_END()

diff --git a/third_party/gmp/mpn/alpha/lshift.asm b/third_party/gmp/mpn/alpha/lshift.asm
new file mode 100644
index 0000000..c62a856
--- /dev/null
+++ b/third_party/gmp/mpn/alpha/lshift.asm

@@ -0,0 +1,182 @@
+dnl  Alpha mpn_lshift -- Shift a number left.
+
+dnl  Copyright 1994, 1995, 2000, 2003, 2009 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C      cycles/limb
+C EV4:     ?
+C EV5:     3.25
+C EV6:     1.75
+
+C  INPUT PARAMETERS
+C  rp	r16
+C  up	r17
+C  n	r18
+C  cnt	r19
+
+
+ASM_START()
+PROLOGUE(mpn_lshift)
+	s8addq	r18,r17,r17	C make r17 point at end of s1
+	ldq	r4,-8(r17)	C load first limb
+	subq	r31,r19,r20
+	s8addq	r18,r16,r16	C make r16 point at end of RES
+	subq	r18,1,r18
+	and	r18,4-1,r28	C number of limbs in first loop
+	srl	r4,r20,r0	C compute function result
+
+	beq	r28,L(L0)
+	subq	r18,r28,r18
+
+	ALIGN(8)
+L(top0):
+	ldq	r3,-16(r17)
+	subq	r16,8,r16
+	sll	r4,r19,r5
+	subq	r17,8,r17
+	subq	r28,1,r28
+	srl	r3,r20,r6
+	bis	r3,r3,r4
+	bis	r5,r6,r8
+	stq	r8,0(r16)
+	bne	r28,L(top0)
+
+L(L0):	sll	r4,r19,r24
+	beq	r18,L(end)
+C warm up phase 1
+	ldq	r1,-16(r17)
+	subq	r18,4,r18
+	ldq	r2,-24(r17)
+	ldq	r3,-32(r17)
+	ldq	r4,-40(r17)
+C warm up phase 2
+	srl	r1,r20,r7
+	sll	r1,r19,r21
+	srl	r2,r20,r8
+	beq	r18,L(end1)
+	ldq	r1,-48(r17)
+	sll	r2,r19,r22
+	ldq	r2,-56(r17)
+	srl	r3,r20,r5
+	bis	r7,r24,r7
+	sll	r3,r19,r23
+	bis	r8,r21,r8
+	srl	r4,r20,r6
+	ldq	r3,-64(r17)
+	sll	r4,r19,r24
+	ldq	r4,-72(r17)
+	subq	r18,4,r18
+	beq	r18,L(end2)
+	ALIGN(16)
+C main loop
+L(top):	stq	r7,-8(r16)
+	bis	r5,r22,r5
+	stq	r8,-16(r16)
+	bis	r6,r23,r6
+
+	srl	r1,r20,r7
+	subq	r18,4,r18
+	sll	r1,r19,r21
+	unop	C ldq	r31,-96(r17)
+
+	srl	r2,r20,r8
+	ldq	r1,-80(r17)
+	sll	r2,r19,r22
+	ldq	r2,-88(r17)
+
+	stq	r5,-24(r16)
+	bis	r7,r24,r7
+	stq	r6,-32(r16)
+	bis	r8,r21,r8
+
+	srl	r3,r20,r5
+	unop	C ldq	r31,-96(r17)
+	sll	r3,r19,r23
+	subq	r16,32,r16
+
+	srl	r4,r20,r6
+	ldq	r3,-96(r17)
+	sll	r4,r19,r24
+	ldq	r4,-104(r17)
+
+	subq	r17,32,r17
+	bne	r18,L(top)
+C cool down phase 2/1
+L(end2):
+	stq	r7,-8(r16)
+	bis	r5,r22,r5
+	stq	r8,-16(r16)
+	bis	r6,r23,r6
+	srl	r1,r20,r7
+	sll	r1,r19,r21
+	srl	r2,r20,r8
+	sll	r2,r19,r22
+	stq	r5,-24(r16)
+	bis	r7,r24,r7
+	stq	r6,-32(r16)
+	bis	r8,r21,r8
+	srl	r3,r20,r5
+	sll	r3,r19,r23
+	srl	r4,r20,r6
+	sll	r4,r19,r24
+C cool down phase 2/2
+	stq	r7,-40(r16)
+	bis	r5,r22,r5
+	stq	r8,-48(r16)
+	bis	r6,r23,r6
+	stq	r5,-56(r16)
+	stq	r6,-64(r16)
+C cool down phase 2/3
+	stq	r24,-72(r16)
+	ret	r31,(r26),1
+
+C cool down phase 1/1
+L(end1):
+	sll	r2,r19,r22
+	srl	r3,r20,r5
+	bis	r7,r24,r7
+	sll	r3,r19,r23
+	bis	r8,r21,r8
+	srl	r4,r20,r6
+	sll	r4,r19,r24
+C cool down phase 1/2
+	stq	r7,-8(r16)
+	bis	r5,r22,r5
+	stq	r8,-16(r16)
+	bis	r6,r23,r6
+	stq	r5,-24(r16)
+	stq	r6,-32(r16)
+	stq	r24,-40(r16)
+	ret	r31,(r26),1
+
+L(end):	stq	r24,-8(r16)
+	ret	r31,(r26),1
+EPILOGUE(mpn_lshift)
+ASM_END()

diff --git a/third_party/gmp/mpn/alpha/mod_34lsub1.asm b/third_party/gmp/mpn/alpha/mod_34lsub1.asm
new file mode 100644
index 0000000..1b03b63
--- /dev/null
+++ b/third_party/gmp/mpn/alpha/mod_34lsub1.asm

@@ -0,0 +1,164 @@
+dnl Alpha mpn_mod_34lsub1.
+
+dnl  Copyright 2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C      cycles/limb
+C EV4:     4 (?)
+C EV5:     2.67
+C EV6:     1.67
+
+
+dnl  INPUT PARAMETERS
+dnl  up		r16
+dnl  n		r17
+
+define(`l0',`r18')
+define(`l1',`r19')
+define(`l2',`r20')
+define(`a0',`r21')
+define(`a1',`r22')
+define(`a2',`r23')
+define(`c0',`r24')
+define(`c1',`r5')
+define(`c2',`r6')
+
+ASM_START()
+PROLOGUE(mpn_mod_34lsub1)
+	bis	r31, r31, c0
+	bis	r31, r31, c1
+	bis	r31, r31, c2
+
+	lda	r17, -3(r17)
+	bge	r17, $L_3_or_more
+	bis	r31, r31, a0
+	bis	r31, r31, a1
+	bis	r31, r31, a2
+	br	r31, $L_012
+
+$L_3_or_more:
+	ldq	a0, 0(r16)
+	ldq	a1, 8(r16)
+	ldq	a2, 16(r16)
+	lda	r16, 24(r16)
+	lda	r17, -3(r17)
+	blt	r17, $L_012
+
+$L_6_or_more:
+	ldq	l0, 0(r16)
+	ldq	l1, 8(r16)
+	ldq	l2, 16(r16)
+	addq	l0, a0, a0
+
+	lda	r16, 24(r16)
+	lda	r17, -3(r17)
+	blt	r17, $L_end
+
+	ALIGN(16)
+C Main loop
+$L_9_or_more:
+$Loop:	cmpult	a0, l0, r0
+	ldq	l0, 0(r16)
+	addq	r0, c0, c0
+	addq	l1, a1, a1
+	cmpult	a1, l1, r0
+	ldq	l1, 8(r16)
+	addq	r0, c1, c1
+	addq	l2, a2, a2
+	cmpult	a2, l2, r0
+	ldq	l2, 16(r16)
+	addq	r0, c2, c2
+	addq	l0, a0, a0
+	lda	r16, 24(r16)
+	lda	r17, -3(r17)
+	bge	r17, $Loop
+
+$L_end:	cmpult	a0, l0, r0
+	addq	r0, c0, c0
+	addq	l1, a1, a1
+	cmpult	a1, l1, r0
+	addq	r0, c1, c1
+	addq	l2, a2, a2
+	cmpult	a2, l2, r0
+	addq	r0, c2, c2
+
+C Handle the last (n mod 3) limbs
+$L_012:	lda	r17, 2(r17)
+	blt	r17, $L_0
+	ldq	l0, 0(r16)
+	addq	l0, a0, a0
+	cmpult	a0, l0, r0
+	addq	r0, c0, c0
+	beq	r17, $L_0
+	ldq	l1, 8(r16)
+	addq	l1, a1, a1
+	cmpult	a1, l1, r0
+	addq	r0, c1, c1
+
+C Align and sum our 3 main accumulators and 3 carry accumulators
+$L_0:	srl	a0, 48, r2
+	srl	a1, 32, r4
+ifdef(`HAVE_LIMB_LITTLE_ENDIAN',
+`	insll	a1, 2, r1',		C (a1 & 0xffffffff) << 16
+`	zapnot	a1, 15, r25
+	sll	r25, 16, r1')
+	zapnot	a0, 63, r0		C a0 & 0xffffffffffff
+	srl	a2, 16, a1
+ifdef(`HAVE_LIMB_LITTLE_ENDIAN',
+`	inswl	a2, 4, r3',		C (a2 & 0xffff) << 32
+`	zapnot	a2, 3, r25
+	sll	r25, 32, r3')
+	addq	r1, r4, r1
+	addq	r0, r2, r0
+	srl	c0, 32, a2
+ifdef(`HAVE_LIMB_LITTLE_ENDIAN',
+`	insll	c0, 2, r4',		C (c0 & 0xffffffff) << 16
+`	zapnot	c0, 15, r25
+	sll	r25, 16, r4')
+	addq	r0, r1, r0
+	addq	r3, a1, r3
+	addq	r0, r3, r0
+	srl	c1, 16, c0
+ifdef(`HAVE_LIMB_LITTLE_ENDIAN',
+`	inswl	c1, 4, r2',		C (c1 & 0xffff) << 32
+`	zapnot	c1, 3, r25
+	sll	r25, 32, r2')
+	addq	r4, a2, r4
+C	srl	c2, 48, r3		C This will be 0 in practise
+	zapnot	c2, 63, r1		C r1 = c2 & 0xffffffffffff
+	addq	r0, r4, r0
+	addq	r2, c0, r2
+	addq	r0, r2, r0
+C	addq	r1, r3, r1
+	addq	r0, r1, r0
+
+	ret	r31, (r26), 1
+EPILOGUE(mpn_mod_34lsub1)
+ASM_END()

diff --git a/third_party/gmp/mpn/alpha/mode1o.asm b/third_party/gmp/mpn/alpha/mode1o.asm
new file mode 100644
index 0000000..96dccc7
--- /dev/null
+++ b/third_party/gmp/mpn/alpha/mode1o.asm

@@ -0,0 +1,209 @@
+dnl  Alpha mpn_modexact_1c_odd -- mpn exact remainder
+
+dnl  Copyright 2003, 2004 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C      cycles/limb
+C EV4:    47
+C EV5:    30
+C EV6:    15
+
+
+C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, mp_limb_t d,
+C                                mp_limb_t c)
+C
+C This code follows the "alternate" code in mpn/generic/mode1o.c,
+C eliminating cbit+climb from the dependent chain.  This leaves,
+C
+C        ev4   ev5   ev6
+C         1     3     1    subq   y = x - h
+C        23    13     7    mulq   q = y * inverse
+C        23    14     7    umulh  h = high (q * d)
+C        --    --    --
+C        47    30    15
+C
+C In each case, the load latency, loop control, and extra carry bit handling
+C hide under the multiply latencies.  Those latencies are long enough that
+C we don't need to worry about alignment or pairing to squeeze out
+C performance.
+C
+C For the first limb, some of the loop code is broken out and scheduled back
+C since it can be done earlier.
+C
+C   - The first ldq src[0] is near the start of the routine, for maximum
+C     time from memory.
+C
+C   - The subq y=x-climb can be done without waiting for the inverse.
+C
+C   - The mulq y*inverse is replicated after the final subq for the inverse,
+C     instead of branching to the mulq in the main loop.  On ev4 a branch
+C     there would cost cycles, but we can hide them under the mulq latency.
+C
+C For the last limb, high<divisor is tested and if that's true a subtract
+C and addback is done, as per the main mpn/generic/mode1o.c code.  This is a
+C data-dependent branch, but we're waiting for umulh so any penalty should
+C hide there.  The multiplies saved would be worth the cost anyway.
+C
+C Enhancements:
+C
+C For size==1, a plain division (done bitwise say) might be faster than
+C calculating an inverse, the latter taking about 130 cycles on ev4 or 70 on
+C ev5.  A call to gcc __remqu might be a possibility.
+
+ASM_START()
+PROLOGUE(mpn_modexact_1c_odd,gp)
+
+	C r16	src
+	C r17	size
+	C r18	d
+	C r19	c
+
+	LEA(r0, binvert_limb_table)
+	srl	r18, 1, r20		C d >> 1
+
+	and	r20, 127, r20		C idx = d>>1 & 0x7F
+
+	addq	r0, r20, r21		C table + idx
+
+ifelse(bwx_available_p,1,
+`	ldbu	r20, 0(r21)		C table[idx], inverse 8 bits
+',`
+	ldq_u	r20, 0(r21)		C table[idx] qword
+	extbl	r20, r21, r20		C table[idx], inverse 8 bits
+')
+
+	mull	r20, r20, r7		C i*i
+	addq	r20, r20, r20		C 2*i
+
+	ldq	r2, 0(r16)		C x = s = src[0]
+	lda	r17, -1(r17)		C size--
+	clr	r0			C initial cbit=0
+
+	mull	r7, r18, r7		C i*i*d
+
+	subq	r20, r7, r20		C 2*i-i*i*d, inverse 16 bits
+
+	mull	r20, r20, r7		C i*i
+	addq	r20, r20, r20		C 2*i
+
+	mull	r7, r18, r7		C i*i*d
+
+	subq	r20, r7, r20		C 2*i-i*i*d, inverse 32 bits
+
+	mulq	r20, r20, r7		C i*i
+	addq	r20, r20, r20		C 2*i
+
+	mulq	r7, r18, r7		C i*i*d
+	subq	r2, r19, r3		C y = x - climb
+
+	subq	r20, r7, r20		C inv = 2*i-i*i*d, inverse 64 bits
+
+ASSERT(r7, C should have d*inv==1 mod 2^64
+`	mulq	r18, r20, r7
+	cmpeq	r7, 1, r7')
+
+	mulq	r3, r20, r4		C first q = y * inv
+
+	beq	r17, L(one)		C if size==1
+	br	L(entry)
+
+
+L(top):
+	C r0	cbit
+	C r16	src, incrementing
+	C r17	size, decrementing
+	C r18	d
+	C r19	climb
+	C r20	inv
+
+	ldq	r1, 0(r16)		C s = src[i]
+	subq	r1, r0, r2		C x = s - cbit
+	cmpult	r1, r0, r0		C new cbit = s < cbit
+
+	subq	r2, r19, r3		C y = x - climb
+
+	mulq	r3, r20, r4		C q = y * inv
+L(entry):
+	cmpult	r2, r19, r5		C cbit2 = x < climb
+	addq	r5, r0, r0		C cbit += cbit2
+	lda	r16, 8(r16)		C src++
+	lda	r17, -1(r17)		C size--
+
+	umulh	r4, r18, r19		C climb = q * d
+	bne	r17, L(top)		C while 2 or more limbs left
+
+
+
+	C r0	cbit
+	C r18	d
+	C r19	climb
+	C r20	inv
+
+	ldq	r1, 0(r16)		C s = src[size-1] high limb
+
+	cmpult	r1, r18, r2		C test high<divisor
+	bne	r2, L(skip)		C skip if so
+
+	C can't skip a division, repeat loop code
+
+	subq	r1, r0, r2		C x = s - cbit
+	cmpult	r1, r0, r0		C new cbit = s < cbit
+
+	subq	r2, r19, r3		C y = x - climb
+
+	mulq	r3, r20, r4		C q = y * inv
+L(one):
+	cmpult	r2, r19, r5		C cbit2 = x < climb
+	addq	r5, r0, r0		C cbit += cbit2
+
+	umulh	r4, r18, r19		C climb = q * d
+
+	addq	r19, r0, r0		C return climb + cbit
+	ret	r31, (r26), 1
+
+
+	ALIGN(8)
+L(skip):
+	C with high<divisor, the final step can be just (cbit+climb)-s and
+	C an addback of d if that underflows
+
+	addq	r19, r0, r19		C c = climb + cbit
+
+	subq	r19, r1, r2		C c - s
+	cmpult	r19, r1, r3		C c < s
+
+	addq	r2, r18, r0		C return c-s + divisor
+
+	cmoveq	r3, r2, r0		C return c-s if no underflow
+	ret	r31, (r26), 1
+
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/alpha/mul_1.asm b/third_party/gmp/mpn/alpha/mul_1.asm
new file mode 100644
index 0000000..a7cdbcf
--- /dev/null
+++ b/third_party/gmp/mpn/alpha/mul_1.asm

@@ -0,0 +1,102 @@
+dnl  Alpha mpn_mul_1 -- Multiply a limb vector with a limb and store
+dnl  the result in a second limb vector.
+
+dnl  Copyright 1992, 1994, 1995, 2000, 2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C      cycles/limb
+C EV4:     42
+C EV5:     18
+C EV6:      7
+
+C  INPUT PARAMETERS
+C  rp	r16
+C  up	r17
+C  n	r18
+C  vl	r19
+C  cl	r20
+
+
+ASM_START()
+PROLOGUE(mpn_mul_1c)
+	ldq	r2,0(r17)	C r2 = s1_limb
+	lda	r18,-1(r18)	C size--
+	mulq	r2,r19,r3	C r3 = prod_low
+	umulh	r2,r19,r4	C r4 = prod_high
+	beq	r18,$Le1c	C jump if size was == 1
+	ldq	r2,8(r17)	C r2 = s1_limb
+	lda	r18,-1(r18)	C size--
+	addq	r3,r20,r3	C r3 = cy_limb + cl
+	stq	r3,0(r16)
+	cmpult	r3,r20,r0	C r0 = carry from (cy_limb + cl)
+	bne	r18,$Loop	C jump if size was == 2
+	br	r31,$Le2
+$Le1c:	addq	r3,r20,r3	C r3 = cy_limb + cl
+	cmpult	r3,r20,r0	C r0 = carry from (cy_limb + cl)
+$Le1:	stq	r3,0(r16)
+	addq	r4,r0,r0
+	ret	r31,(r26),1
+EPILOGUE(mpn_mul_1c)
+
+PROLOGUE(mpn_mul_1)
+	ldq	r2,0(r17)	C r2 = s1_limb
+	lda	r18,-1(r18)	C size--
+	mulq	r2,r19,r3	C r3 = prod_low
+	bic	r31,r31,r0	C clear cy_limb
+	umulh	r2,r19,r4	C r4 = prod_high
+	beq	r18,$Le1	C jump if size was == 1
+	ldq	r2,8(r17)	C r2 = s1_limb
+	lda	r18,-1(r18)	C size--
+	stq	r3,0(r16)
+	beq	r18,$Le2	C jump if size was == 2
+
+	ALIGN(8)
+$Loop:	mulq	r2,r19,r3	C r3 = prod_low
+	addq	r4,r0,r0	C cy_limb = cy_limb + 'cy'
+	lda	r18,-1(r18)	C size--
+	umulh	r2,r19,r4	C r4 = prod_high
+	ldq	r2,16(r17)	C r2 = s1_limb
+	lda	r17,8(r17)	C s1_ptr++
+	addq	r3,r0,r3	C r3 = cy_limb + prod_low
+	stq	r3,8(r16)
+	cmpult	r3,r0,r0	C r0 = carry from (cy_limb + prod_low)
+	lda	r16,8(r16)	C res_ptr++
+	bne	r18,$Loop
+
+$Le2:	mulq	r2,r19,r3	C r3 = prod_low
+	addq	r4,r0,r0	C cy_limb = cy_limb + 'cy'
+	umulh	r2,r19,r4	C r4 = prod_high
+	addq	r3,r0,r3	C r3 = cy_limb + prod_low
+	cmpult	r3,r0,r0	C r0 = carry from (cy_limb + prod_low)
+	stq	r3,8(r16)
+	addq	r4,r0,r0	C cy_limb = prod_high + cy
+	ret	r31,(r26),1
+EPILOGUE(mpn_mul_1)
+ASM_END()

diff --git a/third_party/gmp/mpn/alpha/rshift.asm b/third_party/gmp/mpn/alpha/rshift.asm
new file mode 100644
index 0000000..6e1e214
--- /dev/null
+++ b/third_party/gmp/mpn/alpha/rshift.asm

@@ -0,0 +1,180 @@
+dnl  Alpha mpn_rshift -- Shift a number right.
+
+dnl  Copyright 1994, 1995, 2000, 2009 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C      cycles/limb
+C EV4:     ?
+C EV5:     3.25
+C EV6:     1.75
+
+C  INPUT PARAMETERS
+C  rp	r16
+C  up	r17
+C  n	r18
+C  cnt	r19
+
+
+ASM_START()
+PROLOGUE(mpn_rshift)
+	ldq	r4,0(r17)	C load first limb
+	subq	r31,r19,r20
+	subq	r18,1,r18
+	and	r18,4-1,r28	C number of limbs in first loop
+	sll	r4,r20,r0	C compute function result
+
+	beq	r28,L(L0)
+	subq	r18,r28,r18
+
+	ALIGN(8)
+L(top0):
+	ldq	r3,8(r17)
+	addq	r16,8,r16
+	srl	r4,r19,r5
+	addq	r17,8,r17
+	subq	r28,1,r28
+	sll	r3,r20,r6
+	bis	r3,r3,r4
+	bis	r5,r6,r8
+	stq	r8,-8(r16)
+	bne	r28,L(top0)
+
+L(L0):	srl	r4,r19,r24
+	beq	r18,L(end)
+C warm up phase 1
+	ldq	r1,8(r17)
+	subq	r18,4,r18
+	ldq	r2,16(r17)
+	ldq	r3,24(r17)
+	ldq	r4,32(r17)
+C warm up phase 2
+	sll	r1,r20,r7
+	srl	r1,r19,r21
+	sll	r2,r20,r8
+	beq	r18,L(end1)
+	ldq	r1,40(r17)
+	srl	r2,r19,r22
+	ldq	r2,48(r17)
+	sll	r3,r20,r5
+	bis	r7,r24,r7
+	srl	r3,r19,r23
+	bis	r8,r21,r8
+	sll	r4,r20,r6
+	ldq	r3,56(r17)
+	srl	r4,r19,r24
+	ldq	r4,64(r17)
+	subq	r18,4,r18
+	beq	r18,L(end2)
+	ALIGN(16)
+C main loop
+L(top):	stq	r7,0(r16)
+	bis	r5,r22,r5
+	stq	r8,8(r16)
+	bis	r6,r23,r6
+
+	sll	r1,r20,r7
+	subq	r18,4,r18
+	srl	r1,r19,r21
+	unop	C ldq	r31,-96(r17)
+
+	sll	r2,r20,r8
+	ldq	r1,72(r17)
+	srl	r2,r19,r22
+	ldq	r2,80(r17)
+
+	stq	r5,16(r16)
+	bis	r7,r24,r7
+	stq	r6,24(r16)
+	bis	r8,r21,r8
+
+	sll	r3,r20,r5
+	unop	C ldq	r31,-96(r17)
+	srl	r3,r19,r23
+	addq	r16,32,r16
+
+	sll	r4,r20,r6
+	ldq	r3,88(r17)
+	srl	r4,r19,r24
+	ldq	r4,96(r17)
+
+	addq	r17,32,r17
+	bne	r18,L(top)
+C cool down phase 2/1
+L(end2):
+	stq	r7,0(r16)
+	bis	r5,r22,r5
+	stq	r8,8(r16)
+	bis	r6,r23,r6
+	sll	r1,r20,r7
+	srl	r1,r19,r21
+	sll	r2,r20,r8
+	srl	r2,r19,r22
+	stq	r5,16(r16)
+	bis	r7,r24,r7
+	stq	r6,24(r16)
+	bis	r8,r21,r8
+	sll	r3,r20,r5
+	srl	r3,r19,r23
+	sll	r4,r20,r6
+	srl	r4,r19,r24
+C cool down phase 2/2
+	stq	r7,32(r16)
+	bis	r5,r22,r5
+	stq	r8,40(r16)
+	bis	r6,r23,r6
+	stq	r5,48(r16)
+	stq	r6,56(r16)
+C cool down phase 2/3
+	stq	r24,64(r16)
+	ret	r31,(r26),1
+
+C cool down phase 1/1
+L(end1):
+	srl	r2,r19,r22
+	sll	r3,r20,r5
+	bis	r7,r24,r7
+	srl	r3,r19,r23
+	bis	r8,r21,r8
+	sll	r4,r20,r6
+	srl	r4,r19,r24
+C cool down phase 1/2
+	stq	r7,0(r16)
+	bis	r5,r22,r5
+	stq	r8,8(r16)
+	bis	r6,r23,r6
+	stq	r5,16(r16)
+	stq	r6,24(r16)
+	stq	r24,32(r16)
+	ret	r31,(r26),1
+
+L(end):	stq	r24,0(r16)
+	ret	r31,(r26),1
+EPILOGUE(mpn_rshift)
+ASM_END()

diff --git a/third_party/gmp/mpn/alpha/sec_tabselect.asm b/third_party/gmp/mpn/alpha/sec_tabselect.asm
new file mode 100644
index 0000000..679b169
--- /dev/null
+++ b/third_party/gmp/mpn/alpha/sec_tabselect.asm

@@ -0,0 +1,137 @@
+dnl  Alpha mpn_sec_tabselect.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2011-2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C      cycles/limb
+C EV4:      ?
+C EV5:      2.25
+C EV6:      1.64
+
+define(`rp',     `r16')
+define(`tp',     `r17')
+define(`n',      `r18')
+define(`nents',  `r19')
+define(`which',  `r20')
+
+define(`i',      `r21')
+define(`j',      `r22')
+define(`stride', `r23')
+define(`mask',   `r24')
+define(`k',      `r25')
+
+
+ASM_START()
+PROLOGUE(mpn_sec_tabselect)
+	subq	n, 4, j			C outer loop induction variable
+
+	blt	j, L(outer_end)
+L(outer_top):
+	mov	tp, r8
+	lda	r0, 0(r31)
+	lda	r1, 0(r31)
+	lda	r2, 0(r31)
+	lda	r3, 0(r31)
+	subq	j, 4, j			C outer loop induction variable
+	subq	nents, which, k
+	mov	nents, i
+
+	ALIGN(16)
+L(top):	ldq	r4, 0(tp)
+	ldq	r5, 8(tp)
+	cmpeq	k, i, mask
+	subq	i, 1, i
+	subq	r31, mask, mask
+	ldq	r6, 16(tp)
+	ldq	r7, 24(tp)
+	and	r4, mask, r4
+	and	r5, mask, r5
+	or	r0, r4, r0
+	or	r1, r5, r1
+	and	r6, mask, r6
+	and	r7, mask, r7
+	or	r2, r6, r2
+	or	r3, r7, r3
+	s8addq	n, tp, tp
+	bne	i, L(top)
+
+	stq	r0, 0(rp)
+	stq	r1, 8(rp)
+	stq	r2, 16(rp)
+	stq	r3, 24(rp)
+	addq	r8, 32, tp
+	addq	rp, 32, rp
+	bge	j, L(outer_top)
+L(outer_end):
+
+	and	n, 2, r0
+	beq	r0, L(b0x)
+L(b1x):	mov	tp, r8
+	lda	r0, 0(r31)
+	lda	r1, 0(r31)
+	subq	nents, which, k
+	mov	nents, i
+	ALIGN(16)
+L(tp2):	ldq	r4, 0(tp)
+	ldq	r5, 8(tp)
+	cmpeq	k, i, mask
+	subq	i, 1, i
+	subq	r31, mask, mask
+	and	r4, mask, r4
+	and	r5, mask, r5
+	or	r0, r4, r0
+	or	r1, r5, r1
+	s8addq	n, tp, tp
+	bne	i, L(tp2)
+	stq	r0, 0(rp)
+	stq	r1, 8(rp)
+	addq	r8, 16, tp
+	addq	rp, 16, rp
+
+L(b0x):	and	n, 1, r0
+	beq	r0, L(b00)
+L(b01):	lda	r0, 0(r31)
+	subq	nents, which, k
+	mov	nents, i
+	ALIGN(16)
+L(tp1):	ldq	r4, 0(tp)
+	cmpeq	k, i, mask
+	subq	i, 1, i
+	subq	r31, mask, mask
+	and	r4, mask, r4
+	or	r0, r4, r0
+	s8addq	n, tp, tp
+	bne	i, L(tp1)
+	stq	r0, 0(rp)
+
+L(b00):	ret	r31, (r26), 1
+EPILOGUE()

diff --git a/third_party/gmp/mpn/alpha/sqr_diag_addlsh1.asm b/third_party/gmp/mpn/alpha/sqr_diag_addlsh1.asm
new file mode 100644
index 0000000..ee219ef
--- /dev/null
+++ b/third_party/gmp/mpn/alpha/sqr_diag_addlsh1.asm

@@ -0,0 +1,93 @@
+dnl  Alpha mpn_sqr_diag_addlsh1.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C      cycles/limb
+C EV4:      ?
+C EV5:     10.2
+C EV6:      4.5
+
+C Ideally, one-way code could run at 9 c/l (limited by mulq+umulh) on ev5 and
+C about 3.75 c/l on ev6.  Two-way code could run at about 3.25 c/l on ev6.
+
+C Algorithm: We allow ourselves to propagate carry to a product high word
+C without worrying for carry out, since (B-1)^2 = B^2-2B+1 has a high word of
+C B-2, i.e, will not spill.  We propagate carry similarly to a product low word
+C since the problem value B-1 is a quadratic non-residue mod B, but our
+C products are squares.
+
+define(`rp',	`r16')
+define(`tp',	`r17')
+define(`up',	`r18')
+define(`n',	`r19')
+
+ASM_START()
+PROLOGUE(mpn_sqr_diag_addlsh1)
+	ldq	r0, 0(up)
+	bis	r31, r31, r21
+	bis	r31, r31, r3
+	mulq	r0, r0, r7
+	stq	r7, 0(rp)
+	umulh	r0, r0, r6
+	lda	n, -1(n)
+
+	ALIGN(16)
+L(top):	ldq	r0, 8(up)
+	lda	up, 8(up)
+	ldq	r8, 0(tp)
+	ldq	r20, 8(tp)
+	mulq	r0, r0, r7
+	lda	tp, 16(tp)
+	sll	r8, 1, r23
+	srl	r8, 63, r22
+	or	r21, r23, r23
+	sll	r20, 1, r24
+	addq	r3, r6, r6		C cannot carry per comment above
+	or	r22, r24, r24
+	addq	r23, r6, r21
+	umulh	r0, r0, r6
+	cmpult	r21, r23, r1
+	addq	r1, r7, r7		C cannot carry per comment above
+	stq	r21, 8(rp)
+	addq	r24, r7, r22
+	stq	r22, 16(rp)
+	lda	n, -1(n)
+	cmpult	r22, r7, r3
+	srl	r20, 63, r21
+	lda	rp, 16(rp)
+	bne	n, L(top)
+
+	addq	r3, r6, r6		C cannot carry per comment above
+	addq	r21, r6, r21
+	stq	r21, 8(rp)
+	ret	r31, (r26), 1
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/alpha/sub_n.asm b/third_party/gmp/mpn/alpha/sub_n.asm
new file mode 100644
index 0000000..1bb7226
--- /dev/null
+++ b/third_party/gmp/mpn/alpha/sub_n.asm

@@ -0,0 +1,164 @@
+dnl  Alpha mpn_sub_n -- Subtract two limb vectors of the same length > 0
+dnl  and store difference in a third limb vector.
+
+dnl  Copyright 1995, 1999, 2000, 2005, 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C      cycles/limb
+C EV4:     ?
+C EV5:     4.75
+C EV6:     3
+
+dnl  INPUT PARAMETERS
+dnl  res_ptr	r16
+dnl  s1_ptr	r17
+dnl  s2_ptr	r18
+dnl  size	r19
+
+ASM_START()
+PROLOGUE(mpn_sub_nc)
+	bis	r31,r20,r25
+	br	L(com)
+EPILOGUE()
+PROLOGUE(mpn_sub_n)
+	bis	r31,r31,r25		C clear cy
+L(com):	subq	r19,4,r19		C decr loop cnt
+	blt	r19,$Lend2		C if less than 4 limbs, goto 2nd loop
+C Start software pipeline for 1st loop
+	ldq	r0,0(r18)
+	ldq	r4,0(r17)
+	ldq	r1,8(r18)
+	ldq	r5,8(r17)
+	addq	r17,32,r17		C update s1_ptr
+	subq	r4,r0,r28		C 1st main subtract
+	ldq	r2,16(r18)
+	subq	r28,r25,r20		C 1st carry subtract
+	ldq	r3,24(r18)
+	cmpult	r4,r0,r8		C compute cy from last subtract
+	ldq	r6,-16(r17)
+	cmpult	r28,r25,r25		C compute cy from last subtract
+	ldq	r7,-8(r17)
+	bis	r8,r25,r25		C combine cy from the two subtracts
+	subq	r19,4,r19		C decr loop cnt
+	subq	r5,r1,r28		C 2nd main subtract
+	addq	r18,32,r18		C update s2_ptr
+	subq	r28,r25,r21		C 2nd carry subtract
+	cmpult	r5,r1,r8		C compute cy from last subtract
+	blt	r19,$Lend1		C if less than 4 limbs remain, jump
+C 1st loop handles groups of 4 limbs in a software pipeline
+	ALIGN(16)
+$Loop:	cmpult	r28,r25,r25		C compute cy from last subtract
+	ldq	r0,0(r18)
+	bis	r8,r25,r25		C combine cy from the two subtracts
+	ldq	r1,8(r18)
+	subq	r6,r2,r28		C 3rd main subtract
+	ldq	r4,0(r17)
+	subq	r28,r25,r22		C 3rd carry subtract
+	ldq	r5,8(r17)
+	cmpult	r6,r2,r8		C compute cy from last subtract
+	cmpult	r28,r25,r25		C compute cy from last subtract
+	stq	r20,0(r16)
+	bis	r8,r25,r25		C combine cy from the two subtracts
+	stq	r21,8(r16)
+	subq	r7,r3,r28		C 4th main subtract
+	subq	r28,r25,r23		C 4th carry subtract
+	cmpult	r7,r3,r8		C compute cy from last subtract
+	cmpult	r28,r25,r25		C compute cy from last subtract
+		addq	r17,32,r17		C update s1_ptr
+	bis	r8,r25,r25		C combine cy from the two subtracts
+		addq	r16,32,r16		C update res_ptr
+	subq	r4,r0,r28		C 1st main subtract
+	ldq	r2,16(r18)
+	subq	r28,r25,r20		C 1st carry subtract
+	ldq	r3,24(r18)
+	cmpult	r4,r0,r8		C compute cy from last subtract
+	ldq	r6,-16(r17)
+	cmpult	r28,r25,r25		C compute cy from last subtract
+	ldq	r7,-8(r17)
+	bis	r8,r25,r25		C combine cy from the two subtracts
+	subq	r19,4,r19		C decr loop cnt
+	stq	r22,-16(r16)
+	subq	r5,r1,r28		C 2nd main subtract
+	stq	r23,-8(r16)
+	subq	r28,r25,r21		C 2nd carry subtract
+		addq	r18,32,r18		C update s2_ptr
+	cmpult	r5,r1,r8		C compute cy from last subtract
+	bge	r19,$Loop
+C Finish software pipeline for 1st loop
+$Lend1:	cmpult	r28,r25,r25		C compute cy from last subtract
+	bis	r8,r25,r25		C combine cy from the two subtracts
+	subq	r6,r2,r28		C cy add
+	subq	r28,r25,r22		C 3rd main subtract
+	cmpult	r6,r2,r8		C compute cy from last subtract
+	cmpult	r28,r25,r25		C compute cy from last subtract
+	stq	r20,0(r16)
+	bis	r8,r25,r25		C combine cy from the two subtracts
+	stq	r21,8(r16)
+	subq	r7,r3,r28		C cy add
+	subq	r28,r25,r23		C 4th main subtract
+	cmpult	r7,r3,r8		C compute cy from last subtract
+	cmpult	r28,r25,r25		C compute cy from last subtract
+	bis	r8,r25,r25		C combine cy from the two subtracts
+	addq	r16,32,r16		C update res_ptr
+	stq	r22,-16(r16)
+	stq	r23,-8(r16)
+$Lend2:	addq	r19,4,r19		C restore loop cnt
+	beq	r19,$Lret
+C Start software pipeline for 2nd loop
+	ldq	r0,0(r18)
+	ldq	r4,0(r17)
+	subq	r19,1,r19
+	beq	r19,$Lend0
+C 2nd loop handles remaining 1-3 limbs
+	ALIGN(16)
+$Loop0:	subq	r4,r0,r28		C main subtract
+	cmpult	r4,r0,r8		C compute cy from last subtract
+	ldq	r0,8(r18)
+	ldq	r4,8(r17)
+	subq	r28,r25,r20		C carry subtract
+	addq	r18,8,r18
+	addq	r17,8,r17
+	stq	r20,0(r16)
+	cmpult	r28,r25,r25		C compute cy from last subtract
+	subq	r19,1,r19		C decr loop cnt
+	bis	r8,r25,r25		C combine cy from the two subtracts
+	addq	r16,8,r16
+	bne	r19,$Loop0
+$Lend0:	subq	r4,r0,r28		C main subtract
+	subq	r28,r25,r20		C carry subtract
+	cmpult	r4,r0,r8		C compute cy from last subtract
+	cmpult	r28,r25,r25		C compute cy from last subtract
+	stq	r20,0(r16)
+	bis	r8,r25,r25		C combine cy from the two subtracts
+
+$Lret:	bis	r25,r31,r0		C return cy
+	ret	r31,(r26),1
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/alpha/submul_1.asm b/third_party/gmp/mpn/alpha/submul_1.asm
new file mode 100644
index 0000000..2b63b52
--- /dev/null
+++ b/third_party/gmp/mpn/alpha/submul_1.asm

@@ -0,0 +1,99 @@
+dnl  Alpha mpn_submul_1 -- Multiply a limb vector with a limb and subtract
+dnl  the result from a second limb vector.
+
+dnl  Copyright 1992, 1994, 1995, 2000, 2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C      cycles/limb
+C EV4:     42
+C EV5:     18
+C EV6:      7
+
+C  INPUT PARAMETERS
+C  rp	r16
+C  up	r17
+C  n	r18
+C  limb	r19
+
+
+ASM_START()
+PROLOGUE(mpn_submul_1)
+	ldq	r2,0(r17)	C r2 = s1_limb
+	addq	r17,8,r17	C s1_ptr++
+	subq	r18,1,r18	C size--
+	mulq	r2,r19,r3	C r3 = prod_low
+	ldq	r5,0(r16)	C r5 = *res_ptr
+	umulh	r2,r19,r0	C r0 = prod_high
+	beq	r18,$Lend1	C jump if size was == 1
+	ldq	r2,0(r17)	C r2 = s1_limb
+	addq	r17,8,r17	C s1_ptr++
+	subq	r18,1,r18	C size--
+	subq	r5,r3,r3
+	cmpult	r5,r3,r4
+	stq	r3,0(r16)
+	addq	r16,8,r16	C res_ptr++
+	beq	r18,$Lend2	C jump if size was == 2
+
+	ALIGN(8)
+$Loop:	mulq	r2,r19,r3	C r3 = prod_low
+	ldq	r5,0(r16)	C r5 = *res_ptr
+	addq	r4,r0,r0	C cy_limb = cy_limb + 'cy'
+	subq	r18,1,r18	C size--
+	umulh	r2,r19,r4	C r4 = cy_limb
+	ldq	r2,0(r17)	C r2 = s1_limb
+	addq	r17,8,r17	C s1_ptr++
+	addq	r3,r0,r3	C r3 = cy_limb + prod_low
+	cmpult	r3,r0,r0	C r0 = carry from (cy_limb + prod_low)
+	subq	r5,r3,r3
+	cmpult	r5,r3,r5
+	stq	r3,0(r16)
+	addq	r16,8,r16	C res_ptr++
+	addq	r5,r0,r0	C combine carries
+	bne	r18,$Loop
+
+$Lend2:	mulq	r2,r19,r3	C r3 = prod_low
+	ldq	r5,0(r16)	C r5 = *res_ptr
+	addq	r4,r0,r0	C cy_limb = cy_limb + 'cy'
+	umulh	r2,r19,r4	C r4 = cy_limb
+	addq	r3,r0,r3	C r3 = cy_limb + prod_low
+	cmpult	r3,r0,r0	C r0 = carry from (cy_limb + prod_low)
+	subq	r5,r3,r3
+	cmpult	r5,r3,r5
+	stq	r3,0(r16)
+	addq	r5,r0,r0	C combine carries
+	addq	r4,r0,r0	C cy_limb = prod_high + cy
+	ret	r31,(r26),1
+$Lend1:	subq	r5,r3,r3
+	cmpult	r5,r3,r5
+	stq	r3,0(r16)
+	addq	r0,r5,r0
+	ret	r31,(r26),1
+EPILOGUE(mpn_submul_1)
+ASM_END()

diff --git a/third_party/gmp/mpn/alpha/umul.asm b/third_party/gmp/mpn/alpha/umul.asm
new file mode 100644
index 0000000..039081e
--- /dev/null
+++ b/third_party/gmp/mpn/alpha/umul.asm

@@ -0,0 +1,44 @@
+dnl  mpn_umul_ppmm -- 1x1->2 limb multiplication
+
+dnl  Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_umul_ppmm (mp_limb_t *lowptr, mp_limb_t m1, mp_limb_t m2);
+C
+
+ASM_START()
+PROLOGUE(mpn_umul_ppmm)
+	mulq	r17, r18, r1
+	umulh	r17, r18, r0
+	stq	r1, 0(r16)
+	ret	r31, (r26), 1
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/alpha/unicos.m4 b/third_party/gmp/mpn/alpha/unicos.m4
new file mode 100644
index 0000000..e05cf5c
--- /dev/null
+++ b/third_party/gmp/mpn/alpha/unicos.m4

@@ -0,0 +1,131 @@
+divert(-1)
+
+dnl  m4 macros for alpha assembler on unicos.
+
+
+dnl  Copyright 2000, 2002-2004, 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+dnl  Note that none of the standard GMP_ASM_ autoconf tests are done for
+dnl  unicos, so none of the config.m4 results can be used here.
+
+dnl  No underscores on unicos
+define(`GSYM_PREFIX')
+
+define(`ASM_START',
+m4_assert_numargs(0)
+`	.ident	dummy')
+
+define(`X',
+m4_assert_numargs(1)
+`^X$1')
+
+define(`FLOAT64',
+m4_assert_numargs(2)
+`	.psect	$1@crud,data
+$1:	.t_floating $2
+	.endp')
+
+dnl  Called: PROLOGUE_cpu(GSYM_PREFIX`'foo[,gp|noalign])
+dnl          EPILOGUE_cpu(GSYM_PREFIX`'foo)
+
+define(`PROLOGUE_cpu',
+m4_assert_numargs_range(1,2)
+`ifelse(`$2',gp,,
+`ifelse(`$2',noalign,,
+`ifelse(`$2',,,`m4_error(`Unrecognised PROLOGUE parameter
+')')')')dnl
+	.stack	192		; What does this mean?  Only Cray knows.
+	.psect	$1@code,code,cache
+$1::')
+
+define(`EPILOGUE_cpu',
+m4_assert_numargs(1)
+`	.endp')
+
+
+dnl  Usage: LDGP(dst,src)
+dnl
+dnl  Emit an "ldgp dst,src", but only on systems using a GOT (which unicos
+dnl  doesn't).
+
+define(LDGP,
+m4_assert_numargs(2)
+)
+
+
+dnl  Usage: EXTERN(variable_name)
+define(`EXTERN',
+m4_assert_numargs(1)
+`	.extern	$1')
+
+define(`DATASTART',
+m4_assert_numargs_range(1,2)
+`	.psect	$1@crud,data
+	ALIGN(ifelse($#,1,2,$2))
+$1:')
+
+define(`DATAEND',
+m4_assert_numargs(0)
+`	.endp')
+
+define(`ASM_END',
+m4_assert_numargs(0)
+`	.end')
+
+define(`cvttqc',
+m4_assert_numargs(-1)
+`cvttq/c')
+
+dnl  Load a symbolic address into a register
+define(`LEA',
+m4_assert_numargs(2)
+	`laum	$1,  $2(r31)
+	sll	$1,  32,   $1
+	lalm	$1,  $2($1)
+	lal	$1,  $2($1)')
+
+
+dnl  Usage: ALIGN(bytes)
+dnl
+dnl  Unicos assembler .align emits zeros, even in code segments, so disable
+dnl  aligning.
+dnl
+dnl  GCC uses a macro emiting nops until the desired alignment is reached
+dnl  (see unicosmk_file_start in alpha.c).  Could do something like that if
+dnl  we cared.  The maximum desired alignment must be established at the
+dnl  start of the section though, since of course emitting nops only
+dnl  advances relative to the section beginning.
+
+define(`ALIGN',
+m4_assert_numargs(1)
+)
+
+
+divert

diff --git a/third_party/gmp/mpn/arm/README b/third_party/gmp/mpn/arm/README
new file mode 100644
index 0000000..53c7214
--- /dev/null
+++ b/third_party/gmp/mpn/arm/README

@@ -0,0 +1,35 @@
+Copyright 2002, 2012, 2015 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.
+
+
+
+
+
+This directory contains mpn functions for ARM processors.  It has been
+optimised mainly for Cortex-A9 and Cortex-A15, but the code in the top-level
+directory should run on all ARM processors at architecture level v4 or later.

diff --git a/third_party/gmp/mpn/arm/aors_n.asm b/third_party/gmp/mpn/arm/aors_n.asm
new file mode 100644
index 0000000..fdad9f7
--- /dev/null
+++ b/third_party/gmp/mpn/arm/aors_n.asm

@@ -0,0 +1,112 @@
+dnl  ARM mpn_add_n and mpn_sub_n
+
+dnl  Contributed to the GNU project by Robert Harley.
+
+dnl  Copyright 1997, 2000, 2001, 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C StrongARM	 ?
+C XScale	 ?
+C Cortex-A7	 ?
+C Cortex-A8	 ?
+C Cortex-A9	 2.5	slightly fluctuating
+C Cortex-A15	 2.25
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`vp', `r2')
+define(`n',  `r3')
+
+ifdef(`OPERATION_add_n', `
+  define(`ADDSUB',	adds)
+  define(`ADDSUBC',	adcs)
+  define(`CLRCY',	`cmn	r0, #0')
+  define(`SETCY',	`cmp	$1, #1')
+  define(`RETVAL',	`adc	r0, n, #0')
+  define(`func',	mpn_add_n)
+  define(`func_nc',	mpn_add_nc)')
+ifdef(`OPERATION_sub_n', `
+  define(`ADDSUB',	subs)
+  define(`ADDSUBC',	sbcs)
+  define(`CLRCY',	`cmp	r0, r0')
+  define(`SETCY',	`rsbs	$1, $1, #0')
+  define(`RETVAL',	`sbc	r0, r0, r0
+			and	r0, r0, #1')
+  define(`func',	mpn_sub_n)
+  define(`func_nc',	mpn_sub_nc)')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+
+ASM_START()
+PROLOGUE(func_nc)
+	ldr	r12, [sp, #0]
+	stmfd	sp!, { r8, r9, lr }
+	SETCY(	r12)
+	b	L(ent)
+EPILOGUE()
+PROLOGUE(func)
+	stmfd	sp!, { r8, r9, lr }
+	CLRCY(	r12)
+L(ent):	tst	n, #1
+	beq	L(skip1)
+	ldr	r12, [up], #4
+	ldr	lr, [vp], #4
+	ADDSUBC	r12, r12, lr
+	str	r12, [rp], #4
+L(skip1):
+	tst	n, #2
+	beq	L(skip2)
+	ldmia	up!, { r8, r9 }
+	ldmia	vp!, { r12, lr }
+	ADDSUBC	r8, r8, r12
+	ADDSUBC	r9, r9, lr
+	stmia	rp!, { r8, r9 }
+L(skip2):
+	bics	n, n, #3
+	beq	L(rtn)
+	stmfd	sp!, { r4, r5, r6, r7 }
+
+L(top):	ldmia	up!, { r4, r5, r6, r7 }
+	ldmia	vp!, { r8, r9, r12, lr }
+	ADDSUBC	r4, r4, r8
+	sub	n, n, #4
+	ADDSUBC	r5, r5, r9
+	ADDSUBC	r6, r6, r12
+	ADDSUBC	r7, r7, lr
+	stmia	rp!, { r4, r5, r6, r7 }
+	teq	n, #0
+	bne	L(top)
+
+	ldmfd	sp!, { r4, r5, r6, r7 }
+
+L(rtn):	RETVAL
+	ldmfd	sp!, { r8, r9, pc }
+EPILOGUE()

diff --git a/third_party/gmp/mpn/arm/aorslsh1_n.asm b/third_party/gmp/mpn/arm/aorslsh1_n.asm
new file mode 100644
index 0000000..889e654
--- /dev/null
+++ b/third_party/gmp/mpn/arm/aorslsh1_n.asm

@@ -0,0 +1,167 @@
+dnl  ARM mpn_addlsh1_n and mpn_sublsh1_n
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	      addlsh1_n       sublsh1_n
+C	     cycles/limb     cycles/limb
+C StrongARM	 ?		 ?
+C XScale	 ?		 ?
+C Cortex-A7	 ?		 ?
+C Cortex-A8	 ?		 ?
+C Cortex-A9	 3.12		 3.7
+C Cortex-A15	 ?		 ?
+
+C TODO
+C  * The addlsh1_n code runs well, but is only barely faster than mpn_addmul_1.
+C    The sublsh1_n code could surely be tweaked, its REVCY slows down things
+C    very much.  If two insns are really needed, it might help to separate them
+C    for better micro-parallelism.
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`vp', `r2')
+define(`n',  `r3')
+
+ifdef(`OPERATION_addlsh1_n', `
+  define(`ADDSUB',	adds)
+  define(`ADDSUBC',	adcs)
+  define(`SETCY',	`cmp	$1, #1')
+  define(`RETVAL',	`adc	r0, $1, #2')
+  define(`SAVECY',	`sbc	$1, $2, #0')
+  define(`RESTCY',	`cmn	$1, #1')
+  define(`REVCY',	`')
+  define(`INICYR',	`mov	$1, #0')
+  define(`r10r11',	`r11')
+  define(`func',	mpn_addlsh1_n)
+  define(`func_nc',	mpn_addlsh1_nc)')
+ifdef(`OPERATION_sublsh1_n', `
+  define(`ADDSUB',	subs)
+  define(`ADDSUBC',	sbcs)
+  define(`SETCY',	`rsbs	$1, $1, #0')
+  define(`RETVAL',	`adc	r0, $1, #1')
+  define(`SAVECY',	`sbc	$1, $1, $1')
+  define(`RESTCY',	`cmn	$1, #1')
+  define(`REVCY',	`sbc	$1, $1, $1
+			cmn	$1, #1')
+  define(`INICYR',	`mvn	$1, #0')
+  define(`r10r11',	`r10')
+  define(`func',	mpn_sublsh1_n)
+  define(`func_nc',	mpn_sublsh1_nc)')
+
+MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_sublsh1_n)
+
+ASM_START()
+PROLOGUE(func)
+	push	{r4-r10r11, r14}
+
+ifdef(`OPERATION_addlsh1_n', `
+	mvn	r11, #0
+')
+	INICYR(	r14)
+	subs	n, n, #3
+	blt	L(le2)			C carry clear on branch path
+
+	cmn	r0, #0			C clear carry
+	ldmia	vp!, {r8, r9, r10}
+	b	L(mid)
+
+L(top):	RESTCY(	r14)
+	ADDSUBC	r4, r4, r8
+	ADDSUBC	r5, r5, r9
+	ADDSUBC	r6, r6, r10
+	ldmia	vp!, {r8, r9, r10}
+	stmia	rp!, {r4, r5, r6}
+	REVCY(r14)
+	adcs	r8, r8, r8
+	adcs	r9, r9, r9
+	adcs	r10, r10, r10
+	ldmia	up!, {r4, r5, r6}
+	SAVECY(	r14, r11)
+	subs	n, n, #3
+	blt	L(exi)
+	RESTCY(	r12)
+	ADDSUBC	r4, r4, r8
+	ADDSUBC	r5, r5, r9
+	ADDSUBC	r6, r6, r10
+	ldmia	vp!, {r8, r9, r10}
+	stmia	rp!, {r4, r5, r6}
+	REVCY(r12)
+L(mid):	adcs	r8, r8, r8
+	adcs	r9, r9, r9
+	adcs	r10, r10, r10
+	ldmia	up!, {r4, r5, r6}
+	SAVECY(	r12, r11)
+	subs	n, n, #3
+	bge	L(top)
+
+	mov	r7, r12			C swap alternating...
+	mov	r12, r14		C ...carry-save...
+	mov	r14, r7			C ...registers
+
+L(exi):	RESTCY(	r12)
+	ADDSUBC	r4, r4, r8
+	ADDSUBC	r5, r5, r9
+	ADDSUBC	r6, r6, r10
+	stmia	rp!, {r4, r5, r6}
+
+	REVCY(r12)
+L(le2):	tst	n, #1			C n = {-1,-2,-3} map to [2], [1], [0]
+	beq	L(e1)
+
+L(e02):	tst	n, #2
+	beq	L(rt0)
+	ldm	vp, {r8, r9}
+	adcs	r8, r8, r8
+	adcs	r9, r9, r9
+	ldm	up, {r4, r5}
+	SAVECY(	r12, r11)
+	RESTCY(	r14)
+	ADDSUBC	r4, r4, r8
+	ADDSUBC	r5, r5, r9
+	stm	rp, {r4, r5}
+	b	L(rt1)
+
+L(e1):	ldr	r8, [vp]
+	adcs	r8, r8, r8
+	ldr	r4, [up]
+	SAVECY(	r12, r11)
+	RESTCY(	r14)
+	ADDSUBC	r4, r4, r8
+	str	r4, [rp]
+
+L(rt1):	mov	r14, r12
+	REVCY(r12)
+L(rt0):	RETVAL(	r14)
+	pop	{r4-r10r11, r14}
+	return	r14
+EPILOGUE()

diff --git a/third_party/gmp/mpn/arm/aorsmul_1.asm b/third_party/gmp/mpn/arm/aorsmul_1.asm
new file mode 100644
index 0000000..b02fbb3
--- /dev/null
+++ b/third_party/gmp/mpn/arm/aorsmul_1.asm

@@ -0,0 +1,135 @@
+dnl  ARM mpn_addmul_1 and mpn_submul_1.
+
+dnl  Copyright 1998, 2000, 2001, 2003, 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C StrongARM:     ?
+C XScale	 ?
+C Cortex-A7	 ?
+C Cortex-A8	 ?
+C Cortex-A9	 5.25
+C Cortex-A15	 4
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`n',  `r2')
+define(`vl', `r3')
+define(`rl', `r12')
+define(`ul', `r6')
+define(`r',  `lr')
+
+ifdef(`OPERATION_addmul_1', `
+  define(`ADDSUB',	adds)
+  define(`ADDSUBC',	adcs)
+  define(`CLRRCY',	`mov	$1, #0
+			adds	r0, r0, #0')
+  define(`RETVAL',	`adc	r0, r4, #0')
+  define(`func',	mpn_addmul_1)')
+ifdef(`OPERATION_submul_1', `
+  define(`ADDSUB',	subs)
+  define(`ADDSUBC',	sbcs)
+  define(`CLRRCY',	`subs	$1, r0, r0')
+  define(`RETVAL',	`sbc	r0, r0, r0
+			sub	r0, $1, r0')
+  define(`func',	mpn_submul_1)')
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
+
+ASM_START()
+PROLOGUE(func)
+	stmfd	sp!, { r4-r6, lr }
+	CLRRCY(	r4)
+	tst	n, #1
+	beq	L(skip1)
+	ldr	ul, [up], #4
+	ldr	rl, [rp, #0]
+	umull	r5, r4, ul, vl
+	ADDSUB	r, rl, r5
+	str	r, [rp], #4
+L(skip1):
+	tst	n, #2
+	beq	L(skip2)
+	ldr	ul, [up], #4
+	ldr	rl, [rp, #0]
+	mov	r5, #0
+	umlal	r4, r5, ul, vl
+	ldr	ul, [up], #4
+	ADDSUBC	r, rl, r4
+	ldr	rl, [rp, #4]
+	mov	r4, #0
+	umlal	r5, r4, ul, vl
+	str	r, [rp], #4
+	ADDSUBC	r, rl, r5
+	str	r, [rp], #4
+L(skip2):
+	bics	n, n, #3
+	beq	L(rtn)
+
+	ldr	ul, [up], #4
+	ldr	rl, [rp, #0]
+	mov	r5, #0
+	umlal	r4, r5, ul, vl
+	b	L(in)
+
+L(top):	ldr	ul, [up], #4
+	ADDSUBC	r, rl, r5
+	ldr	rl, [rp, #4]
+	mov	r5, #0
+	umlal	r4, r5, ul, vl
+	str	r, [rp], #4
+L(in):	ldr	ul, [up], #4
+	ADDSUBC	r, rl, r4
+	ldr	rl, [rp, #4]
+	mov	r4, #0
+	umlal	r5, r4, ul, vl
+	str	r, [rp], #4
+	ldr	ul, [up], #4
+	ADDSUBC	r, rl, r5
+	ldr	rl, [rp, #4]
+	mov	r5, #0
+	umlal	r4, r5, ul, vl
+	str	r, [rp], #4
+	ldr	ul, [up], #4
+	ADDSUBC	r, rl, r4
+	ldr	rl, [rp, #4]
+	mov	r4, #0
+	umlal	r5, r4, ul, vl
+	sub	n, n, #4
+	tst	n, n
+	str	r, [rp], #4
+	bne	L(top)
+
+	ADDSUBC	r, rl, r5
+	str	r, [rp]
+
+L(rtn):	RETVAL(	r4)
+	ldmfd	sp!, { r4-r6, pc }
+EPILOGUE()

diff --git a/third_party/gmp/mpn/arm/arm-defs.m4 b/third_party/gmp/mpn/arm/arm-defs.m4
new file mode 100644
index 0000000..4b4fa0b
--- /dev/null
+++ b/third_party/gmp/mpn/arm/arm-defs.m4

@@ -0,0 +1,100 @@
+divert(-1)
+
+dnl  m4 macros for ARM assembler.
+
+dnl  Copyright 2001, 2012-2016, 2018-2019 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+dnl  Standard commenting is with @, the default m4 # is for constants and we
+dnl  don't want to disable macro expansions in or after them.
+
+changecom(@&*$)
+
+define(`ASM_START',
+m4_assert_numargs_range(0,1)
+`ifelse($1,`neon',`.fpu	neon',
+        $1,,`',
+        1,1,`m4_error(`$0 got invalid argument $1')')')
+
+dnl  APCS register names.
+
+deflit(a1,r0)
+deflit(a2,r1)
+deflit(a3,r2)
+deflit(a4,r3)
+dnl deflit(v1,r4)
+dnl deflit(v2,r5)
+dnl deflit(v3,r6)
+dnl deflit(v4,r7)
+dnl deflit(v5,r8)
+dnl deflit(v6,r9)
+deflit(sb,r9)
+dnl deflit(v7,r10)
+deflit(sl,r10)
+deflit(fp,r11)
+deflit(ip,r12)
+dnl deflit(sp,r13)
+deflit(lr,r14)
+deflit(pc,r15)
+
+
+define(`lea_list', `')
+define(`lea_num',0)
+
+dnl  LEA(reg,gmp_symbol)
+dnl
+dnl  Load the address of gmp_symbol into a register.  The gmp_symbol must be
+dnl  either local or protected/hidden, since we assume it has a fixed distance
+dnl  from the point of use.
+
+define(`LEA',`dnl
+ldr	$1, L(ptr`'lea_num)
+ifdef(`PIC',dnl
+`dnl
+L(bas`'lea_num):dnl
+	add	$1, $1, pc`'dnl
+	m4append(`lea_list',`
+L(ptr'lea_num`):	.word	GSYM_PREFIX`'$2-L(bas'lea_num`)-8')
+	define(`lea_num', eval(lea_num+1))dnl
+',`dnl
+	m4append(`lea_list',`
+L(ptr'lea_num`):	.word	GSYM_PREFIX`'$2')
+	define(`lea_num', eval(lea_num+1))dnl
+')dnl
+')
+
+define(`return',`ifdef(`NOTHUMB',`mov	pc, ',`bx')')
+
+
+define(`EPILOGUE_cpu',
+`lea_list
+	SIZE(`$1',.-`$1')'
+`define(`lea_list', `')')
+
+divert

diff --git a/third_party/gmp/mpn/arm/bdiv_dbm1c.asm b/third_party/gmp/mpn/arm/bdiv_dbm1c.asm
new file mode 100644
index 0000000..b919dc4
--- /dev/null
+++ b/third_party/gmp/mpn/arm/bdiv_dbm1c.asm

@@ -0,0 +1,113 @@
+dnl  ARM mpn_bdiv_dbm1c.
+
+dnl  Copyright 2008, 2011, 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C StrongARM	 ?
+C XScale	 ?
+C Cortex-A7	 ?
+C Cortex-A8	 ?
+C Cortex-A9	 4.25
+C Cortex-A15	 2.5
+
+C TODO
+C  * Try using umlal or umaal.
+C  * Try using ldm/stm.
+
+define(`qp',	  `r0')
+define(`up',	  `r1')
+define(`n',	  `r2')
+define(`bd',	  `r3')
+define(`cy',	  `sp,#0')
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_bdiv_dbm1c)
+	push	{r4, r5, r6, r7, r8}
+	ldr	r4, [up], #4
+	ldr	r5, [sp, #20]
+	ands	r12, n, #3
+	beq	L(fi0)
+	cmp	r12, #2
+	bcc	L(fi1)
+	beq	L(fi2)
+
+L(fi3):	umull	r8, r12, r4, bd
+	ldr	r4, [up], #4
+	b	L(lo3)
+
+L(fi0):	umull	r6, r7, r4, bd
+	ldr	r4, [up], #4
+	b	L(lo0)
+
+L(fi1):	subs	n, n, #1
+	umull	r8, r12, r4, bd
+	bls	L(wd1)
+	ldr	r4, [up], #4
+	b	L(lo1)
+
+L(fi2):	umull	r6, r7, r4, bd
+	ldr	r4, [up], #4
+	b	L(lo2)
+
+L(top):	ldr	r4, [up], #4
+	subs	r5, r5, r6
+	str	r5, [qp], #4
+	sbc	r5, r5, r7
+L(lo1):	umull	r6, r7, r4, bd
+	ldr	r4, [up], #4
+	subs	r5, r5, r8
+	str	r5, [qp], #4
+	sbc	r5, r5, r12
+L(lo0):	umull	r8, r12, r4, bd
+	ldr	r4, [up], #4
+	subs	r5, r5, r6
+	str	r5, [qp], #4
+	sbc	r5, r5, r7
+L(lo3):	umull	r6, r7, r4, bd
+	ldr	r4, [up], #4
+	subs	r5, r5, r8
+	str	r5, [qp], #4
+	sbc	r5, r5, r12
+L(lo2):	subs	n, n, #4
+	umull	r8, r12, r4, bd
+	bhi	L(top)
+
+L(wd2):	subs	r5, r5, r6
+	str	r5, [qp], #4
+	sbc	r5, r5, r7
+L(wd1):	subs	r5, r5, r8
+	str	r5, [qp]
+	sbc	r0, r5, r12
+	pop	{r4, r5, r6, r7, r8}
+	return	lr
+EPILOGUE()

diff --git a/third_party/gmp/mpn/arm/bdiv_q_1.asm b/third_party/gmp/mpn/arm/bdiv_q_1.asm
new file mode 100644
index 0000000..ae395d1
--- /dev/null
+++ b/third_party/gmp/mpn/arm/bdiv_q_1.asm

@@ -0,0 +1,162 @@
+dnl  ARM v4 mpn_bdiv_q_1, mpn_pi1_bdiv_q_1 -- Hensel division by 1-limb divisor.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2012, 2017 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C               cycles/limb
+C               norm   unorm
+C 1176		13	18
+C Cortex-A5	 8	12
+C Cortex-A7	10.5	18
+C Cortex-A8	14	15
+C Cortex-A9	10	12		not measured since latest edits
+C Cortex-A15	 9	 9
+C Cortex-A53	14	20
+
+C Architecture requirements:
+C v5	-
+C v5t	-
+C v5te	-
+C v6	-
+C v6t2	-
+C v7a	-
+
+define(`rp',  `r0')
+define(`up',  `r1')
+define(`n',   `r2')
+define(`d',   `r3')
+define(`di_arg',  `sp[0]')		C	just mpn_pi1_bdiv_q_1
+define(`cnt_arg', `sp[4]')		C	just mpn_pi1_bdiv_q_1
+
+define(`cy',  `r7')
+define(`cnt', `r6')
+define(`tnc', `r8')
+
+ASM_START()
+PROLOGUE(mpn_bdiv_q_1)
+	tst	d, #1
+	push	{r6-r11}
+	mov	cnt, #0
+	bne	L(inv)
+
+C count trailing zeros
+	movs	r10, d, lsl #16
+	moveq	d, d, lsr #16
+	moveq	cnt, #16
+	tst	d, #0xff
+	moveq	d, d, lsr #8
+	addeq	cnt, cnt, #8
+	LEA(	r10, ctz_tab)
+	and	r11, d, #0xff
+	ldrb	r10, [r10, r11]
+	mov	d, d, lsr r10
+	add	cnt, cnt, r10
+
+C binvert limb
+L(inv):	LEA(	r10, binvert_limb_table)
+	and	r12, d, #254
+	ldrb	r10, [r10, r12, lsr #1]
+	mul	r12, r10, r10
+	mul	r12, d, r12
+	rsb	r12, r12, r10, lsl #1
+	mul	r10, r12, r12
+	mul	r10, d, r10
+	rsb	r10, r10, r12, lsl #1	C r10 = inverse
+	b	L(pi1)
+EPILOGUE()
+
+PROLOGUE(mpn_pi1_bdiv_q_1)
+	push	{r6-r11}
+
+	ldr	cnt, [sp, #28]
+	ldr	r10, [sp, #24]
+
+L(pi1):	ldr	r11, [up], #4		C up[0]
+	cmp	cnt, #0
+	mov	cy, #0
+	bne	L(unorm)
+
+L(norm):
+	subs	n, n, #1		C set carry as side-effect
+	beq	L(edn)
+
+	ALIGN(16)
+L(tpn):	sbcs	cy, r11, cy
+	ldr	r11, [up], #4
+	sub	n, n, #1
+	mul	r9, r10, cy
+	tst	n, n
+	umull	r12, cy, d, r9
+	str	r9, [rp], #4
+	bne	L(tpn)
+
+L(edn):	sbc	cy, r11, cy
+	mul	r9, r10, cy
+	str	r9, [rp]
+	pop	{r6-r11}
+	return	r14
+
+L(unorm):
+	rsb	tnc, cnt, #32
+	mov	r11, r11, lsr cnt
+	subs	n, n, #1		C set carry as side-effect
+	beq	L(edu)
+
+	ALIGN(16)
+L(tpu):	ldr	r12, [up], #4
+	orr	r9, r11, r12, lsl tnc
+	mov	r11, r12, lsr cnt
+	sbcs	cy, r9, cy		C critical path ->cy->cy->
+	sub	n, n, #1
+	mul	r9, r10, cy		C critical path ->cy->r9->
+	tst	n, n
+	umull	r12, cy, d, r9		C critical path ->r9->cy->
+	str	r9, [rp], #4
+	bne	L(tpu)
+
+L(edu):	sbc	cy, r11, cy
+	mul	r9, r10, cy
+	str	r9, [rp]
+	pop	{r6-r11}
+	return	r14
+EPILOGUE()
+
+	RODATA
+ctz_tab:
+	.byte	8,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0
+	.byte	5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0
+	.byte	6,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0
+	.byte	5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0
+	.byte	7,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0
+	.byte	5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0
+	.byte	6,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0
+	.byte	5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0

diff --git a/third_party/gmp/mpn/arm/cnd_aors_n.asm b/third_party/gmp/mpn/arm/cnd_aors_n.asm
new file mode 100644
index 0000000..0479f0d
--- /dev/null
+++ b/third_party/gmp/mpn/arm/cnd_aors_n.asm

@@ -0,0 +1,134 @@
+dnl  ARM mpn_cnd_add_n, mpn_cnd_sub_n
+
+dnl  Copyright 2012, 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C StrongARM	 ?
+C XScale	 ?
+C Cortex-A7	 ?
+C Cortex-A8	 ?
+C Cortex-A9	 3
+C Cortex-A15	 2.5
+
+define(`cnd',	`r0')
+define(`rp',	`r1')
+define(`up',	`r2')
+define(`vp',	`r3')
+
+define(`n',	`r12')
+
+
+ifdef(`OPERATION_cnd_add_n', `
+	define(`ADDSUB',      adds)
+	define(`ADDSUBC',      adcs)
+	define(`INITCY',      `cmn	r0, #0')
+	define(`RETVAL',      `adc	r0, n, #0')
+	define(func,	      mpn_cnd_add_n)')
+ifdef(`OPERATION_cnd_sub_n', `
+	define(`ADDSUB',      subs)
+	define(`ADDSUBC',      sbcs)
+	define(`INITCY',      `cmp	r0, #0')
+	define(`RETVAL',      `adc	r0, n, #0
+			      rsb	r0, r0, #1')
+	define(func,	      mpn_cnd_sub_n)')
+
+MULFUNC_PROLOGUE(mpn_cnd_add_n mpn_cnd_sub_n)
+
+ASM_START()
+PROLOGUE(func)
+	push	{r4-r11}
+	ldr	n, [sp, #32]
+
+	cmp	cnd, #1
+	sbc	cnd, cnd, cnd		C conditionally set to 0xffffffff
+
+	INITCY				C really only needed for n = 0 (mod 4)
+
+	ands	r4, n, #3
+	beq	L(top)
+	cmp	r4, #2
+	bcc	L(b1)
+	beq	L(b2)
+
+L(b3):	ldm	vp!, {r4,r5,r6}
+	ldm	up!, {r8,r9,r10}
+	bic	r4, r4, cnd
+	bic	r5, r5, cnd
+	bic	r6, r6, cnd
+	ADDSUB	r8, r8, r4
+	ADDSUBC	r9, r9, r5
+	ADDSUBC	r10, r10, r6
+	stm	rp!, {r8,r9,r10}
+	sub	n, n, #3
+	teq	n, #0
+	bne	L(top)
+	b	L(end)
+
+L(b2):	ldm	vp!, {r4,r5}
+	ldm	up!, {r8,r9}
+	bic	r4, r4, cnd
+	bic	r5, r5, cnd
+	ADDSUB	r8, r8, r4
+	ADDSUBC	r9, r9, r5
+	stm	rp!, {r8,r9}
+	sub	n, n, #2
+	teq	n, #0
+	bne	L(top)
+	b	L(end)
+
+L(b1):	ldr	r4, [vp], #4
+	ldr	r8, [up], #4
+	bic	r4, r4, cnd
+	ADDSUB	r8, r8, r4
+	str	r8, [rp], #4
+	sub	n, n, #1
+	teq	n, #0
+	beq	L(end)
+
+L(top):	ldm	vp!, {r4,r5,r6,r7}
+	ldm	up!, {r8,r9,r10,r11}
+	bic	r4, r4, cnd
+	bic	r5, r5, cnd
+	bic	r6, r6, cnd
+	bic	r7, r7, cnd
+	ADDSUBC	r8, r8, r4
+	ADDSUBC	r9, r9, r5
+	ADDSUBC	r10, r10, r6
+	ADDSUBC	r11, r11, r7
+	sub	n, n, #4
+	stm	rp!, {r8,r9,r10,r11}
+	teq	n, #0
+	bne	L(top)
+
+L(end):	RETVAL
+	pop	{r4-r11}
+	return	r14
+EPILOGUE()

diff --git a/third_party/gmp/mpn/arm/com.asm b/third_party/gmp/mpn/arm/com.asm
new file mode 100644
index 0000000..850b10a
--- /dev/null
+++ b/third_party/gmp/mpn/arm/com.asm

@@ -0,0 +1,75 @@
+dnl  ARM mpn_com.
+
+dnl  Copyright 2003, 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C StrongARM	 ?
+C XScale	 ?
+C Cortex-A7	 ?
+C Cortex-A8	 ?
+C Cortex-A9	 2.0
+C Cortex-A15	 1.75
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`n',  `r2')
+
+ASM_START()
+PROLOGUE(mpn_com)
+	tst	n, #1
+	beq	L(skip1)
+	ldr	r3, [up], #4
+	mvn	r3, r3
+	str	r3, [rp], #4
+L(skip1):
+	tst	n, #2
+	beq	L(skip2)
+	ldmia	up!, { r3, r12 }		C load 2 limbs
+	mvn	r3, r3
+	mvn	r12, r12
+	stmia	rp!, { r3, r12 }		C store 2 limbs
+L(skip2):
+	bics	n, n, #3
+	beq	L(rtn)
+	stmfd	sp!, { r7, r8, r9 }		C save regs on stack
+
+L(top):	ldmia	up!, { r3, r8, r9, r12 }	C load 4 limbs
+	subs	n, n, #4
+	mvn	r3, r3
+	mvn	r8, r8
+	mvn	r9, r9
+	mvn	r12, r12
+	stmia	rp!, { r3, r8, r9, r12 }	C store 4 limbs
+	bne	L(top)
+
+	ldmfd	sp!, { r7, r8, r9 }		C restore regs from stack
+L(rtn):	return	lr
+EPILOGUE()

diff --git a/third_party/gmp/mpn/arm/copyd.asm b/third_party/gmp/mpn/arm/copyd.asm
new file mode 100644
index 0000000..bcad98d
--- /dev/null
+++ b/third_party/gmp/mpn/arm/copyd.asm

@@ -0,0 +1,84 @@
+dnl  ARM mpn_copyd.
+
+dnl  Contributed to the GNU project by Robert Harley and Torbjörn Granlund.
+
+dnl  Copyright 2003, 2012, 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C StrongARM	 ?
+C XScale	 ?
+C Cortex-A7	 ?
+C Cortex-A8	 ?
+C Cortex-A9	 1.25-1.5
+C Cortex-A15	 1.25
+
+C TODO
+C  * Consider wider unrolling.  Analogous 8-way code runs 10% faster on both A9
+C    and A15.  But it probably slows things down for 8 <= n < a few dozen.
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`n',  `r2')
+
+ASM_START()
+PROLOGUE(mpn_copyd)
+	mov	r12, n, lsl #2
+	sub	r12, r12, #4
+	add	rp, rp, r12
+	add	up, up, r12
+
+	tst	n, #1
+	beq	L(skip1)
+	ldr	r3, [up], #-4
+	str	r3, [rp], #-4
+L(skip1):
+	tst	n, #2
+	beq	L(skip2)
+	ldmda	up!, { r3,r12 }
+	stmda	rp!, { r3,r12 }
+L(skip2):
+	bics	n, n, #3
+	beq	L(rtn)
+
+	push	{ r4-r5 }
+	subs	n, n, #4
+	ldmda	up!, { r3,r4,r5,r12 }
+	beq	L(end)
+
+L(top):	subs	n, n, #4
+	stmda	rp!, { r3,r4,r5,r12 }
+	ldmda	up!, { r3,r4,r5,r12 }
+	bne	L(top)
+
+L(end):	stmda	rp, { r3,r4,r5,r12 }
+	pop	{ r4-r5 }
+L(rtn):	return	lr
+EPILOGUE()

diff --git a/third_party/gmp/mpn/arm/copyi.asm b/third_party/gmp/mpn/arm/copyi.asm
new file mode 100644
index 0000000..421930f
--- /dev/null
+++ b/third_party/gmp/mpn/arm/copyi.asm

@@ -0,0 +1,79 @@
+dnl  ARM mpn_copyi.
+
+dnl  Contributed to the GNU project by Robert Harley and Torbjörn Granlund.
+
+dnl  Copyright 2003, 2012, 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C StrongARM	 ?
+C XScale	 ?
+C Cortex-A7	 ?
+C Cortex-A8	 ?
+C Cortex-A9	 1.25-1.5
+C Cortex-A15	 1.25
+
+C TODO
+C  * Consider wider unrolling.  Analogous 8-way code runs 10% faster on both A9
+C    and A15.  But it probably slows things down for 8 <= n < a few dozen.
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`n',  `r2')
+
+ASM_START()
+PROLOGUE(mpn_copyi)
+	tst	n, #1
+	beq	L(skip1)
+	ldr	r3, [up], #4
+	str	r3, [rp], #4
+L(skip1):
+	tst	n, #2
+	beq	L(skip2)
+	ldmia	up!, { r3,r12 }
+	stmia	rp!, { r3,r12 }
+L(skip2):
+	bics	n, n, #3
+	beq	L(rtn)
+
+	push	{ r4-r5 }
+	subs	n, n, #4
+	ldmia	up!, { r3,r4,r5,r12 }
+	beq	L(end)
+
+L(top):	subs	n, n, #4
+	stmia	rp!, { r3,r4,r5,r12 }
+	ldmia	up!, { r3,r4,r5,r12 }
+	bne	L(top)
+
+L(end):	stm	rp, { r3,r4,r5,r12 }
+	pop	{ r4-r5 }
+L(rtn):	return	lr
+EPILOGUE()

diff --git a/third_party/gmp/mpn/arm/dive_1.asm b/third_party/gmp/mpn/arm/dive_1.asm
new file mode 100644
index 0000000..8bffb0c
--- /dev/null
+++ b/third_party/gmp/mpn/arm/dive_1.asm

@@ -0,0 +1,151 @@
+dnl  ARM v4 mpn_divexact_1.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  Copyright 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C               cycles/limb       cycles/limb
+C               norm    unorm    modexact_1c_odd
+C StrongARM	 ?
+C XScale	 ?
+C Cortex-A7	 ?
+C Cortex-A8	 ?
+C Cortex-A9	10	12
+C Cortex-A15	 9	 9
+
+C Architecture requirements:
+C v5	-
+C v5t	-
+C v5te	-
+C v6	-
+C v6t2	-
+C v7a	-
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`n',  `r2')
+define(`d',  `r3')
+
+define(`cy', `r7')
+define(`cnt', `r6')
+define(`tnc', `r8')
+
+ASM_START()
+PROLOGUE(mpn_divexact_1)
+	tst	d, #1
+	push	{r4-r9}
+	mov	cnt, #0
+	bne	L(inv)
+
+C count trailing zeros
+	movs	r4, d, lsl #16
+	moveq	d, d, lsr #16
+	moveq	cnt, #16
+	tst	d, #0xff
+	moveq	d, d, lsr #8
+	addeq	cnt, cnt, #8
+	LEA(	r4, ctz_tab)
+	and	r5, d, #0xff
+	ldrb	r4, [r4, r5]
+	mov	d, d, lsr r4
+	add	cnt, cnt, r4
+
+C binvert limb
+L(inv):	LEA(	r4, binvert_limb_table)
+	and	r12, d, #254
+	ldrb	r4, [r4, r12, lsr #1]
+	mul	r12, r4, r4
+	mul	r12, d, r12
+	rsb	r12, r12, r4, lsl #1
+	mul	r4, r12, r12
+	mul	r4, d, r4
+	rsb	r4, r4, r12, lsl #1	C r4 = inverse
+
+	tst	cnt, cnt
+	ldr	r5, [up], #4		C up[0]
+	mov	cy, #0
+	bne	L(unnorm)
+
+L(norm):
+	subs	n, n, #1		C set carry as side-effect
+	beq	L(edn)
+
+	ALIGN(16)
+L(tpn):	sbcs	cy, r5, cy
+	ldr	r5, [up], #4
+	sub	n, n, #1
+	mul	r9, r4, cy
+	tst	n, n
+	umull	r12, cy, d, r9
+	str	r9, [rp], #4
+	bne	L(tpn)
+
+L(edn):	sbc	cy, r5, cy
+	mul	r9, r4, cy
+	str	r9, [rp]
+	pop	{r4-r9}
+	return	r14
+
+L(unnorm):
+	rsb	tnc, cnt, #32
+	mov	r5, r5, lsr cnt
+	subs	n, n, #1		C set carry as side-effect
+	beq	L(edu)
+
+	ALIGN(16)
+L(tpu):	ldr	r12, [up], #4
+	orr	r9, r5, r12, lsl tnc
+	mov	r5, r12, lsr cnt
+	sbcs	cy, r9, cy		C critical path ->cy->cy->
+	sub	n, n, #1
+	mul	r9, r4, cy		C critical path ->cy->r9->
+	tst	n, n
+	umull	r12, cy, d, r9		C critical path ->r9->cy->
+	str	r9, [rp], #4
+	bne	L(tpu)
+
+L(edu):	sbc	cy, r5, cy
+	mul	r9, r4, cy
+	str	r9, [rp]
+	pop	{r4-r9}
+	return	r14
+EPILOGUE()
+
+	RODATA
+ctz_tab:
+	.byte	8,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0
+	.byte	5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0
+	.byte	6,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0
+	.byte	5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0
+	.byte	7,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0
+	.byte	5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0
+	.byte	6,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0
+	.byte	5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0

diff --git a/third_party/gmp/mpn/arm/gmp-mparam.h b/third_party/gmp/mpn/arm/gmp-mparam.h
new file mode 100644
index 0000000..87eec3a
--- /dev/null
+++ b/third_party/gmp/mpn/arm/gmp-mparam.h

@@ -0,0 +1,127 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 1999-2003, 2009, 2010 Free Software Foundation,
+Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* 1193MHz ARM (gcc55.fsffrance.org) */
+
+#define DIVREM_1_NORM_THRESHOLD              0  /* preinv always */
+#define DIVREM_1_UNNORM_THRESHOLD            0  /* always */
+#define MOD_1_NORM_THRESHOLD                 0  /* always */
+#define MOD_1_UNNORM_THRESHOLD               0  /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD         56
+#define MOD_1U_TO_MOD_1_1_THRESHOLD         11
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD         0  /* never mpn_mod_1_1p */
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD     MP_SIZE_T_MAX
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     71
+#define USE_PREINV_DIVREM_1                  1  /* preinv always */
+#define DIVREM_2_THRESHOLD                   0  /* preinv always */
+#define DIVEXACT_1_THRESHOLD                 0  /* always */
+#define BMOD_1_TO_MOD_1_THRESHOLD           41
+
+#define MUL_TOOM22_THRESHOLD                36
+#define MUL_TOOM33_THRESHOLD               125
+#define MUL_TOOM44_THRESHOLD               193
+#define MUL_TOOM6H_THRESHOLD               303
+#define MUL_TOOM8H_THRESHOLD               418
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD     125
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     176
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD     114
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD     129
+
+#define SQR_BASECASE_THRESHOLD              12
+#define SQR_TOOM2_THRESHOLD                 78
+#define SQR_TOOM3_THRESHOLD                137
+#define SQR_TOOM4_THRESHOLD                212
+#define SQR_TOOM6_THRESHOLD                306
+#define SQR_TOOM8_THRESHOLD                422
+
+#define MULMOD_BNM1_THRESHOLD               20
+#define SQRMOD_BNM1_THRESHOLD               26
+
+#define MUL_FFT_MODF_THRESHOLD             436  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    436, 5}, {     27, 6}, {     28, 7}, {     15, 6}, \
+    {     32, 7}, {     17, 6}, {     35, 7}, {     19, 6}, \
+    {     39, 7}, {     29, 8}, {     15, 7}, {     35, 8}, \
+    {     19, 7}, {     41, 8}, {     23, 7}, {     49, 8}, \
+    {     27, 9}, {     15, 8}, {     31, 7}, {     63, 8}, \
+    {    256, 9}, {    512,10}, {   1024,11}, {   2048,12}, \
+    {   4096,13}, {   8192,14}, {  16384,15}, {  32768,16} }
+#define MUL_FFT_TABLE3_SIZE 28
+#define MUL_FFT_THRESHOLD                 5760
+
+#define SQR_FFT_MODF_THRESHOLD             404  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    404, 5}, {     13, 4}, {     27, 5}, {     27, 6}, \
+    {     28, 7}, {     15, 6}, {     32, 7}, {     17, 6}, \
+    {     35, 7}, {     29, 8}, {     15, 7}, {     35, 8}, \
+    {     19, 7}, {     41, 8}, {     23, 7}, {     47, 8}, \
+    {     27, 9}, {     15, 8}, {     39, 9}, {    512,10}, \
+    {   1024,11}, {   2048,12}, {   4096,13}, {   8192,14}, \
+    {  16384,15}, {  32768,16} }
+#define SQR_FFT_TABLE3_SIZE 26
+#define SQR_FFT_THRESHOLD                 3776
+
+#define MULLO_BASECASE_THRESHOLD             0  /* always */
+#define MULLO_DC_THRESHOLD                 137
+#define MULLO_MUL_N_THRESHOLD            11479
+
+#define DC_DIV_QR_THRESHOLD                150
+#define DC_DIVAPPR_Q_THRESHOLD             494
+#define DC_BDIV_QR_THRESHOLD               148
+#define DC_BDIV_Q_THRESHOLD                345
+
+#define INV_MULMOD_BNM1_THRESHOLD           70
+#define INV_NEWTON_THRESHOLD               474
+#define INV_APPR_THRESHOLD                 478
+
+#define BINV_NEWTON_THRESHOLD              542
+#define REDC_1_TO_REDC_N_THRESHOLD         117
+
+#define MU_DIV_QR_THRESHOLD               2089
+#define MU_DIVAPPR_Q_THRESHOLD            2172
+#define MUPI_DIV_QR_THRESHOLD              225
+#define MU_BDIV_QR_THRESHOLD              1528
+#define MU_BDIV_Q_THRESHOLD               2089
+
+#define MATRIX22_STRASSEN_THRESHOLD         16
+#define HGCD_THRESHOLD                     197
+#define GCD_DC_THRESHOLD                   902
+#define GCDEXT_DC_THRESHOLD                650
+#define JACOBI_BASE_METHOD                   2
+
+#define GET_STR_DC_THRESHOLD                20
+#define GET_STR_PRECOMPUTE_THRESHOLD        39
+#define SET_STR_DC_THRESHOLD              1045
+#define SET_STR_PRECOMPUTE_THRESHOLD      2147

diff --git a/third_party/gmp/mpn/arm/invert_limb.asm b/third_party/gmp/mpn/arm/invert_limb.asm
new file mode 100644
index 0000000..af7502d
--- /dev/null
+++ b/third_party/gmp/mpn/arm/invert_limb.asm

@@ -0,0 +1,93 @@
+dnl  ARM mpn_invert_limb -- Invert a normalized limb.
+
+dnl  Copyright 2001, 2009, 2011, 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_invert_limb)
+	LEA(	r2, approx_tab-512)
+	mov	r3, r0, lsr #23
+	mov	r3, r3, asl #1
+	ldrh	r3, [r3, r2]
+	mov	r1, r3, asl #17
+	mul	r12, r3, r3
+	umull	r3, r2, r12, r0
+	sub	r1, r1, r2, asl #1
+	umull	r3, r2, r1, r1
+	umull	r12, r3, r0, r3
+	umull	r2, r12, r0, r2
+	adds	r2, r2, r3
+	adc	r12, r12, #0
+	rsb	r1, r12, r1
+	mvn	r2, r2, lsr #30
+	add	r2, r2, r1, asl #2
+	umull	r12, r3, r0, r2
+	adds	r1, r12, r0
+	adc	r3, r3, r0
+	rsb	r0, r3, r2
+	return	lr
+EPILOGUE()
+
+	RODATA
+	ALIGN(2)
+approx_tab:
+	.short    0xffc0,0xfec0,0xfdc0,0xfcc0,0xfbc0,0xfac0,0xfa00,0xf900
+	.short    0xf800,0xf700,0xf640,0xf540,0xf440,0xf380,0xf280,0xf180
+	.short    0xf0c0,0xefc0,0xef00,0xee00,0xed40,0xec40,0xeb80,0xeac0
+	.short    0xe9c0,0xe900,0xe840,0xe740,0xe680,0xe5c0,0xe500,0xe400
+	.short    0xe340,0xe280,0xe1c0,0xe100,0xe040,0xdf80,0xdec0,0xde00
+	.short    0xdd40,0xdc80,0xdbc0,0xdb00,0xda40,0xd980,0xd8c0,0xd800
+	.short    0xd740,0xd680,0xd600,0xd540,0xd480,0xd3c0,0xd340,0xd280
+	.short    0xd1c0,0xd140,0xd080,0xcfc0,0xcf40,0xce80,0xcdc0,0xcd40
+	.short    0xcc80,0xcc00,0xcb40,0xcac0,0xca00,0xc980,0xc8c0,0xc840
+	.short    0xc780,0xc700,0xc640,0xc5c0,0xc540,0xc480,0xc400,0xc380
+	.short    0xc2c0,0xc240,0xc1c0,0xc100,0xc080,0xc000,0xbf80,0xbec0
+	.short    0xbe40,0xbdc0,0xbd40,0xbc80,0xbc00,0xbb80,0xbb00,0xba80
+	.short    0xba00,0xb980,0xb900,0xb840,0xb7c0,0xb740,0xb6c0,0xb640
+	.short    0xb5c0,0xb540,0xb4c0,0xb440,0xb3c0,0xb340,0xb2c0,0xb240
+	.short    0xb1c0,0xb140,0xb0c0,0xb080,0xb000,0xaf80,0xaf00,0xae80
+	.short    0xae00,0xad80,0xad40,0xacc0,0xac40,0xabc0,0xab40,0xaac0
+	.short    0xaa80,0xaa00,0xa980,0xa900,0xa8c0,0xa840,0xa7c0,0xa740
+	.short    0xa700,0xa680,0xa600,0xa5c0,0xa540,0xa4c0,0xa480,0xa400
+	.short    0xa380,0xa340,0xa2c0,0xa240,0xa200,0xa180,0xa140,0xa0c0
+	.short    0xa080,0xa000,0x9f80,0x9f40,0x9ec0,0x9e80,0x9e00,0x9dc0
+	.short    0x9d40,0x9d00,0x9c80,0x9c40,0x9bc0,0x9b80,0x9b00,0x9ac0
+	.short    0x9a40,0x9a00,0x9980,0x9940,0x98c0,0x9880,0x9840,0x97c0
+	.short    0x9780,0x9700,0x96c0,0x9680,0x9600,0x95c0,0x9580,0x9500
+	.short    0x94c0,0x9440,0x9400,0x93c0,0x9340,0x9300,0x92c0,0x9240
+	.short    0x9200,0x91c0,0x9180,0x9100,0x90c0,0x9080,0x9000,0x8fc0
+	.short    0x8f80,0x8f40,0x8ec0,0x8e80,0x8e40,0x8e00,0x8d80,0x8d40
+	.short    0x8d00,0x8cc0,0x8c80,0x8c00,0x8bc0,0x8b80,0x8b40,0x8b00
+	.short    0x8a80,0x8a40,0x8a00,0x89c0,0x8980,0x8940,0x88c0,0x8880
+	.short    0x8840,0x8800,0x87c0,0x8780,0x8740,0x8700,0x8680,0x8640
+	.short    0x8600,0x85c0,0x8580,0x8540,0x8500,0x84c0,0x8480,0x8440
+	.short    0x8400,0x8380,0x8340,0x8300,0x82c0,0x8280,0x8240,0x8200
+	.short    0x81c0,0x8180,0x8140,0x8100,0x80c0,0x8080,0x8040,0x8000
+ASM_END()

diff --git a/third_party/gmp/mpn/arm/logops_n.asm b/third_party/gmp/mpn/arm/logops_n.asm
new file mode 100644
index 0000000..7e04165
--- /dev/null
+++ b/third_party/gmp/mpn/arm/logops_n.asm

@@ -0,0 +1,139 @@
+dnl  ARM mpn_and_n, mpn_andn_n. mpn_nand_n, etc.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 1997, 2000, 2001, 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C            cycles/limb             cycles/limb
+C          and andn ior xor         nand iorn nior xnor
+C StrongARM	 ?			 ?
+C XScale	 ?			 ?
+C Cortex-A7	 ?			 ?
+C Cortex-A8	 ?			 ?
+C Cortex-A9	2.5-2.72		2.75-3
+C Cortex-A15	2.25			2.75
+
+C TODO
+C  * It seems that 2.25 c/l and 2.75 c/l is possible for A9.
+C  * Debug popping issue, see comment below.
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`vp', `r2')
+define(`n',  `r3')
+
+define(`POSTOP')
+
+ifdef(`OPERATION_and_n',`
+  define(`func',    `mpn_and_n')
+  define(`LOGOP',   `and	$1, $2, $3')')
+ifdef(`OPERATION_andn_n',`
+  define(`func',    `mpn_andn_n')
+  define(`LOGOP',   `bic	$1, $2, $3')')
+ifdef(`OPERATION_nand_n',`
+  define(`func',    `mpn_nand_n')
+  define(`POSTOP',  `mvn	$1, $1')
+  define(`LOGOP',   `and	$1, $2, $3')')
+ifdef(`OPERATION_ior_n',`
+  define(`func',    `mpn_ior_n')
+  define(`LOGOP',   `orr	$1, $2, $3')')
+ifdef(`OPERATION_iorn_n',`
+  define(`func',    `mpn_iorn_n')
+  define(`POSTOP',  `mvn	$1, $1')
+  define(`LOGOP',   `bic	$1, $3, $2')')
+ifdef(`OPERATION_nior_n',`
+  define(`func',    `mpn_nior_n')
+  define(`POSTOP',  `mvn	$1, $1')
+  define(`LOGOP',   `orr	$1, $2, $3')')
+ifdef(`OPERATION_xor_n',`
+  define(`func',    `mpn_xor_n')
+  define(`LOGOP',   `eor	$1, $2, $3')')
+ifdef(`OPERATION_xnor_n',`
+  define(`func',    `mpn_xnor_n')
+  define(`POSTOP',  `mvn	$1, $1')
+  define(`LOGOP',   `eor	$1, $2, $3')')
+
+MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n)
+
+ASM_START()
+PROLOGUE(func)
+	push	{ r8, r9, r10 }
+	tst	n, #1
+	beq	L(skip1)
+	ldr	r10, [vp], #4
+	ldr	r12, [up], #4
+	LOGOP(	r12, r12, r10)
+	POSTOP(	r12)
+	str	r12, [rp], #4
+L(skip1):
+	tst	n, #2
+	beq	L(skip2)
+	ldmia	vp!, { r10, r12 }
+	ldmia	up!, { r8, r9 }
+	LOGOP(	r8, r8, r10)
+	LOGOP(	r9, r9, r12)
+	POSTOP(	r8)
+	POSTOP(	r9)
+	stmia	rp!, { r8, r9 }
+L(skip2):
+	bics	n, n, #3
+	beq	L(rtn)
+	push	{ r4, r5, r6, r7 }
+
+	ldmia	vp!, { r8, r9, r10, r12 }
+	b	L(mid)
+
+L(top):	ldmia	vp!, { r8, r9, r10, r12 }
+	POSTOP(	r4)
+	POSTOP(	r5)
+	POSTOP(	r6)
+	POSTOP(	r7)
+	stmia	rp!, { r4, r5, r6, r7 }
+L(mid):	sub	n, n, #4
+	ldmia	up!, { r4, r5, r6, r7 }
+	teq	n, #0
+	LOGOP(	r4, r4, r8)
+	LOGOP(	r5, r5, r9)
+	LOGOP(	r6, r6, r10)
+	LOGOP(	r7, r7, r12)
+	bne	L(top)
+
+	POSTOP(	r4)
+	POSTOP(	r5)
+	POSTOP(	r6)
+	POSTOP(	r7)
+	stmia	rp!, { r4, r5, r6, r7 }
+
+	pop	{ r4, r5, r6, r7 }	C popping r8-r10 here strangely fails
+
+L(rtn):	pop	{ r8, r9, r10 }
+	return	r14
+EPILOGUE()

diff --git a/third_party/gmp/mpn/arm/lshift.asm b/third_party/gmp/mpn/arm/lshift.asm
new file mode 100644
index 0000000..1d5ce0a
--- /dev/null
+++ b/third_party/gmp/mpn/arm/lshift.asm

@@ -0,0 +1,88 @@
+dnl  ARM mpn_lshift.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C StrongARM	 ?
+C XScale	 ?
+C Cortex-A7	 ?
+C Cortex-A8	 ?
+C Cortex-A9	 3.5
+C Cortex-A15	 ?
+
+define(`rp',  `r0')
+define(`up',  `r1')
+define(`n',   `r2')
+define(`cnt', `r3')
+define(`tnc', `r12')
+
+ASM_START()
+PROLOGUE(mpn_lshift)
+	add	up, up, n, lsl #2
+	push	{r4, r6, r7, r8}
+	ldr	r4, [up, #-4]!
+	add	rp, rp, n, lsl #2
+	rsb	tnc, cnt, #32
+
+	mov	r7, r4, lsl cnt
+	tst	n, #1
+	beq	L(evn)			C n even
+
+L(odd):	subs	n, n, #2
+	bcc	L(1)			C n = 1
+	ldr	r8, [up, #-4]!
+	b	L(mid)
+
+L(evn):	ldr	r6, [up, #-4]!
+	subs	n, n, #2
+	beq	L(end)
+
+L(top):	ldr	r8, [up, #-4]!
+	orr	r7, r7, r6, lsr tnc
+	str	r7, [rp, #-4]!
+	mov	r7, r6, lsl cnt
+L(mid):	ldr	r6, [up, #-4]!
+	orr	r7, r7, r8, lsr tnc
+	str	r7, [rp, #-4]!
+	mov	r7, r8, lsl cnt
+	subs	n, n, #2
+	bgt	L(top)
+
+L(end):	orr	r7, r7, r6, lsr tnc
+	str	r7, [rp, #-4]!
+	mov	r7, r6, lsl cnt
+L(1):	str	r7, [rp, #-4]
+	mov	r0, r4, lsr tnc
+	pop	{r4, r6, r7, r8}
+	return	r14
+EPILOGUE()

diff --git a/third_party/gmp/mpn/arm/lshiftc.asm b/third_party/gmp/mpn/arm/lshiftc.asm
new file mode 100644
index 0000000..e5b52df
--- /dev/null
+++ b/third_party/gmp/mpn/arm/lshiftc.asm

@@ -0,0 +1,95 @@
+dnl  ARM mpn_lshiftc.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C StrongARM	 ?
+C XScale	 ?
+C Cortex-A7	 ?
+C Cortex-A8	 ?
+C Cortex-A9	 4.0
+C Cortex-A15	 ?
+
+define(`rp',  `r0')
+define(`up',  `r1')
+define(`n',   `r2')
+define(`cnt', `r3')
+define(`tnc', `r12')
+
+ASM_START()
+PROLOGUE(mpn_lshiftc)
+	add	up, up, n, lsl #2
+	push	{r4, r6, r7, r8}
+	ldr	r4, [up, #-4]!
+	add	rp, rp, n, lsl #2
+	rsb	tnc, cnt, #32
+	mvn	r6, r4
+
+	mov	r7, r6, lsl cnt
+	tst	n, #1
+	beq	L(evn)			C n even
+
+L(odd):	subs	n, n, #2
+	bcc	L(1)			C n = 1
+	ldr	r8, [up, #-4]!
+	mvn	r8, r8
+	b	L(mid)
+
+L(evn):	ldr	r6, [up, #-4]!
+	mvn	r6, r6
+	subs	n, n, #2
+	beq	L(end)
+
+L(top):	ldr	r8, [up, #-4]!
+	orr	r7, r7, r6, lsr tnc
+	str	r7, [rp, #-4]!
+	mvn	r8, r8
+	mov	r7, r6, lsl cnt
+L(mid):	ldr	r6, [up, #-4]!
+	orr	r7, r7, r8, lsr tnc
+	str	r7, [rp, #-4]!
+	mvn	r6, r6
+	mov	r7, r8, lsl cnt
+	subs	n, n, #2
+	bgt	L(top)
+
+L(end):	orr	r7, r7, r6, lsr tnc
+	str	r7, [rp, #-4]!
+	mov	r7, r6, lsl cnt
+L(1):	mvn	r6, #0
+	orr	r7, r7, r6, lsr tnc
+	str	r7, [rp, #-4]
+	mov	r0, r4, lsr tnc
+	pop	{r4, r6, r7, r8}
+	return	r14
+EPILOGUE()

diff --git a/third_party/gmp/mpn/arm/mod_34lsub1.asm b/third_party/gmp/mpn/arm/mod_34lsub1.asm
new file mode 100644
index 0000000..596cd3c
--- /dev/null
+++ b/third_party/gmp/mpn/arm/mod_34lsub1.asm

@@ -0,0 +1,124 @@
+dnl  ARM mpn_mod_34lsub1 -- remainder modulo 2^24-1.
+
+dnl  Copyright 2012, 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C StrongARM	 ?
+C XScale	 ?
+C Cortex-A5	 2.67
+C Cortex-A7	 2.35
+C Cortex-A8	 2.0
+C Cortex-A9	 1.33
+C Cortex-A15	 1.33
+C Cortex-A17	 3.34
+C Cortex-A53	 2.0
+
+define(`ap',	r0)
+define(`n',	r1)
+
+C mp_limb_t mpn_mod_34lsub1 (mp_srcptr up, mp_size_t n)
+
+C TODO
+C  * Write cleverer summation code.
+C  * Consider loading 6 64-bit aligned registers at a time, to approach 1 c/l.
+
+ASM_START()
+	TEXT
+	ALIGN(32)
+PROLOGUE(mpn_mod_34lsub1)
+	push	{ r4, r5, r6, r7 }
+
+	subs	n, n, #3
+	mov	r7, #0
+	blt	L(le2)			C n <= 2
+
+	ldmia	ap!, { r2, r3, r12 }
+	subs	n, n, #3
+	blt	L(sum)			C n <= 5
+	cmn	r0, #0			C clear carry
+	sub	n, n, #3
+	b	L(mid)
+
+L(top):	adcs	r2, r2, r4
+	adcs	r3, r3, r5
+	adcs	r12, r12, r6
+L(mid):	ldmia	ap!, { r4, r5, r6 }
+	tst	n, n
+	sub	n, n, #3
+	bpl	L(top)
+
+	add	n, n, #3
+
+	adcs	r2, r2, r4
+	adcs	r3, r3, r5
+	adcs	r12, r12, r6
+	movcs	r7, #1			C r7 <= 1
+
+L(sum):	cmn	n, #2
+	movlo	r4, #0
+	ldrhs	r4, [ap], #4
+	movls	r5, #0
+	ldrhi	r5, [ap], #4
+
+	adds	r2, r2, r4
+	adcs	r3, r3, r5
+	adcs	r12, r12, #0
+	adc	r7, r7, #0		C r7 <= 2
+
+L(sum2):
+	bic	r0, r2, #0xff000000
+	add	r0, r0, r2, lsr #24
+	add	r0, r0, r7
+
+	mov	r7, r3, lsl #8
+	bic	r1, r7, #0xff000000
+	add	r0, r0, r1
+	add	r0, r0, r3, lsr #16
+
+	mov	r7, r12, lsl #16
+	bic	r1, r7, #0xff000000
+	add	r0, r0, r1
+	add	r0, r0, r12, lsr #8
+
+	pop	{ r4, r5, r6, r7 }
+	return	lr
+
+L(le2):	cmn	n, #1
+	bne	L(1)
+	ldmia	ap!, { r2, r3 }
+	mov	r12, #0
+	b	L(sum2)
+L(1):	ldr	r2, [ap]
+	bic	r0, r2, #0xff000000
+	add	r0, r0, r2, lsr #24
+	pop	{ r4, r5, r6, r7 }
+	return	lr
+EPILOGUE()

diff --git a/third_party/gmp/mpn/arm/mode1o.asm b/third_party/gmp/mpn/arm/mode1o.asm
new file mode 100644
index 0000000..63a7f36
--- /dev/null
+++ b/third_party/gmp/mpn/arm/mode1o.asm

@@ -0,0 +1,92 @@
+dnl  ARM mpn_modexact_1c_odd
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C StrongARM	 ?
+C XScale	 ?
+C Cortex-A7	 ?
+C Cortex-A8	 ?
+C Cortex-A9	10
+C Cortex-A15	 9
+
+C Architecture requirements:
+C v5	-
+C v5t	-
+C v5te	-
+C v6	-
+C v6t2	-
+C v7a	-
+
+define(`up', `r0')
+define(`n',  `r1')
+define(`d',  `r2')
+define(`cy', `r3')
+
+	.protected	binvert_limb_table
+ASM_START()
+PROLOGUE(mpn_modexact_1c_odd)
+	stmfd	sp!, {r4, r5}
+
+	LEA(	r4, binvert_limb_table)
+
+	ldr	r5, [up], #4		C up[0]
+
+	and	r12, d, #254
+	ldrb	r4, [r4, r12, lsr #1]
+	mul	r12, r4, r4
+	mul	r12, d, r12
+	rsb	r12, r12, r4, asl #1
+	mul	r4, r12, r12
+	mul	r4, d, r4
+	rsb	r4, r4, r12, asl #1	C r4 = inverse
+
+	subs	n, n, #1		C set carry as side-effect
+	beq	L(end)
+
+L(top):	sbcs	cy, r5, cy
+	ldr	r5, [up], #4
+	sub	n, n, #1
+	mul	r12, r4, cy
+	tst	n, n
+	umull	r12, cy, d, r12
+	bne	L(top)
+
+L(end):	sbcs	cy, r5, cy
+	mul	r12, r4, cy
+	umull	r12, r0, d, r12
+	addcc	r0, r0, #1
+
+	ldmfd	sp!, {r4, r5}
+	return	r14
+EPILOGUE()

diff --git a/third_party/gmp/mpn/arm/mul_1.asm b/third_party/gmp/mpn/arm/mul_1.asm
new file mode 100644
index 0000000..f7bc1bc
--- /dev/null
+++ b/third_party/gmp/mpn/arm/mul_1.asm

@@ -0,0 +1,94 @@
+dnl  ARM mpn_mul_1 -- Multiply a limb vector with a limb and store the result
+dnl  in a second limb vector.
+dnl  Contributed by Robert Harley.
+
+dnl  Copyright 1998, 2000, 2001, 2003, 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C StrongARM	6-8
+C XScale	 ?
+C Cortex-A7	 ?
+C Cortex-A8	 ?
+C Cortex-A9	 4.75
+C Cortex-A15	 ?
+
+C We should rewrite this along the lines of addmul_1.asm.  That should save a
+C cycle on StrongARM, and several cycles on XScale.
+
+define(`rp',`r0')
+define(`up',`r1')
+define(`n',`r2')
+define(`vl',`r3')
+
+
+ASM_START()
+PROLOGUE(mpn_mul_1)
+	stmfd	sp!, { r8, r9, lr }
+	ands	r12, n, #1
+	beq	L(skip1)
+	ldr	lr, [up], #4
+	umull	r9, r12, lr, vl
+	str	r9, [rp], #4
+L(skip1):
+	tst	n, #2
+	beq	L(skip2)
+	mov	r8, r12
+	ldmia	up!, { r12, lr }
+	mov	r9, #0
+	umlal	r8, r9, r12, vl
+	mov	r12, #0
+	umlal	r9, r12, lr, vl
+	stmia	rp!, { r8, r9 }
+L(skip2):
+	bics	n, n, #3
+	beq	L(rtn)
+	stmfd	sp!, { r6, r7 }
+
+L(top):	mov	r6, r12
+	ldmia	up!, { r8, r9, r12, lr }
+	ldr	r7, [rp, #12]			C cache allocate
+	mov	r7, #0
+	umlal	r6, r7, r8, vl
+	mov	r8, #0
+	umlal	r7, r8, r9, vl
+	mov	r9, #0
+	umlal	r8, r9, r12, vl
+	mov	r12, #0
+	umlal	r9, r12, lr, vl
+	subs	n, n, #4
+	stmia	rp!, { r6, r7, r8, r9 }
+	bne	L(top)
+
+	ldmfd	sp!, { r6, r7 }
+
+L(rtn):	mov	r0, r12
+	ldmfd	sp!, { r8, r9, pc }
+EPILOGUE()

diff --git a/third_party/gmp/mpn/arm/neon/README b/third_party/gmp/mpn/arm/neon/README
new file mode 100644
index 0000000..79e3b48
--- /dev/null
+++ b/third_party/gmp/mpn/arm/neon/README

@@ -0,0 +1,2 @@
+This directory contains Neon code which runs and is efficient on all
+ARM CPUs which support Neon.

diff --git a/third_party/gmp/mpn/arm/neon/hamdist.asm b/third_party/gmp/mpn/arm/neon/hamdist.asm
new file mode 100644
index 0000000..2320896
--- /dev/null
+++ b/third_party/gmp/mpn/arm/neon/hamdist.asm

@@ -0,0 +1,194 @@
+dnl  ARM Neon mpn_hamdist -- mpn bit hamming distance.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C StrongARM:	 -
+C XScale	 -
+C Cortex-A7	 ?
+C Cortex-A8	 ?
+C Cortex-A9	 1.89
+C Cortex-A15	 0.95
+
+C TODO
+C  * Explore using vldr and vldm.  Does it help on A9?  (These loads do
+C    64-bits-at-a-time, which will mess up in big-endian mode.  Except not for
+C    popcount. Except perhaps also for popcount for the edge loads.)
+C  * Arrange to align the pointer, if that helps performance.  Use the same
+C    read-and-mask trick we use on PCs, for simplicity and performance.  (Sorry
+C    valgrind!)
+C  * Explore if explicit align directives, e.g., "[ptr:128]" help.
+C  * See rth's gmp-devel 2013-02/03 messages about final summation tricks.
+
+C INPUT PARAMETERS
+define(`ap', r0)
+define(`bp', r1)
+define(`n',  r2)
+
+C We sum into 16 16-bit counters in q8,q9, but at the end we sum them and end
+C up with 8 16-bit counters.  Therefore, we can sum to 8(2^16-1) bits, or
+C (8*2^16-1)/32 = 0x3fff limbs.  We use a chunksize close to that, but which
+C can be represented as a 8-bit ARM constant.
+C
+define(`chunksize',0x3f80)
+
+ASM_START()
+PROLOGUE(mpn_hamdist)
+
+	cmp	n, #chunksize
+	bhi	L(gt16k)
+
+L(lt16k):
+	vmov.i64   q8, #0		C clear summation register
+	vmov.i64   q9, #0		C clear summation register
+
+	tst	   n, #1
+	beq	   L(xxx0)
+	vmov.i64   d0, #0
+	vmov.i64   d20, #0
+	sub	   n, n, #1
+	vld1.32   {d0[0]}, [ap]!	C load 1 limb
+	vld1.32   {d20[0]}, [bp]!	C load 1 limb
+	veor	   d0, d0, d20
+	vcnt.8	   d24, d0
+	vpadal.u8  d16, d24		C d16/q8 = 0; could just splat
+
+L(xxx0):tst	   n, #2
+	beq	   L(xx00)
+	sub	   n, n, #2
+	vld1.32    {d0}, [ap]!		C load 2 limbs
+	vld1.32    {d20}, [bp]!		C load 2 limbs
+	veor	   d0, d0, d20
+	vcnt.8	   d24, d0
+	vpadal.u8  d16, d24
+
+L(xx00):tst	   n, #4
+	beq	   L(x000)
+	sub	   n, n, #4
+	vld1.32    {q0}, [ap]!		C load 4 limbs
+	vld1.32    {q10}, [bp]!		C load 4 limbs
+	veor	   q0, q0, q10
+	vcnt.8	   q12, q0
+	vpadal.u8  q8, q12
+
+L(x000):tst	   n, #8
+	beq	   L(0000)
+
+	subs	   n, n, #8
+	vld1.32    {q0,q1}, [ap]!	C load 8 limbs
+	vld1.32    {q10,q11}, [bp]!	C load 8 limbs
+	bls	   L(sum)
+
+L(gt8):	vld1.32    {q2,q3}, [ap]!	C load 8 limbs
+	vld1.32    {q14,q15}, [bp]!	C load 8 limbs
+	veor	   q0, q0, q10
+	veor	   q1, q1, q11
+	sub	   n, n, #8
+	vcnt.8	   q12, q0
+	vcnt.8	   q13, q1
+	b	   L(mid)
+
+L(0000):subs	   n, n, #16
+	blo	   L(e0)
+
+	vld1.32    {q2,q3}, [ap]!	C load 8 limbs
+	vld1.32    {q0,q1}, [ap]!	C load 8 limbs
+	vld1.32    {q14,q15}, [bp]!	C load 8 limbs
+	vld1.32    {q10,q11}, [bp]!	C load 8 limbs
+	veor	   q2, q2, q14
+	veor	   q3, q3, q15
+	vcnt.8	   q12, q2
+	vcnt.8	   q13, q3
+	subs	   n, n, #16
+	blo	   L(end)
+
+L(top):	vld1.32    {q2,q3}, [ap]!	C load 8 limbs
+	vld1.32    {q14,q15}, [bp]!	C load 8 limbs
+	veor	   q0, q0, q10
+	veor	   q1, q1, q11
+	vpadal.u8  q8, q12
+	vcnt.8	   q12, q0
+	vpadal.u8  q9, q13
+	vcnt.8	   q13, q1
+L(mid):	vld1.32    {q0,q1}, [ap]!	C load 8 limbs
+	vld1.32    {q10,q11}, [bp]!	C load 8 limbs
+	veor	   q2, q2, q14
+	veor	   q3, q3, q15
+	subs	   n, n, #16
+	vpadal.u8  q8, q12
+	vcnt.8	   q12, q2
+	vpadal.u8  q9, q13
+	vcnt.8	   q13, q3
+	bhs	   L(top)
+
+L(end):	vpadal.u8  q8, q12
+	vpadal.u8  q9, q13
+L(sum):	veor	   q0, q0, q10
+	veor	   q1, q1, q11
+	vcnt.8	   q12, q0
+	vcnt.8	   q13, q1
+	vpadal.u8  q8, q12
+	vpadal.u8  q9, q13
+	vadd.i16   q8, q8, q9
+					C we have 8 16-bit counts
+L(e0):	vpaddl.u16 q8, q8		C we have 4 32-bit counts
+	vpaddl.u32 q8, q8		C we have 2 64-bit counts
+	vmov.32    r0, d16[0]
+	vmov.32    r1, d17[0]
+	add	   r0, r0, r1
+	bx	lr
+
+C Code for large count.  Splits operand and calls above code.
+define(`ap2', r5)
+define(`bp2', r6)
+L(gt16k):
+	push	{r4,r5,r6,r14}
+	mov	ap2, ap
+	mov	bp2, bp
+	mov	r3, n			C full count
+	mov	r4, #0			C total sum
+
+1:	mov	n, #chunksize		C count for this invocation
+	bl	L(lt16k)		C could jump deep inside code
+	add	ap2, ap2, #chunksize*4	C point at next chunk
+	add	bp2, bp2, #chunksize*4	C point at next chunk
+	add	r4, r4, r0
+	mov	ap, ap2			C put chunk pointer in place for call
+	mov	bp, bp2			C put chunk pointer in place for call
+	sub	r3, r3, #chunksize
+	cmp	r3, #chunksize
+	bhi	1b
+
+	mov	n, r3			C count for final invocation
+	bl	L(lt16k)
+	add	r0, r4, r0
+	pop	{r4,r5,r6,pc}
+EPILOGUE()

diff --git a/third_party/gmp/mpn/arm/neon/lorrshift.asm b/third_party/gmp/mpn/arm/neon/lorrshift.asm
new file mode 100644
index 0000000..7ebc780
--- /dev/null
+++ b/third_party/gmp/mpn/arm/neon/lorrshift.asm

@@ -0,0 +1,279 @@
+dnl  ARM Neon mpn_lshift and mpn_rshift.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb     cycles/limb     cycles/limb      good
+C              aligned	      unaligned	      best seen	     for cpu?
+C StrongARM	 -		 -
+C XScale	 -		 -
+C Cortex-A7	 ?		 ?
+C Cortex-A8	 ?		 ?
+C Cortex-A9	 3		 3				Y
+C Cortex-A15	 1.5		 1.5				Y
+
+
+C We read 64 bits at a time at 32-bit aligned addresses, and except for the
+C first and last store, we write using 64-bit aligned addresses.  All shifting
+C is done on 64-bit words in 'extension' registers.
+C
+C It should be possible to read also using 64-bit alignment, by manipulating
+C the shift count for unaligned operands.  Not done, since it does not seem to
+C matter for A9 or A15.
+C
+C This will not work in big-endian mode.
+
+C TODO
+C  * Try using 128-bit operations.  Note that Neon lacks pure 128-bit shifts,
+C    which might make it tricky.
+C  * Clean up and simplify.
+C  * Consider sharing most of the code for lshift and rshift, since the feed-in
+C    code, the loop, and most of the wind-down code are identical.
+C  * Replace the basecase code with code using 'extension' registers.
+C  * Optimise.  It is not clear that this loop insn permutation is optimal for
+C    either A9 or A15.
+
+C INPUT PARAMETERS
+define(`rp',  `r0')
+define(`ap',  `r1')
+define(`n',   `r2')
+define(`cnt', `r3')
+
+ifdef(`OPERATION_lshift',`
+	define(`IFLSH', `$1')
+	define(`IFRSH', `')
+	define(`X',`0')
+	define(`Y',`1')
+	define(`func',`mpn_lshift')
+')
+ifdef(`OPERATION_rshift',`
+	define(`IFLSH', `')
+	define(`IFRSH', `$1')
+	define(`X',`1')
+	define(`Y',`0')
+	define(`func',`mpn_rshift')
+')
+
+MULFUNC_PROLOGUE(mpn_lshift mpn_rshift)
+
+ASM_START(neon)
+	TEXT
+	ALIGN(64)
+PROLOGUE(func)
+IFLSH(`	mov	r12, n, lsl #2	')
+IFLSH(`	add	rp, rp, r12	')
+IFLSH(`	add	ap, ap, r12	')
+
+	cmp	n, #4			C SIMD code n limit
+	ble	L(base)
+
+ifdef(`OPERATION_lshift',`
+	vdup.32	d6, r3			C left shift count is positive
+	sub	r3, r3, #64		C right shift count is negative
+	vdup.32	d7, r3
+	mov	r12, #-8')		C lshift pointer update offset
+ifdef(`OPERATION_rshift',`
+	rsb	r3, r3, #0		C right shift count is negative
+	vdup.32	d6, r3
+	add	r3, r3, #64		C left shift count is positive
+	vdup.32	d7, r3
+	mov	r12, #8')		C rshift pointer update offset
+
+IFLSH(`	sub	ap, ap, #8	')
+	vld1.32	{d19}, [ap], r12	C load initial 2 limbs
+	vshl.u64 d18, d19, d7		C retval
+
+	tst	rp, #4			C is rp 64-bit aligned already?
+	beq	L(rp_aligned)		C yes, skip
+IFLSH(`	add	ap, ap, #4	')	C move back ap pointer
+IFRSH(`	sub	ap, ap, #4	')	C move back ap pointer
+	vshl.u64 d4, d19, d6
+	sub	n, n, #1		C first limb handled
+IFLSH(`	sub	 rp, rp, #4	')
+	vst1.32	 {d4[Y]}, [rp]IFRSH(!)	C store first limb, rp gets aligned
+	vld1.32	 {d19}, [ap], r12	C load ap[1] and ap[2]
+
+L(rp_aligned):
+IFLSH(`	sub	rp, rp, #8	')
+	subs	n, n, #6
+	blt	L(two_or_three_more)
+	tst	n, #2
+	beq	L(2)
+
+L(1):	vld1.32	 {d17}, [ap], r12
+	vshl.u64 d5, d19, d6
+	vld1.32	 {d16}, [ap], r12
+	vshl.u64 d0, d17, d7
+	vshl.u64 d4, d17, d6
+	sub	n, n, #2
+	b	 L(mid)
+
+L(2):	vld1.32	 {d16}, [ap], r12
+	vshl.u64 d4, d19, d6
+	vld1.32	 {d17}, [ap], r12
+	vshl.u64 d1, d16, d7
+	vshl.u64 d5, d16, d6
+	subs	n, n, #4
+	blt	L(end)
+
+L(top):	vld1.32	 {d16}, [ap], r12
+	vorr	 d2, d4, d1
+	vshl.u64 d0, d17, d7
+	vshl.u64 d4, d17, d6
+	vst1.32	 {d2}, [rp:64], r12
+L(mid):	vld1.32	 {d17}, [ap], r12
+	vorr	 d3, d5, d0
+	vshl.u64 d1, d16, d7
+	vshl.u64 d5, d16, d6
+	vst1.32	 {d3}, [rp:64], r12
+	subs	n, n, #4
+	bge	L(top)
+
+L(end):	tst	 n, #1
+	beq	 L(evn)
+
+	vorr	 d2, d4, d1
+	vst1.32	 {d2}, [rp:64], r12
+	b	 L(cj1)
+
+L(evn):	vorr	 d2, d4, d1
+	vshl.u64 d0, d17, d7
+	vshl.u64 d16, d17, d6
+	vst1.32	 {d2}, [rp:64], r12
+	vorr	 d2, d5, d0
+	b	 L(cj2)
+
+C Load last 2 - 3 limbs, store last 4 - 5 limbs
+L(two_or_three_more):
+	tst	n, #1
+	beq	L(l2)
+
+L(l3):	vshl.u64 d5, d19, d6
+	vld1.32	 {d17}, [ap], r12
+L(cj1):	veor	 d16, d16, d16
+IFLSH(`	add	 ap, ap, #4	')
+	vld1.32	 {d16[Y]}, [ap], r12
+	vshl.u64 d0, d17, d7
+	vshl.u64 d4, d17, d6
+	vorr	 d3, d5, d0
+	vshl.u64 d1, d16, d7
+	vshl.u64 d5, d16, d6
+	vst1.32	 {d3}, [rp:64], r12
+	vorr	 d2, d4, d1
+	vst1.32	 {d2}, [rp:64], r12
+IFLSH(`	add	 rp, rp, #4	')
+	vst1.32	 {d5[Y]}, [rp]
+	vmov.32	 r0, d18[X]
+	bx	lr
+
+L(l2):	vld1.32	 {d16}, [ap], r12
+	vshl.u64 d4, d19, d6
+	vshl.u64 d1, d16, d7
+	vshl.u64 d16, d16, d6
+	vorr	 d2, d4, d1
+L(cj2):	vst1.32	 {d2}, [rp:64], r12
+	vst1.32	 {d16}, [rp]
+	vmov.32	 r0, d18[X]
+	bx	lr
+
+
+define(`tnc', `r12')
+L(base):
+	push	{r4, r6, r7, r8}
+ifdef(`OPERATION_lshift',`
+	ldr	r4, [ap, #-4]!
+	rsb	tnc, cnt, #32
+
+	mov	r7, r4, lsl cnt
+	tst	n, #1
+	beq	L(ev)			C n even
+
+L(od):	subs	n, n, #2
+	bcc	L(ed1)			C n = 1
+	ldr	r8, [ap, #-4]!
+	b	L(md)			C n = 3
+
+L(ev):	ldr	r6, [ap, #-4]!
+	subs	n, n, #2
+	beq	L(ed)			C n = 3
+					C n = 4
+L(tp):	ldr	r8, [ap, #-4]!
+	orr	r7, r7, r6, lsr tnc
+	str	r7, [rp, #-4]!
+	mov	r7, r6, lsl cnt
+L(md):	ldr	r6, [ap, #-4]!
+	orr	r7, r7, r8, lsr tnc
+	str	r7, [rp, #-4]!
+	mov	r7, r8, lsl cnt
+
+L(ed):	orr	r7, r7, r6, lsr tnc
+	str	r7, [rp, #-4]!
+	mov	r7, r6, lsl cnt
+L(ed1):	str	r7, [rp, #-4]
+	mov	r0, r4, lsr tnc
+')
+ifdef(`OPERATION_rshift',`
+	ldr	r4, [ap]
+	rsb	tnc, cnt, #32
+
+	mov	r7, r4, lsr cnt
+	tst	n, #1
+	beq	L(ev)			C n even
+
+L(od):	subs	n, n, #2
+	bcc	L(ed1)			C n = 1
+	ldr	r8, [ap, #4]!
+	b	L(md)			C n = 3
+
+L(ev):	ldr	r6, [ap, #4]!
+	subs	n, n, #2
+	beq	L(ed)			C n = 2
+					C n = 4
+
+L(tp):	ldr	r8, [ap, #4]!
+	orr	r7, r7, r6, lsl tnc
+	str	r7, [rp], #4
+	mov	r7, r6, lsr cnt
+L(md):	ldr	r6, [ap, #4]!
+	orr	r7, r7, r8, lsl tnc
+	str	r7, [rp], #4
+	mov	r7, r8, lsr cnt
+
+L(ed):	orr	r7, r7, r6, lsl tnc
+	str	r7, [rp], #4
+	mov	r7, r6, lsr cnt
+L(ed1):	str	r7, [rp], #4
+	mov	r0, r4, lsl tnc
+')
+	pop	{r4, r6, r7, r8}
+	bx	r14
+EPILOGUE()

diff --git a/third_party/gmp/mpn/arm/neon/lshiftc.asm b/third_party/gmp/mpn/arm/neon/lshiftc.asm
new file mode 100644
index 0000000..f1bf0de
--- /dev/null
+++ b/third_party/gmp/mpn/arm/neon/lshiftc.asm

@@ -0,0 +1,242 @@
+dnl  ARM Neon mpn_lshiftc.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb     cycles/limb     cycles/limb      good
+C              aligned	      unaligned	      best seen	     for cpu?
+C StrongARM	 -		 -
+C XScale	 -		 -
+C Cortex-A7	 ?		 ?
+C Cortex-A8	 ?		 ?
+C Cortex-A9	 3.5		 3.5				Y
+C Cortex-A15	 1.75		 1.75				Y
+
+
+C We read 64 bits at a time at 32-bit aligned addresses, and except for the
+C first and last store, we write using 64-bit aligned addresses.  All shifting
+C is done on 64-bit words in 'extension' registers.
+C
+C It should be possible to read also using 64-bit alignment, by manipulating
+C the shift count for unaligned operands.  Not done, since it does not seem to
+C matter for A9 or A15.
+C
+C This will not work in big-endian mode.
+
+C TODO
+C  * Try using 128-bit operations.  Note that Neon lacks pure 128-bit shifts,
+C    which might make it tricky.
+C  * Clean up and simplify.
+C  * Consider sharing most of the code for lshift and rshift, since the feed-in
+C    code, the loop, and most of the wind-down code are identical.
+C  * Replace the basecase code with code using 'extension' registers.
+C  * Optimise.  It is not clear that this loop insn permutation is optimal for
+C    either A9 or A15.
+
+C INPUT PARAMETERS
+define(`rp',  `r0')
+define(`ap',  `r1')
+define(`n',   `r2')
+define(`cnt', `r3')
+
+ASM_START(neon)
+	TEXT
+	ALIGN(64)
+PROLOGUE(mpn_lshiftc)
+	mov	r12, n, lsl #2
+	add	rp, rp, r12
+	add	ap, ap, r12
+
+	cmp	n, #4			C SIMD code n limit
+	ble	L(base)
+
+	vdup.32	d6, r3			C left shift count is positive
+	sub	r3, r3, #64		C right shift count is negative
+	vdup.32	d7, r3
+	mov	r12, #-8		C lshift pointer update offset
+
+	sub	ap, ap, #8
+	vld1.32	{d19}, [ap], r12	C load initial 2 limbs
+	vshl.u64 d18, d19, d7		C retval
+
+	tst	rp, #4			C is rp 64-bit aligned already?
+	beq	L(rp_aligned)		C yes, skip
+	vmvn	 d19, d19
+	add	ap, ap, #4		C move back ap pointer
+	vshl.u64 d4, d19, d6
+	sub	n, n, #1		C first limb handled
+	sub	 rp, rp, #4
+	vst1.32	 {d4[1]}, [rp]		C store first limb, rp gets aligned
+	vld1.32	 {d19}, [ap], r12	C load ap[1] and ap[2]
+
+L(rp_aligned):
+	sub	rp, rp, #8
+	subs	n, n, #6
+	vmvn	 d19, d19
+	blt	L(two_or_three_more)
+	tst	n, #2
+	beq	L(2)
+
+L(1):	vld1.32	 {d17}, [ap], r12
+	vshl.u64 d5, d19, d6
+	vmvn	 d17, d17
+	vld1.32	 {d16}, [ap], r12
+	vshl.u64 d0, d17, d7
+	vshl.u64 d4, d17, d6
+	sub	n, n, #2
+	b	 L(mid)
+
+L(2):	vld1.32	 {d16}, [ap], r12
+	vshl.u64 d4, d19, d6
+	vmvn	 d16, d16
+	vld1.32	 {d17}, [ap], r12
+	vshl.u64 d1, d16, d7
+	vshl.u64 d5, d16, d6
+	subs	n, n, #4
+	blt	L(end)
+
+L(top):	vmvn	 d17, d17
+	vld1.32	 {d16}, [ap], r12
+	vorr	 d2, d4, d1
+	vshl.u64 d0, d17, d7
+	vshl.u64 d4, d17, d6
+	vst1.32	 {d2}, [rp:64], r12
+L(mid):	vmvn	 d16, d16
+	vld1.32	 {d17}, [ap], r12
+	vorr	 d3, d5, d0
+	vshl.u64 d1, d16, d7
+	vshl.u64 d5, d16, d6
+	vst1.32	 {d3}, [rp:64], r12
+	subs	n, n, #4
+	bge	L(top)
+
+L(end):	tst	 n, #1
+	beq	 L(evn)
+
+	vorr	 d2, d4, d1
+	vst1.32	 {d2}, [rp:64], r12
+	b	 L(cj1)
+
+L(evn):	vmvn	 d17, d17
+	vorr	 d2, d4, d1
+	vshl.u64 d0, d17, d7
+	vshl.u64 d4, d17, d6
+	vst1.32	 {d2}, [rp:64], r12
+	vmov.u8	 d17, #255
+	vorr	 d2, d5, d0
+	vshl.u64 d0, d17, d7
+	vorr	 d3, d4, d0
+	b	 L(cj2)
+
+C Load last 2 - 3 limbs, store last 4 - 5 limbs
+L(two_or_three_more):
+	tst	n, #1
+	beq	L(l2)
+
+L(l3):	vshl.u64 d5, d19, d6
+	vld1.32	 {d17}, [ap], r12
+L(cj1):	vmov.u8	 d16, #0
+	add	 ap, ap, #4
+	vmvn	 d17, d17
+	vld1.32	 {d16[1]}, [ap], r12
+	vshl.u64 d0, d17, d7
+	vshl.u64 d4, d17, d6
+	vmvn	 d16, d16
+	vorr	 d3, d5, d0
+	vshl.u64 d1, d16, d7
+	vshl.u64 d5, d16, d6
+	vst1.32	 {d3}, [rp:64], r12
+	vorr	 d2, d4, d1
+	vst1.32	 {d2}, [rp:64], r12
+	add	 rp, rp, #4
+	vst1.32	 {d5[1]}, [rp]
+	vmov.32	 r0, d18[0]
+	bx	lr
+
+L(l2):	vld1.32	 {d16}, [ap], r12
+	vshl.u64 d4, d19, d6
+	vmvn	 d16, d16
+	vshl.u64 d1, d16, d7
+	vshl.u64 d5, d16, d6
+	vmov.u8	 d17, #255
+	vorr	 d2, d4, d1
+	vshl.u64 d0, d17, d7
+	vorr	 d3, d5, d0
+L(cj2):	vst1.32	 {d2}, [rp:64], r12
+	vst1.32	 {d3}, [rp]
+	vmov.32	 r0, d18[0]
+	bx	lr
+
+
+define(`tnc', `r12')
+L(base):
+	push	{r4, r6, r7, r8}
+	ldr	r4, [ap, #-4]!
+	rsb	tnc, cnt, #32
+	mvn	r6, r4
+
+	mov	r7, r6, lsl cnt
+	tst	n, #1
+	beq	L(ev)			C n even
+
+L(od):	subs	n, n, #2
+	bcc	L(ed1)			C n = 1
+	ldr	r8, [ap, #-4]!
+	mvn	r8, r8
+	b	L(md)			C n = 3
+
+L(ev):	ldr	r6, [ap, #-4]!
+	mvn	r6, r6
+	subs	n, n, #2
+	beq	L(ed)			C n = 3
+					C n = 4
+L(tp):	ldr	r8, [ap, #-4]!
+	orr	r7, r7, r6, lsr tnc
+	str	r7, [rp, #-4]!
+	mvn	r8, r8
+	mov	r7, r6, lsl cnt
+L(md):	ldr	r6, [ap, #-4]!
+	orr	r7, r7, r8, lsr tnc
+	str	r7, [rp, #-4]!
+	mvn	r6, r6
+	mov	r7, r8, lsl cnt
+
+L(ed):	orr	r7, r7, r6, lsr tnc
+	str	r7, [rp, #-4]!
+	mov	r7, r6, lsl cnt
+L(ed1):	mvn	r6, #0
+	orr	r7, r7, r6, lsr tnc
+	str	r7, [rp, #-4]
+	mov	r0, r4, lsr tnc
+	pop	{r4, r6, r7, r8}
+	bx	r14
+EPILOGUE()

diff --git a/third_party/gmp/mpn/arm/neon/popcount.asm b/third_party/gmp/mpn/arm/neon/popcount.asm
new file mode 100644
index 0000000..2f8f9af
--- /dev/null
+++ b/third_party/gmp/mpn/arm/neon/popcount.asm

@@ -0,0 +1,166 @@
+dnl  ARM Neon mpn_popcount -- mpn bit population count.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C StrongARM:	 -
+C XScale	 -
+C Cortex-A7	 ?
+C Cortex-A8	 ?
+C Cortex-A9	 1.125
+C Cortex-A15	 0.56
+
+C TODO
+C  * Explore using vldr and vldm.  Does it help on A9?  (These loads do
+C    64-bits-at-a-time, which will mess up in big-endian mode.  Except not for
+C    popcount. Except perhaps also for popcount for the edge loads.)
+C  * Arrange to align the pointer, if that helps performance.  Use the same
+C    read-and-mask trick we use on PCs, for simplicity and performance.  (Sorry
+C    valgrind!)
+C  * Explore if explicit align directives, e.g., "[ptr:128]" help.
+C  * See rth's gmp-devel 2013-02/03 messages about final summation tricks.
+
+C INPUT PARAMETERS
+define(`ap', r0)
+define(`n',  r1)
+
+C We sum into 16 16-bit counters in q8,q9, but at the end we sum them and end
+C up with 8 16-bit counters.  Therefore, we can sum to 8(2^16-1) bits, or
+C (8*2^16-1)/32 = 0x3fff limbs.  We use a chunksize close to that, but which
+C can be represented as a 8-bit ARM constant.
+C
+define(`chunksize',0x3f80)
+
+ASM_START()
+PROLOGUE(mpn_popcount)
+
+	cmp	n, #chunksize
+	bhi	L(gt16k)
+
+L(lt16k):
+	vmov.i64   q8, #0		C clear summation register
+	vmov.i64   q9, #0		C clear summation register
+
+	tst	   n, #1
+	beq	   L(xxx0)
+	vmov.i64   d0, #0
+	sub	   n, n, #1
+	vld1.32   {d0[0]}, [ap]!	C load 1 limb
+	vcnt.8	   d24, d0
+	vpadal.u8  d16, d24		C d16/q8 = 0; could just splat
+
+L(xxx0):tst	   n, #2
+	beq	   L(xx00)
+	sub	   n, n, #2
+	vld1.32    {d0}, [ap]!		C load 2 limbs
+	vcnt.8	   d24, d0
+	vpadal.u8  d16, d24
+
+L(xx00):tst	   n, #4
+	beq	   L(x000)
+	sub	   n, n, #4
+	vld1.32    {q0}, [ap]!		C load 4 limbs
+	vcnt.8	   q12, q0
+	vpadal.u8  q8, q12
+
+L(x000):tst	   n, #8
+	beq	   L(0000)
+
+	subs	   n, n, #8
+	vld1.32    {q0,q1}, [ap]!	C load 8 limbs
+	bls	   L(sum)
+
+L(gt8):	vld1.32    {q2,q3}, [ap]!	C load 8 limbs
+	sub	   n, n, #8
+	vcnt.8	   q12, q0
+	vcnt.8	   q13, q1
+	b	   L(mid)
+
+L(0000):subs	   n, n, #16
+	blo	   L(e0)
+
+	vld1.32    {q2,q3}, [ap]!	C load 8 limbs
+	vld1.32    {q0,q1}, [ap]!	C load 8 limbs
+	vcnt.8	   q12, q2
+	vcnt.8	   q13, q3
+	subs	   n, n, #16
+	blo	   L(end)
+
+L(top):	vld1.32    {q2,q3}, [ap]!	C load 8 limbs
+	vpadal.u8  q8, q12
+	vcnt.8	   q12, q0
+	vpadal.u8  q9, q13
+	vcnt.8	   q13, q1
+L(mid):	vld1.32    {q0,q1}, [ap]!	C load 8 limbs
+	subs	   n, n, #16
+	vpadal.u8  q8, q12
+	vcnt.8	   q12, q2
+	vpadal.u8  q9, q13
+	vcnt.8	   q13, q3
+	bhs	   L(top)
+
+L(end):	vpadal.u8  q8, q12
+	vpadal.u8  q9, q13
+L(sum):	vcnt.8	   q12, q0
+	vcnt.8	   q13, q1
+	vpadal.u8  q8, q12
+	vpadal.u8  q9, q13
+	vadd.i16   q8, q8, q9
+					C we have 8 16-bit counts
+L(e0):	vpaddl.u16 q8, q8		C we have 4 32-bit counts
+	vpaddl.u32 q8, q8		C we have 2 64-bit counts
+	vmov.32    r0, d16[0]
+	vmov.32    r1, d17[0]
+	add	   r0, r0, r1
+	bx	lr
+
+C Code for large count.  Splits operand and calls above code.
+define(`ap2', r2)			C caller-saves reg not used above
+L(gt16k):
+	push	{r4,r14}
+	mov	ap2, ap
+	mov	r3, n			C full count
+	mov	r4, #0			C total sum
+
+1:	mov	n, #chunksize		C count for this invocation
+	bl	L(lt16k)		C could jump deep inside code
+	add	ap2, ap2, #chunksize*4	C point at next chunk
+	add	r4, r4, r0
+	mov	ap, ap2			C put chunk pointer in place for call
+	sub	r3, r3, #chunksize
+	cmp	r3, #chunksize
+	bhi	1b
+
+	mov	n, r3			C count for final invocation
+	bl	L(lt16k)
+	add	r0, r4, r0
+	pop	{r4,pc}
+EPILOGUE()

diff --git a/third_party/gmp/mpn/arm/neon/sec_tabselect.asm b/third_party/gmp/mpn/arm/neon/sec_tabselect.asm
new file mode 100644
index 0000000..69fceb0
--- /dev/null
+++ b/third_party/gmp/mpn/arm/neon/sec_tabselect.asm

@@ -0,0 +1,140 @@
+dnl  ARM Neon mpn_sec_tabselect.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2011-2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C	     cycles/limb
+C StrongARM	 -
+C XScale	 -
+C Cortex-A7	 ?
+C Cortex-A8	 ?
+C Cortex-A9	 1.15
+C Cortex-A15	 0.65
+
+define(`rp',     `r0')
+define(`tp',     `r1')
+define(`n',      `r2')
+define(`nents',  `r3')
+C define(`which',  on stack)
+
+define(`i',      `r4')
+define(`j',      `r5')
+
+define(`maskq',  `q10')
+define(`maskd',  `d20')
+
+ASM_START()
+PROLOGUE(mpn_sec_tabselect)
+	push	{r4-r5}
+
+	add	  r4, sp, #8
+	vld1.32	  {d30[], d31[]}, [r4]	C 4 `which' copies
+	vmov.i32  q14, #1		C 4 copies of 1
+
+	subs	j, n, #8
+	bmi	L(outer_end)
+
+L(outer_top):
+	mov	  i, nents
+	mov	  r12, tp		C preserve tp
+	veor	  q13, q13, q13		C 4 counter copies
+	veor	  q2, q2, q2
+	veor	  q3, q3, q3
+	ALIGN(16)
+L(top):	vceq.i32  maskq, q13, q15	C compare idx copies to `which' copies
+	vld1.32	  {q0,q1}, [tp]
+	vadd.i32  q13, q13, q14
+	vbit	  q2, q0, maskq
+	vbit	  q3, q1, maskq
+	add	  tp, tp, n, lsl #2
+	subs	  i, i, #1
+	bne	  L(top)
+	vst1.32	  {q2,q3}, [rp]!
+	add	  tp, r12, #32		C restore tp, point to next slice
+	subs	  j, j, #8
+	bpl	  L(outer_top)
+L(outer_end):
+
+	tst	  n, #4
+	beq	  L(b0xx)
+L(b1xx):mov	  i, nents
+	mov	  r12, tp
+	veor	  q13, q13, q13
+	veor	  q2, q2, q2
+	ALIGN(16)
+L(tp4):	vceq.i32  maskq, q13, q15
+	vld1.32	  {q0}, [tp]
+	vadd.i32  q13, q13, q14
+	vbit	  q2, q0, maskq
+	add	  tp, tp, n, lsl #2
+	subs	  i, i, #1
+	bne	  L(tp4)
+	vst1.32	  {q2}, [rp]!
+	add	  tp, r12, #16
+
+L(b0xx):tst	  n, #2
+	beq	  L(b00x)
+L(b01x):mov	  i, nents
+	mov	  r12, tp
+	veor	  d26, d26, d26
+	veor	  d4, d4, d4
+	ALIGN(16)
+L(tp2):	vceq.i32  maskd, d26, d30
+	vld1.32	  {d0}, [tp]
+	vadd.i32  d26, d26, d28
+	vbit	  d4, d0, maskd
+	add	  tp, tp, n, lsl #2
+	subs	  i, i, #1
+	bne	  L(tp2)
+	vst1.32	  {d4}, [rp]!
+	add	  tp, r12, #8
+
+L(b00x):tst	  n, #1
+	beq	  L(b000)
+L(b001):mov	  i, nents
+	mov	  r12, tp
+	veor	  d26, d26, d26
+	veor	  d4, d4, d4
+	ALIGN(16)
+L(tp1):	vceq.i32  maskd, d26, d30
+	vld1.32	  {d0[0]}, [tp]
+	vadd.i32  d26, d26, d28
+	vbit	  d4, d0, maskd
+	add	  tp, tp, n, lsl #2
+	subs	  i, i, #1
+	bne	  L(tp1)
+	vst1.32	  {d4[0]}, [rp]
+
+L(b000):pop	{r4-r5}
+	bx	r14
+EPILOGUE()

diff --git a/third_party/gmp/mpn/arm/rsh1aors_n.asm b/third_party/gmp/mpn/arm/rsh1aors_n.asm
new file mode 100644
index 0000000..f2e3006
--- /dev/null
+++ b/third_party/gmp/mpn/arm/rsh1aors_n.asm

@@ -0,0 +1,124 @@
+dnl  ARM mpn_rsh1add_n and mpn_rsh1sub_n.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C StrongARM	 ?
+C XScale	 ?
+C Cortex-A7	 ?
+C Cortex-A8	 ?
+C Cortex-A9	3.64-3.7
+C Cortex-A15	 2.5
+
+C TODO
+C  * Not optimised.
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`vp', `r2')
+define(`n',  `r3')
+
+ifdef(`OPERATION_rsh1add_n', `
+  define(`ADDSUB',	adds)
+  define(`ADDSUBC',	adcs)
+  define(`RSTCY',	`cmn	$1, $1')
+  define(`func',	mpn_rsh1add_n)
+  define(`func_nc',	mpn_rsh1add_nc)')
+ifdef(`OPERATION_rsh1sub_n', `
+  define(`ADDSUB',	subs)
+  define(`ADDSUBC',	sbcs)
+  define(`RSTCY',
+	`mvn	$2, #0x80000000
+	cmp	$2, $1')
+  define(`func',	mpn_rsh1sub_n)
+  define(`func_nc',	mpn_rsh1sub_nc)')
+
+MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1sub_n)
+
+ASM_START()
+PROLOGUE(func)
+	push	{r4-r11}
+	ldr	r4, [up], #4
+	ldr	r8, [vp], #4
+	ADDSUB	r4, r4, r8
+	movs	r12, r7, rrx
+	and	r11, r4, #1	C return value
+	subs	n, n, #4
+	blo	L(end)
+
+L(top):	ldmia	up!, {r5,r6,r7}
+	ldmia	vp!, {r8,r9,r10}
+	cmn	r12, r12
+	ADDSUBC	r5, r5, r8
+	ADDSUBC	r6, r6, r9
+	ADDSUBC	r7, r7, r10
+	movs	r12, r7, rrx
+	movs	r6, r6, rrx
+	movs	r5, r5, rrx
+	movs	r4, r4, rrx
+	subs	n, n, #3
+	stmia	rp!, {r4,r5,r6}
+	mov	r4, r7
+	bhs	L(top)
+
+L(end):	cmn	n, #2
+	bls	L(e2)
+	ldm	up, {r5,r6}
+	ldm	vp, {r8,r9}
+	cmn	r12, r12
+	ADDSUBC	r5, r5, r8
+	ADDSUBC	r6, r6, r9
+	movs	r12, r6, rrx
+	movs	r5, r5, rrx
+	movs	r4, r4, rrx
+	stmia	rp!, {r4,r5}
+	mov	r4, r6
+	b	L(e1)
+
+L(e2):	bne	L(e1)
+	ldr	r5, [up, #0]
+	ldr	r8, [vp, #0]
+	cmn	r12, r12
+	ADDSUBC	r5, r5, r8
+	movs	r12, r5, rrx
+	movs	r4, r4, rrx
+	str	r4, [rp], #4
+	mov	r4, r5
+
+L(e1):	RSTCY(	r12, r1)
+	mov	r4, r4, rrx
+	str	r4, [rp, #0]
+	mov	r0, r11
+	pop	{r4-r11}
+	return	r14
+EPILOGUE()

diff --git a/third_party/gmp/mpn/arm/rshift.asm b/third_party/gmp/mpn/arm/rshift.asm
new file mode 100644
index 0000000..9ddbc2e
--- /dev/null
+++ b/third_party/gmp/mpn/arm/rshift.asm

@@ -0,0 +1,86 @@
+dnl  ARM mpn_rshift.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 1997, 2000, 2001, 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C StrongARM	 ?
+C XScale	 ?
+C Cortex-A7	 ?
+C Cortex-A8	 ?
+C Cortex-A9	 3.5
+C Cortex-A15	 ?
+
+define(`rp',  `r0')
+define(`up',  `r1')
+define(`n',   `r2')
+define(`cnt', `r3')
+define(`tnc', `r12')
+
+ASM_START()
+PROLOGUE(mpn_rshift)
+	push	{r4, r6, r7, r8}
+	ldr	r4, [up]
+	rsb	tnc, cnt, #32
+
+	mov	r7, r4, lsr cnt
+	tst	n, #1
+	beq	L(evn)			C n even
+
+L(odd):	subs	n, n, #2
+	bcc	L(1)			C n = 1
+	ldr	r8, [up, #4]!
+	b	L(mid)
+
+L(evn):	ldr	r6, [up, #4]!
+	subs	n, n, #2
+	beq	L(end)
+
+L(top):	ldr	r8, [up, #4]!
+	orr	r7, r7, r6, lsl tnc
+	str	r7, [rp], #4
+	mov	r7, r6, lsr cnt
+L(mid):	ldr	r6, [up, #4]!
+	orr	r7, r7, r8, lsl tnc
+	str	r7, [rp], #4
+	mov	r7, r8, lsr cnt
+	subs	n, n, #2
+	bgt	L(top)
+
+L(end):	orr	r7, r7, r6, lsl tnc
+	str	r7, [rp], #4
+	mov	r7, r6, lsr cnt
+L(1):	str	r7, [rp]
+	mov	r0, r4, lsl tnc
+	pop	{r4, r6, r7, r8}
+	return	r14
+EPILOGUE()

diff --git a/third_party/gmp/mpn/arm/sec_tabselect.asm b/third_party/gmp/mpn/arm/sec_tabselect.asm
new file mode 100644
index 0000000..76a412b
--- /dev/null
+++ b/third_party/gmp/mpn/arm/sec_tabselect.asm

@@ -0,0 +1,131 @@
+dnl  ARM mpn_sec_tabselect
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C StrongARM	 ?
+C XScale	 ?
+C Cortex-A7	 ?
+C Cortex-A8	 ?
+C Cortex-A9	 2.33
+C Cortex-A15	 2.2
+
+C TODO
+C  * Consider using special code for small nents, either swapping the inner and
+C    outer loops, or providing a few completely unrolling the inner loops.
+
+define(`rp',    `r0')
+define(`tp',    `r1')
+define(`n',     `r2')
+define(`nents', `r3')
+C      which  on stack
+
+define(`i',     `r11')
+define(`j',     `r12')
+define(`c',     `r14')
+define(`mask',  `r7')
+
+ASM_START()
+PROLOGUE(mpn_sec_tabselect)
+	push	{r4-r11, r14}
+
+	subs	j, n, #3
+	bmi	L(outer_end)
+L(outer_top):
+	ldr	c, [sp, #36]
+	mov	i, nents
+	push	{tp}
+
+	mov	r8, #0
+	mov	r9, #0
+	mov	r10, #0
+
+L(top):	subs	c, c, #1
+	ldm	tp, {r4,r5,r6}
+	sbc	mask, mask, mask
+	subs	i, i, #1
+	add	tp, tp, n, lsl #2
+	and	r4, r4, mask
+	and	r5, r5, mask
+	and	r6, r6, mask
+	orr	r8, r8, r4
+	orr	r9, r9, r5
+	orr	r10, r10, r6
+	bge	L(top)
+
+	stmia	rp!, {r8,r9,r10}
+	pop	{tp}
+	add	tp, tp, #12
+	subs	j, j, #3
+	bpl	L(outer_top)
+L(outer_end):
+
+	cmp	j, #-1
+	bne	L(n2)
+
+	ldr	c, [sp, #36]
+	mov	i, nents
+	mov	r8, #0
+	mov	r9, #0
+L(tp2):	subs	c, c, #1
+	sbc	mask, mask, mask
+	ldm	tp, {r4,r5}
+	subs	i, i, #1
+	add	tp, tp, n, lsl #2
+	and	r4, r4, mask
+	and	r5, r5, mask
+	orr	r8, r8, r4
+	orr	r9, r9, r5
+	bge	L(tp2)
+	stmia	rp, {r8,r9}
+	pop	{r4-r11, r14}
+	return	lr
+
+L(n2):	cmp	j, #-2
+	bne	L(n1)
+
+	ldr	c, [sp, #36]
+	mov	i, nents
+	mov	r8, #0
+L(tp1):	subs	c, c, #1
+	sbc	mask, mask, mask
+	ldr	r4, [tp]
+	subs	i, i, #1
+	add	tp, tp, n, lsl #2
+	and	r4, r4, mask
+	orr	r8, r8, r4
+	bge	L(tp1)
+	str	r8, [rp]
+L(n1):	pop	{r4-r11, r14}
+	return	lr
+EPILOGUE()

diff --git a/third_party/gmp/mpn/arm/udiv.asm b/third_party/gmp/mpn/arm/udiv.asm
new file mode 100644
index 0000000..7c04789
--- /dev/null
+++ b/third_party/gmp/mpn/arm/udiv.asm

@@ -0,0 +1,104 @@
+dnl  ARM mpn_udiv_qrnnd -- divide a two limb dividend and a one limb divisor.
+dnl  Return quotient and store remainder through a supplied pointer.
+
+dnl  Copyright 2001, 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+define(`rem_ptr',`r0')
+define(`n1',`r1')
+define(`n0',`r2')
+define(`d',`r3')
+
+C divstep -- develop one quotient bit.  Dividend in $1$2, divisor in $3.
+C Quotient bit is shifted into $2.
+define(`divstep',
+       `adcs	$2, $2, $2
+	adc	$1, $1, $1
+	cmp	$1, $3
+	subcs	$1, $1, $3')
+
+ASM_START()
+PROLOGUE(mpn_udiv_qrnnd)
+	mov	r12, #8			C loop counter for both loops below
+	cmp	d, #0x80000000		C check divisor msb and clear carry
+	bcs	L(_large_divisor)
+
+L(oop):	divstep(n1,n0,d)
+	divstep(n1,n0,d)
+	divstep(n1,n0,d)
+	divstep(n1,n0,d)
+	sub	r12, r12, #1
+	teq	r12, #0
+	bne	L(oop)
+
+	str	n1, [rem_ptr]		C store remainder
+	adc	r0, n0, n0		C quotient: add last carry from divstep
+	return	lr
+
+L(_large_divisor):
+	stmfd	sp!, { r8, lr }
+
+	and	r8, n0, #1		C save lsb of dividend
+	mov	lr, n1, lsl #31
+	orrs	n0, lr, n0, lsr #1	C n0 = lo(n1n0 >> 1)
+	mov	n1, n1, lsr #1		C n1 = hi(n1n0 >> 1)
+
+	and	lr, d, #1		C save lsb of divisor
+	movs	d, d, lsr #1		C d = floor(orig_d / 2)
+	adc	d, d, #0		C d = ceil(orig_d / 2)
+
+L(oop2):
+	divstep(n1,n0,d)
+	divstep(n1,n0,d)
+	divstep(n1,n0,d)
+	divstep(n1,n0,d)
+	sub	r12, r12, #1
+	teq	r12, #0
+	bne	L(oop2)
+
+	adc	n0, n0, n0		C shift and add last carry from divstep
+	add	n1, r8, n1, lsl #1	C shift in omitted dividend lsb
+	tst	lr, lr			C test saved divisor lsb
+	beq	L(_even_divisor)
+
+	rsb	d, lr, d, lsl #1	C restore orig d value
+	adds	n1, n1, n0		C fix remainder for omitted divisor lsb
+	addcs	n0, n0, #1		C adjust quotient if rem. fix carried
+	subcs	n1, n1, d		C adjust remainder accordingly
+	cmp	n1, d			C remainder >= divisor?
+	subcs	n1, n1, d		C adjust remainder
+	addcs	n0, n0, #1		C adjust quotient
+
+L(_even_divisor):
+	str	n1, [rem_ptr]		C store remainder
+	mov	r0, n0			C quotient
+	ldmfd	sp!, { r8, pc }
+EPILOGUE(mpn_udiv_qrnnd)

diff --git a/third_party/gmp/mpn/arm/v5/gcd_11.asm b/third_party/gmp/mpn/arm/v5/gcd_11.asm
new file mode 100644
index 0000000..3c2b48f
--- /dev/null
+++ b/third_party/gmp/mpn/arm/v5/gcd_11.asm

@@ -0,0 +1,70 @@
+dnl  ARM v5 mpn_gcd_11.
+
+dnl  Based on the K7 gcd_1.asm, by Kevin Ryde.  Rehacked for ARM by Torbjörn
+dnl  Granlund.
+
+dnl  Copyright 2000-2002, 2005, 2009, 2011, 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/bit (approx)
+C StrongARM	 -
+C XScale	 ?
+C Cortex-A5	 6.45	obsolete
+C Cortex-A7	 6.41	obsolete
+C Cortex-A8	 5.0	obsolete
+C Cortex-A9	 5.9	obsolete
+C Cortex-A15	 4.40	obsolete
+C Cortex-A17	 5.68	obsolete
+C Cortex-A53	 4.37	obsolete
+C Numbers measured with: speed -CD -s8-32 -t24 mpn_gcd_1
+
+define(`u0',    `r0')
+define(`v0',    `r1')
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_gcd_11)
+	subs	r3, u0, v0	C			0
+	beq	L(end)		C
+
+	ALIGN(16)
+L(top):	sub	r2, v0, u0	C			0,5
+	and	r12, r2, r3	C			1
+	clz	r12, r12	C			2
+	rsb	r12, r12, #31	C			3
+	rsbcc	r3, r3, #0	C v = abs(u-v), even	1
+	movcs	u0, v0		C u = min(u,v)		1
+	lsr	v0, r3, r12	C			4
+	subs	r3, u0, v0	C			5
+	bne	L(top)		C
+
+L(end):	bx	lr
+EPILOGUE()

diff --git a/third_party/gmp/mpn/arm/v5/mod_1_1.asm b/third_party/gmp/mpn/arm/v5/mod_1_1.asm
new file mode 100644
index 0000000..3cf0cd7
--- /dev/null
+++ b/third_party/gmp/mpn/arm/v5/mod_1_1.asm

@@ -0,0 +1,129 @@
+dnl  ARM mpn_mod_1_1p
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C StrongARM	 -
+C XScale	 ?
+C Cortex-A7	 ?
+C Cortex-A8	 ?
+C Cortex-A9	 7
+C Cortex-A15	 6
+
+define(`ap', `r0')
+define(`n',  `r1')
+define(`d',  `r2')
+define(`cps',`r3')
+
+ASM_START()
+PROLOGUE(mpn_mod_1_1p)
+	push	{r4-r10}
+	add	r0, r0, r1, asl #2
+	ldr	r5, [r0, #-4]!
+	ldr	r12, [r0, #-4]!
+	subs	r1, r1, #2
+	ble	L(4)
+	ldr	r8, [r3, #12]
+	mov	r4, r12
+	mov	r10, r5
+	umull	r7, r5, r10, r8
+	sub	r1, r1, #1
+	b	L(mid)
+
+L(top):	adds	r12, r6, r7
+	adcs	r10, r4, r5
+	sub	r1, r1, #1
+	mov	r6, #0
+	movcs	r6, r8
+	umull	r7, r5, r10, r8
+	adds	r4, r12, r6
+	subcs	r4, r4, r2
+L(mid):	ldr	r6, [r0, #-4]!
+	teq	r1, #0
+	bne	L(top)
+
+	adds	r12, r6, r7
+	adcs	r5, r4, r5
+	subcs	r5, r5, r2
+L(4):	ldr	r1, [r3, #4]
+	cmp	r1, #0
+	beq	L(7)
+	ldr	r4, [r3, #8]
+	umull	r0, r6, r5, r4
+	adds	r12, r0, r12
+	addcs	r6, r6, #1
+	rsb	r0, r1, #32
+	mov	r0, r12, lsr r0
+	orr	r5, r0, r6, asl r1
+	mov	r12, r12, asl r1
+	b	L(8)
+L(7):	cmp	r5, r2
+	subcs	r5, r5, r2
+L(8):	ldr	r0, [r3, #0]
+	umull	r4, r3, r5, r0
+	add	r5, r5, #1
+	adds	r0, r4, r12
+	adc	r5, r3, r5
+	mul	r5, r2, r5
+	sub	r12, r12, r5
+	cmp	r12, r0
+	addhi	r12, r12, r2
+	cmp	r2, r12
+	subls	r12, r12, r2
+	mov	r0, r12, lsr r1
+	pop	{r4-r10}
+	bx	r14
+EPILOGUE()
+
+PROLOGUE(mpn_mod_1_1p_cps)
+	stmfd	sp!, {r4, r5, r6, r14}
+	mov	r5, r0
+	clz	r4, r1
+	mov	r0, r1, asl r4
+	rsb	r6, r0, #0
+	bl	mpn_invert_limb
+	str	r0, [r5, #0]
+	str	r4, [r5, #4]
+	cmp	r4, #0
+	beq	L(2)
+	rsb	r1, r4, #32
+	mov	r3, #1
+	mov	r3, r3, asl r4
+	orr	r3, r3, r0, lsr r1
+	mul	r3, r6, r3
+	mov	r4, r3, lsr r4
+	str	r4, [r5, #8]
+L(2):	mul	r0, r6, r0
+	str	r0, [r5, #12]
+	ldmfd	sp!, {r4, r5, r6, pc}
+EPILOGUE()

diff --git a/third_party/gmp/mpn/arm/v5/mod_1_2.asm b/third_party/gmp/mpn/arm/v5/mod_1_2.asm
new file mode 100644
index 0000000..aa26ecb
--- /dev/null
+++ b/third_party/gmp/mpn/arm/v5/mod_1_2.asm

@@ -0,0 +1,156 @@
+dnl  ARM mpn_mod_1s_2p
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C StrongARM	 -
+C XScale	 ?
+C Cortex-A7	 ?
+C Cortex-A8	 ?
+C Cortex-A9	 4.25
+C Cortex-A15	 3
+
+define(`ap', `r0')
+define(`n',  `r1')
+define(`d',  `r2')
+define(`cps',`r3')
+
+ASM_START()
+PROLOGUE(mpn_mod_1s_2p)
+	push	{r4-r10}
+	tst	n, #1
+	add	r7, r3, #8
+	ldmia	r7, {r7, r8, r12}	C load B1, B2, B3
+	add	ap, ap, n, lsl #2	C put ap at operand end
+	beq	L(evn)
+
+L(odd):	subs	n, n, #1
+	beq	L(1)
+	ldmdb	ap!, {r4,r6,r9}
+	mov	r10, #0
+	umlal	r4, r10, r6, r7
+	umlal	r4, r10, r9, r8
+	b	L(com)
+
+L(evn):	ldmdb	ap!, {r4,r10}
+L(com):	subs	n, n, #2
+	ble	L(end)
+	ldmdb	ap!, {r5,r6}
+	b	L(mid)
+
+L(top):	mov	r9, #0
+	umlal	r5, r9, r6, r7		C B1
+	umlal	r5, r9, r4, r8		C B2
+	ldmdb	ap!, {r4,r6}
+	umlal	r5, r9, r10, r12	C B3
+	ble	L(xit)
+	mov	r10, #0
+	umlal	r4, r10, r6, r7		C B1
+	umlal	r4, r10, r5, r8		C B2
+	ldmdb	ap!, {r5,r6}
+	umlal	r4, r10, r9, r12	C B3
+L(mid):	subs	n, n, #4
+	bge	L(top)
+
+	mov	r9, #0
+	umlal	r5, r9, r6, r7		C B1
+	umlal	r5, r9, r4, r8		C B2
+	umlal	r5, r9, r10, r12	C B3
+	mov	r4, r5
+
+L(end):	movge	   r9, r10		C executed iff coming via xit
+	ldr	r6, [r3, #4]		C cps[1] = cnt
+	mov	r5, #0
+	umlal	r4, r5, r9, r7
+	mov	r7, r5, lsl r6
+L(x):	rsb	r1, r6, #32
+	orr	r8, r7, r4, lsr r1
+	mov	r9, r4, lsl r6
+	ldr	r5, [r3, #0]
+	add	r0, r8, #1
+	umull	r12, r1, r8, r5
+	adds	r4, r12, r9
+	adc	r1, r1, r0
+	mul	r5, r2, r1
+	sub	r9, r9, r5
+	cmp	r9, r4
+	addhi	r9, r9, r2
+	cmp	r2, r9
+	subls	r9, r9, r2
+	mov	r0, r9, lsr r6
+	pop	{r4-r10}
+	bx	r14
+
+L(xit):	mov	r10, #0
+	umlal	r4, r10, r6, r7		C B1
+	umlal	r4, r10, r5, r8		C B2
+	umlal	r4, r10, r9, r12	C B3
+	b	L(end)
+
+L(1):	ldr	r6, [r3, #4]		C cps[1] = cnt
+	ldr	r4, [ap, #-4]		C ap[0]
+	mov	r7, #0
+	b	L(x)
+EPILOGUE()
+
+PROLOGUE(mpn_mod_1s_2p_cps)
+	push	{r4-r8, r14}
+	clz	r4, r1
+	mov	r5, r1, lsl r4		C b <<= cnt
+	mov	r6, r0			C r6 = cps
+	mov	r0, r5
+	bl	mpn_invert_limb
+	rsb	r3, r4, #32
+	mov	r3, r0, lsr r3
+	mov	r2, #1
+	orr	r3, r3, r2, lsl r4
+	rsb	r1, r5, #0
+	mul	r2, r1, r3
+	umull	r3, r12, r2, r0
+	add	r12, r2, r12
+	mvn	r12, r12
+	mul	r1, r5, r12
+	cmp	r1, r3
+	addhi	r1, r1, r5
+	umull	r12, r7, r1, r0
+	add	r7, r1, r7
+	mvn	r7, r7
+	mul	r3, r5, r7
+	cmp	r3, r12
+	addhi	r3, r3, r5
+	mov	r5, r2, lsr r4
+	mov	r7, r1, lsr r4
+	mov	r8, r3, lsr r4
+	stmia	r6, {r0,r4,r5,r7,r8}	C fill cps
+	pop	{r4-r8, pc}
+EPILOGUE()

diff --git a/third_party/gmp/mpn/arm/v6/addmul_1.asm b/third_party/gmp/mpn/arm/v6/addmul_1.asm
new file mode 100644
index 0000000..a38af58
--- /dev/null
+++ b/third_party/gmp/mpn/arm/v6/addmul_1.asm

@@ -0,0 +1,112 @@
+dnl  ARM mpn_addmul_1.
+
+dnl  Copyright 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C StrongARM:	 -
+C XScale	 -
+C ARM11		 6.4
+C Cortex-A7	 5.25
+C Cortex-A8	 7
+C Cortex-A9	 3.25
+C Cortex-A15	 4
+
+C TODO
+C  * Micro-optimise feed-in code.
+C  * Optimise for n=1,2 by delaying register saving.
+C  * Try using ldm/stm.
+
+define(`rp',`r0')
+define(`up',`r1')
+define(`n', `r2')
+define(`v0',`r3')
+
+ASM_START()
+PROLOGUE(mpn_addmul_1)
+	stmfd	sp!, { r4, r5, r6, r7 }
+
+	ands	r6, n, #3
+	mov	r12, #0
+	beq	L(fi0)
+	cmp	r6, #2
+	bcc	L(fi1)
+	beq	L(fi2)
+
+L(fi3):	ldr	r4, [up], #4
+	ldr	r6, [rp, #0]
+	ldr	r5, [up], #4
+	b	L(lo3)
+
+L(fi0):	ldr	r5, [up], #4
+	ldr	r7, [rp], #4
+	ldr	r4, [up], #4
+	b	L(lo0)
+
+L(fi1):	ldr	r4, [up], #4
+	ldr	r6, [rp], #8
+	subs	n, n, #1
+	beq	L(1)
+	ldr	r5, [up], #4
+	b	L(lo1)
+
+L(fi2):	ldr	r5, [up], #4
+	ldr	r7, [rp], #12
+	ldr	r4, [up], #4
+	b	L(lo2)
+
+	ALIGN(16)
+L(top):	ldr	r6, [rp, #-8]
+	ldr	r5, [up], #4
+	str	r7, [rp, #-12]
+L(lo1):	umaal	r6, r12, r4, v0
+	ldr	r7, [rp, #-4]
+	ldr	r4, [up], #4
+	str	r6, [rp, #-8]
+L(lo0):	umaal	r7, r12, r5, v0
+	ldr	r6, [rp, #0]
+	ldr	r5, [up], #4
+	str	r7, [rp, #-4]
+L(lo3):	umaal	r6, r12, r4, v0
+	ldr	r7, [rp, #4]
+	ldr	r4, [up], #4
+	str	r6, [rp], #16
+L(lo2):	umaal	r7, r12, r5, v0
+	subs	n, n, #4
+	bhi	L(top)
+
+	ldr	r6, [rp, #-8]
+	str	r7, [rp, #-12]
+L(1):	umaal	r6, r12, r4, v0
+	str	r6, [rp, #-8]
+	mov	r0, r12
+	ldmfd	sp!, { r4, r5, r6, r7 }
+	bx	lr
+EPILOGUE()

diff --git a/third_party/gmp/mpn/arm/v6/addmul_2.asm b/third_party/gmp/mpn/arm/v6/addmul_2.asm
new file mode 100644
index 0000000..69d0b8f
--- /dev/null
+++ b/third_party/gmp/mpn/arm/v6/addmul_2.asm

@@ -0,0 +1,125 @@
+dnl  ARM mpn_addmul_2.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2012, 2013, 2015 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C StrongARM:	 -
+C XScale	 -
+C ARM11		 4.68
+C Cortex-A5	 3.63
+C Cortex-A7	 3.65
+C Cortex-A8	 4.0
+C Cortex-A9	 2.25
+C Cortex-A15	 2.5
+C Cortex-A17	 2.13
+C Cortex-A53	 3.5
+
+define(`rp',`r0')
+define(`up',`r1')
+define(`n', `r2')
+define(`vp',`r3')
+
+define(`v0',`r6')
+define(`v1',`r7')
+define(`u0',`r3')
+define(`u1',`r9')
+
+define(`cya',`r8')
+define(`cyb',`r12')
+
+
+ASM_START()
+PROLOGUE(mpn_addmul_2)
+	push	{ r4-r9 }
+
+	ldrd	v0, v1, [vp, #0]
+	mov	cya, #0
+	mov	cyb, #0
+
+	tst	n, #1
+	beq	L(evn)
+
+L(odd):	ldr	u1, [up, #0]
+	ldr	r4, [rp, #0]
+	tst	n, #2
+	beq	L(fi1)
+L(fi3):	sub	up, up, #8
+	sub	rp, rp, #8
+	b	L(lo3)
+L(fi1):	sub	n, n, #1
+	b	L(top)
+
+L(evn):	ldr	u0, [up, #0]
+	ldr	r5, [rp, #0]
+	tst	n, #2
+	bne	L(fi2)
+L(fi0):	sub	up, up, #4
+	sub	rp, rp, #4
+	b	L(lo0)
+L(fi2):	sub	up, up, #12
+	sub	rp, rp, #12
+	b	L(lo2)
+
+	ALIGN(16)
+L(top):	ldr	r5, [rp, #4]
+	umaal	r4, cya, u1, v0
+	ldr	u0, [up, #4]
+	umaal	r5, cyb, u1, v1
+	str	r4, [rp, #0]
+L(lo0):	ldr	r4, [rp, #8]
+	umaal	r5, cya, u0, v0
+	ldr	u1, [up, #8]
+	umaal	r4, cyb, u0, v1
+	str	r5, [rp, #4]
+L(lo3):	ldr	r5, [rp, #12]
+	umaal	r4, cya, u1, v0
+	ldr	u0, [up, #12]
+	umaal	r5, cyb, u1, v1
+	str	r4, [rp, #8]
+L(lo2):	ldr	r4, [rp, #16]!
+	umaal	r5, cya, u0, v0
+	ldr	u1, [up, #16]!
+	umaal	r4, cyb, u0, v1
+	str	r5, [rp, #-4]
+	subs	n, n, #4
+	bhi	L(top)
+
+L(end):	umaal	r4, cya, u1, v0
+	umaal	cya, cyb, u1, v1
+	str	r4, [rp, #0]
+	str	cya, [rp, #4]
+	mov	r0, cyb
+
+	pop	{ r4-r9 }
+	bx	r14
+EPILOGUE()

diff --git a/third_party/gmp/mpn/arm/v6/addmul_3.asm b/third_party/gmp/mpn/arm/v6/addmul_3.asm
new file mode 100644
index 0000000..d1490cd
--- /dev/null
+++ b/third_party/gmp/mpn/arm/v6/addmul_3.asm

@@ -0,0 +1,191 @@
+dnl  ARM mpn_addmul_3.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C StrongARM:	 -
+C XScale	 -
+C ARM11		 4.33
+C Cortex-A5	 3.28
+C Cortex-A7	 3.25
+C Cortex-A8	 3.17
+C Cortex-A9	 2.125
+C Cortex-A15	 2
+C Cortex-A17	 2.11
+C Cortex-A53	 4.18
+
+C TODO
+C  * Use a fast path for n <= KARATSUBA_MUL_THRESHOLD using a jump table,
+C    avoiding the current multiply.
+C  * Start the first multiply or multiplies early.
+
+define(`rp',`r0')
+define(`up',`r1')
+define(`n', `r2')
+define(`vp',`r3')
+
+define(`v0',`r4')  define(`v1',`r5')  define(`v2',`r6')
+define(`u0',`r3')  define(`u1',`r14')
+define(`w0',`r7')  define(`w1',`r8')  define(`w2',`r9')
+define(`cy0',`r10')  define(`cy1',`r11') define(`cy2',`r12')
+
+
+ASM_START()
+PROLOGUE(mpn_addmul_3)
+	push	{ r4-r11, r14 }
+
+	ldr	w0, =0xaaaaaaab		C 3^{-1} mod 2^32
+	ldm	vp, { v0,v1,v2 }
+	mov	cy0, #0
+	mov	cy1, #0
+	mov	cy2, #0
+
+C Tricky n mod 6
+	mul	w0, w0, n		C n * 3^{-1} mod 2^32
+	and	w0, w0, #0xc0000001	C pseudo-CRT mod 3,2
+	sub	n, n, #3
+ifdef(`PIC',`
+	add	pc, pc, w0, ror $28
+	nop
+	b	L(b0)
+	b	L(b2)
+	b	L(b4)
+	.word	0xe7f000f0	C udf
+	b	L(b3)
+	b	L(b5)
+	b	L(b1)
+',`
+	ldr	pc, [pc, w0, ror $28]
+	nop
+	.word	L(b0), L(b2), L(b4), 0, L(b3), L(b5), L(b1)
+')
+
+L(b5):	add	up, up, #-8
+	ldr	w1, [rp, #0]
+	ldr	w2, [rp, #4]
+	ldr	u1, [up, #8]
+	b	L(lo5)
+
+L(b4):	add	rp, rp, #-4
+	add	up, up, #-12
+	ldr	w2, [rp, #4]
+	ldr	w0, [rp, #8]
+	ldr	u0, [up, #12]
+	b	L(lo4)
+
+L(b3):	add	rp, rp, #-8
+	add	up, up, #-16
+	ldr	w0, [rp, #8]
+	ldr	w1, [rp, #12]
+	ldr	u1, [up, #16]
+	b	L(lo3)
+
+L(b1):	add	rp, rp, #8
+	ldr	w2, [rp, #-8]
+	ldr	w0, [rp, #-4]
+	ldr	u1, [up, #0]
+	b	L(lo1)
+
+L(b0):	add	rp, rp, #4
+	add	up, up, #-4
+	ldr	w0, [rp, #-4]
+	ldr	w1, [rp, #0]
+	ldr	u0, [up, #4]
+	b	L(lo0)
+
+L(b2):	add	rp, rp, #12
+	add	up, up, #4
+	ldr	w1, [rp, #-12]
+	ldr	w2, [rp, #-8]
+	ldr	u0, [up, #-4]
+
+	ALIGN(16)
+L(top):	ldr	w0, [rp, #-4]
+	umaal	w1, cy0, u0, v0
+	ldr	u1, [up, #0]
+	umaal	w2, cy1, u0, v1
+	str	w1, [rp, #-12]
+	umaal	w0, cy2, u0, v2
+L(lo1):	ldr	w1, [rp, #0]
+	umaal	w2, cy0, u1, v0
+	ldr	u0, [up, #4]
+	umaal	w0, cy1, u1, v1
+	str	w2, [rp, #-8]
+	umaal	w1, cy2, u1, v2
+L(lo0):	ldr	w2, [rp, #4]
+	umaal	w0, cy0, u0, v0
+	ldr	u1, [up, #8]
+	umaal	w1, cy1, u0, v1
+	str	w0, [rp, #-4]
+	umaal	w2, cy2, u0, v2
+L(lo5):	ldr	w0, [rp, #8]
+	umaal	w1, cy0, u1, v0
+	ldr	u0, [up, #12]
+	umaal	w2, cy1, u1, v1
+	str	w1, [rp, #0]
+	umaal	w0, cy2, u1, v2
+L(lo4):	ldr	w1, [rp, #12]
+	umaal	w2, cy0, u0, v0
+	ldr	u1, [up, #16]
+	umaal	w0, cy1, u0, v1
+	str	w2, [rp, #4]
+	umaal	w1, cy2, u0, v2
+L(lo3):	ldr	w2, [rp, #16]
+	umaal	w0, cy0, u1, v0
+	ldr	u0, [up, #20]
+	umaal	w1, cy1, u1, v1
+	str	w0, [rp, #8]
+	umaal	w2, cy2, u1, v2
+L(lo2):	subs	n, n, #6
+	add	up, up, #24
+	add	rp, rp, #24
+	bge	L(top)
+
+L(end):	umaal	w1, cy0, u0, v0
+	ldr	u1, [up, #0]
+	umaal	w2, cy1, u0, v1
+	str	w1, [rp, #-12]
+	mov	w0, #0
+	umaal	w0, cy2, u0, v2
+	umaal	w2, cy0, u1, v0
+	umaal	w0, cy1, u1, v1
+	str	w2, [rp, #-8]
+	umaal	cy1, cy2, u1, v2
+	adds	w0, w0, cy0
+	str	w0, [rp, #-4]
+	adcs	w1, cy1, #0
+	str	w1, [rp, #0]
+	adc	r0, cy2, #0
+
+	pop	{ r4-r11, pc }
+EPILOGUE()

diff --git a/third_party/gmp/mpn/arm/v6/dive_1.asm b/third_party/gmp/mpn/arm/v6/dive_1.asm
new file mode 100644
index 0000000..92de814
--- /dev/null
+++ b/third_party/gmp/mpn/arm/v6/dive_1.asm

@@ -0,0 +1,149 @@
+dnl  ARM v6 mpn_divexact_1
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2012, 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C               cycles/limb       cycles/limb
+C               norm    unorm    modexact_1c_odd
+C StrongARM	 -	 -
+C XScale	 -	 -
+C Cortex-A7	 ?	 ?
+C Cortex-A8	 ?	 ?
+C Cortex-A9	 9	10		 9
+C Cortex-A15	 7	 7		 7
+
+C Architecture requirements:
+C v5	-
+C v5t	clz
+C v5te	-
+C v6	umaal
+C v6t2	-
+C v7a	-
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`n',  `r2')
+define(`d',  `r3')
+
+define(`cy',  `r7')
+define(`cnt', `r6')
+define(`tnc', `r10')
+
+ASM_START()
+PROLOGUE(mpn_divexact_1)
+	push	{r4,r5,r6,r7,r8,r9}
+
+	tst	d, #1
+
+	rsb	r4, d, #0
+	and	r4, r4, d
+	clz	r4, r4
+	rsb	cnt, r4, #31		C count_trailing_zeros
+	mov	d, d, lsr cnt
+
+C binvert limb
+	LEA(	r4, binvert_limb_table)
+	and	r12, d, #254
+	ldrb	r4, [r4, r12, lsr #1]
+	mul	r12, r4, r4
+	mul	r12, d, r12
+	rsb	r12, r12, r4, lsl #1
+	mul	r4, r12, r12
+	mul	r4, d, r4
+	rsb	r4, r4, r12, lsl #1	C r4 = inverse
+
+	ldr	r5, [up], #4		C up[0]
+	mov	cy, #0
+	rsb	r8, r4, #0		C r8 = -inverse
+	beq	L(unnorm)
+
+L(norm):
+	subs	n, n, #1
+	mul	r5, r5, r4
+	beq	L(end)
+
+	ALIGN(16)
+L(top):	ldr	r9, [up], #4
+	mov	r12, #0
+	str	r5, [rp], #4
+	umaal	r12, cy, r5, d
+	mul	r5, r9, r4
+	mla	r5, cy, r8, r5
+	subs	n, n, #1
+	bne	L(top)
+
+L(end):	str	r5, [rp]
+	pop	{r4,r5,r6,r7,r8,r9}
+	bx	r14
+
+L(unnorm):
+	push	{r10,r11}
+	rsb	tnc, cnt, #32
+	mov	r11, r5, lsr cnt
+	subs	n, n, #1
+	beq	L(edx)
+
+	ldr	r12, [up], #4
+	orr	r9, r11, r12, lsl tnc
+	mov	r11, r12, lsr cnt
+	mul	r5, r9, r4
+	subs	n, n, #1
+	beq	L(edu)
+
+	ALIGN(16)
+L(tpu):	ldr	r12, [up], #4
+	orr	r9, r11, r12, lsl tnc
+	mov	r11, r12, lsr cnt
+	mov	r12, #0
+	str	r5, [rp], #4
+	umaal	r12, cy, r5, d
+	mul	r5, r9, r4
+	mla	r5, cy, r8, r5
+	subs	n, n, #1
+	bne	L(tpu)
+
+L(edu):	str	r5, [rp], #4
+	mov	r12, #0
+	umaal	r12, cy, r5, d
+	mul	r5, r11, r4
+	mla	r5, cy, r8, r5
+	str	r5, [rp]
+	pop	{r10,r11}
+	pop	{r4,r5,r6,r7,r8,r9}
+	bx	r14
+
+L(edx):	mul	r5, r11, r4
+	str	r5, [rp]
+	pop	{r10,r11}
+	pop	{r4,r5,r6,r7,r8,r9}
+	bx	r14
+EPILOGUE()

diff --git a/third_party/gmp/mpn/arm/v6/gmp-mparam.h b/third_party/gmp/mpn/arm/v6/gmp-mparam.h
new file mode 100644
index 0000000..35a7c55
--- /dev/null
+++ b/third_party/gmp/mpn/arm/v6/gmp-mparam.h

@@ -0,0 +1,187 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* 700 MHz ARM11 (raspberry pi) */
+/* FFT tuning limit = 8,088,775 */
+/* Generated by tuneup.c, 2019-10-23, gcc 8.3 */
+
+#define DIVREM_1_NORM_THRESHOLD              0  /* preinv always */
+#define DIVREM_1_UNNORM_THRESHOLD            0  /* always */
+#define MOD_1_NORM_THRESHOLD                 0  /* always */
+#define MOD_1_UNNORM_THRESHOLD               0  /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          7
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          6
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD         0  /* never mpn_mod_1_1p */
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD     MP_SIZE_T_MAX
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     19
+#define USE_PREINV_DIVREM_1                  1  /* preinv always */
+#define DIV_QR_1N_PI1_METHOD                 1  /* 71.61% faster than 2 */
+#define DIV_QR_1_NORM_THRESHOLD          MP_SIZE_T_MAX  /* never */
+#define DIV_QR_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           38
+
+#define DIV_1_VS_MUL_1_PERCENT             251
+
+#define MUL_TOOM22_THRESHOLD                38
+#define MUL_TOOM33_THRESHOLD               134
+#define MUL_TOOM44_THRESHOLD               512
+#define MUL_TOOM6H_THRESHOLD                 0  /* always */
+#define MUL_TOOM8H_THRESHOLD               620
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD     209
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     625
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD     209
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD     211
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD     300
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 55
+#define SQR_TOOM3_THRESHOLD                200
+#define SQR_TOOM4_THRESHOLD                470
+#define SQR_TOOM6_THRESHOLD                614
+#define SQR_TOOM8_THRESHOLD                882
+
+#define MULMID_TOOM42_THRESHOLD             62
+
+#define MULMOD_BNM1_THRESHOLD               23
+#define SQRMOD_BNM1_THRESHOLD               26
+
+#define MUL_FFT_MODF_THRESHOLD             565  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    565, 5}, {     25, 6}, {     13, 5}, {     27, 6}, \
+    {     15, 5}, {     31, 6}, {     28, 7}, {     15, 6}, \
+    {     33, 7}, {     17, 6}, {     35, 7}, {     19, 6}, \
+    {     39, 7}, {     21, 6}, {     43, 7}, {     23, 6}, \
+    {     47, 7}, {     27, 8}, {     15, 7}, {     35, 8}, \
+    {     19, 7}, {     43, 8}, {     23, 7}, {     51, 8}, \
+    {     27, 7}, {     55, 8}, {     31, 7}, {     63, 8}, \
+    {     43, 9}, {     23, 8}, {     55, 9}, {     31, 8}, \
+    {     71, 9}, {     39, 8}, {     83, 9}, {     47, 8}, \
+    {     99, 9}, {     55,10}, {     31, 9}, {     79,10}, \
+    {     47, 9}, {    103,11}, {     31,10}, {     63, 9}, \
+    {    135,10}, {     79, 9}, {    159,10}, {     95, 9}, \
+    {    191,10}, {    111,11}, {     63,10}, {    159,11}, \
+    {     95,10}, {    207,12}, {     63,11}, {    127,10}, \
+    {    255, 9}, {    511,10}, {    271,11}, {    159,10}, \
+    {    351,11}, {    191,10}, {    399,11}, {    223,12}, \
+    {    127,11}, {    255,10}, {    511,11}, {    287,10}, \
+    {    607,11}, {    319,10}, {    639,11}, {    351,12}, \
+    {    191,11}, {    415,13}, {    127,12}, {    255,11}, \
+    {    575,12}, {    319,11}, {    671,12}, {    383,11}, \
+    {    799,12}, {    447,13}, {    255,12}, {    511,11}, \
+    {   1023,12}, {    703,13}, {    383,12}, {    895,14}, \
+    {    255,13}, {    511,12}, {   1151,13}, {    639,12}, \
+    {   1343,13}, {    767,12}, {   1599,13}, {    895,14}, \
+    {  16384,15}, {  32768,16} }
+#define MUL_FFT_TABLE3_SIZE 98
+#define MUL_FFT_THRESHOLD                 5760
+
+#define SQR_FFT_MODF_THRESHOLD             530  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    530, 5}, {     25, 6}, {     13, 5}, {     27, 6}, \
+    {     28, 7}, {     15, 6}, {     33, 7}, {     17, 6}, \
+    {     35, 7}, {     19, 6}, {     39, 7}, {     21, 6}, \
+    {     43, 7}, {     23, 6}, {     47, 7}, {     29, 8}, \
+    {     15, 7}, {     35, 8}, {     19, 7}, {     43, 8}, \
+    {     23, 7}, {     49, 8}, {     27, 7}, {     55, 8}, \
+    {     31, 7}, {     63, 8}, {     43, 9}, {     23, 8}, \
+    {     55, 9}, {     31, 8}, {     71, 9}, {     39, 8}, \
+    {     83, 9}, {     47, 8}, {     95, 9}, {     55,10}, \
+    {     31, 9}, {     79,10}, {     47, 9}, {    103,11}, \
+    {     31,10}, {     63, 9}, {    135,10}, {     79, 9}, \
+    {    167,10}, {     95, 9}, {    191,10}, {    111,11}, \
+    {     63,10}, {    143, 9}, {    287,10}, {    159,11}, \
+    {     95,10}, {    191, 9}, {    383,12}, {     63,11}, \
+    {    127,10}, {    255, 9}, {    511,10}, {    271, 9}, \
+    {    543,10}, {    287,11}, {    159,10}, {    351,11}, \
+    {    191,10}, {    415,11}, {    223,12}, {    127,11}, \
+    {    255,10}, {    543,11}, {    287,10}, {    607,11}, \
+    {    319,10}, {    639,11}, {    351,12}, {    191,11}, \
+    {    383,10}, {    767,11}, {    415,13}, {    127,12}, \
+    {    255,11}, {    607,12}, {    319,11}, {    703,12}, \
+    {    383,11}, {    799,12}, {    447,11}, {    895,13}, \
+    {    255,12}, {    511,11}, {   1023,12}, {    703,13}, \
+    {    383,12}, {    895,14}, {    255,13}, {    511,12}, \
+    {   1151,13}, {    639,12}, {   1343,13}, {    767,12}, \
+    {   1599,13}, {    895,14}, {  16384,15}, {  32768,16} }
+#define SQR_FFT_TABLE3_SIZE 104
+#define SQR_FFT_THRESHOLD                 4416
+
+#define MULLO_BASECASE_THRESHOLD             0  /* always */
+#define MULLO_DC_THRESHOLD                  51
+#define MULLO_MUL_N_THRESHOLD            11278
+#define SQRLO_BASECASE_THRESHOLD            10
+#define SQRLO_DC_THRESHOLD                  55
+#define SQRLO_SQR_THRESHOLD               8648
+
+#define DC_DIV_QR_THRESHOLD                 36
+#define DC_DIVAPPR_Q_THRESHOLD             146
+#define DC_BDIV_QR_THRESHOLD                46
+#define DC_BDIV_Q_THRESHOLD                160
+
+#define INV_MULMOD_BNM1_THRESHOLD           74
+#define INV_NEWTON_THRESHOLD               145
+#define INV_APPR_THRESHOLD                 147
+
+#define BINV_NEWTON_THRESHOLD              372
+#define REDC_1_TO_REDC_2_THRESHOLD           6
+#define REDC_2_TO_REDC_N_THRESHOLD         140
+
+#define MU_DIV_QR_THRESHOLD               2801
+#define MU_DIVAPPR_Q_THRESHOLD            2801
+#define MUPI_DIV_QR_THRESHOLD               79
+#define MU_BDIV_QR_THRESHOLD              2541
+#define MU_BDIV_Q_THRESHOLD               2764
+
+#define POWM_SEC_TABLE  3,20,139,734
+
+#define GET_STR_DC_THRESHOLD                27
+#define GET_STR_PRECOMPUTE_THRESHOLD        45
+#define SET_STR_DC_THRESHOLD               342
+#define SET_STR_PRECOMPUTE_THRESHOLD      1290
+
+#define FAC_DSC_THRESHOLD                  390
+#define FAC_ODD_THRESHOLD                  438
+
+#define MATRIX22_STRASSEN_THRESHOLD         25
+#define HGCD2_DIV1_METHOD                    5  /* 1.32% faster than 3 */
+#define HGCD_THRESHOLD                      82
+#define HGCD_APPR_THRESHOLD                 81
+#define HGCD_REDUCE_THRESHOLD             4633
+#define GCD_DC_THRESHOLD                   345
+#define GCDEXT_DC_THRESHOLD                268
+#define JACOBI_BASE_METHOD                   1  /* 3.30% faster than 2 */
+
+/* Tuneup completed successfully, took 45018 seconds */

diff --git a/third_party/gmp/mpn/arm/v6/mode1o.asm b/third_party/gmp/mpn/arm/v6/mode1o.asm
new file mode 100644
index 0000000..a2f77a6
--- /dev/null
+++ b/third_party/gmp/mpn/arm/v6/mode1o.asm

@@ -0,0 +1,95 @@
+dnl  ARM v6 mpn_modexact_1c_odd
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C StrongARM	 -
+C XScale	 -
+C Cortex-A7	 ?
+C Cortex-A8	 ?
+C Cortex-A9	 9
+C Cortex-A15	 7
+
+C Architecture requirements:
+C v5	-
+C v5t	-
+C v5te	smulbb
+C v6	umaal
+C v6t2	-
+C v7a	-
+
+define(`up', `r0')
+define(`n',  `r1')
+define(`d',  `r2')
+define(`cy', `r3')
+
+	.protected	binvert_limb_table
+ASM_START()
+PROLOGUE(mpn_modexact_1c_odd)
+	stmfd	sp!, {r4, r5, r6, r7}
+
+	LEA(	r4, binvert_limb_table)
+
+	ldr	r6, [up], #4		C up[0]
+
+	and	r12, d, #254
+	ldrb	r4, [r4, r12, lsr #1]
+	smulbb	r12, r4, r4
+	mul	r12, d, r12
+	rsb	r12, r12, r4, asl #1
+	mul	r4, r12, r12
+	mul	r4, d, r4
+	rsb	r4, r4, r12, asl #1	C r4 = inverse
+
+	subs	n, n, #1
+	sub	r6, r6, cy
+	mul	r6, r6, r4
+	beq	L(end)
+
+	rsb	r5, r4, #0		C r5 = -inverse
+
+L(top):	ldr	r7, [up], #4
+	mov	r12, #0
+	umaal	r12, cy, r6, d
+	mul	r6, r7, r4
+	mla	r6, cy, r5, r6
+	subs	n, n, #1
+	bne	L(top)
+
+L(end):	mov	r12, #0
+	umaal	r12, cy, r6, d
+	mov	r0, cy
+
+	ldmfd	sp!, {r4, r5, r6, r7}
+	bx	r14
+EPILOGUE()

diff --git a/third_party/gmp/mpn/arm/v6/mul_1.asm b/third_party/gmp/mpn/arm/v6/mul_1.asm
new file mode 100644
index 0000000..3c6ef99
--- /dev/null
+++ b/third_party/gmp/mpn/arm/v6/mul_1.asm

@@ -0,0 +1,115 @@
+dnl  ARM mpn_mul_1.
+
+dnl  Copyright 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C StrongARM:	 -
+C XScale	 -
+C ARM11		 6.4
+C Cortex-A7	 5.25
+C Cortex-A8	 7
+C Cortex-A9	 3.25
+C Cortex-A15	 4
+
+C TODO
+C  * Micro-optimise feed-in code.
+C  * Optimise for n=1,2 by delaying register saving.
+C  * Try using ldm/stm.
+
+define(`rp',`r0')
+define(`up',`r1')
+define(`n', `r2')
+define(`v0',`r3')
+
+ASM_START()
+PROLOGUE(mpn_mul_1)
+	stmfd	sp!, { r4, r5, r6, r7 }
+
+	ands	r6, n, #3
+	mov	r12, #0
+	beq	L(fi0)
+	cmp	r6, #2
+	bcc	L(fi1)
+	beq	L(fi2)
+
+L(fi3):	ldr	r4, [up], #4
+	mov	r6, #0
+	ldr	r5, [up], #4
+	b	L(lo3)
+
+L(fi0):	ldr	r5, [up], #4
+	add	rp, rp, #4
+	mov	r7, #0
+	ldr	r4, [up], #4
+	b	L(lo0)
+
+L(fi1):	ldr	r4, [up], #4
+	mov	r6, #0
+	add	rp, rp, #8
+	subs	n, n, #1
+	beq	L(1)
+	ldr	r5, [up], #4
+	b	L(lo1)
+
+L(fi2):	ldr	r5, [up], #4
+	add	rp, rp, #12
+	mov	r7, #0
+	ldr	r4, [up], #4
+	b	L(lo2)
+
+	ALIGN(16)
+L(top):	mov	r6, #0
+	ldr	r5, [up], #4
+	str	r7, [rp, #-12]
+L(lo1):	umaal	r6, r12, r4, v0
+	mov	r7, #0
+	ldr	r4, [up], #4
+	str	r6, [rp, #-8]
+L(lo0):	umaal	r7, r12, r5, v0
+	mov	r6, #0
+	ldr	r5, [up], #4
+	str	r7, [rp, #-4]
+L(lo3):	umaal	r6, r12, r4, v0
+	mov	r7, #0
+	ldr	r4, [up], #4
+	str	r6, [rp], #16
+L(lo2):	umaal	r7, r12, r5, v0
+	subs	n, n, #4
+	bhi	L(top)
+
+	mov	r6, #0
+	str	r7, [rp, #-12]
+L(1):	umaal	r6, r12, r4, v0
+	str	r6, [rp, #-8]
+	mov	r0, r12
+	ldmfd	sp!, { r4, r5, r6, r7 }
+	bx	lr
+EPILOGUE()

diff --git a/third_party/gmp/mpn/arm/v6/mul_2.asm b/third_party/gmp/mpn/arm/v6/mul_2.asm
new file mode 100644
index 0000000..edd27f3
--- /dev/null
+++ b/third_party/gmp/mpn/arm/v6/mul_2.asm

@@ -0,0 +1,135 @@
+dnl  ARM mpn_mul_2.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C StrongARM:	 -
+C XScale	 -
+C ARM11		 5.25
+C Cortex-A5	 3.63
+C Cortex-A7	 3.15
+C Cortex-A8	 5.0
+C Cortex-A9	 2.25
+C Cortex-A15	 2.5
+C Cortex-A17	 2.13
+C Cortex-A53	 3.5
+
+C TODO
+C  * This is a trivial edit of the addmul_2 code.  Check for simplifications,
+C    and possible speedups to 2.0 c/l.
+
+define(`rp',`r0')
+define(`up',`r1')
+define(`n', `r2')
+define(`vp',`r3')
+
+define(`v0',`r6')
+define(`v1',`r7')
+define(`u0',`r3')
+define(`u1',`r9')
+
+define(`cya',`r8')
+define(`cyb',`r12')
+
+
+ASM_START()
+PROLOGUE(mpn_mul_2)
+	push	{ r4, r5, r6, r7, r8, r9 }
+
+	ldm	vp, { v0, v1 }
+	mov	cya, #0
+	mov	cyb, #0
+
+	tst	n, #1
+	beq	L(evn)
+L(odd):	mov	r5, #0
+	ldr	u0, [up, #0]
+	mov	r4, #0
+	tst	n, #2
+	beq	L(fi1)
+L(fi3):	sub	up, up, #12
+	sub	rp, rp, #16
+	b	L(lo3)
+L(fi1):	sub	n, n, #1
+	sub	up, up, #4
+	sub	rp, rp, #8
+	b	L(lo1)
+L(evn):	mov	r4, #0
+	ldr	u1, [up, #0]
+	mov	r5, #0
+	tst	n, #2
+	bne	L(fi2)
+L(fi0):	sub	up, up, #8
+	sub	rp, rp, #12
+	b	L(lo0)
+L(fi2):	subs	n, n, #2
+	sub	rp, rp, #4
+	bls	L(end)
+
+	ALIGN(16)
+L(top):	ldr	u0, [up, #4]
+	umaal	r4, cya, u1, v0
+	str	r4, [rp, #4]
+	mov	r4, #0
+	umaal	r5, cyb, u1, v1
+L(lo1):	ldr	u1, [up, #8]
+	umaal	r5, cya, u0, v0
+	str	r5, [rp, #8]
+	mov	r5, #0
+	umaal	r4, cyb, u0, v1
+L(lo0):	ldr	u0, [up, #12]
+	umaal	r4, cya, u1, v0
+	str	r4, [rp, #12]
+	mov	r4, #0
+	umaal	r5, cyb, u1, v1
+L(lo3):	ldr	u1, [up, #16]!
+	umaal	r5, cya, u0, v0
+	str	r5, [rp, #16]!
+	mov	r5, #0
+	umaal	r4, cyb, u0, v1
+	subs	n, n, #4
+	bhi	L(top)
+
+L(end):	umaal	r4, cya, u1, v0
+	ldr	u0, [up, #4]
+	umaal	r5, cyb, u1, v1
+	str	r4, [rp, #4]
+	umaal	r5, cya, u0, v0
+	umaal	cya, cyb, u0, v1
+	str	r5, [rp, #8]
+	str	cya, [rp, #12]
+	mov	r0, cyb
+
+	pop	{ r4, r5, r6, r7, r8, r9 }
+	bx	r14
+EPILOGUE()

diff --git a/third_party/gmp/mpn/arm/v6/popham.asm b/third_party/gmp/mpn/arm/v6/popham.asm
new file mode 100644
index 0000000..c254368
--- /dev/null
+++ b/third_party/gmp/mpn/arm/v6/popham.asm

@@ -0,0 +1,139 @@
+dnl  ARM mpn_popcount and mpn_hamdist.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C		     popcount	      hamdist
+C		    cycles/limb	    cycles/limb
+C StrongARM		 -
+C XScale		 -
+C Cortex-A7		 ?
+C Cortex-A8		 ?
+C Cortex-A9		 8.94		 9.47
+C Cortex-A15		 5.67		 6.44
+
+C Architecture requirements:
+C v5	-
+C v5t	-
+C v5te	ldrd strd
+C v6	usada8
+C v6t2	-
+C v7a	-
+
+ifdef(`OPERATION_popcount',`
+  define(`func',`mpn_popcount')
+  define(`ap',		`r0')
+  define(`n',		`r1')
+  define(`a0',		`r2')
+  define(`a1',		`r3')
+  define(`s',		`r5')
+  define(`b_01010101',	`r6')
+  define(`b_00110011',	`r7')
+  define(`b_00001111',	`r8')
+  define(`zero',	`r9')
+  define(`POPC',	`$1')
+  define(`HAMD',	`dnl')
+')
+ifdef(`OPERATION_hamdist',`
+  define(`func',`mpn_hamdist')
+  define(`ap',		`r0')
+  define(`bp',		`r1')
+  define(`n',		`r2')
+  define(`a0',		`r6')
+  define(`a1',		`r7')
+  define(`b0',		`r4')
+  define(`b1',		`r5')
+  define(`s',		`r11')
+  define(`b_01010101',	`r8')
+  define(`b_00110011',	`r9')
+  define(`b_00001111',	`r10')
+  define(`zero',	`r3')
+  define(`POPC',	`dnl')
+  define(`HAMD',	`$1')
+')
+
+MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
+
+ASM_START()
+PROLOGUE(func)
+POPC(`	push	{ r4-r9 }	')
+HAMD(`	push	{ r4-r11 }	')
+
+	ldr	b_01010101, =0x55555555
+	mov	r12, #0
+	ldr	b_00110011, =0x33333333
+	mov	zero, #0
+	ldr	b_00001111, =0x0f0f0f0f
+
+	tst	n, #1
+	beq	L(evn)
+
+L(odd):	ldr	a1, [ap], #4		C 1 x 32 1-bit accumulators, 0-1
+HAMD(`	ldr	b1, [bp], #4	')	C 1 x 32 1-bit accumulators, 0-1
+HAMD(`	eor	a1, a1, b1	')
+	and	r4, b_01010101, a1, lsr #1
+	sub	a1, a1, r4
+	and	r4, a1, b_00110011
+	bic	r5, a1, b_00110011
+	add	r5, r4, r5, lsr #2	C 8 4-bit accumulators, 0-4
+	subs	n, n, #1
+	b	L(mid)
+
+L(evn):	mov	s, #0
+
+L(top):	ldrd	a0, a1, [ap], #8	C 2 x 32 1-bit accumulators, 0-1
+HAMD(`	ldrd	b0, b1, [bp], #8')
+HAMD(`	eor	a0, a0, b0	')
+HAMD(`	eor	a1, a1, b1	')
+	subs	n, n, #2
+	usada8	r12, s, zero, r12
+	and	r4, b_01010101, a0, lsr #1
+	sub	a0, a0, r4
+	and	r4, b_01010101, a1, lsr #1
+	sub	a1, a1, r4
+	and	r4, a0, b_00110011
+	bic	r5, a0, b_00110011
+	add	a0, r4, r5, lsr #2	C 8 4-bit accumulators, 0-4
+	and	r4, a1, b_00110011
+	bic	r5, a1, b_00110011
+	add	a1, r4, r5, lsr #2	C 8 4-bit accumulators, 0-4
+	add	r5, a0, a1		C 8 4-bit accumulators, 0-8
+L(mid):	and	r4, r5, b_00001111
+	bic	r5, r5, b_00001111
+	add	s, r4, r5, lsr #4	C 4 8-bit accumulators
+	bne	L(top)
+
+	usada8	r0, s, zero, r12
+POPC(`	pop	{ r4-r9 }	')
+HAMD(`	pop	{ r4-r11 }	')
+	bx	r14
+EPILOGUE()

diff --git a/third_party/gmp/mpn/arm/v6/sqr_basecase.asm b/third_party/gmp/mpn/arm/v6/sqr_basecase.asm
new file mode 100644
index 0000000..0fc4f13
--- /dev/null
+++ b/third_party/gmp/mpn/arm/v6/sqr_basecase.asm

@@ -0,0 +1,544 @@
+dnl  ARM v6 mpn_sqr_basecase.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2012, 2013, 2015 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C Code structure:
+C
+C
+C        m_2(0m4)        m_2(2m4)        m_2(1m4)        m_2(3m4)
+C           |               |               |               |
+C           |               |               |               |
+C           |               |               |               |
+C          \|/             \|/             \|/             \|/
+C              ____________                   ____________
+C             /            \                 /            \
+C            \|/            \               \|/            \
+C         am_2(3m4)       am_2(1m4)       am_2(0m4)       am_2(2m4)
+C            \            /|\                \            /|\
+C             \____________/                  \____________/
+C                       \                        /
+C                        \                      /
+C                         \                    /
+C                         cor3             cor2
+C                            \              /
+C                             \            /
+C                            sqr_diag_addlsh1
+
+C TODO
+C  * Align more labels.
+C  * Further tweak counter and updates in outer loops.  (This could save
+C    perhaps 5n cycles).
+C  * Avoid sub-with-lsl in outer loops.  We could keep n up-shifted, then
+C    initialise loop counter i with a right shift.
+C  * Try to use fewer register.  Perhaps coalesce r9 branch target and n_saved.
+C    (This could save 2-3 cycles for n > 4.)
+C  * Optimise sqr_diag_addlsh1 loop.  The current code uses old-style carry
+C    propagation.
+C  * Stop loops earlier suppressing writes of upper-most rp[] values.
+C  * The addmul_2 loops here runs well on all cores, but mul_2 runs poorly
+C    particularly on Cortex-A8.
+
+
+define(`rp',      r0)
+define(`up',      r1)
+define(`n',       r2)
+
+define(`v0',      r3)
+define(`v1',      r6)
+define(`i',       r8)
+define(`n_saved', r14)
+define(`cya',     r11)
+define(`cyb',     r12)
+define(`u0',      r7)
+define(`u1',      r9)
+
+ASM_START()
+PROLOGUE(mpn_sqr_basecase)
+	and	r12, n, #3
+	cmp	n, #4
+	addgt	r12, r12, #4
+	add	pc, pc, r12, lsl #2
+	nop
+	b	L(4)
+	b	L(1)
+	b	L(2)
+	b	L(3)
+	b	L(0m4)
+	b	L(1m4)
+	b	L(2m4)
+	b	L(3m4)
+
+
+L(1m4):	push	{r4-r11, r14}
+	mov	n_saved, n
+	sub	i, n, #4
+	sub	n, n, #2
+	add	r10, pc, #L(am2_2m4)-.-8
+	ldm	up, {v0,v1,u0}
+	sub	up, up, #4
+	mov	cyb, #0
+	mov	r5, #0
+	umull	r4, cya, v1, v0
+	str	r4, [rp], #-12
+	mov	r4, #0
+	b	L(ko0)
+
+L(3m4):	push	{r4-r11, r14}
+	mov	n_saved, n
+	sub	i, n, #4
+	sub	n, n, #2
+	add	r10, pc, #L(am2_0m4)-.-8
+	ldm	up, {v0,v1,u0}
+	add	up, up, #4
+	mov	cyb, #0
+	mov	r5, #0
+	umull	r4, cya, v1, v0
+	str	r4, [rp], #-4
+	mov	r4, #0
+	b	L(ko2)
+
+L(2m4):	push	{r4-r11, r14}
+	mov	n_saved, n
+	sub	i, n, #4
+	sub	n, n, #2
+	add	r10, pc, #L(am2_3m4)-.-8
+	ldm	up, {v0,v1,u1}
+	mov	cyb, #0
+	mov	r4, #0
+	umull	r5, cya, v1, v0
+	str	r5, [rp], #-8
+	mov	r5, #0
+	b	L(ko1)
+
+L(0m4):	push	{r4-r11, r14}
+	mov	n_saved, n
+	sub	i, n, #4
+	sub	n, n, #2
+	add	r10, pc, #L(am2_1m4)-.-8
+	ldm	up, {v0,v1,u1}
+	mov	cyb, #0
+	mov	r4, #0
+	add	up, up, #8
+	umull	r5, cya, v1, v0
+	str	r5, [rp, #0]
+	mov	r5, #0
+
+L(top):	ldr	u0, [up, #4]
+	umaal	r4, cya, u1, v0
+	str	r4, [rp, #4]
+	mov	r4, #0
+	umaal	r5, cyb, u1, v1
+L(ko2):	ldr	u1, [up, #8]
+	umaal	r5, cya, u0, v0
+	str	r5, [rp, #8]
+	mov	r5, #0
+	umaal	r4, cyb, u0, v1
+L(ko1):	ldr	u0, [up, #12]
+	umaal	r4, cya, u1, v0
+	str	r4, [rp, #12]
+	mov	r4, #0
+	umaal	r5, cyb, u1, v1
+L(ko0):	ldr	u1, [up, #16]!
+	umaal	r5, cya, u0, v0
+	str	r5, [rp, #16]!
+	mov	r5, #0
+	umaal	r4, cyb, u0, v1
+	subs	i, i, #4
+	bhi	L(top)
+
+	umaal	r4, cya, u1, v0
+	ldr	u0, [up, #4]
+	umaal	r5, cyb, u1, v1
+	str	r4, [rp, #4]
+	umaal	r5, cya, u0, v0
+	umaal	cya, cyb, u0, v1
+	str	r5, [rp, #8]
+	str	cya, [rp, #12]
+	str	cyb, [rp, #16]
+
+	add	up, up, #4
+	sub	n, n, #1
+	add	rp, rp, #8
+	bx	r10
+
+L(evnloop):
+	subs	i, n, #6
+	sub	n, n, #2
+	blt	L(cor2)
+	ldm	up, {v0,v1,u1}
+	add	up, up, #8
+	mov	cya, #0
+	mov	cyb, #0
+	ldr	r4, [rp, #-4]
+	umaal	r4, cya, v1, v0
+	str	r4, [rp, #-4]
+	ldr	r4, [rp, #0]
+
+	ALIGN(16)
+L(ua2):	ldr	r5, [rp, #4]
+	umaal	r4, cya, u1, v0
+	ldr	u0, [up, #4]
+	umaal	r5, cyb, u1, v1
+	str	r4, [rp, #0]
+	ldr	r4, [rp, #8]
+	umaal	r5, cya, u0, v0
+	ldr	u1, [up, #8]
+	umaal	r4, cyb, u0, v1
+	str	r5, [rp, #4]
+	ldr	r5, [rp, #12]
+	umaal	r4, cya, u1, v0
+	ldr	u0, [up, #12]
+	umaal	r5, cyb, u1, v1
+	str	r4, [rp, #8]
+	ldr	r4, [rp, #16]!
+	umaal	r5, cya, u0, v0
+	ldr	u1, [up, #16]!
+	umaal	r4, cyb, u0, v1
+	str	r5, [rp, #-4]
+	subs	i, i, #4
+	bhs	L(ua2)
+
+	umaal	r4, cya, u1, v0
+	umaal	cya, cyb, u1, v1
+	str	r4, [rp, #0]
+	str	cya, [rp, #4]
+	str	cyb, [rp, #8]
+L(am2_0m4):
+	sub	rp, rp, n, lsl #2
+	sub	up, up, n, lsl #2
+	add	rp, rp, #8
+
+	sub	i, n, #4
+	sub	n, n, #2
+	ldm	up, {v0,v1,u1}
+	mov	cya, #0
+	mov	cyb, #0
+	ldr	r4, [rp, #4]
+	umaal	r4, cya, v1, v0
+	str	r4, [rp, #4]
+	ldr	r4, [rp, #8]
+	b	L(lo0)
+
+	ALIGN(16)
+L(ua0):	ldr	r5, [rp, #4]
+	umaal	r4, cya, u1, v0
+	ldr	u0, [up, #4]
+	umaal	r5, cyb, u1, v1
+	str	r4, [rp, #0]
+	ldr	r4, [rp, #8]
+	umaal	r5, cya, u0, v0
+	ldr	u1, [up, #8]
+	umaal	r4, cyb, u0, v1
+	str	r5, [rp, #4]
+L(lo0):	ldr	r5, [rp, #12]
+	umaal	r4, cya, u1, v0
+	ldr	u0, [up, #12]
+	umaal	r5, cyb, u1, v1
+	str	r4, [rp, #8]
+	ldr	r4, [rp, #16]!
+	umaal	r5, cya, u0, v0
+	ldr	u1, [up, #16]!
+	umaal	r4, cyb, u0, v1
+	str	r5, [rp, #-4]
+	subs	i, i, #4
+	bhs	L(ua0)
+
+	umaal	r4, cya, u1, v0
+	umaal	cya, cyb, u1, v1
+	str	r4, [rp, #0]
+	str	cya, [rp, #4]
+	str	cyb, [rp, #8]
+L(am2_2m4):
+	sub	rp, rp, n, lsl #2
+	sub	up, up, n, lsl #2
+	add	rp, rp, #16
+	b	L(evnloop)
+
+
+L(oddloop):
+	sub	i, n, #5
+	sub	n, n, #2
+	ldm	up, {v0,v1,u0}
+	mov	cya, #0
+	mov	cyb, #0
+	ldr	r5, [rp, #0]
+	umaal	r5, cya, v1, v0
+	str	r5, [rp, #0]
+	ldr	r5, [rp, #4]
+	add	up, up, #4
+	b	L(lo1)
+
+	ALIGN(16)
+L(ua1):	ldr	r5, [rp, #4]
+	umaal	r4, cya, u1, v0
+	ldr	u0, [up, #4]
+	umaal	r5, cyb, u1, v1
+	str	r4, [rp, #0]
+L(lo1):	ldr	r4, [rp, #8]
+	umaal	r5, cya, u0, v0
+	ldr	u1, [up, #8]
+	umaal	r4, cyb, u0, v1
+	str	r5, [rp, #4]
+	ldr	r5, [rp, #12]
+	umaal	r4, cya, u1, v0
+	ldr	u0, [up, #12]
+	umaal	r5, cyb, u1, v1
+	str	r4, [rp, #8]
+	ldr	r4, [rp, #16]!
+	umaal	r5, cya, u0, v0
+	ldr	u1, [up, #16]!
+	umaal	r4, cyb, u0, v1
+	str	r5, [rp, #-4]
+	subs	i, i, #4
+	bhs	L(ua1)
+
+	umaal	r4, cya, u1, v0
+	umaal	cya, cyb, u1, v1
+	str	r4, [rp, #0]
+	str	cya, [rp, #4]
+	str	cyb, [rp, #8]
+L(am2_3m4):
+	sub	rp, rp, n, lsl #2
+	sub	up, up, n, lsl #2
+	add	rp, rp, #4
+
+	subs	i, n, #3
+	beq	L(cor3)
+	sub	n, n, #2
+	ldm	up, {v0,v1,u0}
+	mov	cya, #0
+	mov	cyb, #0
+	ldr	r5, [rp, #8]
+	sub	up, up, #4
+	umaal	r5, cya, v1, v0
+	str	r5, [rp, #8]
+	ldr	r5, [rp, #12]
+	b	L(lo3)
+
+	ALIGN(16)
+L(ua3):	ldr	r5, [rp, #4]
+	umaal	r4, cya, u1, v0
+	ldr	u0, [up, #4]
+	umaal	r5, cyb, u1, v1
+	str	r4, [rp, #0]
+	ldr	r4, [rp, #8]
+	umaal	r5, cya, u0, v0
+	ldr	u1, [up, #8]
+	umaal	r4, cyb, u0, v1
+	str	r5, [rp, #4]
+	ldr	r5, [rp, #12]
+	umaal	r4, cya, u1, v0
+	ldr	u0, [up, #12]
+	umaal	r5, cyb, u1, v1
+	str	r4, [rp, #8]
+L(lo3):	ldr	r4, [rp, #16]!
+	umaal	r5, cya, u0, v0
+	ldr	u1, [up, #16]!
+	umaal	r4, cyb, u0, v1
+	str	r5, [rp, #-4]
+	subs	i, i, #4
+	bhs	L(ua3)
+
+	umaal	r4, cya, u1, v0
+	umaal	cya, cyb, u1, v1
+	str	r4, [rp, #0]
+	str	cya, [rp, #4]
+	str	cyb, [rp, #8]
+L(am2_1m4):
+	sub	rp, rp, n, lsl #2
+	sub	up, up, n, lsl #2
+	add	rp, rp, #12
+	b	L(oddloop)
+
+
+L(cor3):ldm	up, {v0,v1,u0}
+	ldr	r5, [rp, #8]
+	mov	cya, #0
+	mov	cyb, #0
+	umaal	r5, cya, v1, v0
+	str	r5, [rp, #8]
+	ldr	r5, [rp, #12]
+	ldr	r4, [rp, #16]
+	umaal	r5, cya, u0, v0
+	ldr	u1, [up, #12]
+	umaal	r4, cyb, u0, v1
+	str	r5, [rp, #12]
+	umaal	r4, cya, u1, v0
+	umaal	cya, cyb, u1, v1
+	str	r4, [rp, #16]
+	str	cya, [rp, #20]
+	str	cyb, [rp, #24]
+	add	up, up, #16
+	mov	cya, cyb
+	adds	rp, rp, #36		C clear cy
+	mov	cyb, #0
+	umaal	cya, cyb, u1, u0
+	b	L(sqr_diag_addlsh1)
+
+L(cor2):
+	ldm	up!, {v0,v1,u0}
+	mov	r4, cya
+	mov	r5, cyb
+	mov	cya, #0
+	umaal	r4, cya, v1, v0
+	mov	cyb, #0
+	umaal	r5, cya, u0, v0
+	strd	r4, r5, [rp, #-4]
+	umaal	cya, cyb, u0, v1
+	add	rp, rp, #16
+C	b	L(sqr_diag_addlsh1)
+
+
+define(`w0',  r6)
+define(`w1',  r7)
+define(`w2',  r8)
+define(`rbx', r9)
+
+L(sqr_diag_addlsh1):
+	str	cya, [rp, #-12]
+	str	cyb, [rp, #-8]
+	sub	n, n_saved, #1
+	sub	up, up, n_saved, lsl #2
+	sub	rp, rp, n_saved, lsl #3
+	ldr	r3, [up], #4
+	umull	w1, r5, r3, r3
+	mov	w2, #0
+	mov	r10, #0
+C	cmn	r0, #0			C clear cy (already clear)
+	b	L(lm)
+
+L(tsd):	adds	w0, w0, rbx
+	adcs	w1, w1, r4
+	str	w0, [rp, #0]
+L(lm):	ldr	w0, [rp, #4]
+	str	w1, [rp, #4]
+	ldr	w1, [rp, #8]!
+	add	rbx, r5, w2
+	adcs	w0, w0, w0
+	ldr	r3, [up], #4
+	adcs	w1, w1, w1
+	adc	w2, r10, r10
+	umull	r4, r5, r3, r3
+	subs	n, n, #1
+	bne	L(tsd)
+
+	adds	w0, w0, rbx
+	adcs	w1, w1, r4
+	adc	w2, r5, w2
+	stm	rp, {w0,w1,w2}
+
+	pop	{r4-r11, pc}
+
+
+C Straight line code for n <= 4
+
+L(1):	ldr	r3, [up, #0]
+	umull	r1, r2, r3, r3
+	stm	rp, {r1,r2}
+	bx	r14
+
+L(2):	push	{r4-r5}
+	ldm	up, {r5,r12}
+	umull	r1, r2, r5, r5
+	umull	r3, r4, r12, r12
+	umull	r5, r12, r5, r12
+	adds	r5, r5, r5
+	adcs	r12, r12, r12
+	adc	r4, r4, #0
+	adds	r2, r2, r5
+	adcs	r3, r3, r12
+	adc	r4, r4, #0
+	stm	rp, {r1,r2,r3,r4}
+	pop	{r4-r5}
+	bx	r14
+
+L(3):	push	{r4-r11}
+	ldm	up, {r7,r8,r9}
+	umull	r1, r2, r7, r7
+	umull	r3, r4, r8, r8
+	umull	r5, r6, r9, r9
+	umull	r10, r11, r7, r8
+	mov	r12, #0
+	umlal	r11, r12, r7, r9
+	mov	r7, #0
+	umlal	r12, r7, r8, r9
+	adds	r10, r10, r10
+	adcs	r11, r11, r11
+	adcs	r12, r12, r12
+	adcs	r7, r7, r7
+	adc	r6, r6, #0
+	adds	r2, r2, r10
+	adcs	r3, r3, r11
+	adcs	r4, r4, r12
+	adcs	r5, r5, r7
+	adc	r6, r6, #0
+	stm	rp, {r1,r2,r3,r4,r5,r6}
+	pop	{r4-r11}
+	bx	r14
+
+L(4):	push	{r4-r11, r14}
+	ldm	up, {r9,r10,r11,r12}
+	umull	r1, r2, r9, r9
+	umull	r3, r4, r10, r10
+	umull	r5, r6, r11, r11
+	umull	r7, r8, r12, r12
+	stm	rp, {r1,r2,r3,r4,r5,r6,r7}
+	umull	r1, r2, r9, r10
+	mov	r3, #0
+	umlal	r2, r3, r9, r11
+	mov	r4, #0
+	umlal	r3, r4, r9, r12
+	mov	r5, #0
+	umlal	r3, r5, r10, r11
+	umaal	r4, r5, r10, r12
+	mov	r6, #0
+	umlal	r5, r6, r11, r12
+	adds	r1, r1, r1
+	adcs	r2, r2, r2
+	adcs	r3, r3, r3
+	adcs	r4, r4, r4
+	adcs	r5, r5, r5
+	adcs	r6, r6, r6
+	add	rp, rp, #4
+	adc	r7, r8, #0
+	ldm	rp, {r8,r9,r10,r11,r12,r14}
+	adds	r1, r1, r8
+	adcs	r2, r2, r9
+	adcs	r3, r3, r10
+	adcs	r4, r4, r11
+	adcs	r5, r5, r12
+	adcs	r6, r6, r14
+	adc	r7, r7, #0
+	stm	rp, {r1,r2,r3,r4,r5,r6,r7}
+	pop	{r4-r11, pc}
+EPILOGUE()

diff --git a/third_party/gmp/mpn/arm/v6/submul_1.asm b/third_party/gmp/mpn/arm/v6/submul_1.asm
new file mode 100644
index 0000000..8a21733
--- /dev/null
+++ b/third_party/gmp/mpn/arm/v6/submul_1.asm

@@ -0,0 +1,125 @@
+dnl  ARM mpn_submul_1.
+
+dnl  Copyright 2012, 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C StrongARM:	 -
+C XScale	 -
+C Cortex-A7	 ?
+C Cortex-A8	 ?
+C Cortex-A9	 3.75
+C Cortex-A15	 4.0
+
+C This loop complements U on the fly,
+C   U' = B^n - 1 - U
+C and then uses that
+C   R - U*v = R + U'*v + v - B^n v
+
+C TODO
+C  * Micro-optimise feed-in code.
+C  * Optimise for n=1,2 by delaying register saving.
+C  * Try using ldm/stm.
+
+define(`rp',`r0')
+define(`up',`r1')
+define(`n', `r2')
+define(`v0',`r3')
+
+ASM_START()
+PROLOGUE(mpn_submul_1)
+	stmfd	sp!, { r4, r5, r6, r7 }
+
+	ands	r6, n, #3
+	mov	r12, v0
+	beq	L(fi0)
+	cmp	r6, #2
+	bcc	L(fi1)
+	beq	L(fi2)
+
+L(fi3):	ldr	r4, [up], #12
+	mvn	r4, r4
+	ldr	r6, [rp, #0]
+	ldr	r5, [up, #-8]
+	b	L(lo3)
+
+L(fi0):	ldr	r5, [up], #16
+	mvn	r5, r5
+	ldr	r7, [rp], #4
+	ldr	r4, [up, #-12]
+	b	L(lo0)
+
+L(fi1):	ldr	r4, [up], #4
+	mvn	r4, r4
+	ldr	r6, [rp], #8
+	subs	n, n, #1
+	beq	L(1)
+	ldr	r5, [up]
+	b	L(lo1)
+
+L(fi2):	ldr	r5, [up], #8
+	mvn	r5, r5
+	ldr	r7, [rp], #12
+	ldr	r4, [up, #-4]
+	b	L(lo2)
+
+	ALIGN(16)
+L(top):	ldr	r6, [rp, #-8]
+	ldr	r5, [up]
+	str	r7, [rp, #-12]
+L(lo1):	umaal	r6, r12, r4, v0
+	add	up, up, #16
+	mvn	r5, r5
+	ldr	r7, [rp, #-4]
+	ldr	r4, [up, #-12]
+	str	r6, [rp, #-8]
+L(lo0):	umaal	r7, r12, r5, v0
+	mvn	r4, r4
+	ldr	r6, [rp, #0]
+	ldr	r5, [up, #-8]
+	str	r7, [rp, #-4]
+L(lo3):	umaal	r6, r12, r4, v0
+	mvn	r5, r5
+	ldr	r7, [rp, #4]
+	ldr	r4, [up, #-4]
+	str	r6, [rp], #16
+L(lo2):	umaal	r7, r12, r5, v0
+	mvn	r4, r4
+	subs	n, n, #4
+	bhi	L(top)
+
+	ldr	r6, [rp, #-8]
+	str	r7, [rp, #-12]
+L(1):	umaal	r6, r12, r4, v0
+	str	r6, [rp, #-8]
+	sub	r0, v0, r12
+	ldmfd	sp!, { r4, r5, r6, r7 }
+	bx	lr
+EPILOGUE()

diff --git a/third_party/gmp/mpn/arm/v6t2/divrem_1.asm b/third_party/gmp/mpn/arm/v6t2/divrem_1.asm
new file mode 100644
index 0000000..be24615
--- /dev/null
+++ b/third_party/gmp/mpn/arm/v6t2/divrem_1.asm

@@ -0,0 +1,212 @@
+dnl  ARM v6t2 mpn_divrem_1 and mpn_preinv_divrem_1.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C		norm	unorm	frac
+C StrongARM	 -	 -	 -
+C XScale	 -	 -	 -
+C Cortex-A7	 ?	 ?	 ?
+C Cortex-A8	 ?	 ?	 ?
+C Cortex-A9	13	14	13
+C Cortex-A15	11.4	11.8	11.1
+
+C TODO
+C  * Optimise inner-loops better, they could likely run a cycle or two faster.
+C  * Decrease register usage, streamline non-loop code.
+
+define(`qp_arg',  `r0')
+define(`fn',      `r1')
+define(`up_arg',  `r2')
+define(`n_arg',   `r3')
+define(`d_arg',   `0')
+define(`dinv_arg',`4')
+define(`cnt_arg', `8')
+
+define(`n',       `r9')
+define(`qp',      `r5')
+define(`up',      `r6')
+define(`cnt',     `r7')
+define(`tnc',     `r10')
+define(`dinv',    `r0')
+define(`d',       `r4')
+
+ASM_START()
+PROLOGUE(mpn_preinv_divrem_1)
+	stmfd	sp!, {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+	ldr	d,    [sp, #9*4+d_arg]
+	ldr	cnt,  [sp, #9*4+cnt_arg]
+	str	r1, [sp, #9*4+d_arg]	C reuse d stack slot for fn
+	sub	n, r3, #1
+	add	r3, r1, n
+	cmp	d, #0
+	add	qp, qp_arg, r3, lsl #2	C put qp at Q[] end
+	add	up, up_arg, n, lsl #2	C put up at U[] end
+	ldr	dinv, [sp, #9*4+dinv_arg]
+	blt	L(nent)
+	b	L(uent)
+EPILOGUE()
+
+PROLOGUE(mpn_divrem_1)
+	stmfd	sp!, {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+	sub	n, r3, #1
+	ldr	d, [sp, #9*4+d_arg]	C d
+	str	r1, [sp, #9*4+d_arg]	C reuse d stack slot for fn
+	add	r3, r1, n
+	cmp	d, #0
+	add	qp, qp_arg, r3, lsl #2	C put qp at Q[] end
+	add	up, up_arg, n, lsl #2	C put up at U[] end
+	blt	L(normalised)
+
+L(unnorm):
+	clz	cnt, d
+	mov	r0, d, lsl cnt		C pass d << cnt
+	bl	mpn_invert_limb
+L(uent):
+	mov	d, d, lsl cnt		C d <<= cnt
+	cmp	n, #0
+	mov	r1, #0			C r
+	blt	L(frac)
+
+	ldr	r11, [up, #0]
+
+	rsb	tnc, cnt, #32
+	mov	r1, r11, lsr tnc
+	mov	r11, r11, lsl cnt
+	beq	L(uend)
+
+	ldr	r3, [up, #-4]!
+	orr	r2, r11, r3, lsr tnc
+	b	L(mid)
+
+L(utop):
+	mls	r1, d, r8, r11
+	mov	r11, r3, lsl cnt
+	ldr	r3, [up, #-4]!
+	cmp	r1, r2
+	addhi	r1, r1, d
+	subhi	r8, r8, #1
+	orr	r2, r11, r3, lsr tnc
+	cmp	r1, d
+	bcs	L(ufx)
+L(uok):	str	r8, [qp], #-4
+L(mid):	add	r8, r1, #1
+	mov	r11, r2
+	umlal	r2, r8, r1, dinv
+	subs	n, n, #1
+	bne	L(utop)
+
+	mls	r1, d, r8, r11
+	mov	r11, r3, lsl cnt
+	cmp	r1, r2
+	addhi	r1, r1, d
+	subhi	r8, r8, #1
+	cmp	r1, d
+	rsbcs	r1, d, r1
+	addcs	r8, r8, #1
+	str	r8, [qp], #-4
+
+L(uend):add	r8, r1, #1
+	mov	r2, r11
+	umlal	r2, r8, r1, dinv
+	mls	r1, d, r8, r11
+	cmp	r1, r2
+	addhi	r1, r1, d
+	subhi	r8, r8, #1
+	cmp	r1, d
+	rsbcs	r1, d, r1
+	addcs	r8, r8, #1
+	str	r8, [qp], #-4
+L(frac):
+	ldr	r2, [sp, #9*4+d_arg]	C fn
+	cmp	r2, #0
+	beq	L(fend)
+
+L(ftop):mov	r6, #0
+	add	r3, r1, #1
+	umlal	r6, r3, r1, dinv
+	mov	r8, #0
+	mls	r1, d, r3, r8
+	cmp	r1, r6
+	addhi	r1, r1, d
+	subhi	r3, r3, #1
+	subs	r2, r2, #1
+	str	r3, [qp], #-4
+	bne	L(ftop)
+
+L(fend):mov	r11, r1, lsr cnt
+L(rtn):	mov	r0, r11
+	ldmfd	sp!, {r4, r5, r6, r7, r8, r9, r10, r11, pc}
+
+L(normalised):
+	mov	r0, d
+	bl	mpn_invert_limb
+L(nent):
+	cmp	n, #0
+	mov	r11, #0			C r
+	blt	L(nend)
+
+	ldr	r11, [up, #0]
+	cmp	r11, d
+	movlo	r2, #0			C hi q limb
+	movhs	r2, #1			C hi q limb
+	subhs	r11, r11, d
+
+	str	r2, [qp], #-4
+	cmp	n, #0
+	beq	L(nend)
+
+L(ntop):ldr	r1, [up, #-4]!
+	add	r12, r11, #1
+	umlal	r1, r12, r11, dinv
+	ldr	r3, [up, #0]
+	mls	r11, d, r12, r3
+	cmp	r11, r1
+	addhi	r11, r11, d
+	subhi	r12, r12, #1
+	cmp	d, r11
+	bls	L(nfx)
+L(nok):	str	r12, [qp], #-4
+	subs	n, n, #1
+	bne	L(ntop)
+
+L(nend):mov	r1, r11			C r
+	mov	cnt, #0			C shift cnt
+	b	L(frac)
+
+L(nfx):	add	r12, r12, #1
+	rsb	r11, d, r11
+	b	L(nok)
+L(ufx):	rsb	r1, d, r1
+	add	r8, r8, #1
+	b	L(uok)
+EPILOGUE()

diff --git a/third_party/gmp/mpn/arm/v6t2/gcd_11.asm b/third_party/gmp/mpn/arm/v6t2/gcd_11.asm
new file mode 100644
index 0000000..8a38351
--- /dev/null
+++ b/third_party/gmp/mpn/arm/v6t2/gcd_11.asm

@@ -0,0 +1,65 @@
+dnl  ARM v6t2 mpn_gcd_11.
+
+dnl  Copyright 2000-2002, 2005, 2009, 2011, 2012, 2019 Free Software Foundation,
+dnl  Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/bit (approx)
+C StrongARM	 -
+C XScale	 -
+C Cortex-A5	 5.2
+C Cortex-A7	 5.04
+C Cortex-A8	 3.59
+C Cortex-A9	 9.5
+C Cortex-A15	 3.2
+C Cortex-A17	 5.25
+C Cortex-A53	 3.57
+
+define(`u0',    `r0')
+define(`v0',    `r1')
+
+ASM_START()
+	TEXT
+	ALIGN(64)
+PROLOGUE(mpn_gcd_11)
+	subs	r3, u0, v0	C			0
+	beq	L(end)		C
+
+	ALIGN(16)
+L(top):	rbit	r12, r3		C			1,5
+	clz	r12, r12	C			2
+	rsbcc	r3, r3, #0	C v = abs(u-v), even	1
+	movcs	u0, v0		C u = min(u,v)		1
+	lsr	v0, r3, r12	C			3
+	subs	r3, u0, v0	C			4
+	bne	L(top)		C
+
+L(end):	bx	lr
+EPILOGUE()

diff --git a/third_party/gmp/mpn/arm/v6t2/gcd_22.asm b/third_party/gmp/mpn/arm/v6t2/gcd_22.asm
new file mode 100644
index 0000000..3b23808
--- /dev/null
+++ b/third_party/gmp/mpn/arm/v6t2/gcd_22.asm

@@ -0,0 +1,113 @@
+dnl  ARM v6t2 mpn_gcd_22.
+
+dnl  Copyright 2019 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C	     cycles/bit (approx)
+C StrongARM	 -
+C XScale	 -
+C Cortex-A5	10.1
+C Cortex-A7	 9.1
+C Cortex-A8	 6.3
+C Cortex-A9	 ?
+C Cortex-A12	 7.7
+C Cortex-A15	 5.7
+C Cortex-A17	 ?
+C Cortex-A53	 7.0
+
+
+define(`gp',    `r0')
+
+define(`u1',    `r1')
+define(`u0',    `r2')
+define(`v1',    `r3')
+define(`v0',    `r4')
+
+define(`t0',    `r5')
+define(`t1',    `r6')
+define(`cnt',   `r7')
+
+ASM_START()
+PROLOGUE(mpn_gcd_22)
+	push	{ r4-r7 }
+
+	ldr	v0, [sp,#16]		C
+
+L(top):	subs	t0, u0, v0		C 0 7
+	beq	L(lowz)
+	sbcs	t1, u1, v1		C 1 8
+
+	rbit	cnt, t0			C 1
+
+	negcc	t0, t0
+	mvncc	t1, t1
+L(bck):	movcc	v0, u0
+	movcc	v1, u1
+
+	clz	cnt, cnt		C 2
+	rsb	r12, cnt, #32		C 3
+
+	lsr	u0, t0, cnt		C 3
+	lsl	r12, t1, r12		C 4
+	lsr	u1, t1, cnt		C 3
+	orr	u0, u0, r12		C 5
+
+	orrs	r12, u1, v1
+	bne	L(top)
+
+
+	str	r12, [gp,#4]		C high result limb <= 0
+
+	mov	r6, gp
+	mov	r0, u0			C pass 1st argument
+	mov	r1, v0			C pass 2nd argument
+	mov	r7, r14			C preserve link register
+	bl	mpn_gcd_11
+	str	r0, [r6,#0]
+	mov	r14, r7
+	pop	{ r4-r7 }
+	bx	r14
+
+L(lowz):C We come here when v0 - u0 = 0
+	C 1. If v1 - u1 = 0, then gcd is u = v.
+	C 2. Else compute gcd_21({v1,v0}, |u1-v1|)
+	subs	t0, u1, v1
+	beq	L(end)
+	mov	t1, #0
+	rbit	cnt, t0			C 1
+	negcc	t0, t0
+	b	L(bck)
+
+L(end):	str	v0, [gp,#0]
+	str	v1, [gp,#4]
+	pop	{ r4-r7 }
+	bx	r14
+EPILOGUE()

diff --git a/third_party/gmp/mpn/arm/v7a/cora15/addmul_1.asm b/third_party/gmp/mpn/arm/v7a/cora15/addmul_1.asm
new file mode 100644
index 0000000..c2277b3
--- /dev/null
+++ b/third_party/gmp/mpn/arm/v7a/cora15/addmul_1.asm

@@ -0,0 +1,145 @@
+dnl  ARM mpn_addmul_1 optimised for A15.
+
+dnl  Copyright 2012, 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb		best
+C StrongARM:     -
+C XScale	 ?
+C Cortex-A7	 ?
+C Cortex-A8	 ?
+C Cortex-A9	 6			3.25
+C Cortex-A15	 2			this
+
+C This code uses umlal for adding in the rp[] data, keeping the recurrency path
+C separate from any multiply instructions.  It performs well on A15, at umlal's
+C bandwidth.
+C
+C An A9 variant should perhaps stick to 3-way unrolling, and use ldm and stm
+C for all loads and stores.  Alternatively, it could do 2-way or 4-way, but
+C then alignment aware code will be necessary (adding O(1) bookkeeping
+C overhead).
+C
+C We don't use r12 due to ldrd and strd limitations.
+
+C Architecture requirements:
+C v5	-
+C v5t	-
+C v5te	ldrd strd
+C v6	-
+C v6t2	-
+C v7a	-
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`n',  `r2')
+define(`v0', `r3')
+
+define(`w0', `r10') define(`w1', `r11')
+define(`u0', `r8')  define(`u1', `r9')
+
+ASM_START()
+PROLOGUE(mpn_addmul_1)
+	push	{ r4-r11 }
+
+	ands	r6, n, #3
+	sub	n, n, #3
+	beq	L(b00)
+	cmp	r6, #2
+	bcc	L(b01)
+	beq	L(b10)
+
+L(b11):	mov	r6, #0
+	cmn	r13, #0			C carry clear
+	ldr	u1, [up], #-4
+	ldr	w1, [rp], #-4
+	mov	r7, #0
+	b	L(mid)
+
+L(b00):	ldrd	u0, u1, [up]
+	ldrd	w0, w1, [rp]
+	mov	r6, #0
+	umlal	w0, r6, u0, v0
+	cmn	r13, #0			C carry clear
+	mov	r7, #0
+	str	w0, [rp]
+	b	L(mid)
+
+L(b10):	ldrd	u0, u1, [up], #8
+	ldrd	w0, w1, [rp]
+	mov	r4, #0
+	umlal	w0, r4, u0, v0
+	cmn	r13, #0			C carry clear
+	mov	r5, #0
+	str	w0, [rp], #8
+	umlal	w1, r5, u1, v0
+	tst	n, n
+	bmi	L(end)
+	b	L(top)
+
+L(b01):	mov	r4, #0
+	ldr	u1, [up], #4
+	ldr	w1, [rp], #4
+	mov	r5, #0
+	umlal	w1, r5, u1, v0
+	tst	n, n
+	bmi	L(end)
+
+	ALIGN(16)
+L(top):	ldrd	u0, u1, [up, #0]
+	adcs	r4, r4, w1
+	ldrd	w0, w1, [rp, #0]
+	mov	r6, #0
+	umlal	w0, r6, u0, v0		C 1 2
+	adcs	r5, r5, w0
+	mov	r7, #0
+	strd	r4, r5, [rp, #-4]
+L(mid):	umlal	w1, r7, u1, v0		C 2 3
+	ldrd	u0, u1, [up, #8]
+	adcs	r6, r6, w1
+	ldrd	w0, w1, [rp, #8]
+	mov	r4, #0
+	umlal	w0, r4, u0, v0		C 3 4
+	adcs	r7, r7, w0
+	mov	r5, #0
+	strd	r6, r7, [rp, #4]
+	umlal	w1, r5, u1, v0		C 0 1
+	sub	n, n, #4
+	add	up, up, #16
+	add	rp, rp, #16
+	tst	n, n
+	bpl	L(top)
+
+L(end):	adcs	r4, r4, w1
+	str	r4, [rp, #-4]
+	adc	r0, r5, #0
+	pop	{ r4-r11 }
+	bx	r14
+EPILOGUE()

diff --git a/third_party/gmp/mpn/arm/v7a/cora15/aors_n.asm b/third_party/gmp/mpn/arm/v7a/cora15/aors_n.asm
new file mode 100644
index 0000000..dc3f839
--- /dev/null
+++ b/third_party/gmp/mpn/arm/v7a/cora15/aors_n.asm

@@ -0,0 +1,162 @@
+dnl  ARM mpn_add_n/mpn_sub_n optimised for A15.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb		best
+C StrongARM:     -
+C XScale	 ?
+C Cortex-A7	 ?
+C Cortex-A8	 ?
+C Cortex-A9	 3.55			2.5
+C Cortex-A15	 1.27			this
+
+C This was a major improvement compared to the code we had before, but it might
+C not be the best 8-way code possible.  We've tried some permutations of auto-
+C increments and separate pointer updates, but they all ran at the same speed
+C on A15.
+
+C Architecture requirements:
+C v5	-
+C v5t	-
+C v5te	ldrd strd
+C v6	-
+C v6t2	-
+C v7a	-
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`vp', `r2')
+define(`n',  `r3')
+
+ifdef(`OPERATION_add_n', `
+  define(`ADDSUBC',	adcs)
+  define(`IFADD',	`$1')
+  define(`SETCY',	`cmp	$1, #1')
+  define(`RETVAL',	`adc	r0, n, #0')
+  define(`RETVAL2',	`adc	r0, n, #1')
+  define(`func',	mpn_add_n)
+  define(`func_nc',	mpn_add_nc)')
+ifdef(`OPERATION_sub_n', `
+  define(`ADDSUBC',	sbcs)
+  define(`IFADD',	`')
+  define(`SETCY',	`rsbs	$1, $1, #0')
+  define(`RETVAL',	`sbc	r0, r0, r0
+			and	r0, r0, #1')
+  define(`RETVAL2',	`RETVAL')
+  define(`func',	mpn_sub_n)
+  define(`func_nc',	mpn_sub_nc)')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+
+ASM_START()
+PROLOGUE(func_nc)
+	ldr	r12, [sp]
+	b	L(ent)
+EPILOGUE()
+PROLOGUE(func)
+	mov	r12, #0
+L(ent):	push	{ r4-r9 }
+
+	ands	r6, n, #3
+	mov	n, n, lsr #2
+	beq	L(b00)
+	cmp	r6, #2
+	bcc	L(b01)
+	beq	L(b10)
+
+L(b11):	ldr	r5, [up], #4
+	ldr	r7, [vp], #4
+	SETCY(	r12)
+	ADDSUBC	r9, r5, r7
+	ldrd	r4, r5, [up, #0]
+	ldrd	r6, r7, [vp, #0]
+	str	r9, [rp], #-4
+	b	L(lo)
+
+L(b00):	ldrd	r4, r5, [up], #-8
+	ldrd	r6, r7, [vp], #-8
+	SETCY(	r12)
+	sub	rp, rp, #16
+	b	L(mid)
+
+L(b01):	ldr	r5, [up], #-4
+	ldr	r7, [vp], #-4
+	SETCY(	r12)
+	ADDSUBC	r9, r5, r7
+	str	r9, [rp], #-12
+	tst	n, n
+	beq	L(wd1)
+L(gt1):	ldrd	r4, r5, [up, #8]
+	ldrd	r6, r7, [vp, #8]
+	b	L(mid)
+
+L(b10):	ldrd	r4, r5, [up]
+	ldrd	r6, r7, [vp]
+	SETCY(	r12)
+	sub	rp, rp, #8
+	b	L(lo)
+
+	ALIGN(16)
+L(top):	ldrd	r4, r5, [up, #8]
+	ldrd	r6, r7, [vp, #8]
+	strd	r8, r9, [rp, #8]
+L(mid):	ADDSUBC	r8, r4, r6
+	ADDSUBC	r9, r5, r7
+	ldrd	r4, r5, [up, #16]
+	ldrd	r6, r7, [vp, #16]
+	strd	r8, r9, [rp, #16]
+	ADDSUBC	r8, r4, r6
+	ADDSUBC	r9, r5, r7
+	sub	n, n, #2
+	tst	n, n
+	bmi	L(dne)
+	ldrd	r4, r5, [up, #24]
+	ldrd	r6, r7, [vp, #24]
+	strd	r8, r9, [rp, #24]
+	ADDSUBC	r8, r4, r6
+	ADDSUBC	r9, r5, r7
+	ldrd	r4, r5, [up, #32]!
+	ldrd	r6, r7, [vp, #32]!
+	strd	r8, r9, [rp, #32]!
+L(lo):	ADDSUBC	r8, r4, r6
+	ADDSUBC	r9, r5, r7
+	tst	n, n
+	bne	L(top)
+
+L(end):	strd	r8, r9, [rp, #8]
+L(wd1):	RETVAL
+	pop	{ r4-r9 }
+	bx	r14
+L(dne):	strd	r8, r9, [rp, #24]
+	RETVAL2
+	pop	{ r4-r9 }
+	bx	r14
+EPILOGUE()

diff --git a/third_party/gmp/mpn/arm/v7a/cora15/bdiv_q_1.asm b/third_party/gmp/mpn/arm/v7a/cora15/bdiv_q_1.asm
new file mode 100644
index 0000000..245b371
--- /dev/null
+++ b/third_party/gmp/mpn/arm/v7a/cora15/bdiv_q_1.asm

@@ -0,0 +1,36 @@
+dnl  ARM mpn_bdiv_q_1, mpn_pi1_bdiv_q_1 -- Hensel division by 1-limb divisor.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2012, 2013, 2017 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_bdiv_q_1 mpn_pi1_bdiv_q_1)
+include_mpn(`arm/v7a/cora8/bdiv_q_1.asm')

diff --git a/third_party/gmp/mpn/arm/v7a/cora15/cnd_aors_n.asm b/third_party/gmp/mpn/arm/v7a/cora15/cnd_aors_n.asm
new file mode 100644
index 0000000..b9e5cd3
--- /dev/null
+++ b/third_party/gmp/mpn/arm/v7a/cora15/cnd_aors_n.asm

@@ -0,0 +1,158 @@
+dnl  ARM mpn_cnd_add_n/mpn_cnd_sub_n optimised for A15.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb		best
+C StrongARM:     -
+C XScale	 ?
+C Cortex-A7	 ?
+C Cortex-A8	 ?
+C Cortex-A9	 3.75			 3
+C Cortex-A15	 1.78			this
+
+C This code does not run as well as one could have hoped, since 1.5 c/l seems
+C realistic for this insn mix.
+
+C Architecture requirements:
+C v5	-
+C v5t	-
+C v5te	ldrd strd
+C v6	-
+C v6t2	-
+C v7a	-
+
+define(`cnd',`r0')
+define(`rp', `r1')
+define(`up', `r2')
+define(`vp', `r3')
+define(`n',  `r12')
+
+ifdef(`OPERATION_cnd_add_n', `
+  define(`ADDSUB',	adds)
+  define(`ADDSUBC',	adcs)
+  define(`IFADD',	`$1')
+  define(`INITCY',      `cmn	r0, #0')
+  define(`RETVAL',	`adc	r0, n, #0')
+  define(`RETVAL2',	`adc	r0, n, #1')
+  define(`func',	mpn_cnd_add_n)
+  define(`func_nc',	mpn_add_nc)')
+ifdef(`OPERATION_cnd_sub_n', `
+  define(`ADDSUB',	subs)
+  define(`ADDSUBC',	sbcs)
+  define(`IFADD',	`')
+  define(`INITCY',      `cmp	r0, #0')
+  define(`RETVAL',	`sbc	r0, r0, r0
+			and	r0, r0, #1')
+  define(`RETVAL2',	`RETVAL')
+  define(`func',	mpn_cnd_sub_n)
+  define(`func_nc',	mpn_sub_nc)')
+
+MULFUNC_PROLOGUE(mpn_cnd_add_n mpn_cnd_sub_n)
+
+ASM_START()
+PROLOGUE(func)
+	ldr	n, [sp]
+	push	{ r4-r9 }
+
+	cmp	cnd, #1
+	sbc	cnd, cnd, cnd		C conditionally set to 0xffffffff
+
+	ands	r6, n, #3
+	mov	n, n, lsr #2
+	beq	L(b00)
+	cmp	r6, #2
+	bcc	L(b01)
+	beq	L(b10)
+
+L(b11):	ldr	r5, [up], #4
+	ldr	r7, [vp], #4
+	bic	r7, r7, cnd
+	ADDSUB	r9, r5, r7
+	ldrd	r4, r5, [up, #0]
+	ldrd	r6, r7, [vp, #0]
+	bic	r6, r6, cnd
+	bic	r7, r7, cnd
+	str	r9, [rp], #-4
+	b	L(lo)
+
+L(b00):	ldrd	r4, r5, [up], #-8
+	ldrd	r6, r7, [vp], #-8
+	bic	r6, r6, cnd
+	bic	r7, r7, cnd
+	INITCY
+	sub	rp, rp, #16
+	b	L(mid)
+
+L(b01):	ldr	r5, [up], #-4
+	ldr	r7, [vp], #-4
+	bic	r7, r7, cnd
+	ADDSUB	r9, r5, r7
+	str	r9, [rp], #-12
+	tst	n, n
+	beq	L(wd1)
+L(gt1):	ldrd	r4, r5, [up, #8]
+	ldrd	r6, r7, [vp, #8]
+	bic	r6, r6, cnd
+	bic	r7, r7, cnd
+	b	L(mid)
+
+L(b10):	ldrd	r4, r5, [up]
+	ldrd	r6, r7, [vp]
+	bic	r6, r6, cnd
+	bic	r7, r7, cnd
+	INITCY
+	sub	rp, rp, #8
+	b	L(lo)
+
+	ALIGN(16)
+L(top):	ldrd	r6, r7, [vp, #8]
+	ldrd	r4, r5, [up, #8]
+	bic	r6, r6, cnd
+	bic	r7, r7, cnd
+	strd	r8, r9, [rp, #8]
+L(mid):	ADDSUBC	r8, r4, r6
+	ADDSUBC	r9, r5, r7
+	ldrd	r6, r7, [vp, #16]!
+	ldrd	r4, r5, [up, #16]!
+	bic	r6, r6, cnd
+	bic	r7, r7, cnd
+	sub	n, n, #1
+	strd	r8, r9, [rp, #16]!
+L(lo):	ADDSUBC	r8, r4, r6
+	ADDSUBC	r9, r5, r7
+	tst	n, n
+	bne	L(top)
+
+L(end):	strd	r8, r9, [rp, #8]
+L(wd1):	RETVAL
+	pop	{ r4-r9 }
+	bx	r14
+EPILOGUE()

diff --git a/third_party/gmp/mpn/arm/v7a/cora15/com.asm b/third_party/gmp/mpn/arm/v7a/cora15/com.asm
new file mode 100644
index 0000000..a258afe
--- /dev/null
+++ b/third_party/gmp/mpn/arm/v7a/cora15/com.asm

@@ -0,0 +1,180 @@
+dnl  ARM mpn_com optimised for A15.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C            cycles/limb
+C StrongARM	 ?
+C XScale	 ?
+C Cortex-A7	 ?
+C Cortex-A8	 ?
+C Cortex-A9	2.5
+C Cortex-A15	1.0
+
+C This is great A15 core register code, but it is a bit large.
+C We use FEEDIN_VARIANT 1 to save some space, but use 8-way unrolling.
+
+C Architecture requirements:
+C v5	-
+C v5t	-
+C v5te	ldrd strd
+C v6	-
+C v6t2	-
+C v7a	-
+
+define(`FEEDIN_VARIANT', 1)	C alternatives: 0 1 2
+define(`UNROLL', 4x2)		C alternatives: 4 4x2
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`n',  `r2')
+
+ASM_START()
+PROLOGUE(mpn_com)
+	push	{ r4-r5,r8-r9 }
+
+ifelse(FEEDIN_VARIANT,0,`
+	ands	r12, n, #3
+	mov	n, n, lsr #2
+	beq	L(b00a)
+	tst	r12, #1
+	beq	L(bx0)
+	ldr	r5, [up], #4
+	mvn	r9, r5
+	str	r9, [rp], #4
+	tst	r12, #2
+	beq	L(b00)
+L(bx0):	ldrd	r4, r5, [up, #0]
+	sub	rp, rp, #8
+	b	L(lo)
+L(b00):	tst	n, n
+	beq	L(wd1)
+L(b00a):ldrd	r4, r5, [up], #-8
+	sub	rp, rp, #16
+	b	L(mid)
+')
+ifelse(FEEDIN_VARIANT,1,`
+	and	r12, n, #3
+	mov	n, n, lsr #2
+	tst	r12, #1
+	beq	L(bx0)
+	ldr	r5, [up], #4
+	mvn	r9, r5
+	str	r9, [rp], #4
+L(bx0):	tst	r12, #2
+	beq	L(b00)
+	ldrd	r4, r5, [up, #0]
+	sub	rp, rp, #8
+	b	L(lo)
+L(b00):	tst	n, n
+	beq	L(wd1)
+	ldrd	r4, r5, [up], #-8
+	sub	rp, rp, #16
+	b	L(mid)
+')
+ifelse(FEEDIN_VARIANT,2,`
+	ands	r12, n, #3
+	mov	n, n, lsr #2
+	beq	L(b00)
+	cmp	r12, #2
+	bcc	L(b01)
+	beq	L(b10)
+
+L(b11):	ldr	r5, [up], #4
+	mvn	r9, r5
+	ldrd	r4, r5, [up, #0]
+	str	r9, [rp], #-4
+	b	L(lo)
+
+L(b00):	ldrd	r4, r5, [up], #-8
+	sub	rp, rp, #16
+	b	L(mid)
+
+L(b01):	ldr	r5, [up], #-4
+	mvn	r9, r5
+	str	r9, [rp], #-12
+	tst	n, n
+	beq	L(wd1)
+L(gt1):	ldrd	r4, r5, [up, #8]
+	b	L(mid)
+
+L(b10):	ldrd	r4, r5, [up]
+	sub	rp, rp, #8
+	b	L(lo)
+')
+	ALIGN(16)
+ifelse(UNROLL,4,`
+L(top):	ldrd	r4, r5, [up, #8]
+	strd	r8, r9, [rp, #8]
+L(mid):	mvn	r8, r4
+	mvn	r9, r5
+	ldrd	r4, r5, [up, #16]!
+	strd	r8, r9, [rp, #16]!
+	sub	n, n, #1
+L(lo):	mvn	r8, r4
+	mvn	r9, r5
+	tst	n, n
+	bne	L(top)
+')
+ifelse(UNROLL,4x2,`
+L(top):	ldrd	r4, r5, [up, #8]
+	strd	r8, r9, [rp, #8]
+L(mid):	mvn	r8, r4
+	mvn	r9, r5
+	ldrd	r4, r5, [up, #16]
+	strd	r8, r9, [rp, #16]
+	mvn	r8, r4
+	mvn	r9, r5
+	sub	n, n, #2
+	tst	n, n
+	bmi	L(dne)
+	ldrd	r4, r5, [up, #24]
+	strd	r8, r9, [rp, #24]
+	mvn	r8, r4
+	mvn	r9, r5
+	ldrd	r4, r5, [up, #32]!
+	strd	r8, r9, [rp, #32]!
+L(lo):	mvn	r8, r4
+	mvn	r9, r5
+	tst	n, n
+	bne	L(top)
+')
+
+L(end):	strd	r8, r9, [rp, #8]
+L(wd1):	pop	{ r4-r5,r8-r9 }
+	bx	r14
+ifelse(UNROLL,4x2,`
+L(dne):	strd	r8, r9, [rp, #24]
+	pop	{ r4-r5,r8-r9 }
+	bx	r14
+')
+EPILOGUE()

diff --git a/third_party/gmp/mpn/arm/v7a/cora15/gmp-mparam.h b/third_party/gmp/mpn/arm/v7a/cora15/gmp-mparam.h
new file mode 100644
index 0000000..409cbbb
--- /dev/null
+++ b/third_party/gmp/mpn/arm/v7a/cora15/gmp-mparam.h

@@ -0,0 +1,212 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* 2000 MHz Cortex-A15 with Neon (in spite of file position) */
+/* FFT tuning limit = 50,736,668 */
+/* Generated by tuneup.c, 2019-10-22, gcc 5.4 */
+
+#define MOD_1_NORM_THRESHOLD                 0  /* always */
+#define MOD_1_UNNORM_THRESHOLD               0  /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          3
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          2
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD        10
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD     MP_SIZE_T_MAX
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD      8
+#define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_1N_PI1_METHOD                 1  /* 49.14% faster than 2 */
+#define DIV_QR_1_NORM_THRESHOLD          MP_SIZE_T_MAX  /* never */
+#define DIV_QR_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           17
+
+#define DIV_1_VS_MUL_1_PERCENT             267
+
+#define MUL_TOOM22_THRESHOLD                28
+#define MUL_TOOM33_THRESHOLD               114
+#define MUL_TOOM44_THRESHOLD               178
+#define MUL_TOOM6H_THRESHOLD               238
+#define MUL_TOOM8H_THRESHOLD               597
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD     113
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     115
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD     115
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD     115
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD     154
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 38
+#define SQR_TOOM3_THRESHOLD                126
+#define SQR_TOOM4_THRESHOLD                336
+#define SQR_TOOM6_THRESHOLD                446
+#define SQR_TOOM8_THRESHOLD                650
+
+#define MULMID_TOOM42_THRESHOLD             52
+
+#define MULMOD_BNM1_THRESHOLD               23
+#define SQRMOD_BNM1_THRESHOLD               17
+
+#define MUL_FFT_MODF_THRESHOLD             575  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    575, 5}, {     25, 6}, {     13, 5}, {     27, 6}, \
+    {     27, 7}, {     15, 6}, {     31, 7}, {     19, 6}, \
+    {     39, 7}, {     25, 6}, {     51, 7}, {     27, 8}, \
+    {     15, 7}, {     33, 8}, {     19, 7}, {     41, 8}, \
+    {     23, 7}, {     51, 8}, {     27, 9}, {     15, 8}, \
+    {     31, 7}, {     63, 8}, {     39, 9}, {     23, 8}, \
+    {     55,10}, {     15, 9}, {     31, 8}, {     67, 9}, \
+    {     39, 8}, {     79, 9}, {     47, 8}, {     95, 9}, \
+    {     55,10}, {     31, 9}, {     71, 8}, {    143, 9}, \
+    {     87,10}, {     47, 9}, {    111,11}, {     31,10}, \
+    {     63, 9}, {    143,10}, {     79, 9}, {    159,10}, \
+    {     95,11}, {     63,10}, {    143, 9}, {    287,10}, \
+    {    159,11}, {     95,10}, {    191,12}, {     63,11}, \
+    {    127,10}, {    255, 9}, {    511,10}, {    271, 9}, \
+    {    543,10}, {    287,11}, {    159,10}, {    335, 9}, \
+    {    671,10}, {    367, 9}, {    735,11}, {    191,10}, \
+    {    383, 9}, {    799,10}, {    415,11}, {    223,12}, \
+    {    127,10}, {    543,11}, {    287,10}, {    575,11}, \
+    {    319,10}, {    639,11}, {    351,10}, {    703,12}, \
+    {    191,11}, {    383,10}, {    767,11}, {    415,10}, \
+    {    831,11}, {    447,13}, {    127,12}, {    255,11}, \
+    {    543,10}, {   1087,11}, {    575,12}, {    319,11}, \
+    {    671,10}, {   1343,11}, {    735,12}, {    383,11}, \
+    {    831,12}, {    447,11}, {    959,13}, {    255,12}, \
+    {    511,11}, {   1087,12}, {    575,11}, {   1151,12}, \
+    {    639,11}, {   1343,12}, {    703,13}, {    383,12}, \
+    {    767,11}, {   1599,12}, {    895,14}, {    255,13}, \
+    {    511,12}, {   1087,13}, {    639,12}, {   1407,13}, \
+    {    767,12}, {   1599,13}, {    895,14}, {    511,13}, \
+    {   1023,12}, {   2111,13}, {   1151,12}, {   2431,13}, \
+    {   1279,12}, {   2559,13}, {   1407,14}, {    767,13}, \
+    {   1535,12}, {   3135,13}, {   1663,15}, {    511,14}, \
+    {   1023,13}, {   2303,14}, {   1279,13}, {   2559,12}, \
+    {   5119,13}, {   2687,14}, {   1535,13}, {   3071,12}, \
+    {   6143,13}, {   3199,12}, {   6399,14}, {   1791,15}, \
+    {   1023,14}, {   2047,13}, {   4095,14}, {   2303,13}, \
+    {   4607,12}, {   9215,13}, {   4863,12}, {   9727,14}, \
+    {   2559,13}, {   5119,15}, {   1535,14}, {   3071,13}, \
+    {   8192,14}, {  16384,15}, {  32768,16} }
+#define MUL_FFT_TABLE3_SIZE 155
+#define MUL_FFT_THRESHOLD                 5760
+
+#define SQR_FFT_MODF_THRESHOLD             525  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    525, 5}, {     25, 6}, {     27, 7}, {     15, 6}, \
+    {     31, 7}, {     17, 6}, {     35, 7}, {     19, 6}, \
+    {     39, 7}, {     25, 6}, {     51, 7}, {     27, 8}, \
+    {     15, 7}, {     33, 8}, {     19, 7}, {     41, 8}, \
+    {     23, 7}, {     51, 8}, {     27, 7}, {     55, 9}, \
+    {     15, 8}, {     31, 7}, {     63, 8}, {     39, 9}, \
+    {     23, 8}, {     51,10}, {     15, 9}, {     31, 8}, \
+    {     67, 9}, {     39, 8}, {     79, 9}, {     47, 8}, \
+    {     99, 9}, {     55,10}, {     31, 9}, {     79,10}, \
+    {     47, 9}, {     95,11}, {     31,10}, {     63, 9}, \
+    {    135,10}, {     95,11}, {     63,10}, {    143, 9}, \
+    {    287, 8}, {    575, 9}, {    303,10}, {    159,11}, \
+    {     95,10}, {    191, 9}, {    383,12}, {     63,11}, \
+    {    127,10}, {    255, 9}, {    511,10}, {    271, 9}, \
+    {    543,10}, {    287,11}, {    159,10}, {    319, 9}, \
+    {    639,10}, {    335, 9}, {    671,10}, {    351,11}, \
+    {    191,10}, {    399, 9}, {    799,10}, {    415, 9}, \
+    {    831,11}, {    223,10}, {    447,12}, {    127,10}, \
+    {    543,11}, {    287,10}, {    575,11}, {    319,10}, \
+    {    639,11}, {    351,10}, {    703,12}, {    191,11}, \
+    {    383,10}, {    799,11}, {    415,10}, {    831,11}, \
+    {    447,13}, {    127,11}, {    543,10}, {   1087,11}, \
+    {    607,12}, {    319,11}, {    735,12}, {    383,11}, \
+    {    831,12}, {    447,11}, {    959,12}, {    511,11}, \
+    {   1023,12}, {    575,11}, {   1151,12}, {    639,11}, \
+    {   1279,12}, {    703,13}, {    383,12}, {    767,11}, \
+    {   1535,12}, {    831,11}, {   1663,12}, {    895,14}, \
+    {    255,13}, {    511,12}, {   1087,13}, {    639,12}, \
+    {   1343,13}, {    767,12}, {   1599,13}, {    895,14}, \
+    {    511,13}, {   1023,12}, {   2111,13}, {   1151,12}, \
+    {   2303,13}, {   1279,14}, {    767,13}, {   1535,12}, \
+    {   3135,13}, {   1663,15}, {    511,14}, {   1023,13}, \
+    {   2047,12}, {   4095,13}, {   2303,14}, {   1279,13}, \
+    {   2559,12}, {   5119,14}, {   1535,13}, {   3071,12}, \
+    {   6143,13}, {   3199,12}, {   6399,14}, {   1791,15}, \
+    {   1023,14}, {   2047,13}, {   4095,14}, {   2303,13}, \
+    {   4607,12}, {   9215,13}, {   4863,12}, {   9727,14}, \
+    {   2559,15}, {   1535,14}, {   3071,13}, {   8192,14}, \
+    {  16384,15}, {  32768,16} }
+#define SQR_FFT_TABLE3_SIZE 154
+#define SQR_FFT_THRESHOLD                 5312
+
+#define MULLO_BASECASE_THRESHOLD             0  /* always */
+#define MULLO_DC_THRESHOLD                  38
+#define MULLO_MUL_N_THRESHOLD            10950
+#define SQRLO_BASECASE_THRESHOLD            10
+#define SQRLO_DC_THRESHOLD                  35
+#define SQRLO_SQR_THRESHOLD              10323
+
+#define DC_DIV_QR_THRESHOLD                 57
+#define DC_DIVAPPR_Q_THRESHOLD             254
+#define DC_BDIV_QR_THRESHOLD                48
+#define DC_BDIV_Q_THRESHOLD                286
+
+#define INV_MULMOD_BNM1_THRESHOLD           55
+#define INV_NEWTON_THRESHOLD               252
+#define INV_APPR_THRESHOLD                 252
+
+#define BINV_NEWTON_THRESHOLD              372
+#define REDC_1_TO_REDC_2_THRESHOLD          61
+#define REDC_2_TO_REDC_N_THRESHOLD           0  /* always */
+
+#define MU_DIV_QR_THRESHOLD               1858
+#define MU_DIVAPPR_Q_THRESHOLD            1787
+#define MUPI_DIV_QR_THRESHOLD              122
+#define MU_BDIV_QR_THRESHOLD              1528
+#define MU_BDIV_Q_THRESHOLD               1836
+
+#define POWM_SEC_TABLE  1,14,200,480,1532
+
+#define GET_STR_DC_THRESHOLD                16
+#define GET_STR_PRECOMPUTE_THRESHOLD        33
+#define SET_STR_DC_THRESHOLD               104
+#define SET_STR_PRECOMPUTE_THRESHOLD      1120
+
+#define FAC_DSC_THRESHOLD                  164
+#define FAC_ODD_THRESHOLD                   27
+
+#define MATRIX22_STRASSEN_THRESHOLD         19
+#define HGCD2_DIV1_METHOD                    1  /* 3.70% faster than 3 */
+#define HGCD_THRESHOLD                     137
+#define HGCD_APPR_THRESHOLD                157
+#define HGCD_REDUCE_THRESHOLD             3389
+#define GCD_DC_THRESHOLD                   610
+#define GCDEXT_DC_THRESHOLD                443
+#define JACOBI_BASE_METHOD                   4  /* 12.66% faster than 1 */
+
+/* Tuneup completed successfully, took 69757 seconds */

diff --git a/third_party/gmp/mpn/arm/v7a/cora15/logops_n.asm b/third_party/gmp/mpn/arm/v7a/cora15/logops_n.asm
new file mode 100644
index 0000000..0602614
--- /dev/null
+++ b/third_party/gmp/mpn/arm/v7a/cora15/logops_n.asm

@@ -0,0 +1,253 @@
+dnl  ARM mpn_and_n, mpn_andn_n. mpn_nand_n, etc, optimised for A15.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C            cycles/limb             cycles/limb
+C          and andn ior xor         nand iorn nior xnor
+C StrongARM	 ?			 ?
+C XScale	 ?			 ?
+C Cortex-A7	 ?			 ?
+C Cortex-A8	 ?			 ?
+C Cortex-A9	3.5			3.56
+C Cortex-A15	1.27			1.64
+
+C This is great A15 core register code, but it is a bit large.
+C We use FEEDIN_VARIANT 1 to save some space, but use 8-way unrolling.
+
+C Architecture requirements:
+C v5	-
+C v5t	-
+C v5te	ldrd strd
+C v6	-
+C v6t2	-
+C v7a	-
+
+define(`FEEDIN_VARIANT', 1)	C alternatives: 0 1 2
+define(`UNROLL', 4x2)		C alternatives: 4 4x2
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`vp', `r2')
+define(`n',  `r3')
+
+define(`POSTOP')
+
+ifdef(`OPERATION_and_n',`
+  define(`func',    `mpn_and_n')
+  define(`LOGOP',   `and	$1, $2, $3')')
+ifdef(`OPERATION_andn_n',`
+  define(`func',    `mpn_andn_n')
+  define(`LOGOP',   `bic	$1, $2, $3')')
+ifdef(`OPERATION_nand_n',`
+  define(`func',    `mpn_nand_n')
+  define(`POSTOP',  `mvn	$1, $1')
+  define(`LOGOP',   `and	$1, $2, $3')')
+ifdef(`OPERATION_ior_n',`
+  define(`func',    `mpn_ior_n')
+  define(`LOGOP',   `orr	$1, $2, $3')')
+ifdef(`OPERATION_iorn_n',`
+  define(`func',    `mpn_iorn_n')
+  define(`POSTOP',  `mvn	$1, $1')
+  define(`LOGOP',   `bic	$1, $3, $2')')
+ifdef(`OPERATION_nior_n',`
+  define(`func',    `mpn_nior_n')
+  define(`POSTOP',  `mvn	$1, $1')
+  define(`LOGOP',   `orr	$1, $2, $3')')
+ifdef(`OPERATION_xor_n',`
+  define(`func',    `mpn_xor_n')
+  define(`LOGOP',   `eor	$1, $2, $3')')
+ifdef(`OPERATION_xnor_n',`
+  define(`func',    `mpn_xnor_n')
+  define(`POSTOP',  `mvn	$1, $1')
+  define(`LOGOP',   `eor	$1, $2, $3')')
+
+MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n)
+
+ASM_START()
+PROLOGUE(func)
+	push	{ r4-r9 }
+
+ifelse(FEEDIN_VARIANT,0,`
+	ands	r6, n, #3
+	mov	n, n, lsr #2
+	beq	L(b00a)
+	tst	r6, #1
+	beq	L(bx0)
+	ldr	r5, [up], #4
+	ldr	r7, [vp], #4
+	LOGOP(	r9, r5, r7)
+	POSTOP(	r9)
+	str	r9, [rp], #4
+	tst	r6, #2
+	beq	L(b00)
+L(bx0):	ldrd	r4, r5, [up, #0]
+	ldrd	r6, r7, [vp, #0]
+	sub	rp, rp, #8
+	b	L(lo)
+L(b00):	tst	n, n
+	beq	L(wd1)
+L(b00a):ldrd	r4, r5, [up], #-8
+	ldrd	r6, r7, [vp], #-8
+	sub	rp, rp, #16
+	b	L(mid)
+')
+ifelse(FEEDIN_VARIANT,1,`
+	and	r6, n, #3
+	mov	n, n, lsr #2
+	tst	r6, #1
+	beq	L(bx0)
+	ldr	r5, [up], #4
+	ldr	r7, [vp], #4
+	LOGOP(	r9, r5, r7)
+	POSTOP(	r9)
+	str	r9, [rp], #4
+L(bx0):	tst	r6, #2
+	beq	L(b00)
+	ldrd	r4, r5, [up, #0]
+	ldrd	r6, r7, [vp, #0]
+	sub	rp, rp, #8
+	b	L(lo)
+L(b00):	tst	n, n
+	beq	L(wd1)
+	ldrd	r4, r5, [up], #-8
+	ldrd	r6, r7, [vp], #-8
+	sub	rp, rp, #16
+	b	L(mid)
+')
+ifelse(FEEDIN_VARIANT,2,`
+	ands	r6, n, #3
+	mov	n, n, lsr #2
+	beq	L(b00)
+	cmp	r6, #2
+	bcc	L(b01)
+	beq	L(b10)
+
+L(b11):	ldr	r5, [up], #4
+	ldr	r7, [vp], #4
+	LOGOP(	r9, r5, r7)
+	ldrd	r4, r5, [up, #0]
+	ldrd	r6, r7, [vp, #0]
+	POSTOP(	r9)
+	str	r9, [rp], #-4
+	b	L(lo)
+
+L(b00):	ldrd	r4, r5, [up], #-8
+	ldrd	r6, r7, [vp], #-8
+	sub	rp, rp, #16
+	b	L(mid)
+
+L(b01):	ldr	r5, [up], #-4
+	ldr	r7, [vp], #-4
+	LOGOP(	r9, r5, r7)
+	POSTOP(	r9)
+	str	r9, [rp], #-12
+	tst	n, n
+	beq	L(wd1)
+L(gt1):	ldrd	r4, r5, [up, #8]
+	ldrd	r6, r7, [vp, #8]
+	b	L(mid)
+
+L(b10):	ldrd	r4, r5, [up]
+	ldrd	r6, r7, [vp]
+	sub	rp, rp, #8
+	b	L(lo)
+')
+	ALIGN(16)
+ifelse(UNROLL,4,`
+L(top):	ldrd	r4, r5, [up, #8]
+	ldrd	r6, r7, [vp, #8]
+	POSTOP(	r8)
+	POSTOP(	r9)
+	strd	r8, r9, [rp, #8]
+L(mid):	LOGOP(	r8, r4, r6)
+	LOGOP(	r9, r5, r7)
+	ldrd	r4, r5, [up, #16]!
+	ldrd	r6, r7, [vp, #16]!
+	POSTOP(	r8)
+	POSTOP(	r9)
+	strd	r8, r9, [rp, #16]!
+	sub	n, n, #1
+L(lo):	LOGOP(	r8, r4, r6)
+	LOGOP(	r9, r5, r7)
+	tst	n, n
+	bne	L(top)
+')
+ifelse(UNROLL,4x2,`
+L(top):	ldrd	r4, r5, [up, #8]
+	ldrd	r6, r7, [vp, #8]
+	POSTOP(	r8)
+	POSTOP(	r9)
+	strd	r8, r9, [rp, #8]
+L(mid):	LOGOP(	r8, r4, r6)
+	LOGOP(	r9, r5, r7)
+	ldrd	r4, r5, [up, #16]
+	ldrd	r6, r7, [vp, #16]
+	POSTOP(	r8)
+	POSTOP(	r9)
+	strd	r8, r9, [rp, #16]
+	LOGOP(	r8, r4, r6)
+	LOGOP(	r9, r5, r7)
+	sub	n, n, #2
+	tst	n, n
+	bmi	L(dne)
+	ldrd	r4, r5, [up, #24]
+	ldrd	r6, r7, [vp, #24]
+	POSTOP(	r8)
+	POSTOP(	r9)
+	strd	r8, r9, [rp, #24]
+	LOGOP(	r8, r4, r6)
+	LOGOP(	r9, r5, r7)
+	ldrd	r4, r5, [up, #32]!
+	ldrd	r6, r7, [vp, #32]!
+	POSTOP(	r8)
+	POSTOP(	r9)
+	strd	r8, r9, [rp, #32]!
+L(lo):	LOGOP(	r8, r4, r6)
+	LOGOP(	r9, r5, r7)
+	tst	n, n
+	bne	L(top)
+')
+
+L(end):	POSTOP(	r8)
+	POSTOP(	r9)
+	strd	r8, r9, [rp, #8]
+L(wd1):	pop	{ r4-r9 }
+	bx	r14
+ifelse(UNROLL,4x2,`
+L(dne):	POSTOP(	r8)
+	POSTOP(	r9)
+	strd	r8, r9, [rp, #24]
+	pop	{ r4-r9 }
+	bx	r14
+')
+EPILOGUE()

diff --git a/third_party/gmp/mpn/arm/v7a/cora15/mul_1.asm b/third_party/gmp/mpn/arm/v7a/cora15/mul_1.asm
new file mode 100644
index 0000000..766ba5c
--- /dev/null
+++ b/third_party/gmp/mpn/arm/v7a/cora15/mul_1.asm

@@ -0,0 +1,104 @@
+dnl  ARM mpn_mul_1 optimised for A15.
+
+dnl  Copyright 2012, 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb		best
+C StrongARM:	 -
+C XScale	 ?
+C Cortex-A7	 ?
+C Cortex-A8	 ?
+C Cortex-A9	 5.25			3.25
+C Cortex-A15	 2.25			this
+
+
+C This runs well on A15 but very poorly on A9.  By scheduling loads and adds
+C it is possible to get good A9 performance as well, but at the cost of using
+C many more (callee-saves) registers.
+
+C This is armv5 code, optimized for the armv7a cpu A15.  Its location in the
+C GMP file structure might be misleading.
+
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`n',  `r2')
+define(`v0', `r3')
+
+ASM_START()
+PROLOGUE(mpn_mul_1c)
+	ldr	r12, [sp]
+	b	L(ent)
+EPILOGUE()
+PROLOGUE(mpn_mul_1)
+	mov	r12, #0
+L(ent):	push	{r4-r7}
+
+	ldr	r6, [up], #4
+	tst	n, #1
+	beq	L(bx0)
+
+L(bx1):	umull	r4, r7, r6, v0
+	adds	r4, r4, r12
+	tst	n, #2
+	beq	L(lo1)
+	b	L(lo3)
+
+L(bx0):	umull	r4, r5, r6, v0
+	adds	r4, r4, r12
+	tst	n, #2
+	beq	L(lo0)
+	b	L(lo2)
+
+L(top):	ldr	r6, [up], #4
+	str	r4, [rp], #4
+	umull	r4, r5, r6, v0
+	adds	r4, r4, r7
+L(lo0):	ldr	r6, [up], #4
+	str	r4, [rp], #4
+	umull	r4, r7, r6, v0
+	adcs	r4, r4, r5
+L(lo3):	ldr	r6, [up], #4
+	str	r4, [rp], #4
+	umull	r4, r5, r6, v0
+	adcs	r4, r4, r7
+L(lo2):	ldr	r6, [up], #4
+	str	r4, [rp], #4
+	umull	r4, r7, r6, v0
+	adcs	r4, r4, r5
+L(lo1):	adc	r7, r7, #0
+	subs	n, n, #4
+	bgt	L(top)
+
+	str	r4, [rp]
+	mov	r0, r7
+	pop	{r4-r7}
+	bx	lr
+EPILOGUE()

diff --git a/third_party/gmp/mpn/arm/v7a/cora15/neon/aorsorrlsh1_n.asm b/third_party/gmp/mpn/arm/v7a/cora15/neon/aorsorrlsh1_n.asm
new file mode 100644
index 0000000..d8cfe3f
--- /dev/null
+++ b/third_party/gmp/mpn/arm/v7a/cora15/neon/aorsorrlsh1_n.asm

@@ -0,0 +1,43 @@
+dnl  ARM mpn_addlshC_n, mpn_sublshC_n, mpn_rsblshC_n
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+define(LSH,		1)
+
+ifdef(`OPERATION_addlsh1_n',`define(`DO_add')')
+ifdef(`OPERATION_sublsh1_n',`define(`DO_sub')')
+ifdef(`OPERATION_rsblsh1_n',`define(`DO_rsb')')
+
+MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_sublsh1_n mpn_rsblsh1_n)
+
+include_mpn(`arm/v7a/cora15/neon/aorsorrlshC_n.asm')

diff --git a/third_party/gmp/mpn/arm/v7a/cora15/neon/aorsorrlsh2_n.asm b/third_party/gmp/mpn/arm/v7a/cora15/neon/aorsorrlsh2_n.asm
new file mode 100644
index 0000000..b48204d
--- /dev/null
+++ b/third_party/gmp/mpn/arm/v7a/cora15/neon/aorsorrlsh2_n.asm

@@ -0,0 +1,43 @@
+dnl  ARM mpn_addlshC_n, mpn_sublshC_n, mpn_rsblshC_n
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+define(LSH,		2)
+
+ifdef(`OPERATION_addlsh2_n',`define(`DO_add')')
+ifdef(`OPERATION_sublsh2_n',`define(`DO_sub')')
+ifdef(`OPERATION_rsblsh2_n',`define(`DO_rsb')')
+
+MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_sublsh2_n mpn_rsblsh2_n)
+
+include_mpn(`arm/v7a/cora15/neon/aorsorrlshC_n.asm')

diff --git a/third_party/gmp/mpn/arm/v7a/cora15/neon/aorsorrlshC_n.asm b/third_party/gmp/mpn/arm/v7a/cora15/neon/aorsorrlshC_n.asm
new file mode 100644
index 0000000..51f93c1
--- /dev/null
+++ b/third_party/gmp/mpn/arm/v7a/cora15/neon/aorsorrlshC_n.asm

@@ -0,0 +1,144 @@
+dnl  ARM mpn_addlshC_n, mpn_sublshC_n, mpn_rsblshC_n
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+C	     cycles/limb
+C StrongARM	 -
+C XScale	 -
+C Cortex-A7	 ?
+C Cortex-A8	 ?
+C Cortex-A9	 5.25
+C Cortex-A15	 2.25
+
+C TODO
+C  * Consider using 4-way feed-in code.
+C  * This is ad-hoc scheduled, perhaps unnecessarily so for A15, and perhaps
+C    insufficiently for A7 and A8.
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`vp', `r2')
+define(`n',  `r3')
+
+ifdef(`DO_add', `
+  define(`ADCSBCS',	`adcs	$1, $2, $3')
+  define(`CLRCY',	`cmn	r13, #1')
+  define(`RETVAL',	`adc	r0, $1, #0')
+  define(`func',	mpn_addlsh`'LSH`'_n)')
+ifdef(`DO_sub', `
+  define(`ADCSBCS',	`sbcs	$1, $2, $3')
+  define(`CLRCY',	`cmp	r13, #0')
+  define(`RETVAL',	`sbc	$2, $2, $2
+			cmn	$2, #1
+			adc	r0, $1, #0')
+  define(`func',	mpn_sublsh`'LSH`'_n)')
+ifdef(`DO_rsb', `
+  define(`ADCSBCS',	`sbcs	$1, $3, $2')
+  define(`CLRCY',	`cmp	r13, #0')
+  define(`RETVAL',	`sbc	r0, $1, #0')
+  define(`func',	mpn_rsblsh`'LSH`'_n)')
+
+
+ASM_START()
+PROLOGUE(func)
+	push	 {r4-r10}
+	vmov.i8	 d0, #0			C could feed carry through here
+	CLRCY
+	tst	n, #1
+	beq	L(bb0)
+
+L(bb1):	vld1.32	 {d3[0]}, [vp]!
+	vsli.u32 d0, d3, #LSH
+	ldr	 r12, [up], #4
+	vmov.32	 r5, d0[0]
+	vshr.u32 d0, d3, #32-LSH
+	ADCSBCS( r12, r12, r5)
+	str	 r12, [rp], #4
+	bics	 n, n, #1
+	beq	 L(rtn)
+
+L(bb0):	tst	n, #2
+	beq	L(b00)
+
+L(b10):	vld1.32	 {d3}, [vp]!
+	vsli.u64 d0, d3, #LSH
+	ldmia	 up!, {r10,r12}
+	vmov	 r4, r5, d0
+	vshr.u64 d0, d3, #64-LSH
+	ADCSBCS( r10, r10, r4)
+	ADCSBCS( r12, r12, r5)
+	stmia	 rp!, {r10,r12}
+	bics	 n, n, #2
+	beq	 L(rtn)
+
+L(b00):	vld1.32	 {d2}, [vp]!
+	vsli.u64 d0, d2, #LSH
+	vshr.u64 d1, d2, #64-LSH
+	vld1.32	 {d3}, [vp]!
+	vsli.u64 d1, d3, #LSH
+	vmov	 r6, r7, d0
+	vshr.u64 d0, d3, #64-LSH
+	sub	 n, n, #4
+	tst	 n, n
+	beq	 L(end)
+
+	ALIGN(16)
+L(top):	ldmia	 up!, {r8,r9,r10,r12}
+	vld1.32	 {d2}, [vp]!
+	vsli.u64 d0, d2, #LSH
+	vmov	 r4, r5, d1
+	vshr.u64 d1, d2, #64-LSH
+	ADCSBCS( r8, r8, r6)
+	ADCSBCS( r9, r9, r7)
+	vld1.32	 {d3}, [vp]!
+	vsli.u64 d1, d3, #LSH
+	vmov	 r6, r7, d0
+	vshr.u64 d0, d3, #64-LSH
+	ADCSBCS( r10, r10, r4)
+	ADCSBCS( r12, r12, r5)
+	stmia	 rp!, {r8,r9,r10,r12}
+	sub	 n, n, #4
+	tst	 n, n
+	bne	 L(top)
+
+L(end):	ldmia	 up!, {r8,r9,r10,r12}
+	vmov	 r4, r5, d1
+	ADCSBCS( r8, r8, r6)
+	ADCSBCS( r9, r9, r7)
+	ADCSBCS( r10, r10, r4)
+	ADCSBCS( r12, r12, r5)
+	stmia	 rp!, {r8,r9,r10,r12}
+L(rtn):	vmov.32	 r0, d0[0]
+	RETVAL(	 r0, r1)
+	pop	 {r4-r10}
+	bx	 r14
+EPILOGUE()

diff --git a/third_party/gmp/mpn/arm/v7a/cora15/neon/com.asm b/third_party/gmp/mpn/arm/v7a/cora15/neon/com.asm
new file mode 100644
index 0000000..9e7a629
--- /dev/null
+++ b/third_party/gmp/mpn/arm/v7a/cora15/neon/com.asm

@@ -0,0 +1,97 @@
+dnl  ARM Neon mpn_com optimised for A15.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C            cycles/limb
+C StrongARM	 ?
+C XScale	 ?
+C Cortex-A8	 ?
+C Cortex-A9	 2.1
+C Cortex-A15	 0.65
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`n',  `r2')
+
+ASM_START()
+PROLOGUE(mpn_com)
+	cmp		n, #7
+	ble		L(bc)
+
+C Perform a few initial operation until rp is 128-bit aligned
+	tst		rp, #4
+	beq		L(al1)
+	vld1.32		{d0[0]}, [up]!
+	sub		n, n, #1
+	vmvn		d0, d0
+	vst1.32		{d0[0]}, [rp]!
+L(al1):	tst		rp, #8
+	beq		L(al2)
+	vld1.32		{d0}, [up]!
+	sub		n, n, #2
+	vmvn		d0, d0
+	vst1.32		{d0}, [rp:64]!
+L(al2):	vld1.32		{q2}, [up]!
+	subs		n, n, #12
+	blt		L(end)
+
+	ALIGN(16)
+L(top):	vld1.32		{q0}, [up]!
+	vmvn		q2, q2
+	subs		n, n, #8
+	vst1.32		{q2}, [rp:128]!
+	vld1.32		{q2}, [up]!
+	vmvn		q0, q0
+	vst1.32		{q0}, [rp:128]!
+	bge	L(top)
+
+L(end):	vmvn		q2, q2
+	vst1.32		{q2}, [rp:128]!
+
+C Handle last 0-7 limbs.  Note that rp is aligned after loop, but not when we
+C arrive here via L(bc)
+L(bc):	tst		n, #4
+	beq		L(tl1)
+	vld1.32		{q0}, [up]!
+	vmvn		q0, q0
+	vst1.32		{q0}, [rp]!
+L(tl1):	tst		n, #2
+	beq		L(tl2)
+	vld1.32		{d0}, [up]!
+	vmvn		d0, d0
+	vst1.32		{d0}, [rp]!
+L(tl2):	tst		n, #1
+	beq		L(tl3)
+	vld1.32		{d0[0]}, [up]
+	vmvn		d0, d0
+	vst1.32		{d0[0]}, [rp]
+L(tl3):	bx		lr
+EPILOGUE()

diff --git a/third_party/gmp/mpn/arm/v7a/cora15/neon/copyd.asm b/third_party/gmp/mpn/arm/v7a/cora15/neon/copyd.asm
new file mode 100644
index 0000000..98fe535
--- /dev/null
+++ b/third_party/gmp/mpn/arm/v7a/cora15/neon/copyd.asm

@@ -0,0 +1,110 @@
+dnl  ARM Neon mpn_copyd optimised for A15.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C StrongARM	 -
+C XScale	 -
+C Cortex-A7	 ?
+C Cortex-A8	 ?
+C Cortex-A9	 1.75		slower than core register code
+C Cortex-A15	 0.52
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`n',  `r2')
+
+ASM_START()
+PROLOGUE(mpn_copyd)
+	add	rp, rp, n, lsl #2
+	add	up, up, n, lsl #2
+
+	cmp	n, #7
+	ble	L(bc)
+
+C Copy until rp is 128-bit aligned
+	tst	rp, #4
+	beq	L(al1)
+	sub	up, up, #4
+	vld1.32	{d22[0]}, [up]
+	sub	n, n, #1
+	sub	rp, rp, #4
+	vst1.32	{d22[0]}, [rp]
+L(al1):	tst	rp, #8
+	beq	L(al2)
+	sub	up, up, #8
+	vld1.32	{d22}, [up]
+	sub	n, n, #2
+	sub	rp, rp, #8
+	vst1.32	{d22}, [rp:64]
+L(al2):	sub	up, up, #16
+	vld1.32	{d26-d27}, [up]
+	subs	n, n, #12
+	sub	rp, rp, #16			C offset rp for loop
+	blt	L(end)
+
+	sub	up, up, #16			C offset up for loop
+	mov	r12, #-16
+
+	ALIGN(16)
+L(top):	vld1.32	{d22-d23}, [up], r12
+	vst1.32	{d26-d27}, [rp:128], r12
+	vld1.32	{d26-d27}, [up], r12
+	vst1.32	{d22-d23}, [rp:128], r12
+	subs	n, n, #8
+	bge	L(top)
+
+	add	up, up, #16			C undo up offset
+						C rp offset undoing folded
+L(end):	vst1.32	{d26-d27}, [rp:128]
+
+C Copy last 0-7 limbs.  Note that rp is aligned after loop, but not when we
+C arrive here via L(bc)
+L(bc):	tst	n, #4
+	beq	L(tl1)
+	sub	up, up, #16
+	vld1.32	{d22-d23}, [up]
+	sub	rp, rp, #16
+	vst1.32	{d22-d23}, [rp]
+L(tl1):	tst	n, #2
+	beq	L(tl2)
+	sub	up, up, #8
+	vld1.32	{d22}, [up]
+	sub	rp, rp, #8
+	vst1.32	{d22}, [rp]
+L(tl2):	tst	n, #1
+	beq	L(tl3)
+	sub	up, up, #4
+	vld1.32	{d22[0]}, [up]
+	sub	rp, rp, #4
+	vst1.32	{d22[0]}, [rp]
+L(tl3):	bx	lr
+EPILOGUE()

diff --git a/third_party/gmp/mpn/arm/v7a/cora15/neon/copyi.asm b/third_party/gmp/mpn/arm/v7a/cora15/neon/copyi.asm
new file mode 100644
index 0000000..2e05afe
--- /dev/null
+++ b/third_party/gmp/mpn/arm/v7a/cora15/neon/copyi.asm

@@ -0,0 +1,90 @@
+dnl  ARM Neon mpn_copyi optimised for A15.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C StrongARM	 -
+C XScale	 -
+C Cortex-A7	 ?
+C Cortex-A8	 ?
+C Cortex-A9	 1.75		slower than core register code
+C Cortex-A15	 0.52
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`n',  `r2')
+
+ASM_START()
+PROLOGUE(mpn_copyi)
+	cmp	n, #7
+	ble	L(bc)
+
+C Copy until rp is 128-bit aligned
+	tst	rp, #4
+	beq	L(al1)
+	vld1.32	{d22[0]}, [up]!
+	sub	n, n, #1
+	vst1.32	{d22[0]}, [rp]!
+L(al1):	tst	rp, #8
+	beq	L(al2)
+	vld1.32	{d22}, [up]!
+	sub	n, n, #2
+	vst1.32	{d22}, [rp:64]!
+L(al2):	vld1.32	{d26-d27}, [up]!
+	subs	n, n, #12
+	blt	L(end)
+
+	ALIGN(16)
+L(top):	vld1.32	{d22-d23}, [up]!
+	vst1.32	{d26-d27}, [rp:128]!
+	vld1.32	{d26-d27}, [up]!
+	vst1.32	{d22-d23}, [rp:128]!
+	subs	n, n, #8
+	bge	L(top)
+
+L(end):	vst1.32	{d26-d27}, [rp:128]!
+
+C Copy last 0-7 limbs.  Note that rp is aligned after loop, but not when we
+C arrive here via L(bc)
+L(bc):	tst	n, #4
+	beq	L(tl1)
+	vld1.32	{d22-d23}, [up]!
+	vst1.32	{d22-d23}, [rp]!
+L(tl1):	tst	n, #2
+	beq	L(tl2)
+	vld1.32	{d22}, [up]!
+	vst1.32	{d22}, [rp]!
+L(tl2):	tst	n, #1
+	beq	L(tl3)
+	vld1.32	{d22[0]}, [up]
+	vst1.32	{d22[0]}, [rp]
+L(tl3):	bx	lr
+EPILOGUE()

diff --git a/third_party/gmp/mpn/arm/v7a/cora15/neon/rsh1aors_n.asm b/third_party/gmp/mpn/arm/v7a/cora15/neon/rsh1aors_n.asm
new file mode 100644
index 0000000..2c11d6d
--- /dev/null
+++ b/third_party/gmp/mpn/arm/v7a/cora15/neon/rsh1aors_n.asm

@@ -0,0 +1,177 @@
+dnl  ARM Neon mpn_rsh1add_n, mpn_rsh1sub_n.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C StrongARM	 -
+C XScale	 -
+C Cortex-A7	 ?
+C Cortex-A8	 ?
+C Cortex-A9	4-5
+C Cortex-A15	 2.5
+
+C TODO
+C  * Try to make this smaller, its size (384 bytes) is excessive.
+C  * Try to reach 2.25 c/l on A15, to match the addlsh_1 family.
+C  * This is ad-hoc scheduled, perhaps unnecessarily so for A15, and perhaps
+C    insufficiently for A7 and A8.
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`vp', `r2')
+define(`n',  `r3')
+
+ifdef(`OPERATION_rsh1add_n', `
+  define(`ADDSUBS',	`adds	$1, $2, $3')
+  define(`ADCSBCS',	`adcs	$1, $2, $3')
+  define(`IFADD',	`$1')
+  define(`IFSUB',	`')
+  define(`func',	mpn_rsh1add_n)')
+ifdef(`OPERATION_rsh1sub_n', `
+  define(`ADDSUBS',	`subs	$1, $2, $3')
+  define(`ADCSBCS',	`sbcs	$1, $2, $3')
+  define(`IFADD',	`')
+  define(`IFSUB',	`$1')
+  define(`func',	mpn_rsh1sub_n)')
+
+MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1sub_n)
+
+ASM_START()
+PROLOGUE(func)
+	push	 {r4-r10}
+
+	ands	r4, n, #3
+	beq	L(b00)
+	cmp	r4, #2
+	blo	L(b01)
+	beq	L(b10)
+
+L(b11):	ldmia	 up!, {r9,r10,r12}
+	ldmia	 vp!, {r5,r6,r7}
+	ADDSUBS( r9, r9, r5)
+	vmov	 d4, r9, r9
+	ADCSBCS( r10, r10, r6)
+	ADCSBCS( r12, r12, r7)
+	vshr.u64 d3, d4, #1
+	vmov	 d1, r10, r12
+	vsli.u64 d3, d1, #31
+	vshr.u64 d2, d1, #1
+	vst1.32	 d3[0], [rp]!
+	bics	 n, n, #3
+	beq	 L(wd2)
+L(gt3):	ldmia	 up!, {r8,r9,r10,r12}
+	ldmia	 vp!, {r4,r5,r6,r7}
+	b	 L(mi0)
+
+L(b10):	ldmia	 up!, {r10,r12}
+	ldmia	 vp!, {r6,r7}
+	ADDSUBS( r10, r10, r6)
+	ADCSBCS( r12, r12, r7)
+	vmov	 d4, r10, r12
+	bics	 n, n, #2
+	vshr.u64 d2, d4, #1
+	beq	 L(wd2)
+L(gt2):	ldmia	 up!, {r8,r9,r10,r12}
+	ldmia	 vp!, {r4,r5,r6,r7}
+	b	 L(mi0)
+
+L(b01):	ldr	 r12, [up], #4
+	ldr	 r7, [vp], #4
+	ADDSUBS( r12, r12, r7)
+	vmov	 d4, r12, r12
+	bics	 n, n, #1
+	bne	 L(gt1)
+	mov	 r5, r12, lsr #1
+IFADD(`	adc	 r1, n, #0')
+IFSUB(`	adc	 r1, n, #1')
+	bfi	 r5, r1, #31, #1
+	str	 r5, [rp]
+	and	 r0, r12, #1
+	pop	 {r4-r10}
+	bx	 r14
+L(gt1):	ldmia	 up!, {r8,r9,r10,r12}
+	ldmia	 vp!, {r4,r5,r6,r7}
+	vshr.u64 d2, d4, #1
+	ADCSBCS( r8, r8, r4)
+	ADCSBCS( r9, r9, r5)
+	vmov	 d0, r8, r9
+	ADCSBCS( r10, r10, r6)
+	ADCSBCS( r12, r12, r7)
+	vsli.u64 d2, d0, #31
+	vshr.u64 d3, d0, #1
+	vst1.32	 d2[0], [rp]!
+	b	 L(mi1)
+
+L(b00):	ldmia	 up!, {r8,r9,r10,r12}
+	ldmia	 vp!, {r4,r5,r6,r7}
+	ADDSUBS( r8, r8, r4)
+	ADCSBCS( r9, r9, r5)
+	vmov	 d4, r8, r9
+	ADCSBCS( r10, r10, r6)
+	ADCSBCS( r12, r12, r7)
+	vshr.u64 d3, d4, #1
+	b	 L(mi1)
+
+	ALIGN(16)
+L(top):	ldmia	 up!, {r8,r9,r10,r12}
+	ldmia	 vp!, {r4,r5,r6,r7}
+	vsli.u64 d3, d1, #63
+	vshr.u64 d2, d1, #1
+	vst1.32	 d3, [rp]!
+L(mi0):	ADCSBCS( r8, r8, r4)
+	ADCSBCS( r9, r9, r5)
+	vmov	 d0, r8, r9
+	ADCSBCS( r10, r10, r6)
+	ADCSBCS( r12, r12, r7)
+	vsli.u64 d2, d0, #63
+	vshr.u64 d3, d0, #1
+	vst1.32	 d2, [rp]!
+L(mi1):	vmov	 d1, r10, r12
+	sub	 n, n, #4
+	tst	 n, n
+	bne	 L(top)
+
+L(end):	vsli.u64 d3, d1, #63
+	vshr.u64 d2, d1, #1
+	vst1.32	 d3, [rp]!
+L(wd2):	vmov	 r4, r5, d2
+IFADD(`	adc	 r1, n, #0')
+IFSUB(`	adc	 r1, n, #1')
+	bfi	 r5, r1, #31, #1
+	stm	 rp, {r4,r5}
+
+L(rtn):	vmov.32	 r0, d4[0]
+	and	 r0, r0, #1
+	pop	 {r4-r10}
+	bx	 r14
+EPILOGUE()

diff --git a/third_party/gmp/mpn/arm/v7a/cora15/submul_1.asm b/third_party/gmp/mpn/arm/v7a/cora15/submul_1.asm
new file mode 100644
index 0000000..ed7bfe8
--- /dev/null
+++ b/third_party/gmp/mpn/arm/v7a/cora15/submul_1.asm

@@ -0,0 +1,159 @@
+dnl  ARM mpn_submul_1 optimised for A15.
+
+dnl  Copyright 2012, 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb		best
+C StrongARM:     -
+C XScale	 ?
+C Cortex-A7	 ?
+C Cortex-A8	 ?
+C Cortex-A9	 5.75			3.75
+C Cortex-A15	 2.32			this
+
+C This code uses umlal and umaal for adding in the rp[] data, keeping the
+C recurrency path separate from any multiply instructions.  It performs well on
+C A15, but not quite at the multiply bandwidth like the corresponding addmul_1
+C code.
+C
+C We don't use r12 due to ldrd and strd limitations.
+C
+C This loop complements U on the fly,
+C   U' = B^n - 1 - U
+C and then uses that
+C   R - U*v = R + U'*v + v - B^n v
+
+C Architecture requirements:
+C v5	-
+C v5t	-
+C v5te	ldrd strd
+C v6	umaal
+C v6t2	-
+C v7a	-
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`n',  `r2')
+define(`v0', `r3')
+
+define(`w0', `r10') define(`w1', `r11')
+define(`u0', `r8')  define(`u1', `r9')
+
+ASM_START()
+PROLOGUE(mpn_submul_1)
+	sub	sp, sp, #32
+	strd	r10, r11, [sp, #24]
+	strd	r8, r9, [sp, #16]
+	strd	r6, r7, [sp, #8]
+	strd	r4, r5, [sp, #0]
+C	push	{ r4-r11 }
+
+	ands	r6, n, #3
+	sub	n, n, #3
+	beq	L(b00)
+	cmp	r6, #2
+	bcc	L(b01)
+	beq	L(b10)
+
+L(b11):	mov	r6, #0
+	ldr	u1, [up], #-4
+	ldr	w1, [rp], #-16
+	mvn	u1, u1
+	adds	r7, v0, #0
+	b	L(mid)
+
+L(b00):	ldrd	u0, u1, [up]
+	ldrd	w0, w1, [rp], #-12
+	mvn	u0, u0
+	mvn	u1, u1
+	mov	r6, v0
+	umaal	w0, r6, u0, v0
+	cmn	r13, #0			C carry clear
+	mov	r7, #0
+	str	w0, [rp, #12]
+	b	L(mid)
+
+L(b10):	ldrd	u0, u1, [up], #8
+	ldrd	w0, w1, [rp]
+	mvn	u0, u0
+	mvn	u1, u1
+	mov	r4, v0
+	umaal	w0, r4, u0, v0
+	mov	r5, #0
+	str	w0, [rp], #-4
+	umlal	w1, r5, u1, v0
+	adds	n, n, #0
+	bmi	L(end)
+	b	L(top)
+
+L(b01):	ldr	u1, [up], #4
+	ldr	w1, [rp], #-8
+	mvn	u1, u1
+	mov	r5, v0
+	mov	r4, #0
+	umaal	w1, r5, u1, v0
+	tst	n, n
+	bmi	L(end)
+
+C	ALIGN(16)
+L(top):	ldrd	u0, u1, [up, #0]
+	adcs	r4, r4, w1
+	mvn	u0, u0
+	ldrd	w0, w1, [rp, #12]
+	mvn	u1, u1
+	mov	r6, #0
+	umlal	w0, r6, u0, v0		C 1 2
+	adcs	r5, r5, w0
+	mov	r7, #0
+	strd	r4, r5, [rp, #8]
+L(mid):	umaal	w1, r7, u1, v0		C 2 3
+	ldrd	u0, u1, [up, #8]
+	add	up, up, #16
+	adcs	r6, r6, w1
+	mvn	u0, u0
+	ldrd	w0, w1, [rp, #20]
+	mvn	u1, u1
+	mov	r4, #0
+	umlal	w0, r4, u0, v0		C 3 4
+	adcs	r7, r7, w0
+	mov	r5, #0
+	strd	r6, r7, [rp, #16]!
+	sub	n, n, #4
+	umlal	w1, r5, u1, v0		C 0 1
+	tst	n, n
+	bpl	L(top)
+
+L(end):	adcs	r4, r4, w1
+	str	r4, [rp, #8]
+	adc	r0, r5, #0
+	sub	r0, v0, r0
+	pop	{ r4-r11 }
+	bx	r14
+EPILOGUE()

diff --git a/third_party/gmp/mpn/arm/v7a/cora17/addmul_1.asm b/third_party/gmp/mpn/arm/v7a/cora17/addmul_1.asm
new file mode 100644
index 0000000..c11ed47
--- /dev/null
+++ b/third_party/gmp/mpn/arm/v7a/cora17/addmul_1.asm

@@ -0,0 +1,34 @@
+dnl  ARM mpn_addmul_1
+
+dnl  Copyright 2018 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_addmul_1)
+include_mpn(`arm/v6/addmul_1.asm')

diff --git a/third_party/gmp/mpn/arm/v7a/cora17/gmp-mparam.h b/third_party/gmp/mpn/arm/v7a/cora17/gmp-mparam.h
new file mode 100644
index 0000000..143d4bc
--- /dev/null
+++ b/third_party/gmp/mpn/arm/v7a/cora17/gmp-mparam.h

@@ -0,0 +1,233 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* 1800 MHz Cortex-A17 with Neon (in spite of file position) */
+/* FFT tuning limit = 51243975 */
+/* Generated by tuneup.c, 2019-10-29, gcc 6.3 */
+
+#define MOD_1_NORM_THRESHOLD                 0  /* always */
+#define MOD_1_UNNORM_THRESHOLD               0  /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          5
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          3
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD         8
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD     MP_SIZE_T_MAX
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     12
+#define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_1N_PI1_METHOD                 1  /* 54.08% faster than 2 */
+#define DIV_QR_1_NORM_THRESHOLD          MP_SIZE_T_MAX  /* never */
+#define DIV_QR_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           45
+
+#define DIV_1_VS_MUL_1_PERCENT             248
+
+#define MUL_TOOM22_THRESHOLD                38
+#define MUL_TOOM33_THRESHOLD               132
+#define MUL_TOOM44_THRESHOLD               200
+#define MUL_TOOM6H_THRESHOLD               303
+#define MUL_TOOM8H_THRESHOLD               478
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD     137
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     179
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD     132
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD     145
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD     191
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 62
+#define SQR_TOOM3_THRESHOLD                189
+#define SQR_TOOM4_THRESHOLD                354
+#define SQR_TOOM6_THRESHOLD                426
+#define SQR_TOOM8_THRESHOLD                608
+
+#define MULMID_TOOM42_THRESHOLD             62
+
+#define MULMOD_BNM1_THRESHOLD               21
+#define SQRMOD_BNM1_THRESHOLD               29
+
+#define MUL_FFT_MODF_THRESHOLD             595  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    595, 5}, {     29, 6}, {     15, 5}, {     31, 6}, \
+    {     16, 5}, {     33, 6}, {     29, 7}, {     15, 6}, \
+    {     33, 7}, {     17, 6}, {     36, 7}, {     19, 6}, \
+    {     39, 7}, {     29, 8}, {     15, 7}, {     35, 8}, \
+    {     19, 7}, {     41, 8}, {     23, 7}, {     49, 8}, \
+    {     27, 7}, {     55, 9}, {     15, 8}, {     31, 7}, \
+    {     63, 8}, {     43, 9}, {     23, 8}, {     55, 9}, \
+    {     31, 8}, {     63, 9}, {     39, 8}, {     83, 9}, \
+    {     47, 8}, {     95, 9}, {     55,10}, {     31, 9}, \
+    {     79,10}, {     47, 9}, {    103,11}, {     31,10}, \
+    {     63, 9}, {    135,10}, {     79, 9}, {    159,10}, \
+    {     95, 9}, {    191,10}, {    111,11}, {     63,10}, \
+    {    143, 8}, {    575,10}, {    159,11}, {     95,10}, \
+    {    191, 9}, {    383, 8}, {    767, 9}, {    399, 8}, \
+    {    799,12}, {     63,11}, {    127,10}, {    255, 9}, \
+    {    511, 8}, {   1023, 9}, {    543, 8}, {   1087, 9}, \
+    {    575,10}, {    303,11}, {    159,10}, {    319, 9}, \
+    {    639,10}, {    335, 9}, {    671,10}, {    351, 9}, \
+    {    703,10}, {    367, 9}, {    735,11}, {    191,10}, \
+    {    383, 9}, {    767,10}, {    399, 9}, {    799,10}, \
+    {    415, 9}, {    831,10}, {    431, 9}, {    863,11}, \
+    {    223,10}, {    447,12}, {    127,10}, {    511, 9}, \
+    {   1023,10}, {    543, 9}, {   1087,10}, {    607, 9}, \
+    {   1215,11}, {    319,10}, {    671, 9}, {   1343,11}, \
+    {    351,10}, {    735,12}, {    191,11}, {    383,10}, \
+    {    799,11}, {    415,10}, {    863,11}, {    447,10}, \
+    {    895,13}, {    127,11}, {    511,10}, {   1023,11}, \
+    {    543,10}, {   1087,11}, {    607,10}, {   1215,12}, \
+    {    319,11}, {    671,10}, {   1343,11}, {    735,10}, \
+    {   1471,12}, {    383,11}, {    799,10}, {   1599,11}, \
+    {    863,10}, {   1727,12}, {    447,11}, {    991,10}, \
+    {   1983,12}, {    511,11}, {   1087,12}, {    575,11}, \
+    {   1215,10}, {   2431,12}, {    639,11}, {   1343,12}, \
+    {    703,11}, {   1471,13}, {    383,12}, {    767,11}, \
+    {   1599,12}, {    831,11}, {   1727,12}, {    959,11}, \
+    {   1983,13}, {    511,12}, {   1087,11}, {   2239,12}, \
+    {   1215,11}, {   2431,13}, {    639,12}, {   1471,11}, \
+    {   2943,13}, {    767,12}, {   1727,13}, {    895,12}, \
+    {   1983,14}, {    511,13}, {   1023,12}, {   2239,13}, \
+    {   1151,12}, {   2495,13}, {   1279,12}, {   2623,13}, \
+    {   1407,12}, {   2943,14}, {    767,13}, {   1535,12}, \
+    {   3135,13}, {   1663,12}, {   3455,13}, {   1919,12}, \
+    {   3839,15}, {    511,14}, {   1023,13}, {   2175,12}, \
+    {   4479,13}, {   2431,14}, {   1279,13}, {   2943,12}, \
+    {   5887,14}, {   1535,13}, {   3455,14}, {   1791,13}, \
+    {   3967,15}, {   1023,14}, {   2047,13}, {   4479,14}, \
+    {   2303,13}, {   4991,12}, {   9983,14}, {   2559,13}, \
+    {   5247,14}, {   2815,13}, {   5887,15}, {   1535,14}, \
+    {  16384,15}, {  32768,16} }
+#define MUL_FFT_TABLE3_SIZE 194
+#define MUL_FFT_THRESHOLD                 6784
+
+#define SQR_FFT_MODF_THRESHOLD             500  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    500, 5}, {     29, 6}, {     15, 5}, {     31, 6}, \
+    {     16, 5}, {     33, 6}, {     29, 7}, {     15, 6}, \
+    {     32, 7}, {     17, 6}, {     36, 7}, {     19, 6}, \
+    {     39, 7}, {     29, 8}, {     15, 7}, {     35, 8}, \
+    {     19, 7}, {     41, 8}, {     23, 7}, {     49, 8}, \
+    {     27, 9}, {     15, 8}, {     31, 7}, {     63, 8}, \
+    {     43, 9}, {     23, 8}, {     55,10}, {     15, 9}, \
+    {     31, 8}, {     67, 9}, {     39, 8}, {     79, 9}, \
+    {     47, 8}, {     95, 9}, {     55,10}, {     31, 9}, \
+    {     79,10}, {     47, 9}, {     95,11}, {     31,10}, \
+    {     63, 9}, {    135,10}, {     79, 9}, {    159,10}, \
+    {     95, 9}, {    191,10}, {    111,11}, {     63,10}, \
+    {    127, 9}, {    255,10}, {    143, 9}, {    287,10}, \
+    {    159, 9}, {    319,11}, {     95,10}, {    191, 9}, \
+    {    383, 8}, {    767, 9}, {    399,12}, {     63,11}, \
+    {    127,10}, {    255, 9}, {    511,10}, {    271, 9}, \
+    {    543,10}, {    287, 9}, {    575,10}, {    303,11}, \
+    {    159,10}, {    319, 9}, {    639,10}, {    335, 9}, \
+    {    671,10}, {    351, 9}, {    703,10}, {    367, 9}, \
+    {    735,11}, {    191,10}, {    383, 9}, {    767,10}, \
+    {    399, 9}, {    799,10}, {    415, 9}, {    831,10}, \
+    {    431, 9}, {    863,10}, {    447,12}, {    127,11}, \
+    {    255,10}, {    511, 9}, {   1023,10}, {    543, 9}, \
+    {   1087,11}, {    287,10}, {    607, 9}, {   1215,11}, \
+    {    319,10}, {    671,11}, {    351,10}, {    735,12}, \
+    {    191,11}, {    383,10}, {    799,11}, {    415,10}, \
+    {    863,11}, {    447,10}, {    895,13}, {    127,12}, \
+    {    255,11}, {    511,10}, {   1023,11}, {    543,10}, \
+    {   1087,11}, {    607,10}, {   1215,12}, {    319,11}, \
+    {    671,10}, {   1343,11}, {    735,10}, {   1471,12}, \
+    {    383,11}, {    799,10}, {   1599,11}, {    863,12}, \
+    {    447,11}, {    959,10}, {   1919,11}, {    991,13}, \
+    {    255,12}, {    511,11}, {   1087,12}, {    575,11}, \
+    {   1215,10}, {   2431,12}, {    639,11}, {   1343,12}, \
+    {    703,11}, {   1471,13}, {    383,12}, {    767,11}, \
+    {   1599,12}, {    831,11}, {   1727,12}, {    959,11}, \
+    {   1919,14}, {    255,13}, {    511,12}, {   1087,11}, \
+    {   2239,12}, {   1215,11}, {   2431,13}, {    639,12}, \
+    {   1471,11}, {   2943,13}, {    767,12}, {   1727,13}, \
+    {    895,12}, {   1983,14}, {    511,13}, {   1023,12}, \
+    {   2239,13}, {   1151,12}, {   2495,13}, {   1279,12}, \
+    {   2623,13}, {   1407,12}, {   2943,14}, {    767,13}, \
+    {   1535,12}, {   3071,13}, {   1663,12}, {   3455,13}, \
+    {   1919,12}, {   3839,15}, {    511,14}, {   1023,13}, \
+    {   2175,12}, {   4479,13}, {   2431,14}, {   1279,13}, \
+    {   2943,12}, {   5887,14}, {   1535,13}, {   3455,14}, \
+    {   1791,13}, {   3967,15}, {   1023,14}, {   2047,13}, \
+    {   4479,14}, {   2303,13}, {   4991,12}, {   9983,14}, \
+    {   2559,13}, {   5119,14}, {   2815,13}, {   5887,15}, \
+    {   1535,14}, {  16384,15}, {  32768,16} }
+#define SQR_FFT_TABLE3_SIZE 199
+#define SQR_FFT_THRESHOLD                 4736
+
+#define MULLO_BASECASE_THRESHOLD             0  /* always */
+#define MULLO_DC_THRESHOLD                  27
+#define MULLO_MUL_N_THRESHOLD            13463
+#define SQRLO_BASECASE_THRESHOLD             0  /* always */
+#define SQRLO_DC_THRESHOLD                  26
+#define SQRLO_SQR_THRESHOLD               8907
+
+#define DC_DIV_QR_THRESHOLD                 38
+#define DC_DIVAPPR_Q_THRESHOLD             103
+#define DC_BDIV_QR_THRESHOLD                44
+#define DC_BDIV_Q_THRESHOLD                 98
+
+#define INV_MULMOD_BNM1_THRESHOLD           78
+#define INV_NEWTON_THRESHOLD               165
+#define INV_APPR_THRESHOLD                 115
+
+#define BINV_NEWTON_THRESHOLD              296
+#define REDC_1_TO_REDC_2_THRESHOLD           2
+#define REDC_2_TO_REDC_N_THRESHOLD         147
+
+#define MU_DIV_QR_THRESHOLD               2089
+#define MU_DIVAPPR_Q_THRESHOLD            2089
+#define MUPI_DIV_QR_THRESHOLD               70
+#define MU_BDIV_QR_THRESHOLD              1718
+#define MU_BDIV_Q_THRESHOLD               2089
+
+#define POWM_SEC_TABLE  7,19,107,480,1486
+
+#define GET_STR_DC_THRESHOLD                14
+#define GET_STR_PRECOMPUTE_THRESHOLD        29
+#define SET_STR_DC_THRESHOLD               126
+#define SET_STR_PRECOMPUTE_THRESHOLD       541
+
+#define FAC_DSC_THRESHOLD                  132
+#define FAC_ODD_THRESHOLD                   29
+
+#define MATRIX22_STRASSEN_THRESHOLD         30
+#define HGCD2_DIV1_METHOD                    1  /* 6.55% faster than 3 */
+#define HGCD_THRESHOLD                      54
+#define HGCD_APPR_THRESHOLD                 52
+#define HGCD_REDUCE_THRESHOLD             3524
+#define GCD_DC_THRESHOLD                   303
+#define GCDEXT_DC_THRESHOLD                225
+#define JACOBI_BASE_METHOD                   4  /* 9.73% faster than 1 */
+
+/* Tuneup completed successfully, took 111418 seconds */

diff --git a/third_party/gmp/mpn/arm/v7a/cora17/mod_34lsub1.asm b/third_party/gmp/mpn/arm/v7a/cora17/mod_34lsub1.asm
new file mode 100644
index 0000000..39e5a15
--- /dev/null
+++ b/third_party/gmp/mpn/arm/v7a/cora17/mod_34lsub1.asm

@@ -0,0 +1,121 @@
+dnl  ARM mpn_mod_34lsub1 -- remainder modulo 2^24-1.
+
+dnl  Copyright 2012, 2013, 2018 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C StrongARM	 ?
+C XScale	 ?
+C Cortex-A5	 2.67
+C Cortex-A7	 2.37
+C Cortex-A8	 2.34
+C Cortex-A9	 ?
+C Cortex-A15	 1.39
+C Cortex-A17	 1.60
+C Cortex-A53	 2.51
+
+define(`ap',	r0)
+define(`n',	r1)
+
+C mp_limb_t mpn_mod_34lsub1 (mp_srcptr up, mp_size_t n)
+
+C TODO
+C  * Write cleverer summation code.
+C  * Consider loading 6 64-bit aligned registers at a time, to approach 1 c/l.
+
+ASM_START()
+	TEXT
+	ALIGN(32)
+PROLOGUE(mpn_mod_34lsub1)
+	push	{ r4, r5, r6, r7 }
+
+	subs	n, n, #3
+	mov	r7, #0
+	blt	L(le2)			C n <= 2
+
+	ldmia	ap!, { r2, r3, r12 }
+	subs	n, n, #3
+	blt	L(sum)			C n <= 5
+	mov	r7, #0
+	b	L(mid)
+
+L(top):	adds	r2, r2, r4
+	adcs	r3, r3, r5
+	adcs	r12, r12, r6
+	adc	r7, r7, #0
+L(mid):	ldmia	ap!, { r4, r5, r6 }
+	subs	n, n, #3
+	bpl	L(top)
+
+	adds	r2, r2, r4
+	adcs	r3, r3, r5
+	adcs	r12, r12, r6
+	adc	r7, r7, #0		C r7 <= 1
+
+L(sum):	cmn	n, #2
+	movlo	r4, #0
+	ldrhs	r4, [ap], #4
+	movls	r5, #0
+	ldrhi	r5, [ap], #4
+
+	adds	r2, r2, r4
+	adcs	r3, r3, r5
+	adcs	r12, r12, #0
+	adc	r7, r7, #0		C r7 <= 2
+
+L(sum2):
+	bic	r0, r2, #0xff000000
+	add	r0, r0, r2, lsr #24
+	add	r0, r0, r7
+
+	mov	r7, r3, lsl #8
+	bic	r2, r7, #0xff000000
+	add	r0, r0, r2
+	add	r0, r0, r3, lsr #16
+
+	mov	r2, r12, lsl #16
+	bic	r1, r2, #0xff000000
+	add	r0, r0, r1
+	add	r0, r0, r12, lsr #8
+
+	pop	{ r4, r5, r6, r7 }
+	return	lr
+
+L(le2):	cmn	n, #1
+	bne	L(1)
+	ldmia	ap!, { r2, r3 }
+	mov	r12, #0
+	b	L(sum2)
+L(1):	ldr	r2, [ap]
+	bic	r0, r2, #0xff000000
+	add	r0, r0, r2, lsr #24
+	pop	{ r4, r5, r6, r7 }
+	return	lr
+EPILOGUE()

diff --git a/third_party/gmp/mpn/arm/v7a/cora17/mul_1.asm b/third_party/gmp/mpn/arm/v7a/cora17/mul_1.asm
new file mode 100644
index 0000000..d9b6042
--- /dev/null
+++ b/third_party/gmp/mpn/arm/v7a/cora17/mul_1.asm

@@ -0,0 +1,34 @@
+dnl  ARM mpn_mul_1
+
+dnl  Copyright 2018 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_mul_1)
+include_mpn(`arm/v6/mul_1.asm')

diff --git a/third_party/gmp/mpn/arm/v7a/cora17/submul_1.asm b/third_party/gmp/mpn/arm/v7a/cora17/submul_1.asm
new file mode 100644
index 0000000..f3e8139
--- /dev/null
+++ b/third_party/gmp/mpn/arm/v7a/cora17/submul_1.asm

@@ -0,0 +1,34 @@
+dnl  ARM mpn_submul_1
+
+dnl  Copyright 2018 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_submul_1)
+include_mpn(`arm/v6/submul_1.asm')

diff --git a/third_party/gmp/mpn/arm/v7a/cora5/gmp-mparam.h b/third_party/gmp/mpn/arm/v7a/cora5/gmp-mparam.h
new file mode 100644
index 0000000..e3564e0
--- /dev/null
+++ b/third_party/gmp/mpn/arm/v7a/cora5/gmp-mparam.h

@@ -0,0 +1,205 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* 1500 MHz Cortex-A5 (odroid c1) */
+/* FFT tuning limit = 18,235,562 */
+/* Generated by tuneup.c, 2019-10-22, gcc 4.9 */
+
+#define MOD_1_NORM_THRESHOLD                 0  /* always */
+#define MOD_1_UNNORM_THRESHOLD               0  /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          7
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          7
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD         8
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD     MP_SIZE_T_MAX
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     23
+#define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_1N_PI1_METHOD                 1  /* 132.79% faster than 2 */
+#define DIV_QR_1_NORM_THRESHOLD          MP_SIZE_T_MAX  /* never */
+#define DIV_QR_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           52
+
+#define DIV_1_VS_MUL_1_PERCENT             213
+
+#define MUL_TOOM22_THRESHOLD                48
+#define MUL_TOOM33_THRESHOLD               143
+#define MUL_TOOM44_THRESHOLD               262
+#define MUL_TOOM6H_THRESHOLD               414
+#define MUL_TOOM8H_THRESHOLD               527
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD     153
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     168
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD     152
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD     180
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD     226
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 66
+#define SQR_TOOM3_THRESHOLD                149
+#define SQR_TOOM4_THRESHOLD                348
+#define SQR_TOOM6_THRESHOLD                517
+#define SQR_TOOM8_THRESHOLD                608
+
+#define MULMID_TOOM42_THRESHOLD             70
+
+#define MULMOD_BNM1_THRESHOLD               26
+#define SQRMOD_BNM1_THRESHOLD               28
+
+#define MUL_FFT_MODF_THRESHOLD             660  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    660, 5}, {     29, 6}, {     15, 5}, {     33, 6}, \
+    {     17, 5}, {     35, 6}, {     29, 7}, {     15, 6}, \
+    {     37, 7}, {     19, 6}, {     40, 7}, {     21, 6}, \
+    {     43, 7}, {     37, 8}, {     19, 7}, {     43, 8}, \
+    {     23, 7}, {     51, 8}, {     27, 7}, {     55, 8}, \
+    {     31, 7}, {     63, 8}, {     43, 9}, {     23, 8}, \
+    {     55, 9}, {     31, 8}, {     71, 9}, {     39, 8}, \
+    {     83, 9}, {     47, 8}, {     99, 9}, {     55,10}, \
+    {     31, 9}, {     63, 8}, {    127, 9}, {     79,10}, \
+    {     47, 9}, {    103,11}, {     31,10}, {     63, 9}, \
+    {    135,10}, {     79, 9}, {    167,10}, {     95, 9}, \
+    {    191,10}, {    111,11}, {     63,10}, {    159,11}, \
+    {     95,10}, {    191, 9}, {    383,12}, {     63,11}, \
+    {    127,10}, {    255, 9}, {    511,10}, {    271,11}, \
+    {    159,10}, {    319, 9}, {    639,10}, {    335, 9}, \
+    {    671,11}, {    191,10}, {    383, 9}, {    767,10}, \
+    {    399, 9}, {    799,10}, {    415,11}, {    223,12}, \
+    {    127,11}, {    255,10}, {    511, 9}, {   1023,10}, \
+    {    543,11}, {    287,10}, {    607,11}, {    319,10}, \
+    {    671,11}, {    351,12}, {    191,11}, {    383,10}, \
+    {    799,11}, {    415,10}, {    831,13}, {    127,12}, \
+    {    255,11}, {    511,10}, {   1023,11}, {    543,10}, \
+    {   1087,11}, {    575,10}, {   1151,11}, {    607,12}, \
+    {    319,11}, {    703,12}, {    383,11}, {    831,12}, \
+    {    447,11}, {    895,13}, {    255,12}, {    511,11}, \
+    {   1087,12}, {    575,11}, {   1183,12}, {    639,11}, \
+    {   1279,12}, {    703,13}, {    383,12}, {    767,11}, \
+    {   1535,12}, {    895,14}, {    255,13}, {    511,12}, \
+    {   1151,13}, {    639,12}, {   1407,13}, {    767,12}, \
+    {   1599,13}, {    895,12}, {   1791,14}, {    511,13}, \
+    {   1023,12}, {   2111,13}, {   1151,12}, {   2367,13}, \
+    {   1279,12}, {   2559,13}, {   1407,14}, {    767,13}, \
+    {   1535,12}, {   3071,13}, {   1663,12}, {   3327,13}, \
+    {   1791,15}, {    511,14}, {   1023,13}, {   2175,12}, \
+    {   4351,13}, {   8192,14}, {  16384,15}, {  32768,16} }
+#define MUL_FFT_TABLE3_SIZE 140
+#define MUL_FFT_THRESHOLD                 7552
+
+#define SQR_FFT_MODF_THRESHOLD             590  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    590, 5}, {     33, 6}, {     17, 5}, {     35, 6}, \
+    {     36, 7}, {     19, 6}, {     40, 7}, {     21, 6}, \
+    {     43, 7}, {     23, 6}, {     47, 7}, {     37, 8}, \
+    {     19, 7}, {     43, 8}, {     23, 7}, {     49, 8}, \
+    {     27, 7}, {     55, 8}, {     31, 7}, {     63, 8}, \
+    {     43, 9}, {     23, 8}, {     55, 9}, {     31, 8}, \
+    {     67, 9}, {     39, 8}, {     83, 9}, {     47, 8}, \
+    {     95, 9}, {     55,10}, {     31, 9}, {     79,10}, \
+    {     47, 9}, {    103,11}, {     31,10}, {     63, 9}, \
+    {    135,10}, {     79, 9}, {    167,10}, {     95, 9}, \
+    {    191,10}, {    111,11}, {     63,10}, {    159,11}, \
+    {     95,10}, {    191, 9}, {    383,12}, {     63,11}, \
+    {    127,10}, {    255, 9}, {    511,10}, {    271, 9}, \
+    {    543,11}, {    159,10}, {    319, 9}, {    639,10}, \
+    {    335, 9}, {    671,10}, {    351,11}, {    191,10}, \
+    {    383, 9}, {    767,10}, {    415,12}, {    127,11}, \
+    {    255,10}, {    511, 9}, {   1023,10}, {    543, 9}, \
+    {   1087,11}, {    287,10}, {    575, 9}, {   1151,10}, \
+    {    607,11}, {    319,10}, {    671,11}, {    351,12}, \
+    {    191,11}, {    383,10}, {    799,11}, {    415,10}, \
+    {    831,13}, {    127,12}, {    255,11}, {    511,10}, \
+    {   1023,11}, {    543,10}, {   1087,11}, {    575,10}, \
+    {   1151,11}, {    607,12}, {    319,11}, {    735,12}, \
+    {    383,11}, {    831,12}, {    447,11}, {    927,13}, \
+    {    255,12}, {    511,11}, {   1087,12}, {    575,11}, \
+    {   1151,12}, {    639,11}, {   1279,12}, {    703,13}, \
+    {    383,12}, {    767,11}, {   1535,12}, {    831,11}, \
+    {   1663,12}, {    895,11}, {   1791,12}, {    959,14}, \
+    {    255,13}, {    511,12}, {   1023,11}, {   2047,12}, \
+    {   1151,13}, {    639,12}, {   1407,13}, {    767,12}, \
+    {   1599,13}, {    895,12}, {   1791,14}, {    511,13}, \
+    {   1023,12}, {   2111,13}, {   1151,12}, {   2367,13}, \
+    {   1279,12}, {   2559,13}, {   1407,14}, {    767,13}, \
+    {   1535,12}, {   3071,13}, {   1663,12}, {   3327,13}, \
+    {   1791,15}, {    511,14}, {   1023,13}, {   2175,12}, \
+    {   4351,13}, {   8192,14}, {  16384,15}, {  32768,16} }
+#define SQR_FFT_TABLE3_SIZE 144
+#define SQR_FFT_THRESHOLD                 5760
+
+#define MULLO_BASECASE_THRESHOLD             0  /* always */
+#define MULLO_DC_THRESHOLD                  39
+#define MULLO_MUL_N_THRESHOLD            14709
+#define SQRLO_BASECASE_THRESHOLD             8
+#define SQRLO_DC_THRESHOLD                  33
+#define SQRLO_SQR_THRESHOLD              11278
+
+#define DC_DIV_QR_THRESHOLD                 36
+#define DC_DIVAPPR_Q_THRESHOLD             116
+#define DC_BDIV_QR_THRESHOLD                48
+#define DC_BDIV_Q_THRESHOLD                140
+
+#define INV_MULMOD_BNM1_THRESHOLD           95
+#define INV_NEWTON_THRESHOLD               181
+#define INV_APPR_THRESHOLD                 125
+
+#define BINV_NEWTON_THRESHOLD              327
+#define REDC_1_TO_REDC_2_THRESHOLD           0  /* always */
+#define REDC_2_TO_REDC_N_THRESHOLD         152
+
+#define MU_DIV_QR_THRESHOLD               2350
+#define MU_DIVAPPR_Q_THRESHOLD            2130
+#define MUPI_DIV_QR_THRESHOLD               98
+#define MU_BDIV_QR_THRESHOLD              1970
+#define MU_BDIV_Q_THRESHOLD               2172
+
+#define POWM_SEC_TABLE  6,37,108,624,2351
+
+#define GET_STR_DC_THRESHOLD                28
+#define GET_STR_PRECOMPUTE_THRESHOLD        44
+#define SET_STR_DC_THRESHOLD               309
+#define SET_STR_PRECOMPUTE_THRESHOLD       762
+
+#define FAC_DSC_THRESHOLD                  236
+#define FAC_ODD_THRESHOLD                   29
+
+#define MATRIX22_STRASSEN_THRESHOLD         25
+#define HGCD2_DIV1_METHOD                    5  /* 2.92% faster than 3 */
+#define HGCD_THRESHOLD                      70
+#define HGCD_APPR_THRESHOLD                 59
+#define HGCD_REDUCE_THRESHOLD             4120
+#define GCD_DC_THRESHOLD                   229
+#define GCDEXT_DC_THRESHOLD                233
+#define JACOBI_BASE_METHOD                   1  /* 17.07% faster than 4 */
+
+/* Tuneup completed successfully, took 47845 seconds */

diff --git a/third_party/gmp/mpn/arm/v7a/cora7/gmp-mparam.h b/third_party/gmp/mpn/arm/v7a/cora7/gmp-mparam.h
new file mode 100644
index 0000000..78de045
--- /dev/null
+++ b/third_party/gmp/mpn/arm/v7a/cora7/gmp-mparam.h

@@ -0,0 +1,202 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* 900 MHz Cortex-A7 (raspberry pi2) */
+/* FFT tuning limit = 21,559,921 */
+/* Generated by tuneup.c, 2019-10-22, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD                 0  /* always */
+#define MOD_1_UNNORM_THRESHOLD               0  /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          7
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          8
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD         0  /* never mpn_mod_1_1p */
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD     MP_SIZE_T_MAX
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     18
+#define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_1N_PI1_METHOD                 1  /* 64.16% faster than 2 */
+#define DIV_QR_1_NORM_THRESHOLD          MP_SIZE_T_MAX  /* never */
+#define DIV_QR_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           48
+
+#define DIV_1_VS_MUL_1_PERCENT             216
+
+#define MUL_TOOM22_THRESHOLD                39
+#define MUL_TOOM33_THRESHOLD               129
+#define MUL_TOOM44_THRESHOLD               196
+#define MUL_TOOM6H_THRESHOLD               327
+#define MUL_TOOM8H_THRESHOLD               478
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD     129
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     183
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD     132
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD     144
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD     190
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 52
+#define SQR_TOOM3_THRESHOLD                162
+#define SQR_TOOM4_THRESHOLD                268
+#define SQR_TOOM6_THRESHOLD                399
+#define SQR_TOOM8_THRESHOLD                547
+
+#define MULMID_TOOM42_THRESHOLD             50
+
+#define MULMOD_BNM1_THRESHOLD               21
+#define SQRMOD_BNM1_THRESHOLD               25
+
+#define MUL_FFT_MODF_THRESHOLD             636  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    636, 5}, {     28, 6}, {     15, 5}, {     31, 6}, \
+    {     29, 7}, {     15, 6}, {     33, 7}, {     17, 6}, \
+    {     35, 7}, {     19, 6}, {     39, 7}, {     29, 8}, \
+    {     15, 7}, {     35, 8}, {     19, 7}, {     43, 8}, \
+    {     23, 7}, {     49, 8}, {     27, 7}, {     55, 8}, \
+    {     31, 7}, {     63, 8}, {     43, 9}, {     23, 8}, \
+    {     55, 9}, {     31, 8}, {     67, 9}, {     39, 8}, \
+    {     83, 9}, {     47, 8}, {     95, 9}, {     55,10}, \
+    {     31, 9}, {     79,10}, {     47, 9}, {    103,11}, \
+    {     31,10}, {     63, 9}, {    135,10}, {     79, 9}, \
+    {    159,10}, {     95, 9}, {    191,10}, {    111,11}, \
+    {     63,10}, {    159,11}, {     95,10}, {    191,12}, \
+    {     63,11}, {    127,10}, {    255, 9}, {    511,10}, \
+    {    271, 9}, {    543,11}, {    159,10}, {    319, 9}, \
+    {    639,10}, {    335, 9}, {    671,10}, {    351,11}, \
+    {    191,10}, {    383, 9}, {    767,10}, {    399, 9}, \
+    {    799,10}, {    415,11}, {    223,12}, {    127,11}, \
+    {    255,10}, {    511, 9}, {   1023,10}, {    543,11}, \
+    {    287,10}, {    607,11}, {    319,10}, {    671,11}, \
+    {    351,12}, {    191,11}, {    383,10}, {    799,11}, \
+    {    415,10}, {    831,13}, {    127,12}, {    255,11}, \
+    {    511,10}, {   1023,11}, {    543,10}, {   1087,11}, \
+    {    607,12}, {    319,11}, {    735,12}, {    383,11}, \
+    {    863,12}, {    447,11}, {    959,13}, {    255,12}, \
+    {    511,11}, {   1087,12}, {    575,11}, {   1215,12}, \
+    {    639,11}, {   1279,12}, {    703,13}, {    383,12}, \
+    {    767,11}, {   1599,12}, {    959,14}, {    255,13}, \
+    {    511,12}, {   1215,13}, {    639,12}, {   1471,13}, \
+    {    767,12}, {   1663,13}, {    895,12}, {   1855,14}, \
+    {    511,13}, {   1023,12}, {   2111,13}, {   1151,12}, \
+    {   2431,13}, {   1407,14}, {    767,13}, {   1663,12}, \
+    {   3327,13}, {   1791,15}, {    511,14}, {   1023,13}, \
+    {   2431,14}, {   1279,13}, {   8192,14}, {  16384,15}, \
+    {  32768,16} }
+#define MUL_FFT_TABLE3_SIZE 133
+#define MUL_FFT_THRESHOLD                 6784
+
+#define SQR_FFT_MODF_THRESHOLD             535  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    535, 5}, {     25, 6}, {     13, 5}, {     28, 6}, \
+    {     15, 5}, {     31, 6}, {     29, 7}, {     15, 6}, \
+    {     33, 7}, {     17, 6}, {     36, 7}, {     19, 6}, \
+    {     39, 7}, {     29, 8}, {     15, 7}, {     37, 8}, \
+    {     19, 7}, {     43, 8}, {     23, 7}, {     49, 8}, \
+    {     27, 7}, {     55, 8}, {     31, 7}, {     63, 8}, \
+    {     43, 9}, {     23, 8}, {     55, 9}, {     31, 8}, \
+    {     67, 9}, {     39, 8}, {     79, 9}, {     47, 8}, \
+    {     95, 9}, {     55,10}, {     31, 9}, {     79,10}, \
+    {     47, 9}, {    103,11}, {     31,10}, {     63, 9}, \
+    {    135,10}, {     79, 9}, {    159,10}, {     95, 9}, \
+    {    191,10}, {    111,11}, {     63,10}, {    143, 9}, \
+    {    287,10}, {    159,11}, {     95,10}, {    191,12}, \
+    {     63,11}, {    127,10}, {    255, 9}, {    511,10}, \
+    {    271, 9}, {    543,10}, {    287,11}, {    159,10}, \
+    {    319, 9}, {    639,10}, {    335, 9}, {    671,10}, \
+    {    351,11}, {    191,10}, {    383, 9}, {    767,10}, \
+    {    399, 9}, {    799,10}, {    415, 9}, {    831,11}, \
+    {    223,12}, {    127,10}, {    543,11}, {    287,10}, \
+    {    607,11}, {    319,10}, {    671,11}, {    351,10}, \
+    {    703,12}, {    191,11}, {    383,10}, {    799,11}, \
+    {    415,10}, {    831,13}, {    127,11}, {    511,10}, \
+    {   1023,11}, {    543,10}, {   1087,11}, {    607,12}, \
+    {    319,11}, {    735,12}, {    383,11}, {    863,12}, \
+    {    447,11}, {    991,12}, {    511,11}, {   1087,12}, \
+    {    575,11}, {   1215,12}, {    639,11}, {   1279,12}, \
+    {    703,13}, {    383,12}, {    767,11}, {   1535,12}, \
+    {    831,11}, {   1663,12}, {    959,13}, {    511,12}, \
+    {   1215,13}, {    639,12}, {   1471,13}, {    767,12}, \
+    {   1663,13}, {    895,12}, {   1855,14}, {    511,13}, \
+    {   1023,12}, {   2111,13}, {   1151,12}, {   2431,13}, \
+    {   1407,14}, {    767,13}, {   1791,15}, {    511,14}, \
+    {   1023,13}, {   2431,14}, {   1279,13}, {   8192,14}, \
+    {  16384,15}, {  32768,16} }
+#define SQR_FFT_TABLE3_SIZE 134
+#define SQR_FFT_THRESHOLD                 4736
+
+#define MULLO_BASECASE_THRESHOLD             0  /* always */
+#define MULLO_DC_THRESHOLD                  27
+#define MULLO_MUL_N_THRESHOLD            13463
+#define SQRLO_BASECASE_THRESHOLD             5
+#define SQRLO_DC_THRESHOLD                  31
+#define SQRLO_SQR_THRESHOLD               9449
+
+#define DC_DIV_QR_THRESHOLD                 28
+#define DC_DIVAPPR_Q_THRESHOLD              90
+#define DC_BDIV_QR_THRESHOLD                32
+#define DC_BDIV_Q_THRESHOLD                110
+
+#define INV_MULMOD_BNM1_THRESHOLD           78
+#define INV_NEWTON_THRESHOLD               134
+#define INV_APPR_THRESHOLD                  98
+
+#define BINV_NEWTON_THRESHOLD              278
+#define REDC_1_TO_REDC_2_THRESHOLD           4
+#define REDC_2_TO_REDC_N_THRESHOLD         123
+
+#define MU_DIV_QR_THRESHOLD               1718
+#define MU_DIVAPPR_Q_THRESHOLD            1685
+#define MUPI_DIV_QR_THRESHOLD               62
+#define MU_BDIV_QR_THRESHOLD              1528
+#define MU_BDIV_Q_THRESHOLD               1718
+
+#define POWM_SEC_TABLE  1,22,95,563,1955
+
+#define GET_STR_DC_THRESHOLD                28
+#define GET_STR_PRECOMPUTE_THRESHOLD        51
+#define SET_STR_DC_THRESHOLD               182
+#define SET_STR_PRECOMPUTE_THRESHOLD       638
+
+#define FAC_DSC_THRESHOLD                  153
+#define FAC_ODD_THRESHOLD                   56
+
+#define MATRIX22_STRASSEN_THRESHOLD         25
+#define HGCD2_DIV1_METHOD                    1  /* 5.04% faster than 3 */
+#define HGCD_THRESHOLD                      55
+#define HGCD_APPR_THRESHOLD                 50
+#define HGCD_REDUCE_THRESHOLD             3389
+#define GCD_DC_THRESHOLD                   153
+#define GCDEXT_DC_THRESHOLD                180
+#define JACOBI_BASE_METHOD                   1  /* 30.60% faster than 4 */
+
+/* Tuneup completed successfully, took 75202 seconds */

diff --git a/third_party/gmp/mpn/arm/v7a/cora8/bdiv_q_1.asm b/third_party/gmp/mpn/arm/v7a/cora8/bdiv_q_1.asm
new file mode 100644
index 0000000..e74b260
--- /dev/null
+++ b/third_party/gmp/mpn/arm/v7a/cora8/bdiv_q_1.asm

@@ -0,0 +1,158 @@
+dnl  ARM v6 mpn_bdiv_q_1, mpn_pi1_bdiv_q_1 -- Hensel division by 1-limb divisor.
+dnl  This is v6 code but it runs well on just the v7a Cortex-A8, A9, and A15.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2012, 2013, 2017 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C               cycles/limb
+C               norm   unorm
+C 1176		 -	 -
+C Cortex-A5	 9	13
+C Cortex-A7	12	18
+C Cortex-A8	13	14
+C Cortex-A9	 9	10		not measured since latest edits
+C Cortex-A15	 7	 7
+C Cortex-A53	16	24
+
+C Architecture requirements:
+C v5	-
+C v5t	clz
+C v5te	-
+C v6	umaal
+C v6t2	-
+C v7a	-
+
+define(`rp',  `r0')
+define(`up',  `r1')
+define(`n',   `r2')
+define(`d',   `r3')
+define(`di_arg',  `sp[0]')		C	just mpn_pi1_bdiv_q_1
+define(`cnt_arg', `sp[4]')		C	just mpn_pi1_bdiv_q_1
+
+define(`cy',  `r7')
+define(`cnt', `r6')
+define(`tnc', `r4')
+
+ASM_START()
+PROLOGUE(mpn_bdiv_q_1)
+	push	{r6-r11}
+
+	rsb	r10, d, #0
+	and	r10, r10, d
+	clz	r10, r10
+	rsbs	cnt, r10, #31		C count_trailing_zeros
+	mov	d, d, lsr cnt
+
+C binvert limb
+	LEA(	r10, binvert_limb_table)
+	and	r12, d, #254
+	ldrb	r10, [r10, r12, lsr #1]
+	mul	r12, r10, r10
+	mul	r12, d, r12
+	rsb	r12, r12, r10, lsl #1
+	mul	r10, r12, r12
+	mul	r10, d, r10
+	rsb	r10, r10, r12, lsl #1	C r10 = inverse
+	b	L(pi1)
+EPILOGUE()
+
+PROLOGUE(mpn_pi1_bdiv_q_1)
+	push	{r6-r11}
+
+	ldr	cnt, [sp, #28]
+	ldr	r10, [sp, #24]
+	cmp	cnt, #0
+
+L(pi1):	ldr	r11, [up], #4		C up[0]
+	mov	cy, #0
+	rsb	r8, r10, #0		C r8 = -inverse
+	bne	L(unorm)
+
+L(norm):
+	subs	n, n, #1
+	mul	r11, r11, r10
+	beq	L(edn)
+
+	ALIGN(16)
+L(tpn):	ldr	r9, [up], #4
+	mov	r12, #0
+	str	r11, [rp], #4
+	umaal	r12, cy, r11, d
+	mul	r11, r9, r10
+	mla	r11, cy, r8, r11
+	subs	n, n, #1
+	bne	L(tpn)
+
+L(edn):	str	r11, [rp]
+	pop	{r6-r11}
+	bx	r14
+
+L(unorm):
+	push	{r4-r5}
+	rsb	tnc, cnt, #32
+	mov	r5, r11, lsr cnt
+	subs	n, n, #1
+	beq	L(ed1)
+
+	ldr	r12, [up], #4
+	orr	r9, r5, r12, lsl tnc
+	mov	r5, r12, lsr cnt
+	mul	r11, r9, r10
+	subs	n, n, #1
+	beq	L(edu)
+
+	ALIGN(16)
+L(tpu):	ldr	r12, [up], #4
+	orr	r9, r5, r12, lsl tnc
+	mov	r5, r12, lsr cnt
+	mov	r12, #0
+	str	r11, [rp], #4
+	umaal	r12, cy, r11, d
+	mul	r11, r9, r10
+	mla	r11, cy, r8, r11
+	subs	n, n, #1
+	bne	L(tpu)
+
+L(edu):	str	r11, [rp], #4
+	mov	r12, #0
+	umaal	r12, cy, r11, d
+	mul	r11, r5, r10
+	mla	r11, cy, r8, r11
+	str	r11, [rp]
+	pop	{r4-r11}
+	bx	r14
+
+L(ed1):	mul	r11, r5, r10
+	str	r11, [rp]
+	pop	{r4-r11}
+	bx	r14
+EPILOGUE()

diff --git a/third_party/gmp/mpn/arm/v7a/cora8/gmp-mparam.h b/third_party/gmp/mpn/arm/v7a/cora8/gmp-mparam.h
new file mode 100644
index 0000000..5864841
--- /dev/null
+++ b/third_party/gmp/mpn/arm/v7a/cora8/gmp-mparam.h

@@ -0,0 +1,207 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* 1000 MHz Cortex-A8 (beaglebone black) */
+/* FFT tuning limit = 9,464,348 */
+/* Generated by tuneup.c, 2019-10-23, gcc 6.3 */
+
+#define MOD_1_NORM_THRESHOLD                 0  /* always */
+#define MOD_1_UNNORM_THRESHOLD               0  /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          5
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          5
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD     MP_SIZE_T_MAX
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD         0  /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     12
+#define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_1N_PI1_METHOD                 1  /* 50.65% faster than 2 */
+#define DIV_QR_1_NORM_THRESHOLD          MP_SIZE_T_MAX  /* never */
+#define DIV_QR_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           31
+
+#define DIV_1_VS_MUL_1_PERCENT             192
+
+#define MUL_TOOM22_THRESHOLD                39
+#define MUL_TOOM33_THRESHOLD               129
+#define MUL_TOOM44_THRESHOLD               226
+#define MUL_TOOM6H_THRESHOLD               366
+#define MUL_TOOM8H_THRESHOLD               620
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD     141
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     183
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD     154
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD     160
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD     193
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 46
+#define SQR_TOOM3_THRESHOLD                145
+#define SQR_TOOM4_THRESHOLD                375
+#define SQR_TOOM6_THRESHOLD                  0  /* always */
+#define SQR_TOOM8_THRESHOLD                547
+
+#define MULMID_TOOM42_THRESHOLD             38
+
+#define MULMOD_BNM1_THRESHOLD               22
+#define SQRMOD_BNM1_THRESHOLD               23
+
+#define MUL_FFT_MODF_THRESHOLD             476  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    476, 5}, {     28, 6}, {     15, 5}, {     31, 6}, \
+    {     28, 7}, {     15, 6}, {     33, 7}, {     19, 6}, \
+    {     39, 7}, {     27, 8}, {     15, 7}, {     35, 8}, \
+    {     19, 7}, {     43, 8}, {     23, 7}, {     51, 8}, \
+    {     27, 7}, {     55, 8}, {     31, 7}, {     63, 8}, \
+    {     43, 9}, {     23, 8}, {     55, 9}, {     31, 8}, \
+    {     71, 9}, {     39, 8}, {     83, 9}, {     47, 8}, \
+    {     99, 9}, {     55,10}, {     31, 9}, {     87,10}, \
+    {     47, 9}, {    103,11}, {     31,10}, {     63, 9}, \
+    {    135,10}, {     79, 9}, {    167,10}, {     95, 9}, \
+    {    199,10}, {    111,11}, {     63,10}, {    127, 9}, \
+    {    255,10}, {    143, 9}, {    287, 8}, {    575,10}, \
+    {    159, 9}, {    319,11}, {     95,10}, {    191, 9}, \
+    {    383, 8}, {    767, 9}, {    399,10}, {    207,12}, \
+    {     63,11}, {    127,10}, {    255, 9}, {    511,10}, \
+    {    271, 9}, {    543,10}, {    287, 9}, {    575,11}, \
+    {    159,10}, {    319, 9}, {    639,10}, {    335, 9}, \
+    {    671,10}, {    351, 9}, {    703,10}, {    367,11}, \
+    {    191,10}, {    399, 9}, {    799,10}, {    415,11}, \
+    {    223,12}, {    127,11}, {    255,10}, {    543,11}, \
+    {    287,10}, {    607, 9}, {   1215,11}, {    319,10}, \
+    {    671,11}, {    351,10}, {    703,12}, {    191,11}, \
+    {    383,10}, {    799,11}, {    415,10}, {    863,11}, \
+    {    447,13}, {    127,12}, {    255,11}, {    543,10}, \
+    {   1087,11}, {    607,12}, {    319,11}, {    671,10}, \
+    {   1343,11}, {    735,12}, {    383,11}, {    799,10}, \
+    {   1599,11}, {    863,12}, {    447,11}, {    959,13}, \
+    {    255,12}, {    511,11}, {   1087,12}, {    575,11}, \
+    {   1215,12}, {    639,11}, {   1343,12}, {    703,13}, \
+    {    383,12}, {    767,11}, {   1599,12}, {    831,11}, \
+    {   1663,12}, {    959,14}, {    255,13}, {    511,12}, \
+    {   1215,13}, {    639,12}, {   1407,13}, {    767,12}, \
+    {   1663,13}, {    895,12}, {   1791,14}, {    511,13}, \
+    {   1023,12}, {   2111,13}, {   1151,12}, {   4096,13}, \
+    {   8192,14}, {  16384,15}, {  32768,16} }
+#define MUL_FFT_TABLE3_SIZE 139
+#define MUL_FFT_THRESHOLD                 6784
+
+#define SQR_FFT_MODF_THRESHOLD             436  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    436, 5}, {     25, 6}, {     13, 5}, {     27, 6}, \
+    {     28, 7}, {     15, 6}, {     32, 7}, {     17, 6}, \
+    {     35, 7}, {     19, 6}, {     39, 7}, {     29, 8}, \
+    {     15, 7}, {     35, 8}, {     19, 7}, {     41, 8}, \
+    {     23, 7}, {     49, 8}, {     27, 9}, {     15, 8}, \
+    {     43, 9}, {     23, 8}, {     55,10}, {     15, 9}, \
+    {     31, 8}, {     67, 9}, {     39, 8}, {     79, 9}, \
+    {     47, 8}, {     95, 9}, {     55,10}, {     31, 9}, \
+    {     79,10}, {     47, 9}, {    103,11}, {     31,10}, \
+    {     63, 9}, {    135,10}, {     79, 9}, {    159, 8}, \
+    {    319, 9}, {    167,10}, {     95, 9}, {    191,10}, \
+    {    111,11}, {     63,10}, {    127, 9}, {    255, 8}, \
+    {    511, 9}, {    271,10}, {    143, 9}, {    287, 8}, \
+    {    575, 9}, {    303,10}, {    159, 9}, {    319,11}, \
+    {     95,10}, {    191, 9}, {    383, 8}, {    767, 9}, \
+    {    399,12}, {     63,11}, {    127,10}, {    255, 9}, \
+    {    511,10}, {    271, 9}, {    543,10}, {    287, 9}, \
+    {    575,10}, {    303,11}, {    159,10}, {    319, 9}, \
+    {    639,10}, {    335, 9}, {    671,10}, {    351, 9}, \
+    {    703,10}, {    367,11}, {    191,10}, {    383, 9}, \
+    {    767,10}, {    399, 9}, {    799,10}, {    415, 9}, \
+    {    831,11}, {    223,10}, {    447,12}, {    127,11}, \
+    {    255,10}, {    511, 9}, {   1023,10}, {    543,11}, \
+    {    287,10}, {    607,11}, {    319,10}, {    671,11}, \
+    {    351,10}, {    735,12}, {    191,11}, {    383,10}, \
+    {    799,11}, {    415,10}, {    863,11}, {    447,10}, \
+    {    895,13}, {    127,12}, {    255,11}, {    511,10}, \
+    {   1023,11}, {    543,10}, {   1087,11}, {    607,12}, \
+    {    319,11}, {    671,10}, {   1343,11}, {    735,12}, \
+    {    383,11}, {    863,12}, {    447,11}, {    959,13}, \
+    {    255,12}, {    511,11}, {   1087,12}, {    575,11}, \
+    {   1215,12}, {    639,11}, {   1343,12}, {    703,11}, \
+    {   1407,13}, {    383,12}, {    767,11}, {   1599,12}, \
+    {    831,11}, {   1663,12}, {    959,14}, {    255,13}, \
+    {    511,12}, {   1215,13}, {    639,12}, {   1471,13}, \
+    {    767,12}, {   1663,13}, {    895,12}, {   1855,14}, \
+    {    511,13}, {   1023,12}, {   2111,13}, {   1151,12}, \
+    {   4096,13}, {   8192,14}, {  16384,15}, {  32768,16} }
+#define SQR_FFT_TABLE3_SIZE 152
+#define SQR_FFT_THRESHOLD                 3712
+
+#define MULLO_BASECASE_THRESHOLD            21
+#define MULLO_DC_THRESHOLD                   0  /* never mpn_mullo_basecase */
+#define MULLO_MUL_N_THRESHOLD            13463
+#define SQRLO_BASECASE_THRESHOLD             9
+#define SQRLO_DC_THRESHOLD                  17
+#define SQRLO_SQR_THRESHOLD               7246
+
+#define DC_DIV_QR_THRESHOLD                 27
+#define DC_DIVAPPR_Q_THRESHOLD              74
+#define DC_BDIV_QR_THRESHOLD                21
+#define DC_BDIV_Q_THRESHOLD                 64
+
+#define INV_MULMOD_BNM1_THRESHOLD           78
+#define INV_NEWTON_THRESHOLD                31
+#define INV_APPR_THRESHOLD                  37
+
+#define BINV_NEWTON_THRESHOLD              167
+#define REDC_1_TO_REDC_2_THRESHOLD           4
+#define REDC_2_TO_REDC_N_THRESHOLD         198
+
+#define MU_DIV_QR_THRESHOLD               1858
+#define MU_DIVAPPR_Q_THRESHOLD            1685
+#define MUPI_DIV_QR_THRESHOLD               43
+#define MU_BDIV_QR_THRESHOLD              1589
+#define MU_BDIV_Q_THRESHOLD               1685
+
+#define POWM_SEC_TABLE  1,13,96,487,1378
+
+#define GET_STR_DC_THRESHOLD                18
+#define GET_STR_PRECOMPUTE_THRESHOLD        36
+#define SET_STR_DC_THRESHOLD               145
+#define SET_STR_PRECOMPUTE_THRESHOLD       505
+
+#define FAC_DSC_THRESHOLD                  137
+#define FAC_ODD_THRESHOLD                   29
+
+#define MATRIX22_STRASSEN_THRESHOLD         24
+#define HGCD2_DIV1_METHOD                    5  /* 4.29% faster than 4 */
+#define HGCD_THRESHOLD                      39
+#define HGCD_APPR_THRESHOLD                 50
+#define HGCD_REDUCE_THRESHOLD             3524
+#define GCD_DC_THRESHOLD                   116
+#define GCDEXT_DC_THRESHOLD                124
+#define JACOBI_BASE_METHOD                   4  /* 5.89% faster than 1 */
+
+/* Tuneup completed successfully, took 48230 seconds */

diff --git a/third_party/gmp/mpn/arm/v7a/cora9/bdiv_q_1.asm b/third_party/gmp/mpn/arm/v7a/cora9/bdiv_q_1.asm
new file mode 100644
index 0000000..245b371
--- /dev/null
+++ b/third_party/gmp/mpn/arm/v7a/cora9/bdiv_q_1.asm

@@ -0,0 +1,36 @@
+dnl  ARM mpn_bdiv_q_1, mpn_pi1_bdiv_q_1 -- Hensel division by 1-limb divisor.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2012, 2013, 2017 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_bdiv_q_1 mpn_pi1_bdiv_q_1)
+include_mpn(`arm/v7a/cora8/bdiv_q_1.asm')

diff --git a/third_party/gmp/mpn/arm/v7a/cora9/gmp-mparam.h b/third_party/gmp/mpn/arm/v7a/cora9/gmp-mparam.h
new file mode 100644
index 0000000..5c54012
--- /dev/null
+++ b/third_party/gmp/mpn/arm/v7a/cora9/gmp-mparam.h

@@ -0,0 +1,211 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 1999-2003, 2009, 2010, 2012-2015 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* 1000 MHz Cortex-A9 */
+/* FFT tuning limit = 25 M */
+/* Generated by tuneup.c, 2014-03-12, gcc 4.6 */
+
+#define MOD_1_NORM_THRESHOLD                 0  /* always */
+#define MOD_1_UNNORM_THRESHOLD               0  /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          5
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          4
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD         8
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD     MP_SIZE_T_MAX
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     12
+#define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_1N_PI1_METHOD                 1
+#define DIV_QR_1_NORM_THRESHOLD              5
+#define DIV_QR_1_UNNORM_THRESHOLD            1
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           20
+
+#define DIV_1_VS_MUL_1_PERCENT             190
+
+#define MUL_TOOM22_THRESHOLD                45
+#define MUL_TOOM33_THRESHOLD               129
+#define MUL_TOOM44_THRESHOLD               387
+#define MUL_TOOM6H_THRESHOLD               537
+#define MUL_TOOM8H_THRESHOLD               774
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD     141
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     237
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD     141
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD     258
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD     211
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 64
+#define SQR_TOOM3_THRESHOLD                189
+#define SQR_TOOM4_THRESHOLD                517
+#define SQR_TOOM6_THRESHOLD                656
+#define SQR_TOOM8_THRESHOLD                  0  /* always */
+
+#define MULMID_TOOM42_THRESHOLD             62
+
+#define MULMOD_BNM1_THRESHOLD               23
+#define SQRMOD_BNM1_THRESHOLD               28
+
+#define MUL_FFT_MODF_THRESHOLD             630  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    630, 5}, {     29, 6}, {     15, 5}, {     33, 6}, \
+    {     17, 5}, {     35, 6}, {     36, 7}, {     19, 6}, \
+    {     40, 7}, {     21, 6}, {     43, 7}, {     23, 6}, \
+    {     47, 7}, {     25, 6}, {     51, 7}, {     27, 6}, \
+    {     55, 7}, {     29, 8}, {     15, 7}, {     37, 8}, \
+    {     19, 7}, {     43, 8}, {     23, 7}, {     51, 8}, \
+    {     27, 7}, {     57, 9}, {     15, 8}, {     31, 7}, \
+    {     65, 8}, {     35, 7}, {     71, 8}, {     43, 9}, \
+    {     23, 8}, {     55, 9}, {     31, 8}, {     71, 9}, \
+    {     39, 8}, {     83, 9}, {     47, 8}, {     99, 9}, \
+    {     55,10}, {     31, 9}, {     79,10}, {     47, 9}, \
+    {    103,11}, {     31,10}, {     63, 9}, {    135,10}, \
+    {     79, 9}, {    167,10}, {     95, 9}, {    191,10}, \
+    {    111,11}, {     63,10}, {    159,11}, {     95,10}, \
+    {    191, 9}, {    383,12}, {     63,11}, {    127,10}, \
+    {    255, 9}, {    511,10}, {    271, 9}, {    543,11}, \
+    {    159,10}, {    319, 9}, {    639,10}, {    335, 9}, \
+    {    671,11}, {    191,10}, {    383, 9}, {    767,10}, \
+    {    399, 9}, {    799,10}, {    415,11}, {    223,12}, \
+    {    127,11}, {    255,10}, {    511, 9}, {   1023,10}, \
+    {    543,11}, {    287,10}, {    607,11}, {    319,10}, \
+    {    671,11}, {    351,12}, {    191,11}, {    383,10}, \
+    {    799,11}, {    415,10}, {    831,13}, {    127,12}, \
+    {    255,11}, {    511,10}, {   1023,11}, {    543,10}, \
+    {   1087,11}, {    607,12}, {    319,11}, {    735,12}, \
+    {    383,11}, {    831,12}, {    447,11}, {    927,13}, \
+    {    255,12}, {    511,11}, {   1087,12}, {    575,11}, \
+    {   1215,12}, {    639,11}, {   1343,12}, {    703,13}, \
+    {    383,12}, {    767,11}, {   1535,12}, {    831,11}, \
+    {   1663,12}, {    895,14}, {    255,13}, {    511,12}, \
+    {   1023,11}, {   2047,12}, {   1151,13}, {    639,12}, \
+    {   1407,13}, {    767,12}, {   1663,13}, {    895,12}, \
+    {   1791,14}, {    511,13}, {   1023,12}, {   2111,13}, \
+    {   1151,12}, {   2431,13}, {   1279,12}, {   2559,13}, \
+    {   1407,14}, {    767,13}, {   1535,12}, {   3071,13}, \
+    {   1663,12}, {   3455,13}, {   1791,15}, {    511,14}, \
+    {   1023,13}, {   2047,12}, {   4095,13}, {   2175,12}, \
+    {   4479,13}, {   2431,14}, {   1279,13}, {   2559,12}, \
+    {   5119,13}, {   2815,12}, {   5631,14}, {  16384,15}, \
+    {  32768,16} }
+#define MUL_FFT_TABLE3_SIZE 157
+#define MUL_FFT_THRESHOLD                 6784
+
+#define SQR_FFT_MODF_THRESHOLD             565  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    565, 5}, {     19, 4}, {     40, 5}, {     21, 4}, \
+    {     43, 5}, {     28, 6}, {     15, 5}, {     35, 6}, \
+    {     29, 7}, {     15, 6}, {     37, 7}, {     19, 6}, \
+    {     39, 7}, {     21, 6}, {     43, 7}, {     23, 6}, \
+    {     47, 7}, {     29, 8}, {     15, 7}, {     37, 8}, \
+    {     19, 7}, {     43, 8}, {     23, 7}, {     51, 8}, \
+    {     27, 7}, {     55, 9}, {     15, 8}, {     31, 7}, \
+    {     65, 8}, {     35, 7}, {     71, 8}, {     43, 9}, \
+    {     23, 8}, {     55,10}, {     15, 9}, {     31, 8}, \
+    {     71, 9}, {     39, 8}, {     83, 9}, {     47, 8}, \
+    {     95, 9}, {     55,10}, {     31, 9}, {     79,10}, \
+    {     47, 9}, {    103,11}, {     31,10}, {     63, 9}, \
+    {    135,10}, {     79, 9}, {    159,10}, {     95, 9}, \
+    {    191,10}, {    111,11}, {     63,10}, {    159,11}, \
+    {     95,10}, {    191, 9}, {    383,12}, {     63,11}, \
+    {    127,10}, {    255, 9}, {    511, 8}, {   1023, 9}, \
+    {    527,10}, {    271, 9}, {    543,10}, {    287,11}, \
+    {    159, 9}, {    639,10}, {    335, 9}, {    671,10}, \
+    {    351,11}, {    191,10}, {    383, 9}, {    767,10}, \
+    {    399, 9}, {    799,10}, {    415,11}, {    223,12}, \
+    {    127,11}, {    255,10}, {    511, 9}, {   1023,10}, \
+    {    543,11}, {    287,10}, {    671,11}, {    351,12}, \
+    {    191,11}, {    383,10}, {    799,11}, {    415,10}, \
+    {    831,13}, {    127,12}, {    255,11}, {    511,10}, \
+    {   1023,11}, {    543,10}, {   1087,11}, {    735,12}, \
+    {    383,11}, {    831,12}, {    447,11}, {    927,13}, \
+    {    255,12}, {    511,11}, {   1087,12}, {    575,11}, \
+    {   1151,12}, {    639,11}, {   1343,12}, {    703,13}, \
+    {    383,12}, {    767,11}, {   1535,12}, {    831,11}, \
+    {   1663,12}, {    959,13}, {    511,12}, {   1023,11}, \
+    {   2047,12}, {   1151,13}, {    639,12}, {   1407,13}, \
+    {    767,12}, {   1599,13}, {    895,12}, {   1791,14}, \
+    {    511,13}, {   1023,12}, {   2111,13}, {   1151,12}, \
+    {   2431,13}, {   1279,12}, {   2559,13}, {   1407,14}, \
+    {    767,13}, {   1535,12}, {   3071,13}, {   1663,12}, \
+    {   3455,13}, {   1791,15}, {    511,14}, {   1023,13}, \
+    {   2047,12}, {   4095,13}, {   2175,12}, {   4479,13}, \
+    {   2303,14}, {   1279,13}, {   2559,12}, {   5119,13}, \
+    {   2815,14}, {  16384,15}, {  32768,16} }
+#define SQR_FFT_TABLE3_SIZE 155
+#define SQR_FFT_THRESHOLD                 5568
+
+#define MULLO_BASECASE_THRESHOLD             0  /* always */
+#define MULLO_DC_THRESHOLD                  37
+#define MULLO_MUL_N_THRESHOLD            13463
+#define SQRLO_BASECASE_THRESHOLD            12
+#define SQRLO_DC_THRESHOLD                  22
+#define SQRLO_SQR_THRESHOLD              10950
+
+#define DC_DIV_QR_THRESHOLD                 32
+#define DC_DIVAPPR_Q_THRESHOLD              99
+#define DC_BDIV_QR_THRESHOLD                43
+#define DC_BDIV_Q_THRESHOLD                102
+
+#define INV_MULMOD_BNM1_THRESHOLD           88
+#define INV_NEWTON_THRESHOLD               141
+#define INV_APPR_THRESHOLD                 111
+
+#define BINV_NEWTON_THRESHOLD              312
+#define REDC_1_TO_REDC_2_THRESHOLD           6
+#define REDC_2_TO_REDC_N_THRESHOLD         140
+
+#define MU_DIV_QR_THRESHOLD               2492
+#define MU_DIVAPPR_Q_THRESHOLD            2130
+#define MUPI_DIV_QR_THRESHOLD               55
+#define MU_BDIV_QR_THRESHOLD              2130
+#define MU_BDIV_Q_THRESHOLD               2172
+
+#define POWM_SEC_TABLE  40,53,56,71,1985
+
+#define GET_STR_DC_THRESHOLD                16
+#define GET_STR_PRECOMPUTE_THRESHOLD        33
+#define SET_STR_DC_THRESHOLD               172
+#define SET_STR_PRECOMPUTE_THRESHOLD       671
+
+#define FAC_DSC_THRESHOLD                  309
+#define FAC_ODD_THRESHOLD                   29
+
+#define MATRIX22_STRASSEN_THRESHOLD         24
+#define HGCD_THRESHOLD                      61
+#define HGCD_APPR_THRESHOLD                 50
+#define HGCD_REDUCE_THRESHOLD             4120
+#define GCD_DC_THRESHOLD                   408
+#define GCDEXT_DC_THRESHOLD                303
+#define JACOBI_BASE_METHOD                   4

diff --git a/third_party/gmp/mpn/arm64/aors_n.asm b/third_party/gmp/mpn/arm64/aors_n.asm
new file mode 100644
index 0000000..b1f3bb9
--- /dev/null
+++ b/third_party/gmp/mpn/arm64/aors_n.asm

@@ -0,0 +1,125 @@
+dnl  ARM64 mpn_add_n and mpn_sub_n
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2013, 2017 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C Cortex-A53	2.75-3.25
+C Cortex-A57	 1.5
+C X-Gene	 2.0
+
+changecom(blah)
+
+define(`rp', `x0')
+define(`up', `x1')
+define(`vp', `x2')
+define(`n',  `x3')
+
+ifdef(`OPERATION_add_n', `
+  define(`ADDSUBC',	adcs)
+  define(`CLRCY',	`cmn	xzr, xzr')
+  define(`SETCY',	`cmp	$1, #1')
+  define(`RETVAL',	`cset	x0, cs')
+  define(`func_n',	mpn_add_n)
+  define(`func_nc',	mpn_add_nc)')
+ifdef(`OPERATION_sub_n', `
+  define(`ADDSUBC',	sbcs)
+  define(`CLRCY',	`cmp	xzr, xzr')
+  define(`SETCY',	`cmp	xzr, $1')
+  define(`RETVAL',	`cset	x0, cc')
+  define(`func_n',	mpn_sub_n)
+  define(`func_nc',	mpn_sub_nc)')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+
+ASM_START()
+PROLOGUE(func_nc)
+	SETCY(	x4)
+	b	L(ent)
+EPILOGUE()
+PROLOGUE(func_n)
+	CLRCY
+L(ent):	lsr	x18, n, #2
+	tbz	n, #0, L(bx0)
+
+L(bx1):	ldr	x7, [up]
+	ldr	x11, [vp]
+	ADDSUBC	x13, x7, x11
+	str	x13, [rp],#8
+	tbnz	n, #1, L(b11)
+
+L(b01):	cbz	x18, L(ret)
+	ldp	x4, x5, [up,#8]
+	ldp	x8, x9, [vp,#8]
+	sub	up, up, #8
+	sub	vp, vp, #8
+	b	L(mid)
+
+L(b11):	ldp	x6, x7, [up,#8]
+	ldp	x10, x11, [vp,#8]
+	add	up, up, #8
+	add	vp, vp, #8
+	cbz	x18, L(end)
+	b	L(top)
+
+L(bx0):	tbnz	n, #1, L(b10)
+
+L(b00):	ldp	x4, x5, [up]
+	ldp	x8, x9, [vp]
+	sub	up, up, #16
+	sub	vp, vp, #16
+	b	L(mid)
+
+L(b10):	ldp	x6, x7, [up]
+	ldp	x10, x11, [vp]
+	cbz	x18, L(end)
+
+	ALIGN(16)
+L(top):	ldp	x4, x5, [up,#16]
+	ldp	x8, x9, [vp,#16]
+	ADDSUBC	x12, x6, x10
+	ADDSUBC	x13, x7, x11
+	stp	x12, x13, [rp],#16
+L(mid):	ldp	x6, x7, [up,#32]!
+	ldp	x10, x11, [vp,#32]!
+	ADDSUBC	x12, x4, x8
+	ADDSUBC	x13, x5, x9
+	stp	x12, x13, [rp],#16
+	sub	x18, x18, #1
+	cbnz	x18, L(top)
+
+L(end):	ADDSUBC	x12, x6, x10
+	ADDSUBC	x13, x7, x11
+	stp	x12, x13, [rp]
+L(ret):	RETVAL
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/arm64/aorsmul_1.asm b/third_party/gmp/mpn/arm64/aorsmul_1.asm
new file mode 100644
index 0000000..fd3c36e
--- /dev/null
+++ b/third_party/gmp/mpn/arm64/aorsmul_1.asm

@@ -0,0 +1,140 @@
+dnl  ARM64 mpn_addmul_1 and mpn_submul_1
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2013, 2015, 2017 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C Cortex-A53	9.3-9.8
+C Cortex-A57	 7.0
+C X-Gene	 5.0
+
+C NOTES
+C  * It is possible to keep the carry chain alive between the addition blocks
+C    and thus avoid csinc, but only for addmul_1.  Since that saves no time
+C    on the tested pipelines, we keep addmul_1 and submul_1 similar.
+C  * We could separate feed-in into 4 blocks, one for each residue (mod 4).
+C    That is likely to save a few cycles.
+
+changecom(blah)
+
+define(`rp', `x0')
+define(`up', `x1')
+define(`n',  `x2')
+define(`v0', `x3')
+
+ifdef(`OPERATION_addmul_1', `
+  define(`ADDSUB',	adds)
+  define(`ADDSUBC',	adcs)
+  define(`COND',	`cc')
+  define(`func',	mpn_addmul_1)')
+ifdef(`OPERATION_submul_1', `
+  define(`ADDSUB',	subs)
+  define(`ADDSUBC',	sbcs)
+  define(`COND',	`cs')
+  define(`func',	mpn_submul_1)')
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
+
+PROLOGUE(func)
+	adds	x15, xzr, xzr
+
+	tbz	n, #0, L(1)
+
+	ldr	x4, [up],#8
+	mul	x8, x4, v0
+	umulh	x12, x4, v0
+	ldr	x4, [rp]
+	ADDSUB	x8, x4, x8
+	csinc	x15, x12, x12, COND
+	str	x8, [rp],#8
+
+L(1):	tbz	n, #1, L(2)
+
+	ldp	x4, x5, [up],#16
+	mul	x8, x4, v0
+	umulh	x12, x4, v0
+	mul	x9, x5, v0
+	umulh	x13, x5, v0
+	adds	x8, x8, x15
+	adcs	x9, x9, x12
+	ldp	x4, x5, [rp]
+	adc	x15, x13, xzr
+	ADDSUB	x8, x4, x8
+	ADDSUBC	x9, x5, x9
+	csinc	x15, x15, x15, COND
+	stp	x8, x9, [rp],#16
+
+L(2):	lsr	n, n, #2
+	cbz	n, L(le3)
+	ldp	x4, x5, [up],#32
+	ldp	x6, x7, [up,#-16]
+	b	L(mid)
+L(le3):	mov	x0, x15
+	ret
+
+	ALIGN(16)
+L(top):	ldp	x4, x5, [up],#32
+	ldp	x6, x7, [up,#-16]
+	ADDSUB	x8, x16, x8
+	ADDSUBC	x9, x17, x9
+	stp	x8, x9, [rp],#32
+	ADDSUBC	x10, x12, x10
+	ADDSUBC	x11, x13, x11
+	stp	x10, x11, [rp,#-16]
+	csinc	x15, x15, x15, COND
+L(mid):	sub	n, n, #1
+	mul	x8, x4, v0
+	umulh	x12, x4, v0
+	mul	x9, x5, v0
+	umulh	x13, x5, v0
+	adds	x8, x8, x15
+	mul	x10, x6, v0
+	umulh	x14, x6, v0
+	adcs	x9, x9, x12
+	mul	x11, x7, v0
+	umulh	x15, x7, v0
+	adcs	x10, x10, x13
+	ldp	x16, x17, [rp]
+	adcs	x11, x11, x14
+	ldp	x12, x13, [rp,#16]
+	adc	x15, x15, xzr
+	cbnz	n, L(top)
+
+	ADDSUB	x8, x16, x8
+	ADDSUBC	x9, x17, x9
+	ADDSUBC	x10, x12, x10
+	ADDSUBC	x11, x13, x11
+	stp	x8, x9, [rp]
+	stp	x10, x11, [rp,#16]
+	csinc	x0, x15, x15, COND
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/arm64/aorsorrlsh1_n.asm b/third_party/gmp/mpn/arm64/aorsorrlsh1_n.asm
new file mode 100644
index 0000000..c617a67
--- /dev/null
+++ b/third_party/gmp/mpn/arm64/aorsorrlsh1_n.asm

@@ -0,0 +1,43 @@
+dnl  ARM64 mpn_addlsh1_n, mpn_sublsh1_n, mpn_rsblsh1_n.
+
+dnl  Copyright 2017 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+define(LSH,		1)
+define(RSH,		63)
+
+ifdef(`OPERATION_addlsh1_n',`define(`DO_add')')
+ifdef(`OPERATION_sublsh1_n',`define(`DO_sub')')
+ifdef(`OPERATION_rsblsh1_n',`define(`DO_rsb')')
+
+MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_sublsh1_n mpn_rsblsh1_n)
+
+include_mpn(`arm64/aorsorrlshC_n.asm')

diff --git a/third_party/gmp/mpn/arm64/aorsorrlsh2_n.asm b/third_party/gmp/mpn/arm64/aorsorrlsh2_n.asm
new file mode 100644
index 0000000..852d117
--- /dev/null
+++ b/third_party/gmp/mpn/arm64/aorsorrlsh2_n.asm

@@ -0,0 +1,43 @@
+dnl  ARM64 mpn_addlsh2_n, mpn_sublsh2_n, mpn_rsblsh2_n.
+
+dnl  Copyright 2017 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+define(LSH,		2)
+define(RSH,		62)
+
+ifdef(`OPERATION_addlsh2_n',`define(`DO_add')')
+ifdef(`OPERATION_sublsh2_n',`define(`DO_sub')')
+ifdef(`OPERATION_rsblsh2_n',`define(`DO_rsb')')
+
+MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_sublsh2_n mpn_rsblsh2_n)
+
+include_mpn(`arm64/aorsorrlshC_n.asm')

diff --git a/third_party/gmp/mpn/arm64/aorsorrlshC_n.asm b/third_party/gmp/mpn/arm64/aorsorrlshC_n.asm
new file mode 100644
index 0000000..168caad
--- /dev/null
+++ b/third_party/gmp/mpn/arm64/aorsorrlshC_n.asm

@@ -0,0 +1,139 @@
+dnl  ARM64 mpn_addlshC_n, mpn_sublshC_n, mpn_rsblshC_n.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2017 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C Cortex-A53	3.25-3.75
+C Cortex-A57	 2.18
+C X-Gene	 2.5
+
+changecom(blah)
+
+define(`rp', `x0')
+define(`up', `x1')
+define(`vp', `x2')
+define(`n',  `x3')
+
+ifdef(`DO_add', `
+  define(`ADDSUB',	`adds	$1, $2, $3')
+  define(`ADDSUBC',	`adcs	$1, $2, $3')
+  define(`CLRRCY',	`adds	$1, xzr, xzr')
+  define(`RETVAL',	`adc	x0, $1, xzr')
+  define(`func_n',	mpn_addlsh`'LSH`'_n)')
+ifdef(`DO_sub', `
+  define(`ADDSUB',	`subs	$1, $3, $2')
+  define(`ADDSUBC',	`sbcs	$1, $3, $2')
+  define(`CLRRCY',	`subs	$1, xzr, xzr')
+  define(`RETVAL',	`cinc	x0, $1, cc')
+  define(`func_n',	mpn_sublsh`'LSH`'_n)')
+ifdef(`DO_rsb', `
+  define(`ADDSUB',	`subs	$1, $2, $3')
+  define(`ADDSUBC',	`sbcs	$1, $2, $3')
+  define(`CLRRCY',	`subs	$1, xzr, xzr')
+  define(`RETVAL',	`sbc	x0, $1, xzr')
+  define(`func_n',	mpn_rsblsh`'LSH`'_n)')
+
+ASM_START()
+PROLOGUE(func_n)
+	lsr	x18, n, #2
+	tbz	n, #0, L(bx0)
+
+L(bx1):	ldr	x5, [up]
+	tbnz	n, #1, L(b11)
+
+L(b01):	ldr	x11, [vp]
+	cbz	x18, L(1)
+	ldp	x8, x9, [vp,#8]
+	lsl	x13, x11, #LSH
+	ADDSUB(	x15, x13, x5)
+	str	x15, [rp],#8
+	sub	up, up, #24
+	sub	vp, vp, #8
+	b	L(mid)
+
+L(1):	lsl	x13, x11, #LSH
+	ADDSUB(	x15, x13, x5)
+	str	x15, [rp]
+	lsr	x0, x11, RSH
+	RETVAL(	 x0, x1)
+	ret
+
+L(b11):	ldr	x9, [vp]
+	ldp	x10, x11, [vp,#8]!
+	lsl	x13, x9, #LSH
+	ADDSUB(	x17, x13, x5)
+	str	x17, [rp],#8
+	sub	up, up, #8
+	cbz	x18, L(end)
+	b	L(top)
+
+L(bx0):	tbnz	n, #1, L(b10)
+
+L(b00):	CLRRCY(	x11)
+	ldp	x8, x9, [vp],#-16
+	sub	up, up, #32
+	b	L(mid)
+
+L(b10):	CLRRCY(	x9)
+	ldp	x10, x11, [vp]
+	sub	up, up, #16
+	cbz	x18, L(end)
+
+	ALIGN(16)
+L(top):	ldp	x4, x5, [up,#16]
+	extr	x12, x10, x9, #RSH
+	ldp	x8, x9, [vp,#16]
+	extr	x13, x11, x10, #RSH
+	ADDSUBC(x14, x12, x4)
+	ADDSUBC(x15, x13, x5)
+	stp	x14, x15, [rp],#16
+L(mid):	ldp	x4, x5, [up,#32]!
+	extr	x12, x8, x11, #RSH
+	ldp	x10, x11, [vp,#32]!
+	extr	x13, x9, x8, #RSH
+	ADDSUBC(x16, x12, x4)
+	ADDSUBC(x17, x13, x5)
+	stp	x16, x17, [rp],#16
+	sub	x18, x18, #1
+	cbnz	x18, L(top)
+
+L(end):	ldp	x4, x5, [up,#16]
+	extr	x12, x10, x9, #RSH
+	extr	x13, x11, x10, #RSH
+	ADDSUBC(x14, x12, x4)
+	ADDSUBC(x15, x13, x5)
+	stp	x14, x15, [rp]
+	lsr	x0, x11, RSH
+	RETVAL(	 x0, x1)
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/arm64/bdiv_dbm1c.asm b/third_party/gmp/mpn/arm64/bdiv_dbm1c.asm
new file mode 100644
index 0000000..78984b4
--- /dev/null
+++ b/third_party/gmp/mpn/arm64/bdiv_dbm1c.asm

@@ -0,0 +1,111 @@
+dnl  ARM64 mpn_bdiv_dbm1c.
+
+dnl  Copyright 2008, 2011, 2012, 2014 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C Cortex-A53	 8
+C Cortex-A57	 7
+C X-Gene	 4.25
+
+define(`qp',	  `x0')
+define(`up',	  `x1')
+define(`n',	  `x2')
+define(`bd',	  `x3')
+define(`cy',	  `x4')
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_bdiv_dbm1c)
+	ldr	x5, [up], #8
+	ands	x6, n, #3
+	b.eq	L(fi0)
+	cmp	x6, #2
+	b.cc	L(fi1)
+	b.eq	L(fi2)
+
+L(fi3):	mul	x12, x5, bd
+	umulh	x13, x5, bd
+	ldr	x5, [up], #8
+	b	L(lo3)
+
+L(fi0):	mul	x10, x5, bd
+	umulh	x11, x5, bd
+	ldr	x5, [up], #8
+	b	L(lo0)
+
+L(fi1):	subs	n, n, #1
+	mul	x12, x5, bd
+	umulh	x13, x5, bd
+	b.ls	L(wd1)
+	ldr	x5, [up], #8
+	b	L(lo1)
+
+L(fi2):	mul	x10, x5, bd
+	umulh	x11, x5, bd
+	ldr	x5, [up], #8
+	b	L(lo2)
+
+L(top):	ldr	x5, [up], #8
+	subs	x4, x4, x10
+	str	x4, [qp], #8
+	sbc	x4, x4, x11
+L(lo1):	mul	x10, x5, bd
+	umulh	x11, x5, bd
+	ldr	x5, [up], #8
+	subs	x4, x4, x12
+	str	x4, [qp], #8
+	sbc	x4, x4, x13
+L(lo0):	mul	x12, x5, bd
+	umulh	x13, x5, bd
+	ldr	x5, [up], #8
+	subs	x4, x4, x10
+	str	x4, [qp], #8
+	sbc	x4, x4, x11
+L(lo3):	mul	x10, x5, bd
+	umulh	x11, x5, bd
+	ldr	x5, [up], #8
+	subs	x4, x4, x12
+	str	x4, [qp], #8
+	sbc	x4, x4, x13
+L(lo2):	subs	n, n, #4
+	mul	x12, x5, bd
+	umulh	x13, x5, bd
+	b.hi	L(top)
+
+L(wd2):	subs	x4, x4, x10
+	str	x4, [qp], #8
+	sbc	x4, x4, x11
+L(wd1):	subs	x4, x4, x12
+	str	x4, [qp]
+	sbc	x0, x4, x13
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/arm64/bdiv_q_1.asm b/third_party/gmp/mpn/arm64/bdiv_q_1.asm
new file mode 100644
index 0000000..2e189b8
--- /dev/null
+++ b/third_party/gmp/mpn/arm64/bdiv_q_1.asm

@@ -0,0 +1,128 @@
+dnl  ARM64 mpn_bdiv_q_1, mpn_pi1_bdiv_q_1 -- Hensel division by 1-limb divisor.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2012, 2017 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C               cycles/limb
+C               norm   unorm
+C Cortex-A53	12	15
+C Cortex-A57	12	12
+C Cortex-A72
+C Cortex-A73
+C X-Gene	11	11
+
+C TODO
+C  * Scheduling of umulh later in the unorm loop brings A53 time to 12 c/l.
+C    Unfortunately, that requires software pipelining.
+
+define(`rp',  `x0')
+define(`up',  `x1')
+define(`n',   `x2')
+define(`d',   `x3')
+define(`di',  `x4')		C	just mpn_pi1_bdiv_q_1
+define(`cnt', `x5')		C	just mpn_pi1_bdiv_q_1
+
+define(`cy',  `r7')
+define(`tnc', `x8')
+
+ASM_START()
+PROLOGUE(mpn_bdiv_q_1)
+
+	rbit	x6, d
+	clz	cnt, x6
+	lsr	d, d, cnt
+
+ifdef(`PIC',`
+	adrp	x7, :got:__gmp_binvert_limb_table
+	ubfx	x6, d, 1, 7
+	ldr	x7, [x7, #:got_lo12:__gmp_binvert_limb_table]
+',`
+	adrp	x7, __gmp_binvert_limb_table
+	ubfx	x6, d, 1, 7
+	add	x7, x7, :lo12:__gmp_binvert_limb_table
+')
+	ldrb	w6, [x7, x6]
+	ubfiz	x7, x6, 1, 8
+	umull	x6, w6, w6
+	msub	x6, x6, d, x7
+	lsl	x7, x6, 1
+	mul	x6, x6, x6
+	msub	x6, x6, d, x7
+	lsl	x7, x6, 1
+	mul	x6, x6, x6
+	msub	di, x6, d, x7
+
+	b	mpn_pi1_bdiv_q_1
+EPILOGUE()
+
+PROLOGUE(mpn_pi1_bdiv_q_1)
+	sub	n, n, #1
+	subs	x6, x6, x6		C clear r6 and C flag
+	ldr	x9, [up],#8
+	cbz	cnt, L(norm)
+
+L(unorm):
+	lsr	x12, x9, cnt
+	cbz	n, L(eu1)
+	sub	tnc, xzr, cnt
+
+L(tpu):	ldr	x9, [up],#8
+	lsl	x7, x9, tnc
+	orr	x7, x7, x12
+	sbcs	x6, x7, x6
+	mul	x7, x6, di
+	str	x7, [rp],#8
+	lsr	x12, x9, cnt
+	umulh	x6, x7, d
+	sub	n, n, #1
+	cbnz	n, L(tpu)
+
+L(eu1):	sbcs	x6, x12, x6
+	mul	x6, x6, di
+	str	x6, [rp]
+	ret
+
+L(norm):
+	mul	x5, x9, di
+	str	x5, [rp],#8
+	cbz	n, L(en1)
+
+L(tpn):	ldr	x9, [up],#8
+	umulh	x5, x5, d
+	sbcs	x5, x9, x5
+	mul	x5, x5, di
+	str	x5, [rp],#8
+	sub	n, n, #1
+	cbnz	n, L(tpn)
+
+L(en1):	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/arm64/cnd_aors_n.asm b/third_party/gmp/mpn/arm64/cnd_aors_n.asm
new file mode 100644
index 0000000..39e6cd3
--- /dev/null
+++ b/third_party/gmp/mpn/arm64/cnd_aors_n.asm

@@ -0,0 +1,129 @@
+dnl  ARM64 mpn_cnd_add_n, mpn_cnd_sub_n
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2012, 2013, 2017 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C Cortex-A53	3.87-4.37
+C Cortex-A57	 1.75
+C X-Gene	 2.0
+
+changecom(blah)
+
+define(`cnd',	`x0')
+define(`rp',	`x1')
+define(`up',	`x2')
+define(`vp',	`x3')
+define(`n',	`x4')
+
+ifdef(`OPERATION_cnd_add_n', `
+  define(`ADDSUBC',	adcs)
+  define(`CLRCY',	`cmn	xzr, xzr')
+  define(`RETVAL',	`cset	x0, cs')
+  define(`func',	mpn_cnd_add_n)')
+ifdef(`OPERATION_cnd_sub_n', `
+  define(`ADDSUBC',	sbcs)
+  define(`CLRCY',	`cmp	xzr, xzr')
+  define(`RETVAL',	`cset	x0, cc')
+  define(`func',	mpn_cnd_sub_n)')
+
+MULFUNC_PROLOGUE(mpn_cnd_add_n mpn_cnd_sub_n)
+
+ASM_START()
+PROLOGUE(func)
+	cmp	cnd, #1
+	sbc	cnd, cnd, cnd
+
+	CLRCY
+
+	lsr	x18, n, #2
+	tbz	n, #0, L(bx0)
+
+L(bx1):	ldr	x13, [vp]
+	ldr	x11, [up]
+	bic	x7, x13, cnd
+	ADDSUBC	x9, x11, x7
+	str	x9, [rp]
+	tbnz	n, #1, L(b11)
+
+L(b01):	cbz	x18, L(rt)
+	ldp	x12, x13, [vp,#8]
+	ldp	x10, x11, [up,#8]
+	sub	up, up, #8
+	sub	vp, vp, #8
+	sub	rp, rp, #24
+	b	L(mid)
+
+L(b11):	ldp	x12, x13, [vp,#8]!
+	ldp	x10, x11, [up,#8]!
+	sub	rp, rp, #8
+	cbz	x18, L(end)
+	b	L(top)
+
+L(bx0):	ldp	x12, x13, [vp]
+	ldp	x10, x11, [up]
+	tbnz	n, #1, L(b10)
+
+L(b00):	sub	up, up, #16
+	sub	vp, vp, #16
+	sub	rp, rp, #32
+	b	L(mid)
+
+L(b10):	sub	rp, rp, #16
+	cbz	x18, L(end)
+
+	ALIGN(16)
+L(top):	bic	x6, x12, cnd
+	bic	x7, x13, cnd
+	ldp	x12, x13, [vp,#16]
+	ADDSUBC	x8, x10, x6
+	ADDSUBC	x9, x11, x7
+	ldp	x10, x11, [up,#16]
+	stp	x8, x9, [rp,#16]
+L(mid):	bic	x6, x12, cnd
+	bic	x7, x13, cnd
+	ldp	x12, x13, [vp,#32]!
+	ADDSUBC	x8, x10, x6
+	ADDSUBC	x9, x11, x7
+	ldp	x10, x11, [up,#32]!
+	stp	x8, x9, [rp,#32]!
+	sub	x18, x18, #1
+	cbnz	x18, L(top)
+
+L(end):	bic	x6, x12, cnd
+	bic	x7, x13, cnd
+	ADDSUBC	x8, x10, x6
+	ADDSUBC	x9, x11, x7
+	stp	x8, x9, [rp,#16]
+L(rt):	RETVAL
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/arm64/com.asm b/third_party/gmp/mpn/arm64/com.asm
new file mode 100644
index 0000000..63ad249
--- /dev/null
+++ b/third_party/gmp/mpn/arm64/com.asm

@@ -0,0 +1,84 @@
+dnl  ARM64 mpn_com.
+
+dnl  Copyright 2013, 2014 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C Cortex-A53	 2.25
+C Cortex-A57	 1.25
+C X-Gene	 1.75
+
+changecom(blah)
+
+define(`rp', `x0')
+define(`up', `x1')
+define(`n',  `x2')
+
+ASM_START()
+PROLOGUE(mpn_com)
+	cmp	n, #3
+	b.le	L(bc)
+
+C Copy until rp is 128-bit aligned
+	tbz	rp, #3, L(al2)
+	ld1	{v22.1d}, [up], #8
+	sub	n, n, #1
+	mvn	v22.8b, v22.8b
+	st1	{v22.1d}, [rp], #8
+
+L(al2):	ld1	{v26.2d}, [up], #16
+	subs	n, n, #6
+	b.lt	L(end)
+
+	ALIGN(16)
+L(top):	ld1	{v22.2d}, [up], #16
+	mvn	v26.16b, v26.16b
+	st1	{v26.2d}, [rp], #16
+	ld1	{v26.2d}, [up], #16
+	mvn	v22.16b, v22.16b
+	st1	{v22.2d}, [rp], #16
+	subs	n, n, #4
+	b.ge	L(top)
+
+L(end):	mvn	v26.16b, v26.16b
+	st1	{v26.2d}, [rp], #16
+
+C Copy last 0-3 limbs.  Note that rp is aligned after loop, but not when we
+C arrive here via L(bc)
+L(bc):	tbz	n, #1, L(tl1)
+	ld1	{v22.2d}, [up], #16
+	mvn	v22.16b, v22.16b
+	st1	{v22.2d}, [rp], #16
+L(tl1):	tbz	n, #0, L(tl2)
+	ld1	{v22.1d}, [up]
+	mvn	v22.8b, v22.8b
+	st1	{v22.1d}, [rp]
+L(tl2):	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/arm64/copyd.asm b/third_party/gmp/mpn/arm64/copyd.asm
new file mode 100644
index 0000000..c8001a3
--- /dev/null
+++ b/third_party/gmp/mpn/arm64/copyd.asm

@@ -0,0 +1,93 @@
+dnl  ARM64 mpn_copyd.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C Cortex-A53	 ?
+C Cortex-A57	 ?
+
+changecom(blah)
+
+define(`rp', `x0')
+define(`up', `x1')
+define(`n',  `x2')
+
+ASM_START()
+PROLOGUE(mpn_copyd)
+	add	rp, rp, n, lsl #3
+	add	up, up, n, lsl #3
+
+	cmp	n, #3
+	b.le	L(bc)
+
+C Copy until rp is 128-bit aligned
+	tbz	rp, #3, L(al2)
+	sub	up, up, #8
+	ld1	{v22.1d}, [up]
+	sub	n, n, #1
+	sub	rp, rp, #8
+	st1	{v22.1d}, [rp]
+
+L(al2):	sub	up, up, #16
+	ld1	{v26.2d}, [up]
+	sub	n, n, #6
+	sub	rp, rp, #16			C offset rp for loop
+	tbnz	n, #63, L(end)
+
+	sub	up, up, #16			C offset up for loop
+	mov	x12, #-16
+
+	ALIGN(16)
+L(top):	ld1	{v22.2d}, [up], x12
+	st1	{v26.2d}, [rp], x12
+	ld1	{v26.2d}, [up], x12
+	st1	{v22.2d}, [rp], x12
+	sub	n, n, #4
+	tbz	n, #63, L(top)
+
+	add	up, up, #16			C undo up offset
+
+L(end):	st1	{v26.2d}, [rp]
+
+C Copy last 0-3 limbs.  Note that rp is aligned after loop, but not when we
+C arrive here via L(bc)
+L(bc):	tbz	n, #1, L(tl1)
+	sub	up, up, #16
+	ld1	{v22.2d}, [up]
+	sub	rp, rp, #16
+	st1	{v22.2d}, [rp]
+L(tl1):	tbz	n, #0, L(tl2)
+	sub	up, up, #8
+	ld1	{v22.1d}, [up]
+	sub	rp, rp, #8
+	st1	{v22.1d}, [rp]
+L(tl2):	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/arm64/copyi.asm b/third_party/gmp/mpn/arm64/copyi.asm
new file mode 100644
index 0000000..cfb90c7
--- /dev/null
+++ b/third_party/gmp/mpn/arm64/copyi.asm

@@ -0,0 +1,78 @@
+dnl  ARM64 mpn_copyi.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C Cortex-A53	 2
+C Cortex-A57	 1
+C X-Gene	 1.25
+
+changecom(blah)
+
+define(`rp', `x0')
+define(`up', `x1')
+define(`n',  `x2')
+
+ASM_START()
+PROLOGUE(mpn_copyi)
+	cmp	n, #3
+	b.le	L(bc)
+
+C Copy until rp is 128-bit aligned
+	tbz	rp, #3, L(al2)
+	ld1	{v22.1d}, [up], #8
+	sub	n, n, #1
+	st1	{v22.1d}, [rp], #8
+
+L(al2):	ld1	{v26.2d}, [up], #16
+	sub	n, n, #6
+	tbnz	n, #63, L(end)
+
+	ALIGN(16)
+L(top):	ld1	{v22.2d}, [up], #16
+	st1	{v26.2d}, [rp], #16
+	ld1	{v26.2d}, [up], #16
+	st1	{v22.2d}, [rp], #16
+	sub	n, n, #4
+	tbz	n, #63, L(top)
+
+L(end):	st1	{v26.2d}, [rp], #16
+
+C Copy last 0-3 limbs.  Note that rp is aligned after loop, but not when we
+C arrive here via L(bc)
+L(bc):	tbz	n, #1, L(tl1)
+	ld1	{v22.2d}, [up], #16
+	st1	{v22.2d}, [rp], #16
+L(tl1):	tbz	n, #0, L(tl2)
+	ld1	{v22.1d}, [up]
+	st1	{v22.1d}, [rp]
+L(tl2):	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/arm64/cora53/cnd_aors_n.asm b/third_party/gmp/mpn/arm64/cora53/cnd_aors_n.asm
new file mode 100644
index 0000000..1b227da
--- /dev/null
+++ b/third_party/gmp/mpn/arm64/cora53/cnd_aors_n.asm

@@ -0,0 +1,99 @@
+dnl  ARM64 mpn_cnd_add_n, mpn_cnd_sub_n
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2012, 2013, 2017 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C Cortex-A53	3.5-4
+C Cortex-A57	 2.25
+C X-Gene	 3.5
+
+changecom(blah)
+
+define(`cnd',	`x0')
+define(`rp',	`x1')
+define(`up',	`x2')
+define(`vp',	`x3')
+define(`n',	`x4')
+
+ifdef(`OPERATION_cnd_add_n', `
+  define(`ADDSUBC',	adcs)
+  define(`CLRCY',	`cmn	xzr, xzr')
+  define(`RETVAL',	`cset	x0, cs')
+  define(`func',	mpn_cnd_add_n)')
+ifdef(`OPERATION_cnd_sub_n', `
+  define(`ADDSUBC',	sbcs)
+  define(`CLRCY',	`cmp	xzr, xzr')
+  define(`RETVAL',	`cset	x0, cc')
+  define(`func',	mpn_cnd_sub_n)')
+
+MULFUNC_PROLOGUE(mpn_cnd_add_n mpn_cnd_sub_n)
+
+ASM_START()
+PROLOGUE(func)
+	cmp	cnd, #1
+	sbc	cnd, cnd, cnd
+
+	CLRCY				C really only needed for n = 0 (mod 4)
+
+	tbz	n, #0, L(1)
+	ldr	x10, [up], #8
+	ldr	x12, [vp], #8
+	bic	x6, x12, cnd
+	ADDSUBC	x8, x10, x6
+	sub	n, n, #1
+	str	x8, [rp], #8
+	cbz	n, L(rt)
+
+L(1):	ldp	x10, x11, [up], #16
+	ldp	x12, x13, [vp], #16
+	sub	n, n, #2
+	cbz	n, L(end)
+
+L(top):	bic	x6, x12, cnd
+	bic	x7, x13, cnd
+	ldp	x12, x13, [vp], #16
+	ADDSUBC	x8, x10, x6
+	ADDSUBC	x9, x11, x7
+	ldp	x10, x11, [up], #16
+	sub	n, n, #2
+	stp	x8, x9, [rp], #16
+	cbnz	n, L(top)
+
+L(end):	bic	x6, x12, cnd
+	bic	x7, x13, cnd
+	ADDSUBC	x8, x10, x6
+	ADDSUBC	x9, x11, x7
+	stp	x8, x9, [rp]
+L(rt):	RETVAL
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/arm64/cora53/gmp-mparam.h b/third_party/gmp/mpn/arm64/cora53/gmp-mparam.h
new file mode 100644
index 0000000..f4e258d
--- /dev/null
+++ b/third_party/gmp/mpn/arm64/cora53/gmp-mparam.h

@@ -0,0 +1,242 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file for a53.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+/* 1536 MHz Cortex-A53 */
+/* FFT tuning limit = 21,583,800 */
+/* Generated by tuneup.c, 2019-10-22, gcc 5.4 */
+
+#define DIVREM_1_NORM_THRESHOLD              3
+#define DIVREM_1_UNNORM_THRESHOLD            4
+#define MOD_1_1P_METHOD                      2  /* 4.84% faster than 1 */
+#define MOD_1_NORM_THRESHOLD                 3
+#define MOD_1_UNNORM_THRESHOLD               4
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          7
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          7
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD        12
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD        18
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     22
+#define USE_PREINV_DIVREM_1                  1
+#define DIV_QR_1N_PI1_METHOD                 1  /* 39.05% faster than 2 */
+#define DIV_QR_1_NORM_THRESHOLD             21
+#define DIV_QR_1_UNNORM_THRESHOLD           21
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD                 0  /* always */
+#define BMOD_1_TO_MOD_1_THRESHOLD           38
+
+#define DIV_1_VS_MUL_1_PERCENT             161
+
+#define MUL_TOOM22_THRESHOLD                14
+#define MUL_TOOM33_THRESHOLD                49
+#define MUL_TOOM44_THRESHOLD                73
+#define MUL_TOOM6H_THRESHOLD               173
+#define MUL_TOOM8H_THRESHOLD               236
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      81
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD      77
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD      81
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD      88
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD      65
+
+#define SQR_BASECASE_THRESHOLD               0  /* always */
+#define SQR_TOOM2_THRESHOLD                 18
+#define SQR_TOOM3_THRESHOLD                 68
+#define SQR_TOOM4_THRESHOLD                183
+#define SQR_TOOM6_THRESHOLD                230
+#define SQR_TOOM8_THRESHOLD                357
+
+#define MULMID_TOOM42_THRESHOLD             23
+
+#define MULMOD_BNM1_THRESHOLD                9
+#define SQRMOD_BNM1_THRESHOLD               11
+
+#define MUL_FFT_MODF_THRESHOLD             316  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    316, 5}, {     13, 6}, {      7, 5}, {     15, 6}, \
+    {     13, 7}, {      7, 6}, {     15, 7}, {      8, 6}, \
+    {     17, 7}, {      9, 6}, {     19, 7}, {     17, 8}, \
+    {      9, 7}, {     20, 8}, {     11, 7}, {     23, 8}, \
+    {     13, 9}, {      7, 8}, {     19, 9}, {     11, 8}, \
+    {     27, 9}, {     15, 8}, {     33, 9}, {     19, 8}, \
+    {     41, 9}, {     23, 8}, {     49, 9}, {     27,10}, \
+    {     15, 9}, {     39,10}, {     23, 9}, {     51,11}, \
+    {     15,10}, {     31, 9}, {     71,10}, {     39, 9}, \
+    {     83,10}, {     47, 9}, {     99,10}, {     55,11}, \
+    {     31,10}, {     63, 8}, {    255,10}, {     71, 8}, \
+    {    287,10}, {     79, 9}, {    159, 8}, {    319,10}, \
+    {     87,11}, {     47,10}, {     95, 9}, {    191, 8}, \
+    {    383,10}, {    103, 9}, {    207, 8}, {    415,10}, \
+    {    111, 9}, {    223,12}, {     31,11}, {     63, 9}, \
+    {    255, 8}, {    511,10}, {    135, 9}, {    287, 8}, \
+    {    575,11}, {     79,10}, {    159, 9}, {    319, 8}, \
+    {    639,10}, {    175, 9}, {    351, 8}, {    703,11}, \
+    {     95,10}, {    191, 9}, {    383, 8}, {    767,10}, \
+    {    207, 9}, {    415, 8}, {    831,10}, {    223, 9}, \
+    {    447,12}, {     63,10}, {    255, 9}, {    511, 8}, \
+    {   1023, 9}, {    543,10}, {    287, 9}, {    575, 8}, \
+    {   1151,11}, {    159,10}, {    319, 9}, {    639,11}, \
+    {    175,10}, {    351, 9}, {    703, 8}, {   1407,12}, \
+    {     95,11}, {    191,10}, {    383, 9}, {    767,11}, \
+    {    207,10}, {    415, 9}, {    831,11}, {    223,10}, \
+    {    447,13}, {     63,11}, {    255,10}, {    543,11}, \
+    {    287,10}, {    575, 9}, {   1151,12}, {    159,11}, \
+    {    319,10}, {    639,11}, {    351,10}, {    703, 9}, \
+    {   1407, 8}, {   2815,12}, {    191,11}, {    383,10}, \
+    {    767,11}, {    415,10}, {    831,12}, {    223,11}, \
+    {    447,10}, {    895,11}, {    479,10}, {    959, 9}, \
+    {   1919,12}, {    255,11}, {    511,10}, {   1023,11}, \
+    {    543,10}, {   1087,12}, {    287,11}, {    575,10}, \
+    {   1151,12}, {    319,11}, {    639,12}, {    351,11}, \
+    {    703,10}, {   1407, 9}, {   2815,13}, {    191,12}, \
+    {    383,11}, {    767,12}, {    415,11}, {    831,10}, \
+    {   1663,12}, {    447,11}, {    895,10}, {   1791,12}, \
+    {    479,11}, {    959,13}, {    255,12}, {    511,11}, \
+    {   1023,12}, {    543,11}, {   1087,12}, {    575,11}, \
+    {   1151,13}, {    319,12}, {    703,11}, {   1407,10}, \
+    {   2815,13}, {    383,12}, {    831,11}, {   1663,13}, \
+    {    447,12}, {    895,11}, {   1791,12}, {    959,11}, \
+    {   1919,14}, {    255,13}, {    511,12}, {   1087,13}, \
+    {    575,12}, {   1151,13}, {    703,12}, {   1407,11}, \
+    {   2815,14}, {    383,13}, {    831,12}, {   1663,13}, \
+    {    895,12}, {   1791,13}, {    959,12}, {   1919,15}, \
+    {    255,14}, {    511,13}, {   1087,12}, {   2175,13}, \
+    {   1215,14}, {    639,13}, {   1407,12}, {   2815,14}, \
+    {    767,13}, {   1663,14}, {    895,13}, {   1919,12}, \
+    {   3839,15}, {    511,14}, {   1023,13}, {   2175,14}, \
+    {   1151,13}, {   2431,12}, {   4863,14}, {  16384,15}, \
+    {  32768,16}, {  65536,17}, { 131072,18}, { 262144,19}, \
+    { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \
+    {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 217
+#define MUL_FFT_THRESHOLD                 3200
+
+#define SQR_FFT_MODF_THRESHOLD             276  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    276, 5}, {     13, 6}, {      7, 5}, {     15, 6}, \
+    {     17, 7}, {     17, 8}, {      9, 7}, {     20, 8}, \
+    {     11, 7}, {     23, 8}, {     13, 9}, {      7, 8}, \
+    {     15, 7}, {     31, 8}, {     19, 9}, {     11, 8}, \
+    {     27, 9}, {     15, 8}, {     33, 9}, {     19, 8}, \
+    {     39, 9}, {     23, 8}, {     47, 9}, {     27,10}, \
+    {     15, 9}, {     39,10}, {     23, 9}, {     47,11}, \
+    {     15,10}, {     31, 9}, {     67,10}, {     39, 9}, \
+    {     79,10}, {     47, 9}, {     95, 8}, {    191,10}, \
+    {     55,11}, {     31,10}, {     63, 8}, {    255,10}, \
+    {     71, 9}, {    143, 8}, {    287,10}, {     79, 9}, \
+    {    159,11}, {     47,10}, {     95, 9}, {    191, 8}, \
+    {    383, 7}, {    767,10}, {    103,12}, {     31,11}, \
+    {     63, 9}, {    255, 8}, {    511, 7}, {   1023,10}, \
+    {    143, 9}, {    287,11}, {     79,10}, {    159, 9}, \
+    {    319, 8}, {    639,10}, {    175, 9}, {    351, 8}, \
+    {    703,11}, {     95,10}, {    191, 9}, {    383, 8}, \
+    {    767,10}, {    207, 9}, {    415, 8}, {    831,10}, \
+    {    223, 9}, {    447,12}, {     63,10}, {    255, 9}, \
+    {    511, 8}, {   1023,11}, {    143,10}, {    287, 9}, \
+    {    575, 8}, {   1151,11}, {    159,10}, {    319, 9}, \
+    {    639,11}, {    175,10}, {    351, 9}, {    703,12}, \
+    {     95,11}, {    191,10}, {    383, 9}, {    767,11}, \
+    {    207,10}, {    415, 9}, {    831,11}, {    223,10}, \
+    {    447,13}, {     63,11}, {    255,10}, {    511, 9}, \
+    {   1023,11}, {    287,10}, {    575, 9}, {   1151,12}, \
+    {    159,11}, {    319,10}, {    639,11}, {    351,10}, \
+    {    703, 9}, {   1407,12}, {    191,11}, {    383,10}, \
+    {    767,11}, {    415,10}, {    831,12}, {    223,11}, \
+    {    447,10}, {    895,11}, {    479,10}, {    959,12}, \
+    {    255,11}, {    511,10}, {   1023,12}, {    287,11}, \
+    {    575,10}, {   1151,12}, {    319,11}, {    639,12}, \
+    {    351,11}, {    703,10}, {   1407,13}, {    191,12}, \
+    {    383,11}, {    767,12}, {    415,11}, {    831,10}, \
+    {   1663,12}, {    447,11}, {    895,12}, {    479,11}, \
+    {    959,10}, {   1919,13}, {    255,12}, {    511,11}, \
+    {   1023,12}, {    543,11}, {   1087,12}, {    575,11}, \
+    {   1151,13}, {    319,12}, {    703,11}, {   1407,10}, \
+    {   2815,13}, {    383,12}, {    831,11}, {   1663,13}, \
+    {    447,12}, {    895,11}, {   1791,12}, {    959,14}, \
+    {    255,13}, {    511,12}, {   1087,13}, {    575,12}, \
+    {   1151,13}, {    703,12}, {   1407,11}, {   2815,14}, \
+    {    383,13}, {    831,12}, {   1663,13}, {    895,12}, \
+    {   1791,13}, {    959,12}, {   1919,15}, {    255,14}, \
+    {    511,13}, {   1087,12}, {   2175,13}, {   1151,14}, \
+    {    639,13}, {   1407,12}, {   2815,14}, {    767,13}, \
+    {   1663,14}, {    895,13}, {   1919,12}, {   3839,15}, \
+    {    511,14}, {   1023,13}, {   2175,14}, {   1151,13}, \
+    {   2431,12}, {   4863,14}, {  16384,15}, {  32768,16}, \
+    {  65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \
+    {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 204
+#define SQR_FFT_THRESHOLD                 2688
+
+#define MULLO_BASECASE_THRESHOLD             0  /* always */
+#define MULLO_DC_THRESHOLD                  38
+#define MULLO_MUL_N_THRESHOLD             6253
+#define SQRLO_BASECASE_THRESHOLD             4
+#define SQRLO_DC_THRESHOLD                  67
+#define SQRLO_SQR_THRESHOLD               5240
+
+#define DC_DIV_QR_THRESHOLD                 43
+#define DC_DIVAPPR_Q_THRESHOLD             155
+#define DC_BDIV_QR_THRESHOLD                39
+#define DC_BDIV_Q_THRESHOLD                 89
+
+#define INV_MULMOD_BNM1_THRESHOLD           34
+#define INV_NEWTON_THRESHOLD               163
+#define INV_APPR_THRESHOLD                 161
+
+#define BINV_NEWTON_THRESHOLD              196
+#define REDC_1_TO_REDC_N_THRESHOLD          43
+
+#define MU_DIV_QR_THRESHOLD                998
+#define MU_DIVAPPR_Q_THRESHOLD             998
+#define MUPI_DIV_QR_THRESHOLD               91
+#define MU_BDIV_QR_THRESHOLD               807
+#define MU_BDIV_Q_THRESHOLD                924
+
+#define POWM_SEC_TABLE  6,30,125,579,1730
+
+#define GET_STR_DC_THRESHOLD                15
+#define GET_STR_PRECOMPUTE_THRESHOLD        30
+#define SET_STR_DC_THRESHOLD               802
+#define SET_STR_PRECOMPUTE_THRESHOLD      1815
+
+#define FAC_DSC_THRESHOLD                  258
+#define FAC_ODD_THRESHOLD                   24
+
+#define MATRIX22_STRASSEN_THRESHOLD         10
+#define HGCD2_DIV1_METHOD                    1  /* 7.05% faster than 3 */
+#define HGCD_THRESHOLD                     107
+#define HGCD_APPR_THRESHOLD                112
+#define HGCD_REDUCE_THRESHOLD             1679
+#define GCD_DC_THRESHOLD                   324
+#define GCDEXT_DC_THRESHOLD                242
+#define JACOBI_BASE_METHOD                   4  /* 22.41% faster than 1 */
+
+/* Tuneup completed successfully, took 66624 seconds */

diff --git a/third_party/gmp/mpn/arm64/cora57/gmp-mparam.h b/third_party/gmp/mpn/arm64/cora57/gmp-mparam.h
new file mode 100644
index 0000000..0d38621
--- /dev/null
+++ b/third_party/gmp/mpn/arm64/cora57/gmp-mparam.h

@@ -0,0 +1,187 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file for a57, a72-a75.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+/* 1800 MHz Cortex-A72 */
+/* FFT tuning limit = 0.5 M */
+/* Generated by tuneup.c, 2019-10-02, gcc 7.4 */
+
+#define DIVREM_1_NORM_THRESHOLD              3
+#define DIVREM_1_UNNORM_THRESHOLD            4
+#define MOD_1_1P_METHOD                      1  /* 2.21% faster than 2 */
+#define MOD_1_NORM_THRESHOLD                 3
+#define MOD_1_UNNORM_THRESHOLD               4
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          6
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          4
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD         8
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD        42
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     15
+#define USE_PREINV_DIVREM_1                  1
+#define DIV_QR_1N_PI1_METHOD                 1  /* 34.95% faster than 2 */
+#define DIV_QR_1_NORM_THRESHOLD              5
+#define DIV_QR_1_UNNORM_THRESHOLD            5
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD                 0  /* always */
+#define BMOD_1_TO_MOD_1_THRESHOLD           33
+
+#define DIV_1_VS_MUL_1_PERCENT             168
+
+#define MUL_TOOM22_THRESHOLD                10
+#define MUL_TOOM33_THRESHOLD                41
+#define MUL_TOOM44_THRESHOLD                99
+#define MUL_TOOM6H_THRESHOLD               142
+#define MUL_TOOM8H_THRESHOLD               199
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      65
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD      69
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD      63
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD      66
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD      55
+
+#define SQR_BASECASE_THRESHOLD               0  /* always */
+#define SQR_TOOM2_THRESHOLD                 18
+#define SQR_TOOM3_THRESHOLD                 65
+#define SQR_TOOM4_THRESHOLD                166
+#define SQR_TOOM6_THRESHOLD                222
+#define SQR_TOOM8_THRESHOLD                309
+
+#define MULMID_TOOM42_THRESHOLD             22
+
+#define MULMOD_BNM1_THRESHOLD                7
+#define SQRMOD_BNM1_THRESHOLD               12
+
+#define MUL_FFT_MODF_THRESHOLD             276  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    276, 5}, {     13, 6}, {      7, 5}, {     15, 6}, \
+    {     13, 7}, {      7, 6}, {     15, 7}, {      8, 6}, \
+    {     17, 7}, {      9, 6}, {     19, 7}, {     13, 8}, \
+    {      7, 7}, {     17, 8}, {      9, 7}, {     20, 8}, \
+    {     11, 7}, {     23, 8}, {     13, 9}, {      7, 8}, \
+    {     21, 9}, {     11, 8}, {     25,10}, {      7, 9}, \
+    {     15, 8}, {     33, 9}, {     19, 8}, {     39, 9}, \
+    {     23, 8}, {     49, 9}, {     27,10}, {     15, 9}, \
+    {     39,10}, {     23, 9}, {     51,11}, {     15,10}, \
+    {     31, 9}, {     67,10}, {     39, 9}, {     79,10}, \
+    {     47, 9}, {     99,10}, {     55,11}, {     31,10}, \
+    {     63, 8}, {    255,10}, {     71, 9}, {    143, 8}, \
+    {    287,10}, {     79, 9}, {    159, 8}, {    319,11}, \
+    {     47,10}, {     95, 9}, {    191,10}, {    103,12}, \
+    {     31,11}, {     63, 9}, {    255, 8}, {    511,10}, \
+    {    143, 8}, {    575,11}, {     79,10}, {    159, 9}, \
+    {    319,10}, {    175, 9}, {    351, 8}, {    703,11}, \
+    {     95,10}, {    191, 9}, {    383,10}, {    207, 9}, \
+    {    415,10}, {    223, 9}, {    447, 8}, {    895,12}, \
+    {     63,10}, {    255, 9}, {    511, 8}, {   1023, 9}, \
+    {    543,11}, {    143,10}, {    287, 9}, {    575, 8}, \
+    {   1151,10}, {    319, 9}, {    639,11}, {    175,10}, \
+    {    351, 9}, {    703,12}, {     95,10}, {    383, 9}, \
+    {    767,11}, {    207, 9}, {    831,11}, {    223,10}, \
+    {    447, 9}, {    895,13}, {   8192,14}, {  16384,15}, \
+    {  32768,16}, {  65536,17}, { 131072,18}, { 262144,19}, \
+    { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \
+    {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 109
+#define MUL_FFT_THRESHOLD                 3200
+
+#define SQR_FFT_MODF_THRESHOLD             244  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    244, 5}, {     13, 6}, {      7, 5}, {     15, 6}, \
+    {      8, 5}, {     17, 6}, {     17, 7}, {      9, 6}, \
+    {     19, 7}, {     17, 8}, {      9, 7}, {     20, 8}, \
+    {     11, 7}, {     23, 8}, {     13, 9}, {      7, 8}, \
+    {     19, 9}, {     11, 8}, {     25,10}, {      7, 9}, \
+    {     15, 8}, {     33, 9}, {     19, 8}, {     39, 9}, \
+    {     27,10}, {     15, 9}, {     39,10}, {     23, 9}, \
+    {     47,11}, {     15,10}, {     31, 9}, {     67,10}, \
+    {     39, 9}, {     79,10}, {     47, 9}, {     99,10}, \
+    {     55,11}, {     31,10}, {     63, 9}, {    127, 8}, \
+    {    255,10}, {     71, 8}, {    287, 7}, {    575, 9}, \
+    {    159, 8}, {    319,11}, {     47,10}, {     95, 9}, \
+    {    191, 8}, {    383,12}, {     31,11}, {     63,10}, \
+    {    127, 9}, {    255,10}, {    143, 9}, {    287, 8}, \
+    {    575,11}, {     79,10}, {    159, 9}, {    319, 8}, \
+    {    639, 9}, {    351,10}, {    191, 9}, {    383,10}, \
+    {    207, 9}, {    415,10}, {    239,12}, {     63,10}, \
+    {    255, 9}, {    511,10}, {    271,11}, {    143,10}, \
+    {    287, 9}, {    575,11}, {    159,10}, {    319, 9}, \
+    {    639,10}, {    351, 9}, {    703,11}, {    191,10}, \
+    {    383, 9}, {    767,11}, {    207,10}, {    415, 9}, \
+    {    831,11}, {    223,13}, {   8192,14}, {  16384,15}, \
+    {  32768,16}, {  65536,17}, { 131072,18}, { 262144,19}, \
+    { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \
+    {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 97
+#define SQR_FFT_THRESHOLD                 2496
+
+#define MULLO_BASECASE_THRESHOLD             0  /* always */
+#define MULLO_DC_THRESHOLD                  39
+#define MULLO_MUL_N_THRESHOLD             6253
+#define SQRLO_BASECASE_THRESHOLD             4
+#define SQRLO_DC_THRESHOLD                  56
+#define SQRLO_SQR_THRESHOLD               4940
+
+#define DC_DIV_QR_THRESHOLD                 41
+#define DC_DIVAPPR_Q_THRESHOLD             136
+#define DC_BDIV_QR_THRESHOLD                39
+#define DC_BDIV_Q_THRESHOLD                 89
+
+#define INV_MULMOD_BNM1_THRESHOLD           22
+#define INV_NEWTON_THRESHOLD               154
+#define INV_APPR_THRESHOLD                 141
+
+#define BINV_NEWTON_THRESHOLD              182
+#define REDC_1_TO_REDC_N_THRESHOLD          39
+
+#define MU_DIV_QR_THRESHOLD                979
+#define MU_DIVAPPR_Q_THRESHOLD            1078
+#define MUPI_DIV_QR_THRESHOLD               75
+#define MU_BDIV_QR_THRESHOLD               872
+#define MU_BDIV_Q_THRESHOLD                942
+
+#define POWM_SEC_TABLE  1,19,117,539,1730
+
+#define GET_STR_DC_THRESHOLD                10
+#define GET_STR_PRECOMPUTE_THRESHOLD        21
+#define SET_STR_DC_THRESHOLD               572
+#define SET_STR_PRECOMPUTE_THRESHOLD      1036
+
+#define FAC_DSC_THRESHOLD                  142
+#define FAC_ODD_THRESHOLD                   23
+
+#define MATRIX22_STRASSEN_THRESHOLD         11
+#define HGCD2_DIV1_METHOD                    1  /* 8.83% faster than 3 */
+#define HGCD_THRESHOLD                      80
+#define HGCD_APPR_THRESHOLD                 70
+#define HGCD_REDUCE_THRESHOLD             1962
+#define GCD_DC_THRESHOLD                   273
+#define GCDEXT_DC_THRESHOLD                198
+#define JACOBI_BASE_METHOD                   1  /* 7.49% faster than 4 */

diff --git a/third_party/gmp/mpn/arm64/cora72/gmp-mparam.h b/third_party/gmp/mpn/arm64/cora72/gmp-mparam.h
new file mode 100644
index 0000000..fc66fd3
--- /dev/null
+++ b/third_party/gmp/mpn/arm64/cora72/gmp-mparam.h

@@ -0,0 +1,242 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file for a72.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+/* 1800 MHz Cortex-A72 */
+/* FFT tuning limit = 50,811,960 */
+/* Generated by tuneup.c, 2019-10-22, gcc 7.3 */
+
+#define DIVREM_1_NORM_THRESHOLD              3
+#define DIVREM_1_UNNORM_THRESHOLD            3
+#define MOD_1_1P_METHOD                      2  /* 12.09% faster than 1 */
+#define MOD_1_NORM_THRESHOLD                 3
+#define MOD_1_UNNORM_THRESHOLD               3
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          8
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          5
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD         0  /* never mpn_mod_1_1p */
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD        26
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     15
+#define USE_PREINV_DIVREM_1                  1
+#define DIV_QR_1N_PI1_METHOD                 1  /* 13.42% faster than 2 */
+#define DIV_QR_1_NORM_THRESHOLD              4
+#define DIV_QR_1_UNNORM_THRESHOLD            4
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD                 0  /* always */
+#define BMOD_1_TO_MOD_1_THRESHOLD           38
+
+#define DIV_1_VS_MUL_1_PERCENT             168
+
+#define MUL_TOOM22_THRESHOLD                 8
+#define MUL_TOOM33_THRESHOLD                57
+#define MUL_TOOM44_THRESHOLD               153
+#define MUL_TOOM6H_THRESHOLD               222
+#define MUL_TOOM8H_THRESHOLD               333
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      57
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     108
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD     104
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD      56
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD      82
+
+#define SQR_BASECASE_THRESHOLD               0  /* always */
+#define SQR_TOOM2_THRESHOLD                 16
+#define SQR_TOOM3_THRESHOLD                 73
+#define SQR_TOOM4_THRESHOLD                154
+#define SQR_TOOM6_THRESHOLD                206
+#define SQR_TOOM8_THRESHOLD                333
+
+#define MULMID_TOOM42_THRESHOLD             18
+
+#define MULMOD_BNM1_THRESHOLD                8
+#define SQRMOD_BNM1_THRESHOLD               10
+
+#define MUL_FFT_MODF_THRESHOLD             268  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    268, 5}, {     11, 6}, {      6, 5}, {     13, 6}, \
+    {     15, 7}, {     13, 8}, {      7, 7}, {     16, 8}, \
+    {      9, 7}, {     19, 8}, {     11, 7}, {     23, 8}, \
+    {     13, 9}, {      7, 8}, {     15, 7}, {     31, 8}, \
+    {     19, 9}, {     11, 8}, {     27,10}, {      7, 9}, \
+    {     15, 8}, {     33, 9}, {     19, 8}, {     39, 9}, \
+    {     27,10}, {     15, 9}, {     39,10}, {     23, 9}, \
+    {     51,11}, {     15,10}, {     31, 9}, {     71,10}, \
+    {     39, 9}, {     79, 8}, {    159, 7}, {    319, 9}, \
+    {     83,10}, {     47, 9}, {     95, 7}, {    383, 9}, \
+    {     99,10}, {     55,11}, {     31,10}, {     63, 8}, \
+    {    255, 7}, {    511, 9}, {    131,10}, {     71, 9}, \
+    {    143, 8}, {    287, 7}, {    575, 6}, {   1151,10}, \
+    {     79, 8}, {    319, 7}, {    639,10}, {     87, 8}, \
+    {    351,11}, {     47,10}, {     95, 8}, {    383, 7}, \
+    {    767,10}, {    103, 8}, {    415, 7}, {    831, 6}, \
+    {   1663, 9}, {    223, 8}, {    447,12}, {     31,11}, \
+    {     63, 9}, {    255, 8}, {    511, 7}, {   1023, 9}, \
+    {    287, 8}, {    575, 7}, {   1151, 6}, {   2303, 7}, \
+    {   1215,11}, {     79, 9}, {    319, 8}, {    639, 7}, \
+    {   1279, 9}, {    351, 8}, {    703, 7}, {   1407, 6}, \
+    {   2815, 9}, {    383, 8}, {    831, 7}, {   1663, 9}, \
+    {    447, 8}, {    895, 7}, {   1791, 6}, {   3583, 8}, \
+    {    959, 6}, {   3839, 5}, {   7679, 9}, {    511, 8}, \
+    {   1023, 7}, {   2175, 9}, {    575, 8}, {   1151, 7}, \
+    {   2303, 8}, {   1215,10}, {    351, 9}, {    703, 7}, \
+    {   3071, 8}, {   1663, 9}, {    895, 8}, {   1791, 7}, \
+    {   3583, 8}, {   1919, 6}, {   7679, 7}, {   3967, 9}, \
+    {   1023,10}, {    575, 9}, {   1151, 8}, {   2559,10}, \
+    {    703, 8}, {   2815, 9}, {   1471, 7}, {   5887,10}, \
+    {    767,11}, {    415, 9}, {   1791, 8}, {   3583,11}, \
+    {    479,10}, {    959, 8}, {   3967,11}, {    511, 9}, \
+    {   2175,10}, {   1151, 8}, {   4607, 9}, {   2815,10}, \
+    {   1471, 9}, {   2943,11}, {    767,10}, {   1535,11}, \
+    {    831,10}, {   1791,11}, {    959,10}, {   1919, 9}, \
+    {   3839, 8}, {   7679,10}, {   1983,12}, {    511,10}, \
+    {   2047,11}, {   1215,12}, {    639,11}, {   1407,10}, \
+    {   2815,11}, {   1471,12}, {    767,11}, {   1663,12}, \
+    {    895,11}, {   1791,12}, {    959,11}, {   1919,10}, \
+    {   3839,14}, {    255,13}, {    511,12}, {   1023,11}, \
+    {   2047,12}, {   1215,13}, {    639,12}, {   1279,13}, \
+    {    703,12}, {   1407,11}, {   2815,13}, {    767,12}, \
+    {   1535,13}, {    831,12}, {   1663,13}, {    895,12}, \
+    {   1791,11}, {   3583,13}, {    959,12}, {   1919,11}, \
+    {   3839,14}, {    511,13}, {   1023,12}, {   2047,13}, \
+    {   1215,12}, {   2431,14}, {    639,13}, {   1407,12}, \
+    {   2815,13}, {   1471,12}, {   2943,14}, {    767,13}, \
+    {   1535,12}, {   3071,13}, {   1791,12}, {   3583,13}, \
+    {   1919,14}, {   1023,13}, {   2175,14}, {   1151,13}, \
+    {   2431,14}, {   1279,13}, {   2559,15}, {    767,14}, \
+    {   1791,13}, {   3839,15}, {   1023,14}, {   2431,13}, \
+    {   4863,15}, {   1279,14}, {   2943,15}, {   1535,14}, \
+    {  16384,15}, {  32768,16}, {  65536,17}, { 131072,18}, \
+    { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \
+    {4194304,23}, {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 218
+#define MUL_FFT_THRESHOLD                 2688
+
+#define SQR_FFT_MODF_THRESHOLD             236  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    236, 5}, {     13, 6}, {      7, 5}, {     15, 6}, \
+    {     15, 7}, {      8, 6}, {     17, 7}, {     13, 8}, \
+    {      7, 7}, {     17, 8}, {      9, 7}, {     20, 8}, \
+    {     11, 7}, {     23, 8}, {     13, 9}, {      7, 8}, \
+    {     19, 9}, {     11, 8}, {     25,10}, {      7, 9}, \
+    {     15, 8}, {     33, 9}, {     19, 8}, {     39, 9}, \
+    {     27,10}, {     15, 9}, {     39,10}, {     23, 9}, \
+    {     47,11}, {     15,10}, {     31, 9}, {     67,10}, \
+    {     39, 9}, {     79, 8}, {    159,10}, {     47, 9}, \
+    {     95, 8}, {    191, 7}, {    383,10}, {     55,11}, \
+    {     31,10}, {     63, 9}, {    127, 8}, {    255, 7}, \
+    {    511,10}, {     71, 9}, {    143, 8}, {    287, 7}, \
+    {    575,10}, {     79, 8}, {    319, 7}, {    639,11}, \
+    {     47,10}, {     95, 8}, {    383, 7}, {    767, 8}, \
+    {    415,12}, {     31,11}, {     63,10}, {    127, 9}, \
+    {    255, 8}, {    543, 9}, {    287, 8}, {    575, 7}, \
+    {   1151, 9}, {    319, 8}, {    639, 9}, {    351, 8}, \
+    {    703, 7}, {   1407, 6}, {   2815,10}, {    191, 9}, \
+    {    383, 8}, {    767, 9}, {    415, 8}, {    831, 7}, \
+    {   1663,10}, {    223, 9}, {    447, 8}, {    895, 7}, \
+    {   1791, 9}, {    479, 8}, {    959,12}, {     63,11}, \
+    {    127, 9}, {    543, 8}, {   1087,10}, {    287, 9}, \
+    {    575, 8}, {   1151,10}, {    319, 9}, {    639,10}, \
+    {    351, 9}, {    703, 8}, {   1407, 7}, {   2815, 8}, \
+    {   1471, 5}, {  11775, 9}, {    767, 8}, {   1535,10}, \
+    {    415, 9}, {    895, 8}, {   1919, 6}, {   7679, 7}, \
+    {   3967,11}, {    255,10}, {    543, 9}, {   1087, 8}, \
+    {   2175,10}, {    575, 9}, {   1151, 8}, {   2431,10}, \
+    {    639, 9}, {   1279,10}, {    703, 9}, {   1407, 8}, \
+    {   2943,11}, {    383,10}, {    767,11}, {    447,10}, \
+    {    895,11}, {    479,10}, {    959, 9}, {   1919, 8}, \
+    {   3839,10}, {   1023, 9}, {   2175,10}, {   1215, 9}, \
+    {   2431,11}, {    703, 9}, {   2815,10}, {   1471,11}, \
+    {    767,10}, {   1663,11}, {    895,10}, {   1791,11}, \
+    {    959, 9}, {   3839,12}, {    511,11}, {   1087,10}, \
+    {   2175,11}, {   1215,10}, {   2431,12}, {    639,11}, \
+    {   1279,12}, {    703,11}, {   1471,12}, {    767,11}, \
+    {   1663,12}, {    895,11}, {   1919,10}, {   3839,13}, \
+    {    511,12}, {   1087,11}, {   2175,12}, {   1215,11}, \
+    {   2431,13}, {    639,12}, {   1279,13}, {    703,12}, \
+    {   1407,13}, {    767,12}, {   1535,13}, {    831,12}, \
+    {   1791,13}, {   1151,12}, {   2303,13}, {   1215,14}, \
+    {    639,12}, {   2559,13}, {   1407,14}, {    767,12}, \
+    {   3071,14}, {    895,13}, {   1919,12}, {   3839,14}, \
+    {   1023,13}, {   2175,14}, {   1151,12}, {   4607,14}, \
+    {   1279,13}, {   2559,14}, {   1407,13}, {   2943,15}, \
+    {    767,14}, {   1663,13}, {   3583,14}, {   1919,15}, \
+    {   1023,14}, {   2047,13}, {   4095,14}, {   2943,15}, \
+    {   1535,14}, {  16384,15}, {  32768,16}, {  65536,17}, \
+    { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
+    {2097152,22}, {4194304,23}, {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 203
+#define SQR_FFT_THRESHOLD                 2176
+
+#define MULLO_BASECASE_THRESHOLD             0  /* always */
+#define MULLO_DC_THRESHOLD                  33
+#define MULLO_MUL_N_THRESHOLD             5240
+#define SQRLO_BASECASE_THRESHOLD             6
+#define SQRLO_DC_THRESHOLD                  45
+#define SQRLO_SQR_THRESHOLD               4265
+
+#define DC_DIV_QR_THRESHOLD                 38
+#define DC_DIVAPPR_Q_THRESHOLD             108
+#define DC_BDIV_QR_THRESHOLD                36
+#define DC_BDIV_Q_THRESHOLD                 71
+
+#define INV_MULMOD_BNM1_THRESHOLD           14
+#define INV_NEWTON_THRESHOLD               132
+#define INV_APPR_THRESHOLD                 124
+
+#define BINV_NEWTON_THRESHOLD              199
+#define REDC_1_TO_REDC_N_THRESHOLD          34
+
+#define MU_DIV_QR_THRESHOLD                979
+#define MU_DIVAPPR_Q_THRESHOLD             979
+#define MUPI_DIV_QR_THRESHOLD               61
+#define MU_BDIV_QR_THRESHOLD               734
+#define MU_BDIV_Q_THRESHOLD                942
+
+#define POWM_SEC_TABLE  6,30,110,579,1730
+
+#define GET_STR_DC_THRESHOLD                12
+#define GET_STR_PRECOMPUTE_THRESHOLD        19
+#define SET_STR_DC_THRESHOLD               458
+#define SET_STR_PRECOMPUTE_THRESHOLD       875
+
+#define FAC_DSC_THRESHOLD                  153
+#define FAC_ODD_THRESHOLD                   24
+
+#define MATRIX22_STRASSEN_THRESHOLD         15
+#define HGCD2_DIV1_METHOD                    1  /* 8.41% faster than 3 */
+#define HGCD_THRESHOLD                      81
+#define HGCD_APPR_THRESHOLD                 80
+#define HGCD_REDUCE_THRESHOLD             1494
+#define GCD_DC_THRESHOLD                   268
+#define GCDEXT_DC_THRESHOLD                189
+#define JACOBI_BASE_METHOD                   1  /* 10.80% faster than 4 */
+
+/* Tuneup completed successfully, took 96906 seconds */

diff --git a/third_party/gmp/mpn/arm64/cora73/gmp-mparam.h b/third_party/gmp/mpn/arm64/cora73/gmp-mparam.h
new file mode 100644
index 0000000..7fc7f4e
--- /dev/null
+++ b/third_party/gmp/mpn/arm64/cora73/gmp-mparam.h

@@ -0,0 +1,225 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file for a73.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+/* 1800 MHz Cortex-A72 */
+/* FFT tuning limit = 48,820,337 */
+/* Generated by tuneup.c, 2019-10-22, gcc 7.4 */
+
+#define DIVREM_1_NORM_THRESHOLD              3
+#define DIVREM_1_UNNORM_THRESHOLD            3
+#define MOD_1_1P_METHOD                      1  /* 2.28% faster than 2 */
+#define MOD_1_NORM_THRESHOLD                 3
+#define MOD_1_UNNORM_THRESHOLD               4
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          8
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          5
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD         8
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD        44
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     16
+#define USE_PREINV_DIVREM_1                  1
+#define DIV_QR_1N_PI1_METHOD                 1  /* 35.13% faster than 2 */
+#define DIV_QR_1_NORM_THRESHOLD              5
+#define DIV_QR_1_UNNORM_THRESHOLD            5
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD                 0  /* always */
+#define BMOD_1_TO_MOD_1_THRESHOLD           33
+
+#define DIV_1_VS_MUL_1_PERCENT             168
+
+#define MUL_TOOM22_THRESHOLD                10
+#define MUL_TOOM33_THRESHOLD                57
+#define MUL_TOOM44_THRESHOLD                89
+#define MUL_TOOM6H_THRESHOLD               141
+#define MUL_TOOM8H_THRESHOLD               199
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      61
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD      69
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD      65
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD      66
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD      58
+
+#define SQR_BASECASE_THRESHOLD               0  /* always */
+#define SQR_TOOM2_THRESHOLD                 18
+#define SQR_TOOM3_THRESHOLD                 62
+#define SQR_TOOM4_THRESHOLD                166
+#define SQR_TOOM6_THRESHOLD                222
+#define SQR_TOOM8_THRESHOLD                309
+
+#define MULMID_TOOM42_THRESHOLD             22
+
+#define MULMOD_BNM1_THRESHOLD                8
+#define SQRMOD_BNM1_THRESHOLD               11
+
+#define MUL_FFT_MODF_THRESHOLD             276  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    276, 5}, {     13, 6}, {      7, 5}, {     15, 6}, \
+    {     15, 7}, {      8, 6}, {     17, 7}, {      9, 6}, \
+    {     19, 7}, {     13, 8}, {      7, 7}, {     17, 8}, \
+    {      9, 7}, {     19, 8}, {     11, 7}, {     23, 8}, \
+    {     13, 9}, {      7, 8}, {     19, 9}, {     11, 8}, \
+    {     27,10}, {      7, 9}, {     15, 8}, {     33, 9}, \
+    {     19, 8}, {     39, 9}, {     23, 8}, {     47, 9}, \
+    {     27,10}, {     15, 9}, {     43,10}, {     23, 9}, \
+    {     51,11}, {     15,10}, {     31, 9}, {     67,10}, \
+    {     39, 9}, {     83,10}, {     47, 9}, {     99,10}, \
+    {     55,11}, {     31,10}, {     63, 9}, {    127, 8}, \
+    {    255, 9}, {    131,10}, {     71, 9}, {    143, 8}, \
+    {    287,10}, {     79, 9}, {    159, 8}, {    319,11}, \
+    {     47, 9}, {    191, 8}, {    383, 7}, {    767, 8}, \
+    {    415,12}, {     31,11}, {     63, 9}, {    255, 8}, \
+    {    511,10}, {    143, 9}, {    287, 8}, {    575,11}, \
+    {     79,10}, {    159, 9}, {    319,10}, {    175, 9}, \
+    {    351, 8}, {    703,11}, {     95,10}, {    191, 9}, \
+    {    383, 8}, {    767,10}, {    207, 9}, {    415,10}, \
+    {    223, 9}, {    447,12}, {     63,10}, {    255, 9}, \
+    {    511, 8}, {   1023, 9}, {    543,11}, {    143, 9}, \
+    {    575,10}, {    319, 9}, {    639,10}, {    351, 9}, \
+    {    703,12}, {     95,11}, {    191,10}, {    383,11}, \
+    {    207,10}, {    415,11}, {    223,10}, {    447, 9}, \
+    {    895,13}, {     63,11}, {    255,10}, {    511,11}, \
+    {    287,10}, {    575,12}, {    159,11}, {    319,10}, \
+    {    639,11}, {    351,10}, {    703, 9}, {   1407,12}, \
+    {    191,11}, {    383,10}, {    767,11}, {    415,12}, \
+    {    223,11}, {    447,10}, {    895,11}, {    479,10}, \
+    {    959,12}, {    255,11}, {    543,10}, {   1087,11}, \
+    {    575,12}, {    319,11}, {    639,12}, {    351,11}, \
+    {    703,13}, {    191,12}, {    383,11}, {    767,12}, \
+    {    415,11}, {    831,12}, {    447,11}, {    895,12}, \
+    {    479,13}, {    255,12}, {    511,11}, {   1023,12}, \
+    {    575,13}, {    319,12}, {    703,13}, {    383,12}, \
+    {    831,13}, {    447,12}, {    959,14}, {    255,13}, \
+    {    511,12}, {   1023,13}, {    575,12}, {   1151,13}, \
+    {    703,12}, {   1407,14}, {    383,13}, {    831,12}, \
+    {   1663,13}, {    959,15}, {    255,14}, {    511,13}, \
+    {   1151,14}, {    639,13}, {   1407,14}, {    767,13}, \
+    {   1663,14}, {    895,13}, {   1791,15}, {    511,14}, \
+    {   1023,13}, {   2047,14}, {   1151,13}, {   2431,14}, \
+    {   1407,15}, {    767,14}, {   1791,16}, {    511,15}, \
+    {   1023,14}, {   2431,15}, {   1279,14}, {   2815,15}, \
+    {  32768,16}, {  65536,17}, { 131072,18}, { 262144,19}, \
+    { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \
+    {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 185
+#define MUL_FFT_THRESHOLD                 3200
+
+#define SQR_FFT_MODF_THRESHOLD             244  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    244, 5}, {     13, 6}, {      7, 5}, {     15, 6}, \
+    {     17, 7}, {      9, 6}, {     19, 7}, {     17, 8}, \
+    {      9, 7}, {     20, 8}, {     11, 7}, {     23, 8}, \
+    {     13, 9}, {      7, 8}, {     19, 9}, {     11, 8}, \
+    {     25,10}, {      7, 9}, {     15, 8}, {     31, 9}, \
+    {     19, 8}, {     39, 9}, {     23, 8}, {     47,10}, \
+    {     15, 9}, {     39,10}, {     23, 9}, {     47,11}, \
+    {     15,10}, {     31, 9}, {     63,10}, {     39, 9}, \
+    {     79,10}, {     47, 9}, {     95,10}, {     55,11}, \
+    {     31,10}, {     63, 8}, {    255,10}, {     71, 9}, \
+    {    143, 8}, {    287,10}, {     79, 9}, {    159, 8}, \
+    {    319,11}, {     47,10}, {     95, 9}, {    191, 8}, \
+    {    383,12}, {     31,11}, {     63,10}, {    127, 9}, \
+    {    287, 8}, {    575,11}, {     79,10}, {    159, 9}, \
+    {    319, 8}, {    639,10}, {    175, 9}, {    351, 8}, \
+    {    703,11}, {     95, 9}, {    383, 8}, {    767,10}, \
+    {    207, 9}, {    415,10}, {    223, 8}, {    895,10}, \
+    {    239,12}, {     63,11}, {    127,10}, {    255, 9}, \
+    {    511,10}, {    287, 9}, {    575,11}, {    159,10}, \
+    {    319, 9}, {    639,11}, {    175,10}, {    351, 9}, \
+    {    703,11}, {    191,10}, {    383,11}, {    207,10}, \
+    {    415,11}, {    223,10}, {    479,11}, {    255,10}, \
+    {    511,11}, {    287,10}, {    575,12}, {    159,11}, \
+    {    351,12}, {    191,11}, {    383,10}, {    767,12}, \
+    {    223,11}, {    447,10}, {    895,11}, {    479,13}, \
+    {    127,12}, {    255,11}, {    511,12}, {    287,10}, \
+    {   1151,12}, {    319,11}, {    639,12}, {    351,11}, \
+    {    703,13}, {    191,12}, {    383,11}, {    767,12}, \
+    {    415,11}, {    831,12}, {    447,11}, {    895,12}, \
+    {    479,11}, {    959,12}, {    511,11}, {   1023,12}, \
+    {    575,11}, {   1151,13}, {    319,12}, {    639,11}, \
+    {   1279,13}, {    383,12}, {    831,13}, {    447,12}, \
+    {    895,14}, {    255,13}, {    511,12}, {   1023,13}, \
+    {    703,14}, {    383,13}, {    831,12}, {   1663,13}, \
+    {    895,15}, {    255,14}, {    511,13}, {   1151,14}, \
+    {    639,13}, {   1407,14}, {    767,13}, {   1535,14}, \
+    {    895,15}, {    511,14}, {   1151,13}, {   2431,14}, \
+    {   1407,15}, {    767,14}, {   1791,16}, {    511,15}, \
+    {   1023,14}, {   2431,15}, {   1279,14}, {   2815,15}, \
+    {  32768,16}, {  65536,17}, { 131072,18}, { 262144,19}, \
+    { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \
+    {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 165
+#define SQR_FFT_THRESHOLD                 2496
+
+#define MULLO_BASECASE_THRESHOLD             0  /* always */
+#define MULLO_DC_THRESHOLD                  39
+#define MULLO_MUL_N_THRESHOLD             6253
+#define SQRLO_BASECASE_THRESHOLD             4
+#define SQRLO_DC_THRESHOLD                  56
+#define SQRLO_SQR_THRESHOLD               4940
+
+#define DC_DIV_QR_THRESHOLD                 36
+#define DC_DIVAPPR_Q_THRESHOLD             136
+#define DC_BDIV_QR_THRESHOLD                35
+#define DC_BDIV_Q_THRESHOLD                 88
+
+#define INV_MULMOD_BNM1_THRESHOLD           30
+#define INV_NEWTON_THRESHOLD               149
+#define INV_APPR_THRESHOLD                 139
+
+#define BINV_NEWTON_THRESHOLD              166
+#define REDC_1_TO_REDC_N_THRESHOLD          38
+
+#define MU_DIV_QR_THRESHOLD               1120
+#define MU_DIVAPPR_Q_THRESHOLD            1078
+#define MUPI_DIV_QR_THRESHOLD               68
+#define MU_BDIV_QR_THRESHOLD               889
+#define MU_BDIV_Q_THRESHOLD                942
+
+#define POWM_SEC_TABLE  4,22,102,473,1730
+
+#define GET_STR_DC_THRESHOLD                11
+#define GET_STR_PRECOMPUTE_THRESHOLD        22
+#define SET_STR_DC_THRESHOLD               381
+#define SET_STR_PRECOMPUTE_THRESHOLD      1042
+
+#define FAC_DSC_THRESHOLD                  140
+#define FAC_ODD_THRESHOLD                   23
+
+#define MATRIX22_STRASSEN_THRESHOLD         11
+#define HGCD2_DIV1_METHOD                    1  /* 7.84% faster than 3 */
+#define HGCD_THRESHOLD                      80
+#define HGCD_APPR_THRESHOLD                 80
+#define HGCD_REDUCE_THRESHOLD             1679
+#define GCD_DC_THRESHOLD                   273
+#define GCDEXT_DC_THRESHOLD                201
+#define JACOBI_BASE_METHOD                   1  /* 1.03% faster than 4 */
+
+/* Tuneup completed successfully, took 64972 seconds */

diff --git a/third_party/gmp/mpn/arm64/gcd_11.asm b/third_party/gmp/mpn/arm64/gcd_11.asm
new file mode 100644
index 0000000..d8cc3e2
--- /dev/null
+++ b/third_party/gmp/mpn/arm64/gcd_11.asm

@@ -0,0 +1,70 @@
+dnl  ARM v8a mpn_gcd_11.
+
+dnl  Based on the K7 gcd_1.asm, by Kevin Ryde.  Rehacked for ARM by Torbjorn
+dnl  Granlund.
+
+dnl  Copyright 2000-2002, 2005, 2009, 2011-2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+changecom(blah)
+
+C	     cycles/bit (approx)
+C Cortex-A35	 ?
+C Cortex-A53	 ?
+C Cortex-A55	 ?
+C Cortex-A57	 ?
+C Cortex-A72	 ?
+C Cortex-A73	 ?
+C Cortex-A75	 ?
+C Cortex-A76	 ?
+C Cortex-A77	 ?
+C Numbers measured with: speed -CD -s16-64 -t48 mpn_gcd_1
+
+define(`u0',    `x0')
+define(`v0',    `x1')
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_gcd_11)
+	subs	x3, u0, v0		C			0
+	b.eq	L(end)			C
+
+	ALIGN(16)
+L(top):	rbit	x12, x3			C			1,5
+	clz	x12, x12		C			2
+	csneg	x3, x3, x3, cs		C v = abs(u-v), even	1
+	csel	u0, v0, u0, cs		C u = min(u,v)		1
+	lsr	v0, x3, x12		C			3
+	subs	x3, u0, v0		C			4
+	b.ne	L(top)			C
+
+L(end):	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/arm64/gcd_22.asm b/third_party/gmp/mpn/arm64/gcd_22.asm
new file mode 100644
index 0000000..5367fea
--- /dev/null
+++ b/third_party/gmp/mpn/arm64/gcd_22.asm

@@ -0,0 +1,112 @@
+dnl  ARM v8a mpn_gcd_22.
+
+dnl  Copyright 2019 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+changecom(blah)
+
+C	     cycles/bit (approx)
+C Cortex-A35	 ?
+C Cortex-A53	 7.26
+C Cortex-A55	 ?
+C Cortex-A57	 ?
+C Cortex-A72	 5.72
+C Cortex-A73	 6.43
+C Cortex-A75	 ?
+C Cortex-A76	 ?
+C Cortex-A77	 ?
+
+
+define(`u1',    `x0')
+define(`u0',    `x1')
+define(`v1',    `x2')
+define(`v0',    `x3')
+
+define(`t0',    `x5')
+define(`t1',    `x6')
+define(`cnt',   `x7')
+define(`tnc',   `x8')
+
+ASM_START()
+PROLOGUE(mpn_gcd_22)
+
+	ALIGN(16)
+L(top):	subs	t0, u0, v0		C 0 6
+	cbz	t0, L(lowz)
+	sbcs	t1, u1, v1		C 1 7
+
+	rbit	cnt, t0			C 1
+
+	cneg	t0, t0, cc		C 2
+	cinv	t1, t1, cc		C 2 u = |u - v|
+L(bck):	csel	v0, v0, u0, cs		C 2
+	csel	v1, v1, u1, cs		C 2 v = min(u,v)
+
+	clz	cnt, cnt		C 2
+	sub	tnc, xzr, cnt		C 3
+
+	lsr	u0, t0, cnt		C 3
+	lsl	x14, t1, tnc		C 4
+	lsr	u1, t1, cnt		C 3
+	orr	u0, u0, x14		C 5
+
+	orr	x11, u1, v1
+	cbnz	x11, L(top)
+
+
+	subs	x4, u0, v0		C			0
+	b.eq	L(end1)			C
+
+	ALIGN(16)
+L(top1):rbit	x12, x4			C			1,5
+	clz	x12, x12		C			2
+	csneg	x4, x4, x4, cs		C v = abs(u-v), even	1
+	csel	u0, v0, u0, cs		C u = min(u,v)		1
+	lsr	v0, x4, x12		C			3
+	subs	x4, u0, v0		C			4
+	b.ne	L(top1)			C
+L(end1):mov	x0, u0
+	mov	x1, #0
+	ret
+
+L(lowz):C We come here when v0 - u0 = 0
+	C 1. If v1 - u1 = 0, then gcd is u = v.
+	C 2. Else compute gcd_21({v1,v0}, |u1-v1|)
+	subs	t0, u1, v1
+	b.eq	L(end)
+	mov	t1, #0
+	rbit	cnt, t0			C 1
+	cneg	t0, t0, cc		C 2
+	b	L(bck)			C FIXME: make conditional
+
+L(end):	mov	x0, v0
+	mov	x1, v1
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/arm64/gmp-mparam.h b/third_party/gmp/mpn/arm64/gmp-mparam.h
new file mode 100644
index 0000000..7c0c193
--- /dev/null
+++ b/third_party/gmp/mpn/arm64/gmp-mparam.h

@@ -0,0 +1,192 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+/* 1536 MHz Cortex-A53 */
+/* FFT tuning limit = 0.5 M */
+/* Generated by tuneup.c, 2019-09-29, gcc 5.4 */
+
+#define DIVREM_1_NORM_THRESHOLD              3
+#define DIVREM_1_UNNORM_THRESHOLD            4
+#define MOD_1_1P_METHOD                      2  /* 2.08% faster than 1 */
+#define MOD_1_NORM_THRESHOLD                 3
+#define MOD_1_UNNORM_THRESHOLD               4
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          8
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          6
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD        10
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD        20
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     21
+#define USE_PREINV_DIVREM_1                  1
+#define DIV_QR_1N_PI1_METHOD                 1  /* 38.26% faster than 2 */
+#define DIV_QR_1_NORM_THRESHOLD             13
+#define DIV_QR_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD                 0  /* always */
+#define BMOD_1_TO_MOD_1_THRESHOLD           40
+
+#define DIV_1_VS_MUL_1_PERCENT             159
+
+#define MUL_TOOM22_THRESHOLD                14
+#define MUL_TOOM33_THRESHOLD                49
+#define MUL_TOOM44_THRESHOLD                82
+#define MUL_TOOM6H_THRESHOLD               173
+#define MUL_TOOM8H_THRESHOLD               236
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      81
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD      76
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD      81
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD      80
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD      74
+
+#define SQR_BASECASE_THRESHOLD               0  /* always */
+#define SQR_TOOM2_THRESHOLD                 18
+#define SQR_TOOM3_THRESHOLD                 67
+#define SQR_TOOM4_THRESHOLD                166
+#define SQR_TOOM6_THRESHOLD                222
+#define SQR_TOOM8_THRESHOLD                333
+
+#define MULMID_TOOM42_THRESHOLD             20
+
+#define MULMOD_BNM1_THRESHOLD               10
+#define SQRMOD_BNM1_THRESHOLD               11
+
+#define MUL_FFT_MODF_THRESHOLD             316  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    316, 5}, {     13, 6}, {      7, 5}, {     15, 6}, \
+    {     13, 7}, {      7, 6}, {     15, 7}, {      8, 6}, \
+    {     17, 7}, {      9, 6}, {     19, 7}, {     17, 8}, \
+    {      9, 7}, {     20, 8}, {     11, 7}, {     23, 8}, \
+    {     13, 9}, {      7, 8}, {     19, 9}, {     11, 8}, \
+    {     27, 9}, {     15, 8}, {     33, 9}, {     19, 8}, \
+    {     41, 9}, {     23, 8}, {     49, 9}, {     27,10}, \
+    {     15, 9}, {     39,10}, {     23, 9}, {     51,11}, \
+    {     15,10}, {     31, 9}, {     71,10}, {     39, 9}, \
+    {     83,10}, {     47, 9}, {     99,10}, {     55,11}, \
+    {     31,10}, {     63, 9}, {    127, 8}, {    255, 9}, \
+    {    131,10}, {     71, 8}, {    287,10}, {     79, 9}, \
+    {    159, 8}, {    319,10}, {     87,11}, {     47,10}, \
+    {     95, 9}, {    191, 8}, {    383,10}, {    103, 9}, \
+    {    207, 8}, {    415,10}, {    111, 9}, {    223,12}, \
+    {     31,11}, {     63, 9}, {    255, 8}, {    511,10}, \
+    {    135, 9}, {    287, 8}, {    575,11}, {     79,10}, \
+    {    159, 9}, {    319, 8}, {    639,10}, {    175, 9}, \
+    {    351, 8}, {    703,11}, {     95,10}, {    191, 9}, \
+    {    383, 8}, {    767,10}, {    207, 9}, {    415,11}, \
+    {    111,10}, {    223, 9}, {    447,12}, {     63,10}, \
+    {    255, 9}, {    511, 8}, {   1023, 9}, {    543,10}, \
+    {    287, 9}, {    575, 8}, {   1151,11}, {    159,10}, \
+    {    319, 9}, {    639,11}, {    175,10}, {    351, 9}, \
+    {    703, 8}, {   1407,12}, {     95,11}, {    191,10}, \
+    {    383, 9}, {    767,11}, {    207,10}, {    415, 9}, \
+    {    831,11}, {    223,10}, {    447,13}, {   8192,14}, \
+    {  16384,15}, {  32768,16}, {  65536,17}, { 131072,18}, \
+    { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \
+    {4194304,23}, {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 118
+#define MUL_FFT_THRESHOLD                 3200
+
+#define SQR_FFT_MODF_THRESHOLD             272  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    272, 5}, {     13, 6}, {      7, 5}, {     15, 6}, \
+    {      8, 5}, {     17, 6}, {     17, 7}, {     17, 8}, \
+    {      9, 7}, {     19, 8}, {     11, 7}, {     23, 8}, \
+    {     13, 9}, {      7, 8}, {     15, 7}, {     31, 8}, \
+    {     19, 9}, {     11, 8}, {     27, 9}, {     15, 8}, \
+    {     33, 9}, {     19, 8}, {     39, 9}, {     23, 8}, \
+    {     47, 9}, {     27,10}, {     15, 9}, {     39,10}, \
+    {     23, 9}, {     47,11}, {     15,10}, {     31, 9}, \
+    {     67,10}, {     39, 9}, {     79,10}, {     47, 9}, \
+    {     95, 8}, {    191,10}, {     55,11}, {     31,10}, \
+    {     63, 8}, {    255,10}, {     71, 9}, {    143, 8}, \
+    {    287,10}, {     79, 9}, {    159,11}, {     47,10}, \
+    {     95, 9}, {    191, 8}, {    383, 7}, {    767,10}, \
+    {    103, 9}, {    207,12}, {     31,11}, {     63, 9}, \
+    {    255, 8}, {    511, 7}, {   1023, 9}, {    271,10}, \
+    {    143, 9}, {    287,11}, {     79,10}, {    159, 9}, \
+    {    319, 8}, {    639,10}, {    175, 9}, {    351, 8}, \
+    {    703,11}, {     95,10}, {    191, 9}, {    383, 8}, \
+    {    767,10}, {    207, 9}, {    415, 8}, {    831,10}, \
+    {    223,12}, {     63,10}, {    255, 9}, {    511, 8}, \
+    {   1023,10}, {    271,11}, {    143,10}, {    287, 9}, \
+    {    575, 8}, {   1151,11}, {    159,10}, {    319, 9}, \
+    {    639,11}, {    175,10}, {    351, 9}, {    703,12}, \
+    {     95,11}, {    191,10}, {    383, 9}, {    767,11}, \
+    {    207,10}, {    415, 9}, {    831,11}, {    223,10}, \
+    {    447,13}, {   8192,14}, {  16384,15}, {  32768,16}, \
+    {  65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \
+    {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 112
+#define SQR_FFT_THRESHOLD                 2688
+
+#define MULLO_BASECASE_THRESHOLD             0  /* always */
+#define MULLO_DC_THRESHOLD                  38
+#define MULLO_MUL_N_THRESHOLD             6253
+#define SQRLO_BASECASE_THRESHOLD             4
+#define SQRLO_DC_THRESHOLD                  67
+#define SQRLO_SQR_THRESHOLD               5240
+
+#define DC_DIV_QR_THRESHOLD                 42
+#define DC_DIVAPPR_Q_THRESHOLD             152
+#define DC_BDIV_QR_THRESHOLD                39
+#define DC_BDIV_Q_THRESHOLD                 93
+
+#define INV_MULMOD_BNM1_THRESHOLD           37
+#define INV_NEWTON_THRESHOLD               163
+#define INV_APPR_THRESHOLD                 162
+
+#define BINV_NEWTON_THRESHOLD              194
+#define REDC_1_TO_REDC_N_THRESHOLD          43
+
+#define MU_DIV_QR_THRESHOLD                998
+#define MU_DIVAPPR_Q_THRESHOLD             998
+#define MUPI_DIV_QR_THRESHOLD               98
+#define MU_BDIV_QR_THRESHOLD               807
+#define MU_BDIV_Q_THRESHOLD                924
+
+#define POWM_SEC_TABLE  6,30,194,579,1730
+
+#define GET_STR_DC_THRESHOLD                15
+#define GET_STR_PRECOMPUTE_THRESHOLD        29
+#define SET_STR_DC_THRESHOLD               788
+#define SET_STR_PRECOMPUTE_THRESHOLD      1816
+
+#define FAC_DSC_THRESHOLD                  236
+#define FAC_ODD_THRESHOLD                   24
+
+#define MATRIX22_STRASSEN_THRESHOLD         10
+#define HGCD2_DIV1_METHOD                    1  /* 7.05% faster than 3 */
+#define HGCD_THRESHOLD                     101
+#define HGCD_APPR_THRESHOLD                104
+#define HGCD_REDUCE_THRESHOLD             1679
+#define GCD_DC_THRESHOLD                   330
+#define GCDEXT_DC_THRESHOLD                242
+#define JACOBI_BASE_METHOD                   4  /* 20.00% faster than 1 */

diff --git a/third_party/gmp/mpn/arm64/hamdist.asm b/third_party/gmp/mpn/arm64/hamdist.asm
new file mode 100644
index 0000000..c72ca55
--- /dev/null
+++ b/third_party/gmp/mpn/arm64/hamdist.asm

@@ -0,0 +1,181 @@
+dnl  ARM64 Neon mpn_hamdist -- mpn bit hamming distance.
+
+dnl  Copyright 2013, 2014 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C Cortex-A53	 4.5
+C Cortex-A57	 1.9
+C X-Gene	 4.36
+
+C TODO
+C  * Consider greater unrolling.
+C  * Arrange to align the pointer, if that helps performance.  Use the same
+C    read-and-mask trick we use on PCs, for simplicity and performance.  (Sorry
+C    valgrind!)
+C  * Explore if explicit align directives, e.g., "[ptr:128]" help.
+C  * See rth's gmp-devel 2013-02/03 messages about final summation tricks.
+
+changecom(blah)
+
+C INPUT PARAMETERS
+define(`ap', x0)
+define(`bp', x1)
+define(`n',  x2)
+
+C We sum into 16 16-bit counters in v4,v5, but at the end we sum them and end
+C up with 8 16-bit counters.  Therefore, we can sum to 8(2^16-1) bits, or
+C (8*2^16-1)/64 = 0x1fff limbs.  We use a chunksize close to that, but which
+C  allows the huge count code to jump deep into the code (at L(chu)).
+
+define(`maxsize',  0x1fff)
+define(`chunksize',0x1ff0)
+
+ASM_START()
+PROLOGUE(mpn_hamdist)
+
+	mov	x11, #maxsize
+	cmp	n, x11
+	b.hi	L(gt8k)
+
+L(lt8k):
+	movi	v4.16b, #0			C clear summation register
+	movi	v5.16b, #0			C clear summation register
+
+	tbz	n, #0, L(xx0)
+	sub	n, n, #1
+	ld1	{v0.1d}, [ap], #8		C load 1 limb
+	ld1	{v16.1d}, [bp], #8		C load 1 limb
+	eor	v0.16b, v0.16b, v16.16b
+	cnt	v6.16b, v0.16b
+	uadalp	v4.8h,  v6.16b			C could also splat
+
+L(xx0):	tbz	n, #1, L(x00)
+	sub	n, n, #2
+	ld1	{v0.2d}, [ap], #16		C load 2 limbs
+	ld1	{v16.2d}, [bp], #16		C load 2 limbs
+	eor	v0.16b, v0.16b, v16.16b
+	cnt	v6.16b, v0.16b
+	uadalp	v4.8h,  v6.16b
+
+L(x00):	tbz	n, #2, L(000)
+	subs	n, n, #4
+	ld1	{v0.2d,v1.2d}, [ap], #32	C load 4 limbs
+	ld1	{v16.2d,v17.2d}, [bp], #32	C load 4 limbs
+	b.ls	L(sum)
+
+L(gt4):	ld1	{v2.2d,v3.2d}, [ap], #32	C load 4 limbs
+	ld1	{v18.2d,v19.2d}, [bp], #32	C load 4 limbs
+	eor	v0.16b, v0.16b, v16.16b
+	eor	v1.16b, v1.16b, v17.16b
+	sub	n, n, #4
+	cnt	v6.16b, v0.16b
+	cnt	v7.16b, v1.16b
+	b	L(mid)
+
+L(000):	subs	n, n, #8
+	b.lo	L(e0)
+
+L(chu):	ld1	{v2.2d,v3.2d}, [ap], #32	C load 4 limbs
+	ld1	{v0.2d,v1.2d}, [ap], #32	C load 4 limbs
+	ld1	{v18.2d,v19.2d}, [bp], #32	C load 4 limbs
+	ld1	{v16.2d,v17.2d}, [bp], #32	C load 4 limbs
+	eor	v2.16b, v2.16b, v18.16b
+	eor	v3.16b, v3.16b, v19.16b
+	cnt	v6.16b, v2.16b
+	cnt	v7.16b, v3.16b
+	subs	n, n, #8
+	b.lo	L(end)
+
+L(top):	ld1	{v2.2d,v3.2d}, [ap], #32	C load 4 limbs
+	ld1	{v18.2d,v19.2d}, [bp], #32	C load 4 limbs
+	eor	v0.16b, v0.16b, v16.16b
+	eor	v1.16b, v1.16b, v17.16b
+	uadalp	v4.8h,  v6.16b
+	cnt	v6.16b, v0.16b
+	uadalp	v5.8h,  v7.16b
+	cnt	v7.16b, v1.16b
+L(mid):	ld1	{v0.2d,v1.2d}, [ap], #32	C load 4 limbs
+	ld1	{v16.2d,v17.2d}, [bp], #32	C load 4 limbs
+	eor	v2.16b, v2.16b, v18.16b
+	eor	v3.16b, v3.16b, v19.16b
+	subs	n, n, #8
+	uadalp	v4.8h,  v6.16b
+	cnt	v6.16b, v2.16b
+	uadalp	v5.8h,  v7.16b
+	cnt	v7.16b, v3.16b
+	b.hs	L(top)
+
+L(end):	uadalp	v4.8h,  v6.16b
+	uadalp	v5.8h,  v7.16b
+L(sum):	eor	v0.16b, v0.16b, v16.16b
+	eor	v1.16b, v1.16b, v17.16b
+	cnt	v6.16b, v0.16b
+	cnt	v7.16b, v1.16b
+	uadalp	v4.8h,  v6.16b
+	uadalp	v5.8h,  v7.16b
+	add	v4.8h, v4.8h, v5.8h
+					C we have 8 16-bit counts
+L(e0):	uaddlp	v4.4s,  v4.8h		C we have 4 32-bit counts
+	uaddlp	v4.2d,  v4.4s		C we have 2 64-bit counts
+	mov	x0, v4.d[0]
+	mov	x1, v4.d[1]
+	add	x0, x0, x1
+	ret
+
+C Code for count > maxsize.  Splits operand and calls above code.
+define(`ap2', x5)			C caller-saves reg not used above
+define(`bp2', x6)			C caller-saves reg not used above
+L(gt8k):
+	mov	x8, x30
+	mov	x7, n			C full count (caller-saves reg not used above)
+	mov	x4, #0			C total sum  (caller-saves reg not used above)
+	mov	x9, #chunksize*8	C caller-saves reg not used above
+	mov	x10, #chunksize		C caller-saves reg not used above
+
+1:	add	ap2, ap, x9		C point at subsequent block
+	add	bp2, bp, x9		C point at subsequent block
+	mov	n, #chunksize-8		C count for this invocation, adjusted for entry pt
+	movi	v4.16b, #0		C clear chunk summation register
+	movi	v5.16b, #0		C clear chunk summation register
+	bl	L(chu)			C jump deep inside code
+	add	x4, x4, x0
+	mov	ap, ap2			C put chunk pointer in place for calls
+	mov	bp, bp2			C put chunk pointer in place for calls
+	sub	x7, x7, x10
+	cmp	x7, x11
+	b.hi	1b
+
+	mov	n, x7			C count for final invocation
+	bl	L(lt8k)
+	add	x0, x4, x0
+	mov	x30, x8
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/arm64/invert_limb.asm b/third_party/gmp/mpn/arm64/invert_limb.asm
new file mode 100644
index 0000000..a94b0e9
--- /dev/null
+++ b/third_party/gmp/mpn/arm64/invert_limb.asm

@@ -0,0 +1,83 @@
+dnl  ARM64 mpn_invert_limb -- Invert a normalized limb.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C            cycles/limb
+C Cortex-A53     ?
+C Cortex-A57     ?
+
+C Compiler generated, mildly edited.  Could surely be further optimised.
+
+ASM_START()
+PROLOGUE(mpn_invert_limb)
+	lsr	x2, x0, #54
+	adrp	x1, approx_tab
+	and	x2, x2, #0x1fe
+	add	x1, x1, :lo12:approx_tab
+	ldrh	w3, [x1,x2]
+	lsr	x4, x0, #24
+	add	x4, x4, #1
+	ubfiz	x2, x3, #11, #16
+	umull	x3, w3, w3
+	mul	x3, x3, x4
+	sub	x2, x2, #1
+	sub	x2, x2, x3, lsr #40
+	lsl	x3, x2, #60
+	mul	x1, x2, x2
+	msub	x1, x1, x4, x3
+	lsl	x2, x2, #13
+	add	x1, x2, x1, lsr #47
+	and	x2, x0, #1
+	neg	x3, x2
+	and	x3, x3, x1, lsr #1
+	add	x2, x2, x0, lsr #1
+	msub	x2, x1, x2, x3
+	umulh	x2, x2, x1
+	lsl	x1, x1, #31
+	add	x1, x1, x2, lsr #1
+	mul	x3, x1, x0
+	umulh	x2, x1, x0
+	adds	x4, x3, x0
+	adc	x0, x2, x0
+	sub	x0, x1, x0
+	ret
+EPILOGUE()
+
+	RODATA
+	ALIGN(2)
+	TYPE(   approx_tab, object)
+	SIZE(   approx_tab, 512)
+approx_tab:
+forloop(i,256,512-1,dnl
+`	.hword	eval(0x7fd00/i)
+')dnl

diff --git a/third_party/gmp/mpn/arm64/logops_n.asm b/third_party/gmp/mpn/arm64/logops_n.asm
new file mode 100644
index 0000000..ccaec9c
--- /dev/null
+++ b/third_party/gmp/mpn/arm64/logops_n.asm

@@ -0,0 +1,139 @@
+dnl  ARM64 mpn_and_n, mpn_andn_n. mpn_nand_n, etc.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb     cycles/limb
+C	      nand,nior	      all other
+C Cortex-A53	3.25-3.5	2.75-3
+C Cortex-A57	 2.0		 1.5
+C X-Gene	 2.14		 2.0
+
+changecom(blah)
+
+define(`rp', `x0')
+define(`up', `x1')
+define(`vp', `x2')
+define(`n',  `x3')
+
+define(`POSTOP', `dnl')
+
+ifdef(`OPERATION_and_n',`
+  define(`func',    `mpn_and_n')
+  define(`LOGOP',   `and	$1, $2, $3')')
+ifdef(`OPERATION_andn_n',`
+  define(`func',    `mpn_andn_n')
+  define(`LOGOP',   `bic	$1, $2, $3')')
+ifdef(`OPERATION_nand_n',`
+  define(`func',    `mpn_nand_n')
+  define(`POSTOP',  `mvn	$1, $1')
+  define(`LOGOP',   `and	$1, $2, $3')')
+ifdef(`OPERATION_ior_n',`
+  define(`func',    `mpn_ior_n')
+  define(`LOGOP',   `orr	$1, $2, $3')')
+ifdef(`OPERATION_iorn_n',`
+  define(`func',    `mpn_iorn_n')
+  define(`LOGOP',   `orn	$1, $2, $3')')
+ifdef(`OPERATION_nior_n',`
+  define(`func',    `mpn_nior_n')
+  define(`POSTOP',  `mvn	$1, $1')
+  define(`LOGOP',   `orr	$1, $2, $3')')
+ifdef(`OPERATION_xor_n',`
+  define(`func',    `mpn_xor_n')
+  define(`LOGOP',   `eor	$1, $2, $3')')
+ifdef(`OPERATION_xnor_n',`
+  define(`func',    `mpn_xnor_n')
+  define(`LOGOP',   `eon	$1, $2, $3')')
+
+MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n)
+
+ASM_START()
+PROLOGUE(func)
+	lsr	x18, n, #2
+	tbz	n, #0, L(bx0)
+
+L(bx1):	ldr	x7, [up]
+	ldr	x11, [vp]
+	LOGOP(	x15, x7, x11)
+	POSTOP(	x15)
+	str	x15, [rp],#8
+	tbnz	n, #1, L(b11)
+
+L(b01):	cbz	x18, L(ret)
+	ldp	x4, x5, [up,#8]
+	ldp	x8, x9, [vp,#8]
+	sub	up, up, #8
+	sub	vp, vp, #8
+	b	L(mid)
+
+L(b11):	ldp	x6, x7, [up,#8]
+	ldp	x10, x11, [vp,#8]
+	add	up, up, #8
+	add	vp, vp, #8
+	cbz	x18, L(end)
+	b	L(top)
+
+L(bx0):	tbnz	n, #1, L(b10)
+
+L(b00):	ldp	x4, x5, [up],#-16
+	ldp	x8, x9, [vp],#-16
+	b	L(mid)
+
+L(b10):	ldp	x6, x7, [up]
+	ldp	x10, x11, [vp]
+	cbz	x18, L(end)
+
+	ALIGN(16)
+L(top):	ldp	x4, x5, [up,#16]
+	ldp	x8, x9, [vp,#16]
+	LOGOP(	x12, x6, x10)
+	LOGOP(	x13, x7, x11)
+	POSTOP(	x12)
+	POSTOP(	x13)
+	stp	x12, x13, [rp],#16
+L(mid):	ldp	x6, x7, [up,#32]!
+	ldp	x10, x11, [vp,#32]!
+	LOGOP(	x12, x4, x8)
+	LOGOP(	x13, x5, x9)
+	POSTOP(	x12)
+	POSTOP(	x13)
+	stp	x12, x13, [rp],#16
+	sub	x18, x18, #1
+	cbnz	x18, L(top)
+
+L(end):	LOGOP(	x12, x6, x10)
+	LOGOP(	x13, x7, x11)
+	POSTOP(	x12)
+	POSTOP(	x13)
+	stp	x12, x13, [rp]
+L(ret):	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/arm64/lshift.asm b/third_party/gmp/mpn/arm64/lshift.asm
new file mode 100644
index 0000000..472b7ec
--- /dev/null
+++ b/third_party/gmp/mpn/arm64/lshift.asm

@@ -0,0 +1,127 @@
+dnl  ARM64 mpn_lshift.
+
+dnl  Copyright 2013, 2014, 2017 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 3 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb   assumed optimal c/l
+C Cortex-A53	3.5-4.0		 3.25
+C Cortex-A57	 2.0		 2.0
+C X-Gene	 2.67		 2.5
+
+C TODO
+C  * The feed-in code used 1 ldr for odd sized and 2 ldr for even sizes.  These
+C    numbers should be 1 and 0, respectively.  The str in wind-down should also
+C    go.
+C  * Using extr and with 63 separate loops we might reach 1.25 c/l on A57.
+C  * A53's speed depends on alignment, tune/speed -w1 gives 3.5, -w0 gives 4.0.
+
+changecom(blah)
+
+define(`rp_arg', `x0')
+define(`up',     `x1')
+define(`n',      `x2')
+define(`cnt',    `x3')
+
+define(`rp',     `x16')
+
+define(`tnc',`x8')
+
+define(`PSHIFT', lsl)
+define(`NSHIFT', lsr)
+
+ASM_START()
+PROLOGUE(mpn_lshift)
+	add	rp, rp_arg, n, lsl #3
+	add	up, up, n, lsl #3
+	sub	tnc, xzr, cnt
+	lsr	x18, n, #2
+	tbz	n, #0, L(bx0)
+
+L(bx1):	ldr	x4, [up,#-8]
+	tbnz	n, #1, L(b11)
+
+L(b01):	NSHIFT	x0, x4, tnc
+	PSHIFT	x2, x4, cnt
+	cbnz	x18, L(gt1)
+	str	x2, [rp,#-8]
+	ret
+L(gt1):	ldp	x4, x5, [up,#-24]
+	sub	up, up, #8
+	add	rp, rp, #16
+	b	L(lo2)
+
+L(b11):	NSHIFT	x0, x4, tnc
+	PSHIFT	x2, x4, cnt
+	ldp	x6, x7, [up,#-24]!
+	b	L(lo3)
+
+L(bx0):	ldp	x4, x5, [up,#-16]
+	tbz	n, #1, L(b00)
+
+L(b10):	NSHIFT	x0, x5, tnc
+	PSHIFT	x13, x5, cnt
+	NSHIFT	x10, x4, tnc
+	PSHIFT	x2, x4, cnt
+	cbnz	x18, L(gt2)
+	orr	x10, x10, x13
+	stp	x2, x10, [rp,#-16]
+	ret
+L(gt2):	ldp	x4, x5, [up,#-32]
+	orr	x10, x10, x13
+	str	x10, [rp,#-8]
+	sub	up, up, #16
+	add	rp, rp, #8
+	b	L(lo2)
+
+L(b00):	NSHIFT	x0, x5, tnc
+	PSHIFT	x13, x5, cnt
+	NSHIFT	x10, x4, tnc
+	PSHIFT	x2, x4, cnt
+	ldp	x6, x7, [up,#-32]!
+	orr	x10, x10, x13
+	str	x10, [rp,#-8]!
+	b	L(lo0)
+
+	ALIGN(16)
+L(top):	ldp	x4, x5, [up,#-16]
+	orr	x10, x10, x13
+	orr	x11, x12, x2
+	stp	x10, x11, [rp,#-16]
+	PSHIFT	x2, x6, cnt
+L(lo2):	NSHIFT	x10, x4, tnc
+	PSHIFT	x13, x5, cnt
+	NSHIFT	x12, x5, tnc
+	ldp	x6, x7, [up,#-32]!
+	orr	x10, x10, x13
+	orr	x11, x12, x2
+	stp	x10, x11, [rp,#-32]!
+	PSHIFT	x2, x4, cnt
+L(lo0):	sub	x18, x18, #1
+L(lo3):	NSHIFT	x10, x6, tnc
+	PSHIFT	x13, x7, cnt
+	NSHIFT	x12, x7, tnc
+	cbnz	x18, L(top)
+
+L(end):	orr	x10, x10, x13
+	orr	x11, x12, x2
+	PSHIFT	x2, x6, cnt
+	stp	x10, x11, [rp,#-16]
+	str	x2, [rp,#-24]
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/arm64/lshiftc.asm b/third_party/gmp/mpn/arm64/lshiftc.asm
new file mode 100644
index 0000000..dd4c4ce
--- /dev/null
+++ b/third_party/gmp/mpn/arm64/lshiftc.asm

@@ -0,0 +1,130 @@
+dnl  ARM64 mpn_lshiftc.
+
+dnl  Copyright 2013, 2014, 2017 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 3 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb   assumed optimal c/l
+C Cortex-A53	3.5-4.0		 3.25
+C Cortex-A57	 2.0		 2.0
+C X-Gene	 2.67		 2.5
+
+C TODO
+C  * The feed-in code used 1 ldr for odd sized and 2 ldr for even sizes.  These
+C    numbers should be 1 and 0, respectively.  The str in wind-down should also
+C    go.
+C  * Using extr and with 63 separate loops we might reach 1.5 c/l on A57.
+C  * A53's speed depends on alignment, tune/speed -w1 gives 3.5, -w0 gives 4.0.
+
+changecom(blah)
+
+define(`rp_arg', `x0')
+define(`up',     `x1')
+define(`n',      `x2')
+define(`cnt',    `x3')
+
+define(`rp',     `x16')
+
+define(`tnc',`x8')
+
+define(`PSHIFT', lsl)
+define(`NSHIFT', lsr)
+
+ASM_START()
+PROLOGUE(mpn_lshiftc)
+	add	rp, rp_arg, n, lsl #3
+	add	up, up, n, lsl #3
+	sub	tnc, xzr, cnt
+	lsr	x18, n, #2
+	tbz	n, #0, L(bx0)
+
+L(bx1):	ldr	x4, [up,#-8]
+	tbnz	n, #1, L(b11)
+
+L(b01):	NSHIFT	x0, x4, tnc
+	PSHIFT	x2, x4, cnt
+	cbnz	x18, L(gt1)
+	mvn	x2, x2
+	str	x2, [rp,#-8]
+	ret
+L(gt1):	ldp	x4, x5, [up,#-24]
+	sub	up, up, #8
+	add	rp, rp, #16
+	b	L(lo2)
+
+L(b11):	NSHIFT	x0, x4, tnc
+	PSHIFT	x2, x4, cnt
+	ldp	x6, x7, [up,#-24]!
+	b	L(lo3)
+
+L(bx0):	ldp	x4, x5, [up,#-16]
+	tbz	n, #1, L(b00)
+
+L(b10):	NSHIFT	x0, x5, tnc
+	PSHIFT	x13, x5, cnt
+	NSHIFT	x10, x4, tnc
+	PSHIFT	x2, x4, cnt
+	cbnz	x18, L(gt2)
+	eon	x10, x10, x13
+	mvn	x2, x2
+	stp	x2, x10, [rp,#-16]
+	ret
+L(gt2):	ldp	x4, x5, [up,#-32]
+	eon	x10, x10, x13
+	str	x10, [rp,#-8]
+	sub	up, up, #16
+	add	rp, rp, #8
+	b	L(lo2)
+
+L(b00):	NSHIFT	x0, x5, tnc
+	PSHIFT	x13, x5, cnt
+	NSHIFT	x10, x4, tnc
+	PSHIFT	x2, x4, cnt
+	ldp	x6, x7, [up,#-32]!
+	eon	x10, x10, x13
+	str	x10, [rp,#-8]!
+	b	L(lo0)
+
+	ALIGN(16)
+L(top):	ldp	x4, x5, [up,#-16]
+	eon	x10, x10, x13
+	eon	x11, x12, x2
+	stp	x10, x11, [rp,#-16]
+	PSHIFT	x2, x6, cnt
+L(lo2):	NSHIFT	x10, x4, tnc
+	PSHIFT	x13, x5, cnt
+	NSHIFT	x12, x5, tnc
+	ldp	x6, x7, [up,#-32]!
+	eon	x10, x10, x13
+	eon	x11, x12, x2
+	stp	x10, x11, [rp,#-32]!
+	PSHIFT	x2, x4, cnt
+L(lo0):	sub	x18, x18, #1
+L(lo3):	NSHIFT	x10, x6, tnc
+	PSHIFT	x13, x7, cnt
+	NSHIFT	x12, x7, tnc
+	cbnz	x18, L(top)
+
+L(end):	eon	x10, x10, x13
+	eon	x11, x12, x2
+	PSHIFT	x2, x6, cnt
+	stp	x10, x11, [rp,#-16]
+	mvn	x2, x2
+	str	x2, [rp,#-24]
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/arm64/mod_34lsub1.asm b/third_party/gmp/mpn/arm64/mod_34lsub1.asm
new file mode 100644
index 0000000..7945fe7
--- /dev/null
+++ b/third_party/gmp/mpn/arm64/mod_34lsub1.asm

@@ -0,0 +1,124 @@
+dnl  ARM64 mpn_mod_34lsub1 -- remainder modulo 2^48-1.
+
+dnl  Copyright 2012-2014 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C Cortex-A53	 2
+C Cortex-A57	 1
+C X-Gene	 1.45
+
+define(`ap',	x0)
+define(`n',	x1)
+
+changecom(blah)
+
+C mp_limb_t mpn_mod_34lsub1 (mp_srcptr up, mp_size_t n)
+
+C TODO
+C  * An alternative inner loop which could run at 0.722 c/l on A57:
+C	adds	x8, x8, x2
+C	adcs	x9, x9, x3
+C	ldp	x2, x3, [ap, #-32]
+C	adcs	x10, x10, x4
+C	adc	x12, x12, xzr
+C	adds	x8, x8, x5
+C	ldp	x4, x5, [ap, #-16]
+C	sub	n, n, #6
+C	adcs	x9, x9, x6
+C	adcs	x10, x10, x7
+C	ldp	x6, x7, [ap], #48
+C	adc	x12, x12, xzr
+C	tbz	n, #63, L(top)
+
+ASM_START()
+	TEXT
+	ALIGN(32)
+PROLOGUE(mpn_mod_34lsub1)
+	subs	n, n, #3
+	mov	x8, #0
+	b.lt	L(le2)			C n <= 2
+
+	ldp	x2, x3, [ap, #0]
+	ldr	x4, [ap, #16]
+	add	ap, ap, #24
+	subs	n, n, #3
+	b.lt	L(sum)			C n <= 5
+	cmn	x0, #0			C clear carry
+
+L(top):	ldp	x5, x6, [ap, #0]
+	ldr	x7, [ap, #16]
+	add	ap, ap, #24
+	sub	n, n, #3
+	adcs	x2, x2, x5
+	adcs	x3, x3, x6
+	adcs	x4, x4, x7
+	tbz	n, #63, L(top)
+
+	adc	x8, xzr, xzr		C x8 <= 1
+
+L(sum):	cmn	n, #2
+	mov	x5, #0
+	b.lo	1f
+	ldr	x5, [ap], #8
+1:	mov	x6, #0
+	b.ls	1f
+	ldr	x6, [ap], #8
+1:	adds	x2, x2, x5
+	adcs	x3, x3, x6
+	adcs	x4, x4, xzr
+	adc	x8, x8, xzr		C x8 <= 2
+
+L(sum2):
+	and	x0, x2, #0xffffffffffff
+	add	x0, x0, x2, lsr #48
+	add	x0, x0, x8
+
+	lsl	x8, x3, #16
+	and	x1, x8, #0xffffffffffff
+	add	x0, x0, x1
+	add	x0, x0, x3, lsr #32
+
+	lsl	x8, x4, #32
+	and	x1, x8, #0xffffffffffff
+	add	x0, x0, x1
+	add	x0, x0, x4, lsr #16
+	ret
+
+L(le2):	cmn	n, #1
+	b.ne	L(1)
+	ldp	x2, x3, [ap]
+	mov	x4, #0
+	b	L(sum2)
+L(1):	ldr	x2, [ap]
+	and	x0, x2, #0xffffffffffff
+	add	x0, x0, x2, lsr #48
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/arm64/mul_1.asm b/third_party/gmp/mpn/arm64/mul_1.asm
new file mode 100644
index 0000000..d1d3394
--- /dev/null
+++ b/third_party/gmp/mpn/arm64/mul_1.asm

@@ -0,0 +1,127 @@
+dnl  ARM64 mpn_mul_1
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2013, 2015, 2017 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C Cortex-A53	7.5-8
+C Cortex-A57	 7
+C Cortex-A72
+C X-Gene	 4
+
+C TODO
+C  * Start first multiply earlier.
+
+changecom(blah)
+
+define(`rp', `x0')
+define(`up', `x1')
+define(`n',  `x2')
+define(`v0', `x3')
+
+
+PROLOGUE(mpn_mul_1c)
+	adds	xzr, xzr, xzr		C clear cy flag
+	b	L(com)
+EPILOGUE()
+
+PROLOGUE(mpn_mul_1)
+	adds	x4, xzr, xzr		C clear register and cy flag
+L(com):	lsr	x18, n, #2
+	tbnz	n, #0, L(bx1)
+
+L(bx0):	mov	x11, x4
+	tbz	n, #1, L(b00)
+
+L(b10):	ldp	x4, x5, [up]
+	mul	x8, x4, v0
+	umulh	x10, x4, v0
+	cbz	x18, L(2)
+	ldp	x6, x7, [up,#16]!
+	mul	x9, x5, v0
+	b	L(mid)-8
+
+L(2):	mul	x9, x5, v0
+	b	L(2e)
+
+L(bx1):	ldr	x7, [up],#8
+	mul	x9, x7, v0
+	umulh	x11, x7, v0
+	adds	x9, x9, x4
+	str	x9, [rp],#8
+	tbnz	n, #1, L(b10)
+
+L(b01):	cbz	x18, L(1)
+
+L(b00):	ldp	x6, x7, [up]
+	mul	x8, x6, v0
+	umulh	x10, x6, v0
+	ldp	x4, x5, [up,#16]
+	mul	x9, x7, v0
+	adcs	x12, x8, x11
+	umulh	x11, x7, v0
+	add	rp, rp, #16
+	sub	x18, x18, #1
+	cbz	x18, L(end)
+
+	ALIGN(16)
+L(top):	mul	x8, x4, v0
+	ldp	x6, x7, [up,#32]!
+	adcs	x13, x9, x10
+	umulh	x10, x4, v0
+	mul	x9, x5, v0
+	stp	x12, x13, [rp,#-16]
+	adcs	x12, x8, x11
+	umulh	x11, x5, v0
+L(mid):	mul	x8, x6, v0
+	ldp	x4, x5, [up,#16]
+	adcs	x13, x9, x10
+	umulh	x10, x6, v0
+	mul	x9, x7, v0
+	stp	x12, x13, [rp],#32
+	adcs	x12, x8, x11
+	umulh	x11, x7, v0
+	sub	x18, x18, #1
+	cbnz	x18, L(top)
+
+L(end):	mul	x8, x4, v0
+	adcs	x13, x9, x10
+	umulh	x10, x4, v0
+	mul	x9, x5, v0
+	stp	x12, x13, [rp,#-16]
+L(2e):	adcs	x12, x8, x11
+	umulh	x11, x5, v0
+	adcs	x13, x9, x10
+	stp	x12, x13, [rp]
+L(1):	adc	x0, x11, xzr
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/arm64/popcount.asm b/third_party/gmp/mpn/arm64/popcount.asm
new file mode 100644
index 0000000..74de3fc
--- /dev/null
+++ b/third_party/gmp/mpn/arm64/popcount.asm

@@ -0,0 +1,157 @@
+dnl  ARM64 Neon mpn_popcount -- mpn bit population count.
+
+dnl  Copyright 2013, 2014 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C Cortex-A53	 2.5
+C Cortex-A57	 1.14
+C X-Gene	 3
+
+C TODO
+C  * Consider greater unrolling.
+C  * Arrange to align the pointer, if that helps performance.  Use the same
+C    read-and-mask trick we use on PCs, for simplicity and performance.  (Sorry
+C    valgrind!)
+C  * Explore if explicit align directives, e.g., "[ptr:128]" help.
+C  * See rth's gmp-devel 2013-02/03 messages about final summation tricks.
+
+changecom(blah)
+
+C INPUT PARAMETERS
+define(`ap', x0)
+define(`n',  x1)
+
+C We sum into 16 16-bit counters in v4,v5, but at the end we sum them and end
+C up with 8 16-bit counters.  Therefore, we can sum to 8(2^16-1) bits, or
+C (8*2^16-1)/64 = 0x1fff limbs.  We use a chunksize close to that, but which
+C  allows the huge count code to jump deep into the code (at L(chu)).
+
+define(`maxsize',  0x1fff)
+define(`chunksize',0x1ff0)
+
+ASM_START()
+PROLOGUE(mpn_popcount)
+
+	mov	x11, #maxsize
+	cmp	n, x11
+	b.hi	L(gt8k)
+
+L(lt8k):
+	movi	v4.16b, #0			C clear summation register
+	movi	v5.16b, #0			C clear summation register
+
+	tbz	n, #0, L(xx0)
+	sub	n, n, #1
+	ld1	{v0.1d}, [ap], #8		C load 1 limb
+	cnt	v6.16b, v0.16b
+	uadalp	v4.8h,  v6.16b			C could also splat
+
+L(xx0):	tbz	n, #1, L(x00)
+	sub	n, n, #2
+	ld1	{v0.2d}, [ap], #16		C load 2 limbs
+	cnt	v6.16b, v0.16b
+	uadalp	v4.8h,  v6.16b
+
+L(x00):	tbz	n, #2, L(000)
+	subs	n, n, #4
+	ld1	{v0.2d,v1.2d}, [ap], #32	C load 4 limbs
+	b.ls	L(sum)
+
+L(gt4):	ld1	{v2.2d,v3.2d}, [ap], #32	C load 4 limbs
+	sub	n, n, #4
+	cnt	v6.16b, v0.16b
+	cnt	v7.16b, v1.16b
+	b	L(mid)
+
+L(000):	subs	n, n, #8
+	b.lo	L(e0)
+
+L(chu):	ld1	{v2.2d,v3.2d}, [ap], #32	C load 4 limbs
+	ld1	{v0.2d,v1.2d}, [ap], #32	C load 4 limbs
+	cnt	v6.16b, v2.16b
+	cnt	v7.16b, v3.16b
+	subs	n, n, #8
+	b.lo	L(end)
+
+L(top):	ld1	{v2.2d,v3.2d}, [ap], #32	C load 4 limbs
+	uadalp	v4.8h,  v6.16b
+	cnt	v6.16b, v0.16b
+	uadalp	v5.8h,  v7.16b
+	cnt	v7.16b, v1.16b
+L(mid):	ld1	{v0.2d,v1.2d}, [ap], #32	C load 4 limbs
+	subs	n, n, #8
+	uadalp	v4.8h,  v6.16b
+	cnt	v6.16b, v2.16b
+	uadalp	v5.8h,  v7.16b
+	cnt	v7.16b, v3.16b
+	b.hs	L(top)
+
+L(end):	uadalp	v4.8h,  v6.16b
+	uadalp	v5.8h,  v7.16b
+L(sum):	cnt	v6.16b, v0.16b
+	cnt	v7.16b, v1.16b
+	uadalp	v4.8h,  v6.16b
+	uadalp	v5.8h,  v7.16b
+	add	v4.8h, v4.8h, v5.8h
+					C we have 8 16-bit counts
+L(e0):	uaddlp	v4.4s,  v4.8h		C we have 4 32-bit counts
+	uaddlp	v4.2d,  v4.4s		C we have 2 64-bit counts
+	mov	x0, v4.d[0]
+	mov	x1, v4.d[1]
+	add	x0, x0, x1
+	ret
+
+C Code for count > maxsize.  Splits operand and calls above code.
+define(`ap2', x5)			C caller-saves reg not used above
+L(gt8k):
+	mov	x8, x30
+	mov	x7, n			C full count (caller-saves reg not used above)
+	mov	x4, #0			C total sum  (caller-saves reg not used above)
+	mov	x9, #chunksize*8	C caller-saves reg not used above
+	mov	x10, #chunksize		C caller-saves reg not used above
+
+1:	add	ap2, ap, x9		C point at subsequent block
+	mov	n, #chunksize-8		C count for this invocation, adjusted for entry pt
+	movi	v4.16b, #0		C clear chunk summation register
+	movi	v5.16b, #0		C clear chunk summation register
+	bl	L(chu)			C jump deep inside code
+	add	x4, x4, x0
+	mov	ap, ap2			C put chunk pointer in place for calls
+	sub	x7, x7, x10
+	cmp	x7, x11
+	b.hi	1b
+
+	mov	n, x7			C count for final invocation
+	bl	L(lt8k)
+	add	x0, x4, x0
+	mov	x30, x8
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/arm64/rsh1aors_n.asm b/third_party/gmp/mpn/arm64/rsh1aors_n.asm
new file mode 100644
index 0000000..e0b760b
--- /dev/null
+++ b/third_party/gmp/mpn/arm64/rsh1aors_n.asm

@@ -0,0 +1,168 @@
+dnl  ARM64 mpn_rsh1add_n and mpn_rsh1sub_n.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2017 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb   assumed optimal c/l
+C Cortex-A53	3.25-3.75	 3.0 steady
+C Cortex-A57	 2.15		 1.75
+C X-Gene	 2.75		 2.5
+
+changecom(blah)
+
+define(`rp', `x0')
+define(`up', `x1')
+define(`vp', `x2')
+define(`n',  `x3')
+
+ifdef(`OPERATION_rsh1add_n', `
+  define(`ADDSUB',	adds)
+  define(`ADDSUBC',	adcs)
+  define(`COND',	`cs')
+  define(`func_n',	mpn_rsh1add_n)')
+ifdef(`OPERATION_rsh1sub_n', `
+  define(`ADDSUB',	subs)
+  define(`ADDSUBC',	sbcs)
+  define(`COND',	`cc')
+  define(`func_n',	mpn_rsh1sub_n)')
+
+MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1sub_n)
+
+ASM_START()
+PROLOGUE(func_n)
+	lsr	x18, n, #2
+
+	tbz	n, #0, L(bx0)
+
+L(bx1):	ldr	x5, [up],#8
+	ldr	x9, [vp],#8
+	tbnz	n, #1, L(b11)
+
+L(b01):	ADDSUB	x13, x5, x9
+	and	x10, x13, #1
+	cbz	x18, L(1)
+	ldp	x4, x5, [up],#48
+	ldp	x8, x9, [vp],#48
+	ADDSUBC	x14, x4, x8
+	ADDSUBC	x15, x5, x9
+	ldp	x4, x5, [up,#-32]
+	ldp	x8, x9, [vp,#-32]
+	extr	x17, x14, x13, #1
+	ADDSUBC	x12, x4, x8
+	ADDSUBC	x13, x5, x9
+	str	x17, [rp], #24
+	sub	x18, x18, #1
+	cbz	x18, L(end)
+	b	L(top)
+
+L(1):	cset	x14, COND
+	extr	x17, x14, x13, #1
+	str	x17, [rp]
+	mov	x0, x10
+	ret
+
+L(b11):	ADDSUB	x15, x5, x9
+	and	x10, x15, #1
+
+	ldp	x4, x5, [up],#32
+	ldp	x8, x9, [vp],#32
+	ADDSUBC	x12, x4, x8
+	ADDSUBC	x13, x5, x9
+	cbz	x18, L(3)
+	ldp	x4, x5, [up,#-16]
+	ldp	x8, x9, [vp,#-16]
+	extr	x17, x12, x15, #1
+	ADDSUBC	x14, x4, x8
+	ADDSUBC	x15, x5, x9
+	str	x17, [rp], #8
+	b	L(mid)
+
+L(3):	extr	x17, x12, x15, #1
+	str	x17, [rp], #8
+	b	L(2)
+
+L(bx0):	tbz	n, #1, L(b00)
+
+L(b10):	ldp	x4, x5, [up],#32
+	ldp	x8, x9, [vp],#32
+	ADDSUB	x12, x4, x8
+	ADDSUBC	x13, x5, x9
+	and	x10, x12, #1
+	cbz	x18, L(2)
+	ldp	x4, x5, [up,#-16]
+	ldp	x8, x9, [vp,#-16]
+	ADDSUBC	x14, x4, x8
+	ADDSUBC	x15, x5, x9
+	b	L(mid)
+
+L(b00):	ldp	x4, x5, [up],#48
+	ldp	x8, x9, [vp],#48
+	ADDSUB	x14, x4, x8
+	ADDSUBC	x15, x5, x9
+	and	x10, x14, #1
+	ldp	x4, x5, [up,#-32]
+	ldp	x8, x9, [vp,#-32]
+	ADDSUBC	x12, x4, x8
+	ADDSUBC	x13, x5, x9
+	add	rp, rp, #16
+	sub	x18, x18, #1
+	cbz	x18, L(end)
+
+	ALIGN(16)
+L(top):	ldp	x4, x5, [up,#-16]
+	ldp	x8, x9, [vp,#-16]
+	extr	x16, x15, x14, #1
+	extr	x17, x12, x15, #1
+	ADDSUBC	x14, x4, x8
+	ADDSUBC	x15, x5, x9
+	stp	x16, x17, [rp,#-16]
+L(mid):	ldp	x4, x5, [up],#32
+	ldp	x8, x9, [vp],#32
+	extr	x16, x13, x12, #1
+	extr	x17, x14, x13, #1
+	ADDSUBC	x12, x4, x8
+	ADDSUBC	x13, x5, x9
+	stp	x16, x17, [rp],#32
+	sub	x18, x18, #1
+	cbnz	x18, L(top)
+
+L(end):	extr	x16, x15, x14, #1
+	extr	x17, x12, x15, #1
+	stp	x16, x17, [rp,#-16]
+L(2):	cset	x14, COND
+	extr	x16, x13, x12, #1
+	extr	x17, x14, x13, #1
+	stp	x16, x17, [rp]
+
+L(ret):	mov	x0, x10
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/arm64/rshift.asm b/third_party/gmp/mpn/arm64/rshift.asm
new file mode 100644
index 0000000..78ae960
--- /dev/null
+++ b/third_party/gmp/mpn/arm64/rshift.asm

@@ -0,0 +1,125 @@
+dnl  ARM64 mpn_rshift.
+
+dnl  Copyright 2013, 2014, 2017 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 3 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb   assumed optimal c/l
+C Cortex-A53	3.5-4.0		 3.25
+C Cortex-A57	 2.0		 2.0
+C X-Gene	 2.67		 2.5
+
+C TODO
+C  * The feed-in code used 1 ldr for odd sized and 2 ldr for even sizes.  These
+C    numbers should be 1 and 0, respectively.  The str in wind-down should also
+C    go.
+C  * Using extr and with 63 separate loops we might reach 1.25 c/l on A57.
+C  * A53's speed depends on alignment, but not as simply as for lshift/lshiftc.
+
+changecom(blah)
+
+define(`rp_arg', `x0')
+define(`up',     `x1')
+define(`n',      `x2')
+define(`cnt',    `x3')
+
+define(`rp',     `x16')
+
+define(`tnc',`x8')
+
+define(`PSHIFT', lsr)
+define(`NSHIFT', lsl)
+
+ASM_START()
+PROLOGUE(mpn_rshift)
+	mov	rp, rp_arg
+	sub	tnc, xzr, cnt
+	lsr	x18, n, #2
+	tbz	n, #0, L(bx0)
+
+L(bx1):	ldr	x5, [up]
+	tbnz	n, #1, L(b11)
+
+L(b01):	NSHIFT	x0, x5, tnc
+	PSHIFT	x2, x5, cnt
+	cbnz	x18, L(gt1)
+	str	x2, [rp]
+	ret
+L(gt1):	ldp	x4, x5, [up,#8]
+	sub	up, up, #8
+	sub	rp, rp, #32
+	b	L(lo2)
+
+L(b11):	NSHIFT	x0, x5, tnc
+	PSHIFT	x2, x5, cnt
+	ldp	x6, x7, [up,#8]!
+	sub	rp, rp, #16
+	b	L(lo3)
+
+L(bx0):	ldp	x4, x5, [up]
+	tbz	n, #1, L(b00)
+
+L(b10):	NSHIFT	x0, x4, tnc
+	PSHIFT	x13, x4, cnt
+	NSHIFT	x10, x5, tnc
+	PSHIFT	x2, x5, cnt
+	cbnz	x18, L(gt2)
+	orr	x10, x10, x13
+	stp	x10, x2, [rp]
+	ret
+L(gt2):	ldp	x4, x5, [up,#16]
+	orr	x10, x10, x13
+	str	x10, [rp],#-24
+	b	L(lo2)
+
+L(b00):	NSHIFT	x0, x4, tnc
+	PSHIFT	x13, x4, cnt
+	NSHIFT	x10, x5, tnc
+	PSHIFT	x2, x5, cnt
+	ldp	x6, x7, [up,#16]!
+	orr	x10, x10, x13
+	str	x10, [rp],#-8
+	b	L(lo0)
+
+	ALIGN(16)
+L(top):	ldp	x4, x5, [up,#16]
+	orr	x10, x10, x13
+	orr	x11, x12, x2
+	stp	x11, x10, [rp,#16]
+	PSHIFT	x2, x7, cnt
+L(lo2):	NSHIFT	x10, x5, tnc
+	NSHIFT	x12, x4, tnc
+	PSHIFT	x13, x4, cnt
+	ldp	x6, x7, [up,#32]!
+	orr	x10, x10, x13
+	orr	x11, x12, x2
+	stp	x11, x10, [rp,#32]!
+	PSHIFT	x2, x5, cnt
+L(lo0):	sub	x18, x18, #1
+L(lo3):	NSHIFT	x10, x7, tnc
+	NSHIFT	x12, x6, tnc
+	PSHIFT	x13, x6, cnt
+	cbnz	x18, L(top)
+
+L(end):	orr	x10, x10, x13
+	orr	x11, x12, x2
+	PSHIFT	x2, x7, cnt
+	stp	x11, x10, [rp,#16]
+	str	x2, [rp,#32]
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/arm64/sec_tabselect.asm b/third_party/gmp/mpn/arm64/sec_tabselect.asm
new file mode 100644
index 0000000..18a268a
--- /dev/null
+++ b/third_party/gmp/mpn/arm64/sec_tabselect.asm

@@ -0,0 +1,122 @@
+dnl  ARM64 Neon mpn_sec_tabselect.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2011-2014 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C	     cycles/limb
+C Cortex-A53	 2.25
+C Cortex-A57	 1.33
+C X-Gene	 2
+
+C void
+C mpn_sec_tabselect (mp_ptr rp, mp_srcptr *tab,
+C		     mp_size_t n, mp_size_t nents, mp_size_t which)
+
+changecom(blah)
+
+define(`rp',     `x0')
+define(`tp',     `x1')
+define(`n',      `x2')
+define(`nents',  `x3')
+define(`which',  `x4')
+
+define(`i',      `x5')
+define(`j',      `x6')
+
+define(`maskq',  `v4')
+
+ASM_START()
+PROLOGUE(mpn_sec_tabselect)
+	dup	v7.2d, x4			C 2 `which' copies
+
+	mov	x10, #1
+	dup	v6.2d, x10			C 2 copies of 1
+
+	subs	j, n, #4
+	b.mi	L(outer_end)
+
+L(outer_top):
+	mov	i, nents
+	mov	x12, tp				C preserve tp
+	movi	v5.16b, #0			C zero 2 counter copies
+	movi	v2.16b, #0
+	movi	v3.16b, #0
+	ALIGN(16)
+L(tp4):	cmeq	maskq.2d, v5.2d, v7.2d		C compare idx copies to `which' copies
+	ld1	{v0.2d,v1.2d}, [tp]
+	add	v5.2d, v5.2d, v6.2d
+	bit	v2.16b, v0.16b, maskq.16b
+	bit	v3.16b, v1.16b, maskq.16b
+	add	tp, tp, n, lsl #3
+	sub	i, i, #1
+	cbnz	i, L(tp4)
+	st1	{v2.2d,v3.2d}, [rp], #32
+	add	tp, x12, #32			C restore tp, point to next slice
+	subs	j, j, #4
+	b.pl	L(outer_top)
+L(outer_end):
+
+	tbz	n, #1, L(b0x)
+	mov	i, nents
+	mov	x12, tp
+	movi	v5.16b, #0			C zero 2 counter copies
+	movi	v2.16b, #0
+	ALIGN(16)
+L(tp2):	cmeq	maskq.2d, v5.2d, v7.2d
+	ld1	{v0.2d}, [tp]
+	add	v5.2d, v5.2d, v6.2d
+	bit	v2.16b, v0.16b, maskq.16b
+	add	tp, tp, n, lsl #3
+	sub	i, i, #1
+	cbnz	i, L(tp2)
+	st1	{v2.2d}, [rp], #16
+	add	tp, x12, #16
+
+L(b0x):	tbz	n, #0, L(b00)
+	mov	i, nents
+	mov	x12, tp
+	movi	v5.16b, #0			C zero 2 counter copies
+	movi	v2.16b, #0
+	ALIGN(16)
+L(tp1):	cmeq	maskq.2d, v5.2d, v7.2d
+	ld1	{v0.1d}, [tp]
+	add	v5.2d, v5.2d, v6.2d		C FIXME size should be `1d'
+	bit	v2.8b, v0.8b, maskq.8b
+	add	tp, tp, n, lsl #3
+	sub	i, i, #1
+	cbnz	i, L(tp1)
+	st1	{v2.1d}, [rp], #8
+	add	tp, x12, #8
+
+L(b00):	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/arm64/sqr_diag_addlsh1.asm b/third_party/gmp/mpn/arm64/sqr_diag_addlsh1.asm
new file mode 100644
index 0000000..55b5ac7
--- /dev/null
+++ b/third_party/gmp/mpn/arm64/sqr_diag_addlsh1.asm

@@ -0,0 +1,102 @@
+dnl  ARM64 mpn_sqr_diag_addlsh1.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2016, 2017 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C Cortex-A53	 5.65
+C Cortex-A57	 3.5
+C X-Gene	 3.38
+
+changecom(blah)
+
+define(`rp', `x0')
+define(`tp', `x1')
+define(`up', `x2')
+define(`n',  `x3')
+
+ASM_START()
+PROLOGUE(mpn_sqr_diag_addlsh1)
+	ldr	x15, [up],#8
+	lsr	x18, n, #1
+	tbz	n, #0, L(bx0)
+
+L(bx1):	adds	x7, xzr, xzr
+	mul	x12, x15, x15
+	ldr	x16, [up],#8
+	ldp	x4, x5, [tp],#16
+	umulh	x11, x15, x15
+	b	L(mid)
+
+L(bx0):	adds	x5, xzr, xzr
+	mul	x12, x15, x15
+	ldr	x17, [up],#16
+	ldp	x6, x7, [tp],#32
+	umulh	x11, x15, x15
+	sub	x18, x18, #1
+	cbz	x18, L(end)
+
+	ALIGN(16)
+L(top):	extr	x9, x6, x5, #63
+	mul	x10, x17, x17
+	ldr	x16, [up,#-8]
+	adcs	x13, x9, x11
+	ldp	x4, x5, [tp,#-16]
+	umulh	x11, x17, x17
+	extr	x8, x7, x6, #63
+	stp	x12, x13, [rp],#16
+	adcs	x12, x8, x10
+L(mid):	extr	x9, x4, x7, #63
+	mul	x10, x16, x16
+	ldr	x17, [up],#16
+	adcs	x13, x9, x11
+	ldp	x6, x7, [tp],#32
+	umulh	x11, x16, x16
+	extr	x8, x5, x4, #63
+	stp	x12, x13, [rp],#16
+	adcs	x12, x8, x10
+	sub	x18, x18, #1
+	cbnz	x18, L(top)
+
+L(end):	extr	x9, x6, x5, #63
+	mul	x10, x17, x17
+	adcs	x13, x9, x11
+	umulh	x11, x17, x17
+	extr	x8, x7, x6, #63
+	stp	x12, x13, [rp]
+	adcs	x12, x8, x10
+	extr	x9, xzr, x7, #63
+	adcs	x13, x9, x11
+	stp	x12, x13, [rp,#16]
+
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/arm64/xgene1/gmp-mparam.h b/third_party/gmp/mpn/arm64/xgene1/gmp-mparam.h
new file mode 100644
index 0000000..7cc3cb3
--- /dev/null
+++ b/third_party/gmp/mpn/arm64/xgene1/gmp-mparam.h

@@ -0,0 +1,181 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+/* 2400 MHz AppliedMicro X-Gene */
+/* FFT tuning limit = 0.5 M */
+/* Generated by tuneup.c, 2019-09-28, gcc 4.8 */
+
+#define DIVREM_1_NORM_THRESHOLD              0  /* always */
+#define DIVREM_1_UNNORM_THRESHOLD            0  /* always */
+#define MOD_1_1P_METHOD                      1  /* 2.00% faster than 2 */
+#define MOD_1_NORM_THRESHOLD                 0  /* always */
+#define MOD_1_UNNORM_THRESHOLD               0  /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          7
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          5
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD         0  /* never mpn_mod_1_1p */
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD        22
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     13
+#define USE_PREINV_DIVREM_1                  1
+#define DIV_QR_1N_PI1_METHOD                 1  /* 37.38% faster than 2 */
+#define DIV_QR_1_NORM_THRESHOLD              1
+#define DIV_QR_1_UNNORM_THRESHOLD            1
+#define DIV_QR_2_PI2_THRESHOLD              14
+#define DIVEXACT_1_THRESHOLD                 0  /* always */
+#define BMOD_1_TO_MOD_1_THRESHOLD           27
+
+#define DIV_1_VS_MUL_1_PERCENT             249
+
+#define MUL_TOOM22_THRESHOLD                18
+#define MUL_TOOM33_THRESHOLD                61
+#define MUL_TOOM44_THRESHOLD               112
+#define MUL_TOOM6H_THRESHOLD               242
+#define MUL_TOOM8H_THRESHOLD               321
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      73
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD      99
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD     109
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD      72
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD     106
+
+#define SQR_BASECASE_THRESHOLD               0  /* always */
+#define SQR_TOOM2_THRESHOLD                 28
+#define SQR_TOOM3_THRESHOLD                 81
+#define SQR_TOOM4_THRESHOLD                154
+#define SQR_TOOM6_THRESHOLD                214
+#define SQR_TOOM8_THRESHOLD                284
+
+#define MULMID_TOOM42_THRESHOLD             46
+
+#define MULMOD_BNM1_THRESHOLD               11
+#define SQRMOD_BNM1_THRESHOLD               13
+
+#define MUL_FFT_MODF_THRESHOLD             412  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    412, 5}, {     15, 6}, {      8, 5}, {     17, 6}, \
+    {     19, 7}, {     12, 6}, {     25, 7}, {     17, 8}, \
+    {      9, 7}, {     20, 8}, {     11, 7}, {     25, 8}, \
+    {     13, 7}, {     27, 8}, {     15, 7}, {     31, 8}, \
+    {     17, 7}, {     35, 8}, {     21, 9}, {     11, 8}, \
+    {     27, 9}, {     15, 8}, {     35, 9}, {     19, 8}, \
+    {     41, 9}, {     23, 8}, {     49, 9}, {     27,10}, \
+    {     15, 9}, {     31, 8}, {     63, 9}, {     39,10}, \
+    {     23, 9}, {     55,11}, {     15,10}, {     31, 9}, \
+    {     71,10}, {     39, 9}, {     83,10}, {     47, 9}, \
+    {     99,10}, {     55,11}, {     31,10}, {     63, 9}, \
+    {    127,10}, {     71, 9}, {    143,10}, {     79,11}, \
+    {     47,10}, {    103,12}, {     31,11}, {     63,10}, \
+    {    127, 9}, {    255,10}, {    135, 9}, {    271,10}, \
+    {    143,11}, {     79, 9}, {    319,10}, {    167, 9}, \
+    {    351,11}, {     95, 9}, {    383, 8}, {    767,10}, \
+    {    207, 9}, {    415,11}, {    111,12}, {     63,11}, \
+    {    127,10}, {    255, 9}, {    511,10}, {    271, 9}, \
+    {    543,11}, {    143,10}, {    287, 9}, {    575,10}, \
+    {    319, 9}, {    639,10}, {    351,12}, {     95,10}, \
+    {    383, 9}, {    767,11}, {    207,10}, {    415, 9}, \
+    {    831,11}, {    223,10}, {    447,13}, {   8192,14}, \
+    {  16384,15}, {  32768,16}, {  65536,17}, { 131072,18}, \
+    { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \
+    {4194304,23}, {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 98
+#define MUL_FFT_THRESHOLD                 4736
+
+#define SQR_FFT_MODF_THRESHOLD             340  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    340, 5}, {     17, 6}, {      9, 5}, {     19, 6}, \
+    {     19, 7}, {     10, 6}, {     21, 7}, {     11, 6}, \
+    {     23, 7}, {     21, 8}, {     11, 7}, {     24, 8}, \
+    {     15, 7}, {     31, 8}, {     21, 9}, {     11, 8}, \
+    {     27, 9}, {     15, 8}, {     33, 9}, {     19, 8}, \
+    {     39, 9}, {     23, 8}, {     47, 9}, {     27,10}, \
+    {     15, 9}, {     31, 8}, {     63, 9}, {     39,10}, \
+    {     23, 9}, {     51,11}, {     15,10}, {     31, 9}, \
+    {     67,10}, {     39, 9}, {     79,10}, {     47, 9}, \
+    {     95,10}, {     55,11}, {     31,10}, {     79,11}, \
+    {     47,10}, {     95, 9}, {    191,12}, {     31,11}, \
+    {     63,10}, {    127, 9}, {    255, 8}, {    511,10}, \
+    {    135, 9}, {    271,11}, {     79, 9}, {    319, 8}, \
+    {    639,10}, {    175,11}, {     95,10}, {    191, 9}, \
+    {    383,10}, {    207,12}, {     63,11}, {    127,10}, \
+    {    255, 9}, {    511,10}, {    271, 9}, {    543,10}, \
+    {    287, 9}, {    575,10}, {    319, 9}, {    639,11}, \
+    {    175,10}, {    351,12}, {     95,11}, {    191,10}, \
+    {    383, 9}, {    767,11}, {    207,10}, {    415,13}, \
+    {   8192,14}, {  16384,15}, {  32768,16}, {  65536,17}, \
+    { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
+    {2097152,22}, {4194304,23}, {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 87
+#define SQR_FFT_THRESHOLD                 3264
+
+#define MULLO_BASECASE_THRESHOLD             0  /* always */
+#define MULLO_DC_THRESHOLD                  45
+#define MULLO_MUL_N_THRESHOLD             8648
+#define SQRLO_BASECASE_THRESHOLD             0  /* always */
+#define SQRLO_DC_THRESHOLD                 108
+#define SQRLO_SQR_THRESHOLD               6461
+
+#define DC_DIV_QR_THRESHOLD                 64
+#define DC_DIVAPPR_Q_THRESHOLD             222
+#define DC_BDIV_QR_THRESHOLD                63
+#define DC_BDIV_Q_THRESHOLD                132
+
+#define INV_MULMOD_BNM1_THRESHOLD           38
+#define INV_NEWTON_THRESHOLD               242
+#define INV_APPR_THRESHOLD                 222
+
+#define BINV_NEWTON_THRESHOLD              254
+#define REDC_1_TO_REDC_N_THRESHOLD          66
+
+#define MU_DIV_QR_THRESHOLD               1234
+#define MU_DIVAPPR_Q_THRESHOLD            1234
+#define MUPI_DIV_QR_THRESHOLD              122
+#define MU_BDIV_QR_THRESHOLD              1210
+#define MU_BDIV_Q_THRESHOLD               1234
+
+#define POWM_SEC_TABLE  3,23,194,712,2499
+
+#define GET_STR_DC_THRESHOLD                11
+#define GET_STR_PRECOMPUTE_THRESHOLD        22
+#define SET_STR_DC_THRESHOLD               381
+#define SET_STR_PRECOMPUTE_THRESHOLD      2503
+
+#define FAC_DSC_THRESHOLD                  216
+#define FAC_ODD_THRESHOLD                   26
+
+#define MATRIX22_STRASSEN_THRESHOLD         14
+#define HGCD2_DIV1_METHOD                    5  /* 2.01% faster than 3 */
+#define HGCD_THRESHOLD                     122
+#define HGCD_APPR_THRESHOLD                171
+#define HGCD_REDUCE_THRESHOLD             2479
+#define GCD_DC_THRESHOLD                   541
+#define GCDEXT_DC_THRESHOLD                386
+#define JACOBI_BASE_METHOD                   4  /* 7.46% faster than 1 */

diff --git a/third_party/gmp/mpn/asm-defs.m4 b/third_party/gmp/mpn/asm-defs.m4
new file mode 100644
index 0000000..7b7e53e
--- /dev/null
+++ b/third_party/gmp/mpn/asm-defs.m4

@@ -0,0 +1,1766 @@
+divert(-1)
+dnl
+dnl  m4 macros for gmp assembly code, shared by all CPUs.
+
+dnl  Copyright 1999-2006, 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+dnl  These macros are designed for use with any m4 and have been used on
+dnl  GNU, FreeBSD, NetBSD, OpenBSD and SysV.
+dnl
+dnl  GNU m4 and OpenBSD 2.7 m4 will give filenames and line numbers in error
+dnl  messages.
+dnl
+dnl
+dnl  Macros:
+dnl
+dnl  Most new m4 specific macros have an "m4_" prefix to emphasise they're
+dnl  m4 expansions.  But new defining things like deflit() and defreg() are
+dnl  named like the builtin define(), and forloop() is named following the
+dnl  GNU m4 example on which it's based.
+dnl
+dnl  GNU m4 with the -P option uses "m4_" as a prefix for builtins, but that
+dnl  option isn't going to be used, so there's no conflict or confusion.
+dnl
+dnl
+dnl  Comments in output:
+dnl
+dnl  The m4 comment delimiters are left at # and \n, the normal assembler
+dnl  commenting for most CPUs.  m4 passes comment text through without
+dnl  expanding macros in it, which is generally a good thing since it stops
+dnl  unexpected expansions and possible resultant errors.
+dnl
+dnl  But note that when a quoted string is being read, a # isn't special, so
+dnl  apostrophes in comments in quoted strings must be avoided or they'll be
+dnl  interpreted as a closing quote mark.  But when the quoted text is
+dnl  re-read # will still act like a normal comment, suppressing macro
+dnl  expansion.
+dnl
+dnl  For example,
+dnl
+dnl          # apostrophes in comments that're outside quotes are ok
+dnl          # and using macro names like PROLOGUE is ok too
+dnl          ...
+dnl          ifdef(`PIC',`
+dnl                  # but apostrophes aren't ok inside quotes
+dnl                  #                     ^--wrong
+dnl                  ...
+dnl                  # though macro names like PROLOGUE are still ok
+dnl                  ...
+dnl          ')
+dnl
+dnl  If macro expansion in a comment is wanted, use `#' in the .asm (ie. a
+dnl  quoted hash symbol), which will turn into # in the .s but get
+dnl  expansions done on that line.  This can make the .s more readable to
+dnl  humans, but it won't make a blind bit of difference to the assembler.
+dnl
+dnl  All the above applies, mutatis mutandis, when changecom() is used to
+dnl  select @ ! ; or whatever other commenting.
+dnl
+dnl
+dnl  Variations in m4 affecting gmp:
+dnl
+dnl  $# - When a macro is called as "foo" with no brackets, BSD m4 sets $#
+dnl       to 1, whereas GNU or SysV m4 set it to 0.  In all cases though
+dnl       "foo()" sets $# to 1.  This is worked around in various places.
+dnl
+dnl  len() - When "len()" is given an empty argument, BSD m4 evaluates to
+dnl       nothing, whereas GNU, SysV, and the new OpenBSD, evaluate to 0.
+dnl       See m4_length() below which works around this.
+dnl
+dnl  translit() - GNU m4 accepts character ranges like A-Z, and the new
+dnl       OpenBSD m4 does under option -g, but basic BSD and SysV don't.
+dnl
+dnl  popdef() - in BSD and SysV m4 popdef() takes multiple arguments and
+dnl       pops each, but GNU m4 only takes one argument.
+dnl
+dnl  push back - BSD m4 has some limits on the amount of text that can be
+dnl       pushed back.  The limit is reasonably big and so long as macros
+dnl       don't gratuitously duplicate big arguments it isn't a problem.
+dnl       Normally an error message is given, but sometimes it just hangs.
+dnl
+dnl  eval() &,|,^ - GNU and SysV m4 have bitwise operators &,|,^ available,
+dnl       but BSD m4 doesn't (contrary to what the man page suggests) and
+dnl       instead ^ is exponentiation.
+dnl
+dnl  eval() ?: - The C ternary operator "?:" is available in BSD m4, but not
+dnl       in SysV or GNU m4 (as of GNU m4 1.4 and betas of 1.5).
+dnl
+dnl  eval() -2^31 - BSD m4 has a bug where an eval() resulting in -2^31
+dnl       (ie. -2147483648) gives "-(".  Using -2147483648 within an
+dnl       expression is ok, it just can't be a final result.  "-(" will of
+dnl       course upset parsing, with all sorts of strange effects.
+dnl
+dnl  eval() <<,>> - SysV m4 doesn't support shift operators in eval() (on
+dnl       Solaris 7 /usr/xpg4/m4 has them but /usr/ccs/m4 doesn't).  See
+dnl       m4_lshift() and m4_rshift() below for workarounds.
+dnl
+dnl  ifdef() - OSF 4.0 m4 considers a macro defined to a zero value `0' or
+dnl       `00' etc as not defined.  See m4_ifdef below for a workaround.
+dnl
+dnl  m4wrap() sequence - in BSD m4, m4wrap() replaces any previous m4wrap()
+dnl       string, in SysV m4 it appends to it, and in GNU m4 it prepends.
+dnl       See m4wrap_prepend() below which brings uniformity to this.
+dnl
+dnl  m4wrap() 0xFF - old versions of BSD m4 store EOF in a C "char" under an
+dnl       m4wrap() and on systems where char is unsigned by default a
+dnl       spurious 0xFF is output.  This has been observed on recent Cray
+dnl       Unicos Alpha, Apple MacOS X, and HPUX 11 systems.  An autoconf
+dnl       test is used to check for this, see the m4wrap handling below.  It
+dnl       might work to end the m4wrap string with a dnl to consume the
+dnl       0xFF, but that probably induces the offending m4's to read from an
+dnl       already closed "FILE *", which could be bad on a glibc style
+dnl       stdio.
+dnl
+dnl  __file__,__line__ - GNU m4 and OpenBSD 2.7 m4 provide these, and
+dnl       they're used here to make error messages more informative.  GNU m4
+dnl       gives an unhelpful "NONE 0" in an m4wrap(), but that's worked
+dnl       around.
+dnl
+dnl  __file__ quoting - OpenBSD m4, unlike GNU m4, doesn't quote the
+dnl       filename in __file__, so care should be taken that no macro has
+dnl       the same name as a file, or an unwanted expansion will occur when
+dnl       printing an error or warning.
+dnl
+dnl  changecom() - BSD m4 changecom doesn't quite work like the man page
+dnl       suggests, in particular "changecom" or "changecom()" doesn't
+dnl       disable the comment feature, and multi-character comment sequences
+dnl       don't seem to work.  If the default `#' and newline aren't
+dnl       suitable it's necessary to change it to something else,
+dnl       eg. changecom(;).
+dnl
+dnl  OpenBSD 2.6 m4 - in this m4, eval() rejects decimal constants containing
+dnl       an 8 or 9, making it pretty much unusable.  The bug is confined to
+dnl       version 2.6 (it's not in 2.5, and was fixed in 2.7).
+dnl
+dnl  SunOS /usr/bin/m4 - this m4 lacks a number of desired features,
+dnl       including $# and $@, defn(), m4exit(), m4wrap(), pushdef(),
+dnl       popdef().  /usr/5bin/m4 is a SysV style m4 which should always be
+dnl       available, and "configure" will reject /usr/bin/m4 in favour of
+dnl       /usr/5bin/m4 (if necessary).
+dnl
+dnl       The sparc code actually has modest m4 requirements currently and
+dnl       could manage with /usr/bin/m4, but there's no reason to put our
+dnl       macros through contortions when /usr/5bin/m4 is available or GNU
+dnl       m4 can be installed.
+
+
+ifdef(`__ASM_DEFS_M4_INCLUDED__',
+`m4_error(`asm-defs.m4 already included, dont include it twice
+')m4exit(1)')
+define(`__ASM_DEFS_M4_INCLUDED__')
+
+
+dnl  Detect and give a message about the unsuitable OpenBSD 2.6 m4.
+
+ifelse(eval(89),89,,
+`errprint(
+`This m4 doesnt accept 8 and/or 9 in constants in eval(), making it unusable.
+This is probably OpenBSD 2.6 m4 (September 1999).  Upgrade to OpenBSD 2.7,
+or get a bug fix from the CVS (expr.c rev 1.9), or get GNU m4.  Dont forget
+to configure with M4=/wherever/m4 if you install one of these in a directory
+not in $PATH.
+')m4exit(1)')
+
+
+dnl  Detect and give a message about the unsuitable SunOS /usr/bin/m4.
+dnl
+dnl  Unfortunately this test doesn't work when m4 is run in the normal way
+dnl  from mpn/Makefile with "m4 -DOPERATION_foo foo.asm", since the bad m4
+dnl  takes "-" in "-D..." to mean read stdin, so it will look like it just
+dnl  hangs.  But running "m4 asm-defs.m4" to try it out will work.
+dnl
+dnl  We'd like to abort immediately on finding a problem, but unfortunately
+dnl  the bad m4 doesn't have an m4exit(), nor does an invalid eval() kill
+dnl  it.  Unexpanded $#'s in some m4_assert_numargs() later on will comment
+dnl  out some closing parentheses and kill it with "m4: arg stack overflow".
+
+define(m4_dollarhash_works_test,``$#'')
+ifelse(m4_dollarhash_works_test(x),1,,
+`errprint(
+`This m4 doesnt support $# and cant be used for GMP asm processing.
+If this is on SunOS, ./configure should choose /usr/5bin/m4 if you have that
+or can get it, otherwise install GNU m4.  Dont forget to configure with
+M4=/wherever/m4 if you install in a directory not in $PATH.
+')')
+undefine(`m4_dollarhash_works_test')
+
+
+dnl  --------------------------------------------------------------------------
+dnl  Basic error handling things.
+
+
+dnl  Usage: m4_dollarhash_1_if_noparen_p
+dnl
+dnl  Expand to 1 if a call "foo" gives $# set to 1 (as opposed to 0 like GNU
+dnl  and SysV m4 give).
+
+define(m4_dollarhash_1_if_noparen_test,`$#')
+define(m4_dollarhash_1_if_noparen_p,
+eval(m4_dollarhash_1_if_noparen_test==1))
+undefine(`m4_dollarhash_1_if_noparen_test')
+
+
+dnl  Usage: m4wrap_prepend(string)
+dnl
+dnl  Prepend the given string to what will be expanded under m4wrap at the
+dnl  end of input.
+dnl
+dnl  This macro exists to work around variations in m4wrap() behaviour in
+dnl  the various m4s (notes at the start of this file).  Don't use m4wrap()
+dnl  directly since it will interfere with this scheme.
+
+define(m4wrap_prepend,
+m4_assert_numargs(1)
+`define(`m4wrap_string',`$1'defn(`m4wrap_string'))')
+
+define(m4wrap_string,`')
+
+define(m4wrap_works_p,
+`ifelse(M4WRAP_SPURIOUS,yes,0,1)')
+
+ifelse(m4wrap_works_p,1,
+`m4wrap(`m4wrap_string')')
+
+
+dnl  Usage: m4_file_and_line
+dnl
+dnl  Expand to the current file and line number, if the GNU m4 extensions
+dnl  __file__ and __line__ are available.
+dnl
+dnl  In GNU m4 1.4 at the end of input when m4wrap text is expanded,
+dnl  __file__ is NONE and __line__ is 0, which is not a helpful thing to
+dnl  print.  If m4_file_seen() has been called to note the last file seen,
+dnl  then that file at a big line number is used, otherwise "end of input"
+dnl  is used (although "end of input" won't parse as an error message).
+
+define(m4_file_and_line,
+`ifdef(`__file__',
+`ifelse(__file__`'__line__,`NONE0',
+`ifdef(`m4_file_seen_last',`m4_file_seen_last: 999999: ',`end of input: ')',
+`__file__: __line__: ')')')
+
+
+dnl  Usage: m4_errprint_commas(arg,...)
+dnl
+dnl  The same as errprint(), but commas are printed between arguments
+dnl  instead of spaces.
+
+define(m4_errprint_commas,
+`errprint(`$1')dnl
+ifelse(eval($#>1),1,`errprint(`,')m4_errprint_commas(shift($@))')')
+
+
+dnl  Usage: m4_error(args...)
+dnl         m4_warning(args...)
+dnl
+dnl  Print an error message, using m4_errprint_commas, prefixed with the
+dnl  current filename and line number (if available).  m4_error sets up to
+dnl  give an error exit at the end of processing, m4_warning just prints.
+dnl  These macros are the recommended way to print errors.
+dnl
+dnl  The arguments here should be quoted in the usual way to prevent them
+dnl  being expanded when the macro call is read.  (m4_error takes care not
+dnl  to do any further expansion.)
+dnl
+dnl  For example,
+dnl
+dnl         m4_error(`some error message
+dnl         ')
+dnl
+dnl  which prints
+dnl
+dnl         foo.asm:123: some error message
+dnl
+dnl  or if __file__ and __line__ aren't available
+dnl
+dnl         some error message
+dnl
+dnl  The "file:line:" format is a basic style, used by gcc and GNU m4, so
+dnl  emacs and other editors will recognise it in their normal error message
+dnl  parsing.
+
+define(m4_warning,
+`m4_errprint_commas(m4_file_and_line`'$@)')
+
+define(m4_error,
+`define(`m4_error_occurred',1)m4_warning($@)dnl
+ifelse(m4wrap_works_p,0,`m4exit(1)')')
+
+define(`m4_error_occurred',0)
+
+dnl  This m4wrap_prepend() is first, so it'll be executed last.
+m4wrap_prepend(
+`ifelse(m4_error_occurred,1,
+`m4_error(`Errors occurred during m4 processing
+')m4exit(1)')')
+
+
+dnl  Usage: m4_assert_numargs(num)
+dnl
+dnl  Put this unquoted on a line on its own at the start of a macro
+dnl  definition to add some code to check that num many arguments get passed
+dnl  to the macro.  For example,
+dnl
+dnl         define(foo,
+dnl         m4_assert_numargs(2)
+dnl         `something `$1' and `$2' blah blah')
+dnl
+dnl  Then a call like foo(one,two,three) will provoke an error like
+dnl
+dnl         file:10: foo expected 2 arguments, got 3 arguments
+dnl
+dnl  Here are some calls and how many arguments they're interpreted as passing.
+dnl
+dnl         foo(abc,def)  2
+dnl         foo(xyz)      1
+dnl         foo()         0
+dnl         foo          -1
+dnl
+dnl  The -1 for no parentheses at all means a macro that's meant to be used
+dnl  that way can be checked with m4_assert_numargs(-1).  For example,
+dnl
+dnl         define(SPECIAL_SUFFIX,
+dnl         m4_assert_numargs(-1)
+dnl         `ifdef(`FOO',`_foo',`_bar')')
+dnl
+dnl  But as an alternative see also deflit() below where parenthesized
+dnl  expressions following a macro are passed through to the output.
+dnl
+dnl  Note that in BSD m4 there's no way to differentiate calls "foo" and
+dnl  "foo()", so in BSD m4 the distinction between the two isn't enforced.
+dnl  (In GNU and SysV m4 it can be checked, and is.)
+
+
+dnl  m4_assert_numargs is able to check its own arguments by calling
+dnl  assert_numargs_internal directly.
+dnl
+dnl  m4_doublequote($`'0) expands to ``$0'', whereas ``$`'0'' would expand
+dnl  to `$`'0' and do the wrong thing, and likewise for $1.  The same is
+dnl  done in other assert macros.
+dnl
+dnl  $`#' leaves $# in the new macro being defined, and stops # being
+dnl  interpreted as a comment character.
+dnl
+dnl  `dnl ' means an explicit dnl isn't necessary when m4_assert_numargs is
+dnl  used.  The space means that if there is a dnl it'll still work.
+
+dnl  Usage: m4_doublequote(x) expands to ``x''
+define(m4_doublequote,
+`m4_assert_numargs_internal(`$0',1,$#,len(`$1'))``$1''')
+
+define(m4_assert_numargs,
+`m4_assert_numargs_internal(`$0',1,$#,len(`$1'))dnl
+`m4_assert_numargs_internal'(m4_doublequote($`'0),$1,$`#',`len'(m4_doublequote($`'1)))`dnl '')
+
+dnl  Called: m4_assert_numargs_internal(`macroname',wantargs,$#,len(`$1'))
+define(m4_assert_numargs_internal,
+`m4_assert_numargs_internal_check(`$1',`$2',m4_numargs_count(`$3',`$4'))')
+
+dnl  Called: m4_assert_numargs_internal_check(`macroname',wantargs,gotargs)
+dnl
+dnl  If m4_dollarhash_1_if_noparen_p (BSD m4) then gotargs can be 0 when it
+dnl  should be -1.  If wantargs is -1 but gotargs is 0 and the two can't be
+dnl  distinguished then it's allowed to pass.
+dnl
+define(m4_assert_numargs_internal_check,
+`ifelse(eval($2 == $3
+             || ($2==-1 && $3==0 && m4_dollarhash_1_if_noparen_p)),0,
+`m4_error(`$1 expected 'm4_Narguments(`$2')`, got 'm4_Narguments(`$3')
+)')')
+
+dnl  Called: m4_numargs_count($#,len(`$1'))
+dnl  If $#==0 then -1 args, if $#==1 but len(`$1')==0 then 0 args, otherwise
+dnl  $# args.
+define(m4_numargs_count,
+`ifelse($1,0, -1,
+`ifelse(eval($1==1 && $2-0==0),1, 0, $1)')')
+
+dnl  Usage: m4_Narguments(N)
+dnl  "$1 argument" or "$1 arguments" with the plural according to $1.
+define(m4_Narguments,
+`$1 argument`'ifelse(`$1',1,,s)')
+
+
+dnl  --------------------------------------------------------------------------
+dnl  Additional error checking things.
+
+
+dnl  Usage: m4_file_seen()
+dnl
+dnl  Record __file__ for the benefit of m4_file_and_line in m4wrap text.
+dnl
+dnl  The basic __file__ macro comes out quoted in GNU m4, like `foo.asm',
+dnl  and m4_file_seen_last is defined like that too.
+dnl
+dnl  This is used by PROLOGUE, since that's normally in the main .asm file,
+dnl  and in particular it sets up m4wrap error checks for missing EPILOGUE.
+
+define(m4_file_seen,
+m4_assert_numargs(0)
+`ifelse(__file__,`NONE',,
+`define(`m4_file_seen_last',m4_doublequote(__file__))')')
+
+
+dnl  Usage: m4_assert_onearg()
+dnl
+dnl  Put this, unquoted, at the start of a macro definition to add some code
+dnl  to check that one argument is passed to the macro, but with that
+dnl  argument allowed to be empty.  For example,
+dnl
+dnl          define(foo,
+dnl          m4_assert_onearg()
+dnl          `blah blah $1 blah blah')
+dnl
+dnl  Calls "foo(xyz)" or "foo()" are accepted.  A call "foo(xyz,abc)" fails.
+dnl  A call "foo" fails too, but BSD m4 can't detect this case (GNU and SysV
+dnl  m4 can).
+
+define(m4_assert_onearg,
+m4_assert_numargs(0)
+`m4_assert_onearg_internal'(m4_doublequote($`'0),$`#')`dnl ')
+
+dnl  Called: m4_assert_onearg(`macroname',$#)
+define(m4_assert_onearg_internal,
+`ifelse($2,1,,
+`m4_error(`$1 expected 1 argument, got 'm4_Narguments(`$2')
+)')')
+
+
+dnl  Usage: m4_assert_numargs_range(low,high)
+dnl
+dnl  Put this, unquoted, at the start of a macro definition to add some code
+dnl  to check that between low and high many arguments get passed to the
+dnl  macro.  For example,
+dnl
+dnl         define(foo,
+dnl         m4_assert_numargs_range(3,5)
+dnl         `mandatory $1 $2 $3 optional $4 $5 end')
+dnl
+dnl  See m4_assert_numargs() for more info.
+
+define(m4_assert_numargs_range,
+m4_assert_numargs(2)
+``m4_assert_numargs_range_internal'(m4_doublequote($`'0),$1,$2,$`#',`len'(m4_doublequote($`'1)))`dnl '')
+
+dnl  Called: m4_assert_numargs_range_internal(`name',low,high,$#,len(`$1'))
+define(m4_assert_numargs_range_internal,
+m4_assert_numargs(5)
+`m4_assert_numargs_range_check(`$1',`$2',`$3',m4_numargs_count(`$4',`$5'))')
+
+dnl  Called: m4_assert_numargs_range_check(`name',low,high,gotargs)
+dnl
+dnl  If m4_dollarhash_1_if_noparen_p (BSD m4) then gotargs can be 0 when it
+dnl  should be -1.  To ensure a `high' of -1 works, a fudge is applied to
+dnl  gotargs if it's 0 and the 0 and -1 cases can't be distinguished.
+dnl
+define(m4_assert_numargs_range_check,
+m4_assert_numargs(4)
+`ifelse(eval($2 <= $4 &&
+             ($4 - ($4==0 && m4_dollarhash_1_if_noparen_p) <= $3)),0,
+`m4_error(`$1 expected $2 to $3 arguments, got 'm4_Narguments(`$4')
+)')')
+
+
+dnl  Usage: m4_assert_defined(symbol)
+dnl
+dnl  Put this unquoted on a line of its own at the start of a macro
+dnl  definition to add some code to check that the given symbol is defined
+dnl  when the macro is used.  For example,
+dnl
+dnl          define(foo,
+dnl          m4_assert_defined(`FOO_PREFIX')
+dnl          `FOO_PREFIX whatever')
+dnl
+dnl  This is a convenient way to check that the user or ./configure or
+dnl  whatever has defined the things needed by a macro, as opposed to
+dnl  silently generating garbage.
+
+define(m4_assert_defined,
+m4_assert_numargs(1)
+``m4_assert_defined_internal'(m4_doublequote($`'0),``$1'')`dnl '')
+
+dnl  Called: m4_assert_defined_internal(`macroname',`define_required')
+define(m4_assert_defined_internal,
+m4_assert_numargs(2)
+`m4_ifdef(`$2',,
+`m4_error(`$1 needs $2 defined
+')')')
+
+
+dnl  Usage: m4_not_for_expansion(`SYMBOL')
+dnl         define_not_for_expansion(`SYMBOL')
+dnl
+dnl  m4_not_for_expansion turns SYMBOL, if defined, into something which
+dnl  will give an error if expanded.  For example,
+dnl
+dnl         m4_not_for_expansion(`PIC')
+dnl
+dnl  define_not_for_expansion is the same, but always makes a definition.
+dnl
+dnl  These are for symbols that should be tested with ifdef(`FOO',...)
+dnl  rather than be expanded as such.  They guard against accidentally
+dnl  omitting the quotes, as in ifdef(FOO,...).  Note though that they only
+dnl  catches this when FOO is defined, so be sure to test code both with and
+dnl  without each definition.
+
+define(m4_not_for_expansion,
+m4_assert_numargs(1)
+`ifdef(`$1',`define_not_for_expansion(`$1')')')
+
+define(define_not_for_expansion,
+m4_assert_numargs(1)
+`ifelse(defn(`$1'),,,
+`m4_error(``$1' has a non-empty value, maybe it shouldnt be munged with m4_not_for_expansion()
+')')dnl
+define(`$1',`m4_not_for_expansion_internal(`$1')')')
+
+define(m4_not_for_expansion_internal,
+`m4_error(``$1' is not meant to be expanded, perhaps you mean `ifdef(`$1',...)'
+')')
+
+
+dnl  --------------------------------------------------------------------------
+dnl  Various generic m4 things.
+
+
+dnl  Usage: m4_unquote(macro)
+dnl
+dnl  Allow the argument text to be re-evaluated.  This is useful for "token
+dnl  pasting" like m4_unquote(foo`'bar).
+
+define(m4_unquote,
+m4_assert_onearg()
+`$1')
+
+
+dnl  Usage: m4_ifdef(name,yes[,no])
+dnl
+dnl  Expand to the yes argument if name is defined, or to the no argument if
+dnl  not.
+dnl
+dnl  This is the same as the builtin "ifdef", but avoids an OSF 4.0 m4 bug
+dnl  in which a macro with a zero value `0' or `00' etc is considered not
+dnl  defined.
+dnl
+dnl  There's no particular need to use this everywhere, only if there might
+dnl  be a zero value.
+
+define(m4_ifdef,
+m4_assert_numargs_range(2,3)
+`ifelse(eval(ifdef(`$1',1,0)+m4_length(defn(`$1'))),0,
+`$3',`$2')')
+
+
+dnl  Usage: m4_ifdef_anyof_p(`symbol',...)
+dnl
+dnl  Expand to 1 if any of the symbols in the argument list are defined, or
+dnl  to 0 if not.
+
+define(m4_ifdef_anyof_p,
+`ifelse(eval($#<=1 && m4_length(`$1')==0),1, 0,
+`ifdef(`$1', 1,
+`m4_ifdef_anyof_p(shift($@))')')')
+
+
+dnl  Usage: m4_length(string)
+dnl
+dnl  Determine the length of a string.  This is the same as len(), but
+dnl  always expands to a number, working around the BSD len() which
+dnl  evaluates to nothing given an empty argument.
+
+define(m4_length,
+m4_assert_onearg()
+`eval(len(`$1')-0)')
+
+
+dnl  Usage: m4_stringequal_p(x,y)
+dnl
+dnl  Expand to 1 or 0 according as strings x and y are equal or not.
+
+define(m4_stringequal_p,
+`ifelse(`$1',`$2',1,0)')
+
+
+dnl  Usage: m4_incr_or_decr(n,last)
+dnl
+dnl  Do an incr(n) or decr(n), whichever is in the direction of "last".
+dnl  Both n and last must be numbers of course.
+
+define(m4_incr_or_decr,
+m4_assert_numargs(2)
+`ifelse(eval($1<$2),1,incr($1),decr($1))')
+
+
+dnl  Usage: forloop(i, first, last, statement)
+dnl
+dnl  Based on GNU m4 examples/forloop.m4, but extended.
+dnl
+dnl  statement is expanded repeatedly, with i successively defined as
+dnl
+dnl         first, first+1, ..., last-1, last
+dnl
+dnl  Or if first > last, then it's
+dnl
+dnl         first, first-1, ..., last+1, last
+dnl
+dnl  If first == last, then one expansion is done.
+dnl
+dnl  A pushdef/popdef of i is done to preserve any previous definition (or
+dnl  lack of definition).  first and last are eval()ed and so can be
+dnl  expressions.
+dnl
+dnl  forloop_first is defined to 1 on the first iteration, 0 on the rest.
+dnl  forloop_last is defined to 1 on the last iteration, 0 on the others.
+dnl  Nested forloops are allowed, in which case forloop_first and
+dnl  forloop_last apply to the innermost loop that's open.
+dnl
+dnl  A simple example,
+dnl
+dnl         forloop(i, 1, 2*2+1, `dnl
+dnl         iteration number i ... ifelse(forloop_first,1,FIRST)
+dnl         ')
+
+
+dnl  "i" and "statement" are carefully quoted, but "first" and "last" are
+dnl  just plain numbers once eval()ed.
+
+define(`forloop',
+m4_assert_numargs(4)
+`pushdef(`$1',eval(`$2'))dnl
+pushdef(`forloop_first',1)dnl
+pushdef(`forloop_last',0)dnl
+forloop_internal(`$1',eval(`$3'),`$4')`'dnl
+popdef(`forloop_first')dnl
+popdef(`forloop_last')dnl
+popdef(`$1')')
+
+dnl  Called: forloop_internal(`var',last,statement)
+define(`forloop_internal',
+m4_assert_numargs(3)
+`ifelse($1,$2,
+`define(`forloop_last',1)$3',
+`$3`'dnl
+define(`forloop_first',0)dnl
+define(`$1',m4_incr_or_decr($1,$2))dnl
+forloop_internal(`$1',$2,`$3')')')
+
+
+dnl  Usage: foreach(var,body, item1,item2,...,itemN)
+dnl
+dnl  For each "item" argument, define "var" to that value and expand "body".
+dnl  For example,
+dnl
+dnl         foreach(i, `something i
+dnl         ', one, two)
+dnl  gives
+dnl         something one
+dnl         something two
+dnl
+dnl  Any previous definition of "var", or lack thereof, is saved and
+dnl  restored.  Empty "item"s are not allowed.
+
+define(foreach,
+m4_assert_numargs_range(2,1000)
+`ifelse(`$3',,,
+`pushdef(`$1',`$3')$2`'popdef(`$1')dnl
+foreach(`$1',`$2',shift(shift(shift($@))))')')
+
+
+dnl  Usage: m4_toupper(x)
+dnl         m4_tolower(x)
+dnl
+dnl  Convert the argument string to upper or lower case, respectively.
+dnl  Only one argument accepted.
+dnl
+dnl  BSD m4 doesn't take ranges like a-z in translit(), so the full alphabet
+dnl  is written out.
+
+define(m4_alphabet_lower, `abcdefghijklmnopqrstuvwxyz')
+define(m4_alphabet_upper, `ABCDEFGHIJKLMNOPQRSTUVWXYZ')
+
+define(m4_toupper,
+m4_assert_onearg()
+`translit(`$1', m4_alphabet_lower, m4_alphabet_upper)')
+
+define(m4_tolower,
+m4_assert_onearg()
+`translit(`$1', m4_alphabet_upper, m4_alphabet_lower)')
+
+
+dnl  Usage: m4_empty_if_zero(x)
+dnl
+dnl  Evaluate to x, or to nothing if x is 0.  x is eval()ed and so can be an
+dnl  expression.
+dnl
+dnl  This is useful for x86 addressing mode displacements since forms like
+dnl  (%ebx) are one byte shorter than 0(%ebx).  A macro `foo' for use as
+dnl  foo(%ebx) could be defined with the following so it'll be empty if the
+dnl  expression comes out zero.
+dnl
+dnl	   deflit(`foo', `m4_empty_if_zero(a+b*4-c)')
+dnl
+dnl  Naturally this shouldn't be done if, say, a computed jump depends on
+dnl  the code being a particular size.
+
+define(m4_empty_if_zero,
+m4_assert_onearg()
+`ifelse(eval($1),0,,eval($1))')
+
+
+dnl  Usage: m4_log2(x)
+dnl
+dnl  Calculate a logarithm to base 2.
+dnl  x must be an integral power of 2, between 2**0 and 2**30.
+dnl  x is eval()ed, so it can be an expression.
+dnl  An error results if x is invalid.
+dnl
+dnl  2**31 isn't supported, because an unsigned 2147483648 is out of range
+dnl  of a 32-bit signed int.  Also, the bug in BSD m4 where an eval()
+dnl  resulting in 2147483648 (or -2147483648 as the case may be) gives `-('
+dnl  means tests like eval(1<<31==(x)) would be necessary, but that then
+dnl  gives an unattractive explosion of eval() error messages if x isn't
+dnl  numeric.
+
+define(m4_log2,
+m4_assert_numargs(1)
+`m4_log2_internal(0,1,eval(`$1'))')
+
+dnl  Called: m4_log2_internal(n,2**n,target)
+define(m4_log2_internal,
+m4_assert_numargs(3)
+`ifelse($2,$3,$1,
+`ifelse($1,30,
+`m4_error(`m4_log2() argument too big or not a power of two: $3
+')',
+`m4_log2_internal(incr($1),eval(2*$2),$3)')')')
+
+
+dnl  Usage:  m4_div2_towards_zero
+dnl
+dnl  m4 division is probably whatever a C signed division is, and C doesn't
+dnl  specify what rounding gets used on negatives, so this expression forces
+dnl  a rounding towards zero.
+
+define(m4_div2_towards_zero,
+m4_assert_numargs(1)
+`eval((($1) + ((($1)<0) & ($1))) / 2)')
+
+
+dnl  Usage: m4_lshift(n,count)
+dnl         m4_rshift(n,count)
+dnl
+dnl  Calculate n shifted left or right by count many bits.  Both n and count
+dnl  are eval()ed and so can be expressions.
+dnl
+dnl  Negative counts are allowed and mean a shift in the opposite direction.
+dnl  Negative n is allowed and right shifts will be arithmetic (meaning
+dnl  divide by 2**count, rounding towards zero, also meaning the sign bit is
+dnl  duplicated).
+dnl
+dnl  Use these macros instead of << and >> in eval() since the basic ccs
+dnl  SysV m4 doesn't have those operators.
+
+define(m4_rshift,
+m4_assert_numargs(2)
+`m4_lshift(`$1',-(`$2'))')
+
+define(m4_lshift,
+m4_assert_numargs(2)
+`m4_lshift_internal(eval(`$1'),eval(`$2'))')
+
+define(m4_lshift_internal,
+m4_assert_numargs(2)
+`ifelse(eval($2-0==0),1,$1,
+`ifelse(eval($2>0),1,
+`m4_lshift_internal(eval($1*2),decr($2))',
+`m4_lshift_internal(m4_div2_towards_zero($1),incr($2))')')')
+
+
+dnl  Usage: m4_popcount(n)
+dnl
+dnl  Expand to the number 1 bits in n.
+
+define(m4_popcount,
+m4_assert_numargs(1)
+`m4_popcount_internal(0,eval(`$1'))')
+
+dnl  Called: m4_popcount_internal(count,rem)
+define(m4_popcount_internal,
+m4_assert_numargs(2)
+`ifelse($2,0,$1,
+`m4_popcount_internal(eval($1+($2%2)),eval($2/2))')')
+
+
+dnl  Usage: m4_count_trailing_zeros(N)
+dnl
+dnl  Determine the number of trailing zero bits on N.  N is eval()ed and so
+dnl  can be an expression.  If N is zero an error is generated.
+
+define(m4_count_trailing_zeros,
+m4_assert_numargs(1)
+`m4_count_trailing_zeros_internal(eval(`$1'),0)')
+
+dnl  Called: m4_count_trailing_zeros_internal(val,count)
+define(m4_count_trailing_zeros_internal,
+m4_assert_numargs(2)
+`ifelse($1,0,
+`m4_error(`m4_count_trailing_zeros() given a zero value')',
+`ifelse(eval(($1)%2),1,`$2',
+`m4_count_trailing_zeros_internal(eval($1/2),incr($2))')')')
+
+
+dnl  Usage: deflit(name,value)
+dnl
+dnl  Like define(), but "name" expands like a literal, rather than taking
+dnl  arguments.  For example "name(%eax)" expands to "value(%eax)".
+dnl
+dnl  Limitations:
+dnl
+dnl  $ characters in the value part must have quotes to stop them looking
+dnl  like macro parameters.  For example, deflit(reg,`123+$`'4+567').  See
+dnl  defreg() below for handling simple register definitions like $7 etc.
+dnl
+dnl  "name()" is turned into "name", unfortunately.  In GNU and SysV m4 an
+dnl  error is generated when this happens, but in BSD m4 it will happen
+dnl  silently.  The problem is that in BSD m4 $# is 1 in both "name" or
+dnl  "name()", so there's no way to differentiate them.  Because we want
+dnl  plain "name" to turn into plain "value", we end up with "name()"
+dnl  turning into plain "value" too.
+dnl
+dnl  "name(foo)" will lose any whitespace after commas in "foo", for example
+dnl  "disp(%eax, %ecx)" would become "128(%eax,%ecx)".
+dnl
+dnl  These parentheses oddities shouldn't matter in assembler text, but if
+dnl  they do the suggested workaround is to write "name ()" or "name (foo)"
+dnl  to stop the parentheses looking like a macro argument list.  If a space
+dnl  isn't acceptable in the output, then write "name`'()" or "name`'(foo)".
+dnl  The `' is stripped when read, but again stops the parentheses looking
+dnl  like parameters.
+
+dnl  Quoting for deflit_emptyargcheck is similar to m4_assert_numargs.  The
+dnl  stuff in the ifelse gives a $#, $1 and $@ evaluated in the new macro
+dnl  created, not in deflit.
+define(deflit,
+m4_assert_numargs(2)
+`define(`$1',
+`deflit_emptyargcheck'(``$1'',$`#',m4_doublequote($`'1))`dnl
+$2`'dnl
+ifelse(eval($'`#>1 || m4_length('m4_doublequote($`'1)`)!=0),1,($'`@))')')
+
+dnl  Called: deflit_emptyargcheck(macroname,$#,`$1')
+define(deflit_emptyargcheck,
+`ifelse(eval($2==1 && !m4_dollarhash_1_if_noparen_p && m4_length(`$3')==0),1,
+`m4_error(`dont use a deflit as $1() because it loses the brackets (see deflit in asm-defs.m4 for more information)
+')')')
+
+
+dnl  Usage: m4_assert(`expr')
+dnl
+dnl  Test a compile-time requirement with an m4 expression.  The expression
+dnl  should be quoted, and will be eval()ed and expected to be non-zero.
+dnl  For example,
+dnl
+dnl         m4_assert(`FOO*2+6 < 14')
+
+define(m4_assert,
+m4_assert_numargs(1)
+`ifelse(eval($1),1,,
+`m4_error(`assertion failed: $1
+')')')
+
+
+dnl  Usage: m4_repeat(count,text)
+dnl
+dnl  Expand to the given repetitions of the given text.  A zero count is
+dnl  allowed, and expands to nothing.
+
+define(m4_repeat,
+m4_assert_numargs(2)
+`m4_repeat_internal(eval($1),`$2')')
+
+define(m4_repeat_internal,
+m4_assert_numargs(2)
+`ifelse(`$1',0,,
+`forloop(m4_repeat_internal_counter,1,$1,``$2'')')')
+
+
+dnl  Usage: m4_hex_lowmask(bits)
+dnl
+dnl  Generate a hex constant which is a low mask of the given number of
+dnl  bits.  For example m4_hex_lowmask(10) would give 0x3ff.
+
+define(m4_hex_lowmask,
+m4_assert_numargs(1)
+`m4_cpu_hex_constant(m4_hex_lowmask_internal1(eval(`$1')))')
+
+dnl  Called: m4_hex_lowmask_internal1(bits)
+define(m4_hex_lowmask_internal1,
+m4_assert_numargs(1)
+`ifelse($1,0,`0',
+`m4_hex_lowmask_internal2(eval(($1)%4),eval(($1)/4))')')
+
+dnl  Called: m4_hex_lowmask_internal(remainder,digits)
+define(m4_hex_lowmask_internal2,
+m4_assert_numargs(2)
+`ifelse($1,1,`1',
+`ifelse($1,2,`3',
+`ifelse($1,3,`7')')')dnl
+m4_repeat($2,`f')')
+
+
+dnl  --------------------------------------------------------------------------
+dnl  The following m4_list functions take a list as multiple arguments.
+dnl  Arguments are evaluated multiple times, there's no attempt at strict
+dnl  quoting.  Empty list elements are not allowed, since an empty final
+dnl  argument is ignored.  These restrictions don't affect the current uses,
+dnl  and make the implementation easier.
+
+
+dnl  Usage: m4_list_quote(list,...)
+dnl
+dnl  Produce a list with quoted commas, so it can be a single argument
+dnl  string.  For instance m4_list_quote(a,b,c) gives
+dnl
+dnl         a`,'b`,'c`,'
+dnl
+dnl  This can be used to put a list in a define,
+dnl
+dnl         define(foolist, m4_list_quote(a,b,c))
+dnl
+dnl  Which can then be used for instance as
+dnl
+dnl         m4_list_find(target, foolist)
+
+define(m4_list_quote,
+`ifelse(`$1',,,
+`$1`,'m4_list_quote(shift($@))')')
+
+
+dnl  Usage: m4_list_find(key,list,...)
+dnl
+dnl  Evaluate to 1 or 0 according to whether key is in the list elements.
+
+define(m4_list_find,
+m4_assert_numargs_range(1,1000)
+`ifelse(`$2',,0,
+`ifelse(`$1',`$2',1,
+`m4_list_find(`$1',shift(shift($@)))')')')
+
+
+dnl  Usage: m4_list_remove(key,list,...)
+dnl
+dnl  Evaluate to the given list with `key' removed (if present).
+
+define(m4_list_remove,
+m4_assert_numargs_range(1,1000)
+`ifelse(`$2',,,
+`ifelse(`$1',`$2',,`$2,')dnl
+m4_list_remove(`$1',shift(shift($@)))')')
+
+
+dnl  Usage: m4_list_first(list,...)
+dnl
+dnl  Evaluate to the first element of the list (if any).
+
+define(m4_list_first,`$1')
+
+
+dnl  Usage: m4_list_count(list,...)
+dnl
+dnl  Evaluate to the number of elements in the list.  This can't just use $#
+dnl  because the last element might be empty.
+
+define(m4_list_count,
+`m4_list_count_internal(0,$@)')
+
+dnl  Called: m4_list_internal(count,list,...)
+define(m4_list_count_internal,
+m4_assert_numargs_range(1,1000)
+`ifelse(`$2',,$1,
+`m4_list_count_internal(eval($1+1),shift(shift($@)))')')
+
+
+dnl  --------------------------------------------------------------------------
+dnl  Various assembler things, not specific to any particular CPU.
+dnl
+
+
+dnl  Usage: include_mpn(`filename')
+dnl
+dnl  Like include(), but adds a path to the mpn source directory.  For
+dnl  example,
+dnl
+dnl         include_mpn(`sparc64/addmul_1h.asm')
+
+define(include_mpn,
+m4_assert_numargs(1)
+m4_assert_defined(`CONFIG_TOP_SRCDIR')
+`include(CONFIG_TOP_SRCDIR`/mpn/$1')')
+
+
+dnl  Usage: C comment ...
+dnl
+dnl  This works like a FORTRAN-style comment character.  It can be used for
+dnl  comments to the right of assembly instructions, where just dnl would
+dnl  remove the newline and concatenate adjacent lines.
+dnl
+dnl  C and/or dnl are useful when an assembler doesn't support comments, or
+dnl  where different assemblers for a particular CPU need different styles.
+dnl  The intermediate ".s" files will end up with no comments, just code.
+dnl
+dnl  Using C is not intended to cause offence to anyone who doesn't like
+dnl  FORTRAN; but if that happens it's an unexpected bonus.
+dnl
+dnl  During development, if comments are wanted in the .s files to help see
+dnl  what's expanding where, C can be redefined with something like
+dnl
+dnl         define(`C',`#')
+
+define(C, `
+dnl')
+
+
+dnl  Normally PIC is defined (or not) by libtool, but it doesn't set it on
+dnl  systems which are always PIC.  PIC_ALWAYS established in config.m4
+dnl  identifies these for us.
+
+ifelse(`PIC_ALWAYS',`yes',`define(`PIC')')
+
+
+dnl  Various possible defines passed from the Makefile that are to be tested
+dnl  with ifdef() rather than be expanded.
+
+m4_not_for_expansion(`PIC')
+m4_not_for_expansion(`DLL_EXPORT')
+
+dnl  aors_n
+m4_not_for_expansion(`OPERATION_add_n')
+m4_not_for_expansion(`OPERATION_sub_n')
+
+dnl  aors_err1_n
+m4_not_for_expansion(`OPERATION_add_err1_n')
+m4_not_for_expansion(`OPERATION_sub_err1_n')
+
+dnl  aors_err2_n
+m4_not_for_expansion(`OPERATION_add_err2_n')
+m4_not_for_expansion(`OPERATION_sub_err2_n')
+
+dnl  aors_err3_n
+m4_not_for_expansion(`OPERATION_add_err3_n')
+m4_not_for_expansion(`OPERATION_sub_err3_n')
+
+dnl  aorsmul_1
+m4_not_for_expansion(`OPERATION_addmul_1')
+m4_not_for_expansion(`OPERATION_submul_1')
+
+dnl  logops_n
+m4_not_for_expansion(`OPERATION_and_n')
+m4_not_for_expansion(`OPERATION_andn_n')
+m4_not_for_expansion(`OPERATION_nand_n')
+m4_not_for_expansion(`OPERATION_ior_n')
+m4_not_for_expansion(`OPERATION_iorn_n')
+m4_not_for_expansion(`OPERATION_nior_n')
+m4_not_for_expansion(`OPERATION_xor_n')
+m4_not_for_expansion(`OPERATION_xnor_n')
+
+dnl  popham
+m4_not_for_expansion(`OPERATION_popcount')
+m4_not_for_expansion(`OPERATION_hamdist')
+
+dnl  lorrshift
+m4_not_for_expansion(`OPERATION_lshift')
+m4_not_for_expansion(`OPERATION_rshift')
+
+dnl  aorslsh1_n
+m4_not_for_expansion(`OPERATION_addlsh1_n')
+m4_not_for_expansion(`OPERATION_sublsh1_n')
+m4_not_for_expansion(`OPERATION_rsblsh1_n')
+
+dnl  aorslsh2_n
+m4_not_for_expansion(`OPERATION_addlsh2_n')
+m4_not_for_expansion(`OPERATION_sublsh2_n')
+m4_not_for_expansion(`OPERATION_rsblsh2_n')
+
+dnl  rsh1aors_n
+m4_not_for_expansion(`OPERATION_rsh1add_n')
+m4_not_for_expansion(`OPERATION_rsh1sub_n')
+
+
+dnl  Usage: m4_config_gmp_mparam(`symbol')
+dnl
+dnl  Check that `symbol' is defined.  If it isn't, issue an error and
+dnl  terminate immediately.  The error message explains that the symbol
+dnl  should be in config.m4, copied from gmp-mparam.h.
+dnl
+dnl  Termination is immediate since missing say SQR_TOOM2_THRESHOLD can
+dnl  lead to infinite loops and endless error messages.
+
+define(m4_config_gmp_mparam,
+m4_assert_numargs(1)
+`ifdef(`$1',,
+`m4_error(`$1 is not defined.
+	"configure" should have extracted this from gmp-mparam.h and put it
+	in config.m4 (or in <cpu>_<file>.asm for a fat binary), but somehow
+        this has failed.
+')m4exit(1)')')
+
+
+dnl  Usage: defreg(name,reg)
+dnl
+dnl  Give a name to a $ style register.  For example,
+dnl
+dnl         defreg(foo,$12)
+dnl
+dnl  defreg() inserts an extra pair of quotes after the $ so that it's not
+dnl  interpreted as an m4 macro parameter, ie. foo is actually $`'12.  m4
+dnl  strips those quotes when foo is expanded.
+dnl
+dnl  deflit() is used to make the new definition, so it will expand
+dnl  literally even if followed by parentheses ie. foo(99) will become
+dnl  $12(99).  (But there's nowhere that would be used is there?)
+dnl
+dnl  When making further definitions from existing defreg() macros, remember
+dnl  to use defreg() again to protect the $ in the new definitions too.  For
+dnl  example,
+dnl
+dnl         defreg(a0,$4)
+dnl         defreg(a1,$5)
+dnl         ...
+dnl
+dnl         defreg(PARAM_DST,a0)
+dnl
+dnl  This is only because a0 is expanding at the time the PARAM_DST
+dnl  definition is made, leaving a literal $4 that must be re-quoted.  On
+dnl  the other hand in something like the following ra is only expanded when
+dnl  ret is used and its $`'31 protection will have its desired effect at
+dnl  that time.
+dnl
+dnl         defreg(ra,$31)
+dnl         ...
+dnl         define(ret,`j ra')
+dnl
+dnl  Note that only $n forms are meant to be used here, and something like
+dnl  128($30) doesn't get protected and will come out wrong.
+
+define(defreg,
+m4_assert_numargs(2)
+`deflit(`$1',
+substr(`$2',0,1)``''substr(`$2',1))')
+
+
+dnl  Usage: m4_instruction_wrapper()
+dnl
+dnl  Put this, unquoted, on a line on its own, at the start of a macro
+dnl  that's a wrapper around an assembler instruction.  It adds code to give
+dnl  a descriptive error message if the macro is invoked without arguments.
+dnl
+dnl  For example, suppose jmp needs to be wrapped,
+dnl
+dnl         define(jmp,
+dnl         m4_instruction_wrapper()
+dnl         m4_assert_numargs(1)
+dnl                 `.byte 0x42
+dnl                 .long  $1
+dnl                 nop')
+dnl
+dnl  The point of m4_instruction_wrapper is to get a better error message
+dnl  than m4_assert_numargs would give if jmp is accidentally used as plain
+dnl  "jmp foo" instead of the intended "jmp( foo)".  "jmp()" with no
+dnl  argument also provokes the error message.
+dnl
+dnl  m4_instruction_wrapper should only be used with wrapped instructions
+dnl  that take arguments, since obviously something meant to be used as say
+dnl  plain "ret" doesn't want to give an error when used that way.
+
+define(m4_instruction_wrapper,
+m4_assert_numargs(0)
+``m4_instruction_wrapper_internal'(m4_doublequote($`'0),dnl
+ifdef(`__file__',`m4_doublequote(__file__)',``the m4 sources''),dnl
+$`#',m4_doublequote($`'1))`dnl'')
+
+dnl  Called: m4_instruction_wrapper_internal($0,`filename',$#,$1)
+define(m4_instruction_wrapper_internal,
+`ifelse(eval($3<=1 && m4_length(`$4')==0),1,
+`m4_error(`$1 is a macro replacing that instruction and needs arguments, see $2 for details
+')')')
+
+
+dnl  Usage: m4_cpu_hex_constant(string)
+dnl
+dnl  Expand to the string prefixed by a suitable `0x' hex marker.  This
+dnl  should be redefined as necessary for CPUs with different conventions.
+
+define(m4_cpu_hex_constant,
+m4_assert_numargs(1)
+`0x`$1'')
+
+
+dnl  Usage: UNROLL_LOG2, UNROLL_MASK, UNROLL_BYTES
+dnl         CHUNK_LOG2, CHUNK_MASK, CHUNK_BYTES
+dnl
+dnl  When code supports a variable amount of loop unrolling, the convention
+dnl  is to define UNROLL_COUNT to the number of limbs processed per loop.
+dnl  When testing code this can be varied to see how much the loop overhead
+dnl  is costing.  For example,
+dnl
+dnl         deflit(UNROLL_COUNT, 32)
+dnl
+dnl  If the forloop() generating the unrolled loop has a pattern processing
+dnl  more than one limb, the convention is to express this with CHUNK_COUNT.
+dnl  For example,
+dnl
+dnl         deflit(CHUNK_COUNT, 2)
+dnl
+dnl  The LOG2, MASK and BYTES definitions below are derived from these COUNT
+dnl  definitions.  If COUNT is redefined, the LOG2, MASK and BYTES follow
+dnl  the new definition automatically.
+dnl
+dnl  LOG2 is the log base 2 of COUNT.  MASK is COUNT-1, which can be used as
+dnl  a bit mask.  BYTES is GMP_LIMB_BYTES*COUNT, the number of bytes
+dnl  processed in each unrolled loop.
+dnl
+dnl  GMP_LIMB_BYTES is defined in a CPU specific m4 include file.  It
+dnl  exists only so the BYTES definitions here can be common to all CPUs.
+dnl  In the actual code for a given CPU, an explicit 4 or 8 may as well be
+dnl  used because the code is only for a particular CPU, it doesn't need to
+dnl  be general.
+dnl
+dnl  Note that none of these macros do anything except give conventional
+dnl  names to commonly used things.  You still have to write your own
+dnl  expressions for a forloop() and the resulting address displacements.
+dnl  Something like the following would be typical for 4 bytes per limb.
+dnl
+dnl         forloop(`i',0,UNROLL_COUNT-1,`
+dnl                 deflit(`disp',eval(i*4))
+dnl                 ...
+dnl         ')
+dnl
+dnl  Or when using CHUNK_COUNT,
+dnl
+dnl         forloop(`i',0,UNROLL_COUNT/CHUNK_COUNT-1,`
+dnl                 deflit(`disp0',eval(i*CHUNK_COUNT*4))
+dnl                 deflit(`disp1',eval(disp0+4))
+dnl                 ...
+dnl         ')
+dnl
+dnl  Clearly `i' can be run starting from 1, or from high to low or whatever
+dnl  best suits.
+
+deflit(UNROLL_LOG2,
+m4_assert_defined(`UNROLL_COUNT')
+`m4_log2(UNROLL_COUNT)')
+
+deflit(UNROLL_MASK,
+m4_assert_defined(`UNROLL_COUNT')
+`eval(UNROLL_COUNT-1)')
+
+deflit(UNROLL_BYTES,
+m4_assert_defined(`UNROLL_COUNT')
+m4_assert_defined(`GMP_LIMB_BYTES')
+`eval(UNROLL_COUNT * GMP_LIMB_BYTES)')
+
+deflit(CHUNK_LOG2,
+m4_assert_defined(`CHUNK_COUNT')
+`m4_log2(CHUNK_COUNT)')
+
+deflit(CHUNK_MASK,
+m4_assert_defined(`CHUNK_COUNT')
+`eval(CHUNK_COUNT-1)')
+
+deflit(CHUNK_BYTES,
+m4_assert_defined(`CHUNK_COUNT')
+m4_assert_defined(`GMP_LIMB_BYTES')
+`eval(CHUNK_COUNT * GMP_LIMB_BYTES)')
+
+
+dnl  Usage: MPN(name)
+dnl
+dnl  Add MPN_PREFIX to a name.
+dnl  MPN_PREFIX defaults to "__gmpn_" if not defined.
+dnl
+dnl  m4_unquote is used in MPN so that when it expands to say __gmpn_foo,
+dnl  that identifier will be subject to further macro expansion.  This is
+dnl  used by some of the fat binary support for renaming symbols.
+
+ifdef(`MPN_PREFIX',,
+`define(`MPN_PREFIX',`__gmpn_')')
+
+define(MPN,
+m4_assert_numargs(1)
+`m4_unquote(MPN_PREFIX`'$1)')
+
+
+dnl  Usage: mpn_add_n, etc
+dnl
+dnl  Convenience definitions using MPN(), like the #defines in gmp.h.  Each
+dnl  function that might be implemented in assembler is here.
+
+define(define_mpn,
+m4_assert_numargs(1)
+`deflit(`mpn_$1',`MPN(`$1')')')
+
+define_mpn(add)
+define_mpn(add_1)
+define_mpn(add_err1_n)
+define_mpn(add_err2_n)
+define_mpn(add_err3_n)
+define_mpn(add_n)
+define_mpn(add_nc)
+define_mpn(addlsh1_n)
+define_mpn(addlsh1_nc)
+define_mpn(addlsh2_n)
+define_mpn(addlsh2_nc)
+define_mpn(addlsh_n)
+define_mpn(addlsh_nc)
+define_mpn(addlsh1_n_ip1)
+define_mpn(addlsh1_nc_ip1)
+define_mpn(addlsh2_n_ip1)
+define_mpn(addlsh2_nc_ip1)
+define_mpn(addlsh_n_ip1)
+define_mpn(addlsh_nc_ip1)
+define_mpn(addlsh1_n_ip2)
+define_mpn(addlsh1_nc_ip2)
+define_mpn(addlsh2_n_ip2)
+define_mpn(addlsh2_nc_ip2)
+define_mpn(addlsh_n_ip2)
+define_mpn(addlsh_nc_ip2)
+define_mpn(addmul_1)
+define_mpn(addmul_1c)
+define_mpn(addmul_2)
+define_mpn(addmul_3)
+define_mpn(addmul_4)
+define_mpn(addmul_5)
+define_mpn(addmul_6)
+define_mpn(addmul_7)
+define_mpn(addmul_8)
+define_mpn(addmul_2s)
+define_mpn(add_n_sub_n)
+define_mpn(add_n_sub_nc)
+define_mpn(addaddmul_1msb0)
+define_mpn(and_n)
+define_mpn(andn_n)
+define_mpn(bdiv_q_1)
+define_mpn(pi1_bdiv_q_1)
+define_mpn(bdiv_dbm1c)
+define_mpn(cmp)
+define_mpn(cnd_add_n)
+define_mpn(cnd_sub_n)
+define_mpn(com)
+define_mpn(copyd)
+define_mpn(copyi)
+define_mpn(count_leading_zeros)
+define_mpn(count_trailing_zeros)
+define_mpn(div_qr_1n_pi1)
+define_mpn(div_qr_2)
+define_mpn(div_qr_2n_pi1)
+define_mpn(div_qr_2u_pi1)
+define_mpn(div_qr_2n_pi2)
+define_mpn(div_qr_2u_pi2)
+define_mpn(divexact_1)
+define_mpn(divexact_by3c)
+define_mpn(divrem)
+define_mpn(divrem_1)
+define_mpn(divrem_1c)
+define_mpn(divrem_2)
+define_mpn(divrem_classic)
+define_mpn(divrem_newton)
+define_mpn(dump)
+define_mpn(gcd)
+define_mpn(gcd_1)
+define_mpn(gcd_11)
+define_mpn(gcd_22)
+define_mpn(gcdext)
+define_mpn(get_str)
+define_mpn(hamdist)
+define_mpn(invert_limb)
+define_mpn(invert_limb_table)
+define_mpn(ior_n)
+define_mpn(iorn_n)
+define_mpn(lshift)
+define_mpn(lshiftc)
+define_mpn(mod_1_1p)
+define_mpn(mod_1_1p_cps)
+define_mpn(mod_1s_2p)
+define_mpn(mod_1s_2p_cps)
+define_mpn(mod_1s_3p)
+define_mpn(mod_1s_3p_cps)
+define_mpn(mod_1s_4p)
+define_mpn(mod_1s_4p_cps)
+define_mpn(mod_1)
+define_mpn(mod_1c)
+define_mpn(mod_34lsub1)
+define_mpn(modexact_1_odd)
+define_mpn(modexact_1c_odd)
+define_mpn(mul)
+define_mpn(mul_1)
+define_mpn(mul_1c)
+define_mpn(mul_2)
+define_mpn(mul_3)
+define_mpn(mul_4)
+define_mpn(mul_5)
+define_mpn(mul_6)
+define_mpn(mul_basecase)
+define_mpn(mul_n)
+define_mpn(mullo_basecase)
+define_mpn(mulmid_basecase)
+define_mpn(perfect_square_p)
+define_mpn(popcount)
+define_mpn(preinv_divrem_1)
+define_mpn(preinv_mod_1)
+define_mpn(nand_n)
+define_mpn(neg)
+define_mpn(nior_n)
+define_mpn(powm)
+define_mpn(powlo)
+define_mpn(random)
+define_mpn(random2)
+define_mpn(redc_1)
+define_mpn(redc_2)
+define_mpn(rsblsh1_n)
+define_mpn(rsblsh1_nc)
+define_mpn(rsblsh2_n)
+define_mpn(rsblsh2_nc)
+define_mpn(rsblsh_n)
+define_mpn(rsblsh_nc)
+define_mpn(rsh1add_n)
+define_mpn(rsh1add_nc)
+define_mpn(rsh1sub_n)
+define_mpn(rsh1sub_nc)
+define_mpn(rshift)
+define_mpn(rshiftc)
+define_mpn(sbpi1_bdiv_q)
+define_mpn(sbpi1_bdiv_qr)
+define_mpn(sbpi1_bdiv_r)
+define_mpn(scan0)
+define_mpn(scan1)
+define_mpn(set_str)
+define_mpn(sqr_basecase)
+define_mpn(sqr_diagonal)
+define_mpn(sqr_diag_addlsh1)
+define_mpn(sub_n)
+define_mpn(sublsh1_n)
+define_mpn(sublsh1_nc)
+define_mpn(sublsh1_n_ip1)
+define_mpn(sublsh1_nc_ip1)
+define_mpn(sublsh2_n)
+define_mpn(sublsh2_nc)
+define_mpn(sublsh2_n_ip1)
+define_mpn(sublsh2_nc_ip1)
+define_mpn(sublsh_n)
+define_mpn(sublsh_nc)
+define_mpn(sublsh_n_ip1)
+define_mpn(sublsh_nc_ip1)
+define_mpn(sqrtrem)
+define_mpn(sub)
+define_mpn(sub_1)
+define_mpn(sub_err1_n)
+define_mpn(sub_err2_n)
+define_mpn(sub_err3_n)
+define_mpn(sub_n)
+define_mpn(sub_nc)
+define_mpn(submul_1)
+define_mpn(submul_1c)
+define_mpn(sec_tabselect)
+define_mpn(umul_ppmm)
+define_mpn(umul_ppmm_r)
+define_mpn(udiv_qrnnd)
+define_mpn(udiv_qrnnd_r)
+define_mpn(xnor_n)
+define_mpn(xor_n)
+
+
+dnl  Defines for C global arrays and variables, with names matching what's
+dnl  used in the C code.
+dnl
+dnl  Notice that GSYM_PREFIX is included, unlike with the function defines
+dnl  above.  Also, "deflit" is used so that something like __clz_tab(%ebx)
+dnl  comes out as __gmpn_clz_tab(%ebx), for the benefit of CPUs with that
+dnl  style assembler syntax.
+
+deflit(__clz_tab,
+m4_assert_defined(`GSYM_PREFIX')
+`GSYM_PREFIX`'MPN(`clz_tab')')
+
+deflit(binvert_limb_table,
+m4_assert_defined(`GSYM_PREFIX')
+`GSYM_PREFIX`'__gmp_binvert_limb_table')
+
+
+dnl  Usage: ASM_START()
+dnl
+dnl  Emit any directives needed once at the start of an assembler file, like
+dnl  ".set noreorder" or whatever.  The default for this is nothing, but
+dnl  it's redefined by CPU specific m4 files.
+
+define(ASM_START)
+
+
+dnl  Usage: ASM_END()
+dnl
+dnl  Emit any directives needed once at the end of an assembler file.  The
+dnl  default for this is nothing, but it's redefined by CPU specific m4 files.
+
+define(ASM_END)
+
+
+dnl  Usage: PROLOGUE(foo[,param])
+dnl         EPILOGUE(foo)
+dnl
+dnl  Emit directives to start or end a function.  GSYM_PREFIX is added by
+dnl  these macros if necessary, so the given "foo" is what the function will
+dnl  be called in C.
+dnl
+dnl  The second parameter to PROLOGUE is used only for some CPUs and should
+dnl  be omitted if not required.
+dnl
+dnl  Nested or overlapping PROLOGUE/EPILOGUE pairs are allowed, if that
+dnl  makes sense for the system.  The name given to EPILOGUE must be a
+dnl  currently open PROLOGUE.
+dnl
+dnl  If only one PROLOGUE is open then the name can be omitted from
+dnl  EPILOGUE.  This is encouraged, since it means the name only has to
+dnl  appear in one place, not two.
+dnl
+dnl  The given name "foo" is not fully quoted here, it will be macro
+dnl  expanded more than once.  This is the way the m4_list macros work, and
+dnl  it also helps the tune/many.pl program do a renaming like
+dnl  -D__gmpn_add_n=mpn_add_n_foo when GSYM_PREFIX is not empty.
+
+define(PROLOGUE,
+m4_assert_numargs_range(1,2)
+`m4_file_seen()dnl
+define(`PROLOGUE_list',m4_list_quote($1,PROLOGUE_list))dnl
+ifelse(`$2',,
+`PROLOGUE_cpu(GSYM_PREFIX`'$1)',
+`PROLOGUE_cpu(GSYM_PREFIX`'$1,`$2')')')
+
+define(EPILOGUE,
+m4_assert_numargs_range(0,1)
+`ifelse(`$1',,
+`ifelse(m4_list_count(PROLOGUE_list),0,
+`m4_error(`no open functions for EPILOGUE
+')',
+`ifelse(m4_list_count(PROLOGUE_list),1,
+`EPILOGUE_internal(PROLOGUE_current_function)',
+`m4_error(`more than one open function for EPILOGUE
+')')')',
+`EPILOGUE_internal(`$1')')')
+
+define(EPILOGUE_internal,
+m4_assert_numargs(1)
+m4_assert_defined(`EPILOGUE_cpu')
+`ifelse(m4_list_find($1,PROLOGUE_list),0,
+`m4_error(`EPILOGUE without PROLOGUE: $1
+')')dnl
+define(`PROLOGUE_list',m4_list_quote(m4_list_remove($1,PROLOGUE_list)))dnl
+EPILOGUE_cpu(GSYM_PREFIX`$1')')
+
+dnl  Currently open PROLOGUEs, as a comma-separated list.
+define(PROLOGUE_list)
+
+
+dnl  Called: PROLOGUE_check(list,...)
+dnl  Check there's no remaining open PROLOGUEs at the end of input.
+define(PROLOGUE_check,
+`ifelse($1,,,
+`m4_error(`no EPILOGUE for: $1
+')dnl
+PROLOGUE_check(shift($@))')')
+
+m4wrap_prepend(`PROLOGUE_check(PROLOGUE_list)')
+
+
+dnl  Usage: PROLOGUE_current_function
+dnl
+dnl  This macro expands to the current PROLOGUE/EPILOGUE function, or the
+dnl  most recent PROLOGUE if such pairs are nested or overlapped.
+
+define(PROLOGUE_current_function,
+m4_assert_numargs(-1)
+`m4_list_first(PROLOGUE_list)')
+
+
+dnl  Usage: PROLOGUE_cpu(GSYM_PREFIX`'foo[,param])
+dnl         EPILOGUE_cpu(GSYM_PREFIX`'foo)
+dnl
+dnl  These macros hold the CPU-specific parts of PROLOGUE and EPILOGUE.
+dnl  Both are called with the function name, with GSYM_PREFIX already
+dnl  prepended.
+dnl
+dnl  The definitions here are something typical and sensible, but CPU or
+dnl  system specific m4 files should redefine them as necessary.  The
+dnl  optional extra parameter to PROLOGUE_cpu is not expected and not
+dnl  accepted here.
+
+define(PROLOGUE_cpu,
+m4_assert_numargs(1)
+`	TEXT
+	ALIGN(8)
+	GLOBL	`$1' GLOBL_ATTR
+	TYPE(`$1',`function')
+`$1'LABEL_SUFFIX')
+
+define(EPILOGUE_cpu,
+`	SIZE(`$1',.-`$1')')
+
+
+dnl  Usage: L(name)
+dnl
+dnl  Generate a local label with the given name.  This is simply a
+dnl  convenient way to add LSYM_PREFIX.
+dnl
+dnl  LSYM_PREFIX might be L$, so defn() must be used to quote it or the L
+dnl  will expand again as the L macro, making an infinite recursion.
+
+define(`L',
+m4_assert_numargs(1)
+`defn(`LSYM_PREFIX')$1')
+
+
+dnl  Usage: LDEF(name)
+dnl
+dnl  Generate a directive to define a local label.
+dnl
+dnl  On systems with a fixed syntax for defining labels there's no need to
+dnl  use this macro, it's only meant for systems where the syntax varies,
+dnl  like hppa which is "L(foo):" with gas, but just "L(foo)" in column 0
+dnl  with the system `as'.
+dnl
+dnl  The extra `' after LABEL_SUFFIX avoids any chance of a following
+dnl  "(...)"  being interpreted as an argument list.  Not that it'd be
+dnl  sensible to write anything like that after an LDEF(), but just in case.
+
+define(LDEF,
+m4_assert_numargs(1)
+m4_assert_defined(`LABEL_SUFFIX')
+`L(`$1')`'LABEL_SUFFIX`'')
+
+
+dnl  Usage: INT32(label,value)
+dnl         INT64(label,first,second)
+
+define(`INT32',
+m4_assert_defined(`W32')
+`	ALIGN(4)
+LDEF(`$1')
+	W32	$2')
+
+define(`INT64',
+m4_assert_defined(`W32')
+`	ALIGN(8)
+LDEF(`$1')
+	W32	$2
+	W32	$3')
+
+
+dnl  Usage: ALIGN(bytes)
+dnl
+dnl  Emit a ".align" directive.  The alignment is specified in bytes, and
+dnl  will normally need to be a power of 2.  The actual ".align" generated
+dnl  is either bytes or logarithmic according to what ./configure finds the
+dnl  assembler needs.
+dnl
+dnl  If ALIGN_FILL_0x90 is defined and equal to "yes", then ", 0x90" is
+dnl  appended.  This is for x86, see mpn/x86/README.
+
+define(ALIGN,
+m4_assert_numargs(1)
+m4_assert_defined(`ALIGN_LOGARITHMIC')
+`.align	ifelse(ALIGN_LOGARITHMIC,yes,`m4_log2($1)',`eval($1)')dnl
+ifelse(ALIGN_FILL_0x90,yes,`, 0x90')')
+
+
+dnl  Usage: MULFUNC_PROLOGUE(function function...)
+dnl
+dnl  A dummy macro which is grepped for by ./configure to know what
+dnl  functions a multi-function file is providing.  Use this if there aren't
+dnl  explicit PROLOGUE()s for each possible function.
+dnl
+dnl  Multiple MULFUNC_PROLOGUEs can be used, or just one with the function
+dnl  names separated by spaces.
+
+define(`MULFUNC_PROLOGUE',
+m4_assert_numargs(1)
+)
+
+
+dnl  Usage: NAILS_SUPPORT(spec spec ...)
+dnl
+dnl  A dummy macro which is grepped for by ./configure to know what nails
+dnl  are supported in an asm file.
+dnl
+dnl  Ranges can be given, or just individual values.  Multiple values or
+dnl  ranges can be given, separated by spaces.  Multiple NAILS_SUPPORT
+dnl  declarations work too.  Some examples,
+dnl
+dnl         NAILS_SUPPORT(1-20)
+dnl         NAILS_SUPPORT(1 6 9-12)
+dnl         NAILS_SUPPORT(1-10 16-20)
+
+define(NAILS_SUPPORT,
+m4_assert_numargs(1)
+)
+
+
+dnl  Usage: ABI_SUPPORT(abi)
+dnl
+dnl  A dummy macro which is grepped for by ./configure to know what ABIs
+dnl  are supported in an asm file.
+dnl
+dnl  If multiple non-standard ABIs are supported, several ABI_SUPPORT
+dnl  declarations should be used:
+dnl
+dnl         ABI_SUPPORT(FOOABI)
+dnl         ABI_SUPPORT(BARABI)
+
+define(ABI_SUPPORT,
+m4_assert_numargs(1)
+)
+
+
+dnl  Usage: GMP_NUMB_MASK
+dnl
+dnl  A bit mask for the number part of a limb.  Eg. with 6 bit nails in a
+dnl  32 bit limb, GMP_NUMB_MASK would be 0x3ffffff.
+
+define(GMP_NUMB_MASK,
+m4_assert_numargs(-1)
+m4_assert_defined(`GMP_NUMB_BITS')
+`m4_hex_lowmask(GMP_NUMB_BITS)')
+
+
+dnl  Usage: m4append(`variable',`value-to-append')
+
+define(`m4append',
+`define(`$1',  defn(`$1')`$2')
+'
+)
+
+divert`'dnl

diff --git a/third_party/gmp/mpn/cpp-ccas b/third_party/gmp/mpn/cpp-ccas
new file mode 100755
index 0000000..25f7cdc
--- /dev/null
+++ b/third_party/gmp/mpn/cpp-ccas

@@ -0,0 +1,118 @@
+#!/bin/sh
+#
+# A helper script for Makeasm.am .S.lo rule.
+
+# Copyright 2001 Free Software Foundation, Inc.
+#
+#  This file is part of the GNU MP Library.
+#
+#  The GNU MP Library is free software; you can redistribute it and/or modify
+#  it under the terms of either:
+#
+#    * the GNU Lesser General Public License as published by the Free
+#      Software Foundation; either version 3 of the License, or (at your
+#      option) any later version.
+#
+#  or
+#
+#    * the GNU General Public License as published by the Free Software
+#      Foundation; either version 2 of the License, or (at your option) any
+#      later version.
+#
+#  or both in parallel, as here.
+#
+#  The GNU MP Library is distributed in the hope that it will be useful, but
+#  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+#  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+#  for more details.
+#
+#  You should have received copies of the GNU General Public License and the
+#  GNU Lesser General Public License along with the GNU MP Library.  If not,
+#  see https://www.gnu.org/licenses/.
+
+
+# Usage: cpp-cc --cpp=CPP CC ... file.S ...
+#
+# Process file.S with the given CPP command plus any -D options in the
+# rest of the arguments, then assemble with the given CC plus all
+# arguments.
+#
+# The CPP command must be in a single --cpp= argument, and will be
+# split on whitespace.  It should include -I options required.
+#
+# When CC is invoked, file.S is replaced with a temporary .s file
+# which is the CPP output.
+#
+# Any lines starting with "#" are removed from the CPP output, usually
+# these will be #line and #file markers from CPP, but they might also
+# be comments from the .S.
+#
+# To allow parallel builds, the temp file name is based on the .S file
+# name, which will be the output object filename for all uses we put
+# this script to.
+
+CPP=
+CPPDEFS=
+CC=
+S=
+SEEN_O=no
+
+for i in "$@"; do
+  case $i in
+    --cpp=*)
+      CPP=`echo "$i" | sed 's/^--cpp=//'`
+      ;;
+    -D*)
+      CPPDEFS="$CPPDEFS $i"
+      CC="$CC $i"
+      ;;
+    *.S)
+      if test -n "$S"; then
+        echo "Only one .S file permitted"
+        exit 1
+      fi
+      BASENAME=`echo "$i" | sed -e 's/\.S$//' -e 's/^.*[\\/:]//'`
+      S=$i
+      TMP_I=tmp-$BASENAME.i
+      TMP_S=tmp-$BASENAME.s
+      CC="$CC $TMP_S"
+      ;;
+    -o)
+      SEEN_O=yes
+      CC="$CC $i"
+      ;;
+    *)
+      CC="$CC $i"
+      ;;
+  esac
+done
+
+if test -z "$CPP"; then
+  echo "No --cpp specified"
+  exit 1
+fi
+
+if test -z "$S"; then
+  echo "No .S specified"
+  exit 1
+fi
+
+# Libtool adds it's own -o when sending output to .libs/foo.o, but not
+# when just wanting foo.o in the current directory.  We need an
+# explicit -o in both cases since we're assembling tmp-foo.s.
+#
+if test $SEEN_O = no; then
+  CC="$CC -o $BASENAME.o"
+fi
+
+echo "$CPP $CPPDEFS $S >$TMP_I"
+$CPP $CPPDEFS $S >$TMP_I || exit
+
+echo "grep -v '^#' $TMP_I >$TMP_S"
+grep -v '^#' $TMP_I >$TMP_S
+
+echo "$CC"
+$CC || exit
+
+# Comment this out to preserve .s intermediates
+rm -f $TMP

diff --git a/third_party/gmp/mpn/cray/README b/third_party/gmp/mpn/cray/README
new file mode 100644
index 0000000..3a347d2
--- /dev/null
+++ b/third_party/gmp/mpn/cray/README

@@ -0,0 +1,121 @@
+Copyright 2000-2002 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.
+
+
+
+
+
+
+The code in this directory works for Cray vector systems such as C90,
+J90, T90 (both the CFP variant and the IEEE variant) and SV1.  (For
+the T3E and T3D systems, see the `alpha' subdirectory at the same
+level as the directory containing this file.)
+
+The cfp subdirectory is for systems utilizing the traditional Cray
+floating-point format, and the ieee subdirectory is for the newer
+systems that use the IEEE floating-point format.
+
+There are several issues that reduces speed on Cray systems.  For
+systems with cfp floating point, the main obstacle is the forming of
+128-bit products.  For IEEE systems, adding, and in particular
+computing carry is the main issue.  There are no vectorizing
+unsigned-less-than instructions, and the sequence that implement that
+operation is very long.
+
+Shifting is the only operation that is simple to make fast.  All Cray
+systems have a bitblt instructions (Vi Vj,Vj<Ak and Vi Vj,Vj>Ak) that
+should be really useful.
+
+For best speed for cfp systems, we need a mul_basecase, since that
+reduces the need for carry propagation to a minimum.  Depending on the
+size (vn) of the smaller of the two operands (V), we should split U and V
+in different chunk sizes:
+
+U split in 2 32-bit parts
+V split according to the table:
+parts			4	5	6	7	8
+bits/part		16	13	11	10	8
+max allowed vn		1	8	32	64	256
+number of multiplies	8	10	12	14	16
+peak cycles/limb	4	5	6	7	8
+
+U split in 3 22-bit parts
+V split according to the table:
+parts			3	4	5
+bits/part		22	16	13
+max allowed vn		16	1024	8192
+number of multiplies	9	12	15
+peak cycles/limb	4.5	6	7.5
+
+U split in 4 16-bit parts
+V split according to the table:
+parts			4
+bits/part		16
+max allowed vn		65536
+number of multiplies	16
+peak cycles/limb	8
+
+(A T90 CPU can accumulate two products per cycle.)
+
+IDEA:
+* Rewrite mpn_add_n:
+    short cy[n + 1];
+    #pragma _CRI ivdep
+      for (i = 0; i < n; i++)
+	{ s = up[i] + vp[i];
+	  rp[i] = s;
+	  cy[i + 1] = s < up[i]; }
+      more_carries = 0;
+    #pragma _CRI ivdep
+      for (i = 1; i < n; i++)
+	{ s = rp[i] + cy[i];
+	  rp[i] = s;
+	  more_carries += s < cy[i]; }
+      cys = 0;
+      if (more_carries)
+	{
+	  cys = rp[1] < cy[1];
+	  for (i = 2; i < n; i++)
+	    { rp[i] += cys;
+	      cys = rp[i] < cys; }
+	}
+      return cys + cy[n];
+
+* Write mpn_add3_n for adding three operands.  First add operands 1
+  and 2, and generate cy[].  Then add operand 3 to the partial result,
+  and accumulate carry into cy[].  Finally propagate carry just like
+  in the new mpn_add_n.
+
+IDEA:
+
+Store fewer bits, perhaps 62, per limb.  That brings mpn_add_n time
+down to 2.5 cycles/limb and mpn_addmul_1 times to 4 cycles/limb.  By
+storing even fewer bits per limb, perhaps 56, it would be possible to
+write a mul_mul_basecase that would run at effectively 1 cycle/limb.
+(Use VM here to better handle the romb-shaped multiply area, perhaps
+rounding operand sizes up to the next power of 2.)

diff --git a/third_party/gmp/mpn/cray/add_n.c b/third_party/gmp/mpn/cray/add_n.c
new file mode 100644
index 0000000..af49159
--- /dev/null
+++ b/third_party/gmp/mpn/cray/add_n.c

@@ -0,0 +1,90 @@
+/* Cray PVP mpn_add_n -- add two limb vectors and store their sum in a third
+   limb vector.
+
+Copyright 1996, 2000, 2001 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+/* This code runs at 4 cycles/limb.  It may be possible to bring it down
+   to 3 cycles/limb.  */
+
+#include "gmp-impl.h"
+
+mp_limb_t
+mpn_add_n (mp_ptr rp, mp_srcptr up, mp_srcptr vp, mp_size_t n)
+{
+  mp_limb_t cy[n];
+  mp_limb_t a, b, r, s0, c0, c1;
+  mp_size_t i;
+  int more_carries;
+
+  /* Main add loop.  Generate a raw output sum in rp[] and a carry vector
+     in cy[].  */
+#pragma _CRI ivdep
+  for (i = 0; i < n; i++)
+    {
+      a = up[i];
+      b = vp[i];
+      s0 = a + b;
+      rp[i] = s0;
+      c0 = ((a & b) | ((a | b) & ~s0)) >> 63;
+      cy[i] = c0;
+    }
+  /* Carry add loop.  Add the carry vector cy[] to the raw sum rp[] and
+     store the new sum back to rp[0].  If this generates further carry, set
+     more_carries.  */
+  more_carries = 0;
+#pragma _CRI ivdep
+  for (i = 1; i < n; i++)
+    {
+      r = rp[i];
+      c0 = cy[i - 1];
+      s0 = r + c0;
+      rp[i] = s0;
+      c0 = (r & ~s0) >> 63;
+      more_carries += c0;
+    }
+  /* If that second loop generated carry, handle that in scalar loop.  */
+  if (more_carries)
+    {
+      mp_limb_t cyrec = 0;
+      /* Look for places where rp[k] is zero and cy[k-1] is non-zero.
+	 These are where we got a recurrency carry.  */
+      for (i = 1; i < n; i++)
+	{
+	  r = rp[i];
+	  c0 = (r == 0 && cy[i - 1] != 0);
+	  s0 = r + cyrec;
+	  rp[i] = s0;
+	  c1 = (r & ~s0) >> 63;
+	  cyrec = c0 | c1;
+	}
+      return cyrec | cy[n - 1];
+    }
+
+  return cy[n - 1];
+}

diff --git a/third_party/gmp/mpn/cray/cfp/addmul_1.c b/third_party/gmp/mpn/cray/cfp/addmul_1.c
new file mode 100644
index 0000000..9c7f383
--- /dev/null
+++ b/third_party/gmp/mpn/cray/cfp/addmul_1.c

@@ -0,0 +1,48 @@
+/* mpn_addmul_1 for Cray PVP.
+
+Copyright 1996, 2000, 2001 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+#include "gmp-impl.h"
+
+mp_limb_t
+mpn_addmul_1 (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_limb_t limb)
+{
+  mp_limb_t p0[n], p1[n], tp[n];
+  mp_limb_t cy_limb;
+
+  GMPN_MULWW (p1, p0, up, &n, &limb);
+  cy_limb = mpn_add_n (tp, rp, p0, n);
+  rp[0] = tp[0];
+  if (n != 1)
+    cy_limb += mpn_add_n (rp + 1, tp + 1, p1, n - 1);
+  cy_limb += p1[n - 1];
+
+  return cy_limb;
+}

diff --git a/third_party/gmp/mpn/cray/cfp/mul_1.c b/third_party/gmp/mpn/cray/cfp/mul_1.c
new file mode 100644
index 0000000..33a6a05
--- /dev/null
+++ b/third_party/gmp/mpn/cray/cfp/mul_1.c

@@ -0,0 +1,47 @@
+/* mpn_mul_1 for Cray PVP.
+
+Copyright 1996, 2000, 2001 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+#include "gmp-impl.h"
+
+mp_limb_t
+mpn_mul_1 (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_limb_t limb)
+{
+  mp_limb_t p0[n], p1[n];
+  mp_limb_t cy_limb;
+
+  GMPN_MULWW (p1, p0, up, &n, &limb);
+  rp[0] = p0[0];
+  cy_limb = p1[n - 1];
+  if (n != 1)
+    cy_limb += mpn_add_n (rp + 1, p0 + 1, p1, n - 1);
+
+  return cy_limb;
+}

diff --git a/third_party/gmp/mpn/cray/cfp/mulwwc90.s b/third_party/gmp/mpn/cray/cfp/mulwwc90.s
new file mode 100644
index 0000000..71d2285
--- /dev/null
+++ b/third_party/gmp/mpn/cray/cfp/mulwwc90.s

@@ -0,0 +1,254 @@
+*    Helper for mpn_mul_1, mpn_addmul_1, and mpn_submul_1 for Cray PVP.
+
+*    Copyright 1996, 2000 Free Software Foundation, Inc.
+*    This file is generated from mulww.f in this same directory.
+
+*  This file is part of the GNU MP Library.
+*
+*  The GNU MP Library is free software; you can redistribute it and/or modify
+*  it under the terms of either:
+*
+*    * the GNU Lesser General Public License as published by the Free
+*      Software Foundation; either version 3 of the License, or (at your
+*      option) any later version.
+*
+*  or
+*
+*    * the GNU General Public License as published by the Free Software
+*      Foundation; either version 2 of the License, or (at your option) any
+*      later version.
+*
+*  or both in parallel, as here.
+*
+*  The GNU MP Library is distributed in the hope that it will be useful, but
+*  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+*  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+*  for more details.
+*
+*  You should have received copies of the GNU General Public License and the
+*  GNU Lesser General Public License along with the GNU MP Library.  If not,
+*  see https://www.gnu.org/licenses/.
+
+            IDENT           GMPN_MULWW
+**********************************************
+*      Assemble with Cal Version 2.0         *
+*                                            *
+* Generated by CFT77   6.0.4.19              *
+*           on 06/27/00 at 04:34:13          *
+*                                            *
+**********************************************
+* ALLOW UNDERSCORES IN IDENTIFIERS
+            EDIT            OFF
+            FORMAT          NEW
+@DATA       SECTION         DATA,CM
+@DATA       =               W.*
+            CON             O'0000000000040000000000
+            CON             O'0435152404713723252514
+            CON             O'0535270000000000000000
+            CON             O'0000000000000001200012
+            VWD             32/0,32/P.GMPN_MULWW
+            CON             O'0014003000000000001416
+            CON             O'0000000000000000000011
+            CON             O'0000000000000000000215
+            BSSZ            1
+@CODE       SECTION         CODE
+@CODE       =               P.*
+L3          =               P.*
+            A0              A6
+            A5              6
+            B03,A5          0,A0
+            A0              A1+A2
+            A5              1
+            0,A0            T00,A5
+            B02             A2
+            B66             A3
+            B01             A6
+            A7              P.L4
+            B00             A7
+            A6              @DATA
+            J               $STKOFEN
+GMPN_MULWW  =               P.*
+            A0              @DATA+3
+            B77             A0
+            A1              13
+            A0              B66
+            A2              B66
+            A4              B67
+            0,A0            B77,A1
+            A7              782
+            A3              A2+A7
+            A0              A4-A3
+            JAM             L3
+            A0              A6
+            A5              6
+            B03,A5          0,A0
+            A0              A1+A2
+            A5              1
+            0,A0            T00,A5
+            B02             A2
+            B66             A3
+            B01             A6
+L4          =               P.*
+            A7              B07
+            S7              0,A7
+            A6              B10
+            S6              0,A6
+            S5              1
+            S4              <22
+            S7              S7-S5
+            S5              #S7
+            T00             S6
+            S6              S6>22
+            S7              T00
+            S7              S7>44
+            S3              T00
+            S3              S3&S4
+            S6              S6&S4
+            S7              S7&S4
+            S3              S3<24
+            S6              S6<24
+            S7              S7<24
+            S0              S5
+            S4              S5
+            S1              S6
+            S2              S3
+            S3              S7
+            JSP             L5
+L6          =               P.*
+            S7              -S4
+            A2              S7
+            VL              A2
+            A3              B06
+            A5              B05
+            A4              B04
+            A1              VL
+            A2              S4
+L7          =               P.*
+            A0              A3
+            VL              A1
+            V7              ,A0,1
+            B11             A5
+            A7              22
+            B12             A4
+            V6              V7>A7
+            B13             A3
+            S7              <22
+            A3              B02
+            V5              S7&V6
+            A6              24
+            V4              V5<A6
+            V3              S1*FV4
+            V2              S7&V7
+            V1              V2<A6
+            V0              S3*FV1
+            V6              V0+V3
+            A5              44
+            V5              V7>A5
+            V2              S1*FV1
+            V3              S7&V5
+            A0              14
+            B77             A0
+            A4              B77
+            A0              A4+A3
+            ,A0,1           V2
+            V0              V3<A6
+            V7              S2*FV1
+            A4              142
+            A0              A4+A3
+            ,A0,1           V7
+            V5              V7>A7
+            V2              S2*FV0
+            V3              V6+V2
+            S7              <20
+            V1              S7&V3
+            A4              270
+            A0              A4+A3
+            ,A0,1           V0
+            A4              14
+            A0              A4+A3
+            V7              ,A0,1
+            V6              V1<A7
+            V2              S2*FV4
+            V0              V7+V2
+            S7              <42
+            V1              S7&V0
+            A4              398
+            A0              A4+A3
+            ,A0,1           V0
+            V7              S3*FV4
+            V2              V5+V1
+            V0              V3<A5
+            A5              526
+            A0              A5+A3
+            ,A0,1           V0
+            A5              270
+            A0              A5+A3
+            V4              ,A0,1
+            V5              V2+V6
+            A5              20
+            V1              V3>A5
+            V0              S1*FV4
+            A5              654
+            A0              A5+A3
+            ,A0,1           V1
+            V6              V7+V0
+            A5              2
+            V2              V6<A5
+            V3              S3*FV4
+            A5              142
+            A0              A5+A3
+            V1              ,A0,1
+            A5              526
+            A0              A5+A3
+            V7              ,A0,1
+            V0              V1+V7
+            V6              V3<A6
+            V4              V6+V2
+            A6              42
+            V7              V5>A6
+            A5              654
+            CPW
+            A0              A5+A3
+            V1              ,A0,1
+            A5              398
+            A0              A5+A3
+            V3              ,A0,1
+            V6              V4+V1
+            V2              V3>A6
+            V5              V6+V2
+            A6              B12
+            V4              V3<A7
+            A7              B13
+            A3              A7+A1
+            A7              B11
+            A5              A7+A1
+            A4              A6+A1
+            A7              A2+A1
+            A0              A2+A1
+            A2              128
+            B13             A0
+            V1              V0+V4
+            A0              B11
+            ,A0,1           V1
+            V6              V5+V7
+            A0              A6
+            ,A0,1           V6
+            A0              B13
+            A1              A2
+            A2              A7
+            JAN             L7
+L8          =               P.*
+L5          =               P.*
+            S1              0
+            A0              B02
+            A2              B02
+            A1              13
+            B66             A0
+            B77,A1          0,A0
+            A0              A2+A1
+            A1              1
+            T00,A1          0,A0
+            J               B00
+            EXT             $STKOFEN:p
+            ENTRY           GMPN_MULWW
+            END

diff --git a/third_party/gmp/mpn/cray/cfp/mulwwj90.s b/third_party/gmp/mpn/cray/cfp/mulwwj90.s
new file mode 100644
index 0000000..1c2c7cd
--- /dev/null
+++ b/third_party/gmp/mpn/cray/cfp/mulwwj90.s

@@ -0,0 +1,253 @@
+*    Helper for mpn_mul_1, mpn_addmul_1, and mpn_submul_1 for Cray PVP.
+
+*    Copyright 1996, 2000 Free Software Foundation, Inc.
+*    This file is generated from mulww.f in this same directory.
+
+*  This file is part of the GNU MP Library.
+*
+*  The GNU MP Library is free software; you can redistribute it and/or modify
+*  it under the terms of either:
+*
+*    * the GNU Lesser General Public License as published by the Free
+*      Software Foundation; either version 3 of the License, or (at your
+*      option) any later version.
+*
+*  or
+*
+*    * the GNU General Public License as published by the Free Software
+*      Foundation; either version 2 of the License, or (at your option) any
+*      later version.
+*
+*  or both in parallel, as here.
+*
+*  The GNU MP Library is distributed in the hope that it will be useful, but
+*  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+*  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+*  for more details.
+*
+*  You should have received copies of the GNU General Public License and the
+*  GNU Lesser General Public License along with the GNU MP Library.  If not,
+*  see https://www.gnu.org/licenses/.
+
+            IDENT           GMPN_MULWW
+**********************************************
+*      Assemble with Cal Version 2.0         *
+*                                            *
+* Generated by CFT77   6.0.4.19              *
+*           on 06/27/00 at 04:34:13          *
+*                                            *
+**********************************************
+* ALLOW UNDERSCORES IN IDENTIFIERS
+            EDIT            OFF
+            FORMAT          NEW
+@DATA       SECTION         DATA,CM
+@DATA       =               W.*
+            CON             O'0000000000040000000000
+            CON             O'0435152404713723252514
+            CON             O'0535270000000000000000
+            CON             O'0000000000000001200012
+            VWD             32/0,32/P.GMPN_MULWW
+            CON             O'0014003000000000001416
+            CON             O'0000000000000000000011
+            CON             O'0000000000000000000215
+            BSSZ            1
+@CODE       SECTION         CODE
+@CODE       =               P.*
+L3          =               P.*
+            A0              A6
+            A5              6
+            B03,A5          0,A0
+            A0              A1+A2
+            A5              1
+            0,A0            T00,A5
+            B02             A2
+            B66             A3
+            B01             A6
+            A7              P.L4
+            B00             A7
+            A6              @DATA
+            J               $STKOFEN
+GMPN_MULWW  =               P.*
+            A0              @DATA+3
+            B77             A0
+            A1              13
+            A0              B66
+            A2              B66
+            A4              B67
+            0,A0            B77,A1
+            A7              782
+            A3              A2+A7
+            A0              A4-A3
+            JAM             L3
+            A0              A6
+            A5              6
+            B03,A5          0,A0
+            A0              A1+A2
+            A5              1
+            0,A0            T00,A5
+            B02             A2
+            B66             A3
+            B01             A6
+L4          =               P.*
+            A7              B07
+            S7              0,A7
+            A6              B10
+            S6              0,A6
+            S5              1
+            S4              <22
+            S7              S7-S5
+            S5              #S7
+            T00             S6
+            S6              S6>22
+            S7              T00
+            S7              S7>44
+            S3              T00
+            S3              S3&S4
+            S6              S6&S4
+            S7              S7&S4
+            S3              S3<24
+            S6              S6<24
+            S7              S7<24
+            S0              S5
+            S4              S5
+            S1              S6
+            S2              S3
+            S3              S7
+            JSP             L5
+L6          =               P.*
+            S7              -S4
+            A2              S7
+            VL              A2
+            A3              B06
+            A5              B05
+            A4              B04
+            A1              VL
+            A2              S4
+L7          =               P.*
+            A0              A3
+            VL              A1
+            V7              ,A0,1
+            B11             A5
+            A7              22
+            B12             A4
+            V6              V7>A7
+            B13             A3
+            S7              <22
+            A3              B02
+            V5              S7&V6
+            A6              24
+            V4              V5<A6
+            V3              S1*FV4
+            V2              S7&V7
+            V1              V2<A6
+            V0              S3*FV1
+            V6              V0+V3
+            A5              44
+            V5              V7>A5
+            V2              S1*FV1
+            V3              S7&V5
+            A0              14
+            B77             A0
+            A4              B77
+            A0              A4+A3
+            ,A0,1           V2
+            V0              V3<A6
+            V7              S2*FV1
+            A4              142
+            A0              A4+A3
+            ,A0,1           V7
+            V5              V7>A7
+            V2              S2*FV0
+            V3              V6+V2
+            S7              <20
+            V1              S7&V3
+            A4              270
+            A0              A4+A3
+            ,A0,1           V0
+            A4              14
+            A0              A4+A3
+            V7              ,A0,1
+            V6              V1<A7
+            V2              S2*FV4
+            V0              V7+V2
+            S7              <42
+            V1              S7&V0
+            A4              398
+            A0              A4+A3
+            ,A0,1           V0
+            V7              S3*FV4
+            V2              V5+V1
+            V0              V3<A5
+            A5              526
+            A0              A5+A3
+            ,A0,1           V0
+            A5              270
+            A0              A5+A3
+            V4              ,A0,1
+            V5              V2+V6
+            A5              20
+            V1              V3>A5
+            V0              S1*FV4
+            A5              654
+            A0              A5+A3
+            ,A0,1           V1
+            V6              V7+V0
+            A5              2
+            V2              V6<A5
+            V3              S3*FV4
+            A5              142
+            A0              A5+A3
+            V1              ,A0,1
+            A5              526
+            A0              A5+A3
+            V7              ,A0,1
+            V0              V1+V7
+            V6              V3<A6
+            V4              V6+V2
+            A6              42
+            V7              V5>A6
+            A5              654
+            A0              A5+A3
+            V1              ,A0,1
+            A5              398
+            A0              A5+A3
+            V3              ,A0,1
+            V6              V4+V1
+            V2              V3>A6
+            V5              V6+V2
+            A6              B12
+            V4              V3<A7
+            A7              B13
+            A3              A7+A1
+            A7              B11
+            A5              A7+A1
+            A4              A6+A1
+            A7              A2+A1
+            A0              A2+A1
+            A2              64
+            B13             A0
+            V1              V0+V4
+            A0              B11
+            ,A0,1           V1
+            V6              V5+V7
+            A0              A6
+            ,A0,1           V6
+            A0              B13
+            A1              A2
+            A2              A7
+            JAN             L7
+L8          =               P.*
+L5          =               P.*
+            S1              0
+            A0              B02
+            A2              B02
+            A1              13
+            B66             A0
+            B77,A1          0,A0
+            A0              A2+A1
+            A1              1
+            T00,A1          0,A0
+            J               B00
+            EXT             $STKOFEN:p
+            ENTRY           GMPN_MULWW
+            END

diff --git a/third_party/gmp/mpn/cray/cfp/submul_1.c b/third_party/gmp/mpn/cray/cfp/submul_1.c
new file mode 100644
index 0000000..622c275
--- /dev/null
+++ b/third_party/gmp/mpn/cray/cfp/submul_1.c

@@ -0,0 +1,48 @@
+/* mpn_submul_1 for Cray PVP.
+
+Copyright 1996, 2000, 2001 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+#include "gmp-impl.h"
+
+mp_limb_t
+mpn_submul_1 (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_limb_t limb)
+{
+  mp_limb_t p0[n], p1[n], tp[n];
+  mp_limb_t cy_limb;
+
+  GMPN_MULWW (p1, p0, up, &n, &limb);
+  cy_limb = mpn_sub_n (tp, rp, p0, n);
+  rp[0] = tp[0];
+  if (n != 1)
+    cy_limb += mpn_sub_n (rp + 1, tp + 1, p1, n - 1);
+  cy_limb += p1[n - 1];
+
+  return cy_limb;
+}

diff --git a/third_party/gmp/mpn/cray/gmp-mparam.h b/third_party/gmp/mpn/cray/gmp-mparam.h
new file mode 100644
index 0000000..1baed1e
--- /dev/null
+++ b/third_party/gmp/mpn/cray/gmp-mparam.h

@@ -0,0 +1,74 @@
+/* Cray T90 CFP gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 1996, 2000-2004 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+/* T90 Unicos 10.0.X in CFP mode */
+
+/* Generated by tuneup.c, 2004-02-07, system compiler */
+
+#define MUL_TOOM22_THRESHOLD             71
+#define MUL_TOOM33_THRESHOLD            131
+
+#define SQR_BASECASE_THRESHOLD           32
+#define SQR_TOOM2_THRESHOLD             199
+#define SQR_TOOM3_THRESHOLD             363
+
+#define DIV_SB_PREINV_THRESHOLD           0  /* (preinv always) */
+#define DIV_DC_THRESHOLD                996
+#define POWM_THRESHOLD                  601
+
+#define HGCD_THRESHOLD                  964
+#define GCD_ACCEL_THRESHOLD               3
+#define GCD_DC_THRESHOLD               2874
+#define JACOBI_BASE_METHOD                2
+
+#define DIVREM_1_NORM_THRESHOLD           0  /* preinv always */
+#define DIVREM_1_UNNORM_THRESHOLD         0  /* always */
+#define MOD_1_NORM_THRESHOLD              0  /* always */
+#define MOD_1_UNNORM_THRESHOLD            0  /* always */
+#define USE_PREINV_DIVREM_1               1  /* preinv always */
+#define USE_PREINV_MOD_1                  1  /* preinv always */
+#define DIVREM_2_THRESHOLD                0  /* preinv always */
+#define DIVEXACT_1_THRESHOLD              0  /* always */
+#define MODEXACT_1_ODD_THRESHOLD          0  /* always */
+
+#define GET_STR_DC_THRESHOLD             26
+#define GET_STR_PRECOMPUTE_THRESHOLD     42
+#define SET_STR_THRESHOLD            145756
+
+#define MUL_FFT_TABLE  { 272, 544, 1088, 2304, 5120, 12288, 49152, 0 }
+#define MUL_FFT_MODF_THRESHOLD          200
+#define MUL_FFT_THRESHOLD              1664
+
+#define SQR_FFT_TABLE  { 1008, 2080, 3904, 7936, 17408, 45056, 0 }
+#define SQR_FFT_MODF_THRESHOLD          600
+#define SQR_FFT_THRESHOLD              2976

diff --git a/third_party/gmp/mpn/cray/hamdist.c b/third_party/gmp/mpn/cray/hamdist.c
new file mode 100644
index 0000000..e7f177a
--- /dev/null
+++ b/third_party/gmp/mpn/cray/hamdist.c

@@ -0,0 +1,42 @@
+/* Cray mpn_hamdist -- hamming distance count.
+
+Copyright 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include <intrinsics.h>
+#include "gmp-impl.h"
+
+unsigned long int
+mpn_hamdist (mp_srcptr p1, mp_srcptr p2, mp_size_t n)
+{
+  unsigned long int result = 0;
+  mp_size_t i;
+  for (i = 0; i < n; i++)
+    result += _popcnt (p1[i] ^ p2[i]);
+  return result;
+}

diff --git a/third_party/gmp/mpn/cray/ieee/addmul_1.c b/third_party/gmp/mpn/cray/ieee/addmul_1.c
new file mode 100644
index 0000000..ce7dfbb
--- /dev/null
+++ b/third_party/gmp/mpn/cray/ieee/addmul_1.c

@@ -0,0 +1,111 @@
+/* Cray PVP/IEEE mpn_addmul_1 -- multiply a limb vector with a limb and add the
+   result to a second limb vector.
+
+Copyright 2000-2002 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+/* This code runs at just under 9 cycles/limb on a T90.  That is not perfect,
+   mainly due to vector register shortage in the main loop.  Assembly code
+   should bring it down to perhaps 7 cycles/limb.  */
+
+#include <intrinsics.h>
+#include "gmp-impl.h"
+
+mp_limb_t
+mpn_addmul_1 (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_limb_t vl)
+{
+  mp_limb_t cy[n];
+  mp_limb_t a, b, r, s0, s1, c0, c1;
+  mp_size_t i;
+  int more_carries;
+
+  if (up == rp)
+    {
+      /* The algorithm used below cannot handle overlap.  Handle it here by
+	 making a temporary copy of the source vector, then call ourselves.  */
+      mp_limb_t xp[n];
+      MPN_COPY (xp, up, n);
+      return mpn_addmul_1 (rp, xp, n, vl);
+    }
+
+  a = up[0] * vl;
+  r = rp[0];
+  s0 = a + r;
+  rp[0] = s0;
+  c0 = ((a & r) | ((a | r) & ~s0)) >> 63;
+  cy[0] = c0;
+
+  /* Main multiply loop.  Generate a raw accumulated output product in rp[]
+     and a carry vector in cy[].  */
+#pragma _CRI ivdep
+  for (i = 1; i < n; i++)
+    {
+      a = up[i] * vl;
+      b = _int_mult_upper (up[i - 1], vl);
+      s0 = a + b;
+      c0 = ((a & b) | ((a | b) & ~s0)) >> 63;
+      r = rp[i];
+      s1 = s0 + r;
+      rp[i] = s1;
+      c1 = ((s0 & r) | ((s0 | r) & ~s1)) >> 63;
+      cy[i] = c0 + c1;
+    }
+  /* Carry add loop.  Add the carry vector cy[] to the raw result rp[] and
+     store the new result back to rp[].  */
+  more_carries = 0;
+#pragma _CRI ivdep
+  for (i = 1; i < n; i++)
+    {
+      r = rp[i];
+      c0 = cy[i - 1];
+      s0 = r + c0;
+      rp[i] = s0;
+      c0 = (r & ~s0) >> 63;
+      more_carries += c0;
+    }
+  /* If that second loop generated carry, handle that in scalar loop.  */
+  if (more_carries)
+    {
+      mp_limb_t cyrec = 0;
+      /* Look for places where rp[k] == 0 and cy[k-1] == 1 or
+	 rp[k] == 1 and cy[k-1] == 2.
+	 These are where we got a recurrency carry.  */
+      for (i = 1; i < n; i++)
+	{
+	  r = rp[i];
+	  c0 = r < cy[i - 1];
+	  s0 = r + cyrec;
+	  rp[i] = s0;
+	  c1 = (r & ~s0) >> 63;
+	  cyrec = c0 | c1;
+	}
+      return _int_mult_upper (up[n - 1], vl) + cyrec + cy[n - 1];
+    }
+
+  return _int_mult_upper (up[n - 1], vl) + cy[n - 1];
+}

diff --git a/third_party/gmp/mpn/cray/ieee/gmp-mparam.h b/third_party/gmp/mpn/cray/ieee/gmp-mparam.h
new file mode 100644
index 0000000..1fdc286
--- /dev/null
+++ b/third_party/gmp/mpn/cray/ieee/gmp-mparam.h

@@ -0,0 +1,73 @@
+/* Cray T90 IEEE gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 1996, 2000-2002, 2004 Free Software Foundation,
+Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+/* Generated by tuneup.c, 2004-02-07, system compiler */
+
+#define MUL_TOOM22_THRESHOLD            130
+#define MUL_TOOM33_THRESHOLD            260
+
+#define SQR_BASECASE_THRESHOLD            9  /* karatsuba */
+#define SQR_TOOM2_THRESHOLD               0  /* never sqr_basecase */
+#define SQR_TOOM3_THRESHOLD              34
+
+#define DIV_SB_PREINV_THRESHOLD           0  /* preinv always */
+#define DIV_DC_THRESHOLD                390
+#define POWM_THRESHOLD                  656
+
+#define HGCD_THRESHOLD                  964
+#define GCD_ACCEL_THRESHOLD               3
+#define GCD_DC_THRESHOLD                964
+#define JACOBI_BASE_METHOD                2
+
+#define DIVREM_1_NORM_THRESHOLD           0  /* preinv always */
+#define DIVREM_1_UNNORM_THRESHOLD         0  /* always */
+#define MOD_1_NORM_THRESHOLD              0  /* always */
+#define MOD_1_UNNORM_THRESHOLD            0  /* always */
+#define USE_PREINV_DIVREM_1               1  /* preinv always */
+#define USE_PREINV_MOD_1                  1  /* preinv always */
+#define DIVREM_2_THRESHOLD                0  /* preinv always */
+#define DIVEXACT_1_THRESHOLD              0  /* always */
+#define MODEXACT_1_ODD_THRESHOLD          0  /* always */
+
+#define GET_STR_DC_THRESHOLD             45
+#define GET_STR_PRECOMPUTE_THRESHOLD     77
+#define SET_STR_THRESHOLD            145756
+
+#define MUL_FFT_TABLE  { 1104, 2208, 4416, 8960, 19456, 45056, 0 }
+#define MUL_FFT_MODF_THRESHOLD         1168
+#define MUL_FFT_THRESHOLD              6528
+
+#define SQR_FFT_TABLE  { 368, 736, 1600, 2816, 7168, 12288, 0 }
+#define SQR_FFT_MODF_THRESHOLD          296
+#define SQR_FFT_THRESHOLD              1312

diff --git a/third_party/gmp/mpn/cray/ieee/invert_limb.c b/third_party/gmp/mpn/cray/ieee/invert_limb.c
new file mode 100644
index 0000000..774a27b
--- /dev/null
+++ b/third_party/gmp/mpn/cray/ieee/invert_limb.c

@@ -0,0 +1,127 @@
+/* mpn_invert_limb -- Invert a normalized limb.
+
+Copyright 1991, 2000, 2001 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+/*
+  This is needed to make configure define HAVE_NATIVE_mpn_invert_limb:
+  PROLOGUE(mpn_invert_limb)
+*/
+
+static const unsigned short int approx_tab[0x100] =
+{
+  /* 0x400, */
+  0x3ff,
+         0x3fc, 0x3f8, 0x3f4, 0x3f0, 0x3ec, 0x3e8, 0x3e4,
+  0x3e0, 0x3dd, 0x3d9, 0x3d5, 0x3d2, 0x3ce, 0x3ca, 0x3c7,
+  0x3c3, 0x3c0, 0x3bc, 0x3b9, 0x3b5, 0x3b2, 0x3ae, 0x3ab,
+  0x3a8, 0x3a4, 0x3a1, 0x39e, 0x39b, 0x397, 0x394, 0x391,
+  0x38e, 0x38b, 0x387, 0x384, 0x381, 0x37e, 0x37b, 0x378,
+  0x375, 0x372, 0x36f, 0x36c, 0x369, 0x366, 0x364, 0x361,
+  0x35e, 0x35b, 0x358, 0x355, 0x353, 0x350, 0x34d, 0x34a,
+  0x348, 0x345, 0x342, 0x340, 0x33d, 0x33a, 0x338, 0x335,
+  0x333, 0x330, 0x32e, 0x32b, 0x329, 0x326, 0x324, 0x321,
+  0x31f, 0x31c, 0x31a, 0x317, 0x315, 0x313, 0x310, 0x30e,
+  0x30c, 0x309, 0x307, 0x305, 0x303, 0x300, 0x2fe, 0x2fc,
+  0x2fa, 0x2f7, 0x2f5, 0x2f3, 0x2f1, 0x2ef, 0x2ec, 0x2ea,
+  0x2e8, 0x2e6, 0x2e4, 0x2e2, 0x2e0, 0x2de, 0x2dc, 0x2da,
+  0x2d8, 0x2d6, 0x2d4, 0x2d2, 0x2d0, 0x2ce, 0x2cc, 0x2ca,
+  0x2c8, 0x2c6, 0x2c4, 0x2c2, 0x2c0, 0x2be, 0x2bc, 0x2bb,
+  0x2b9, 0x2b7, 0x2b5, 0x2b3, 0x2b1, 0x2b0, 0x2ae, 0x2ac,
+  0x2aa, 0x2a8, 0x2a7, 0x2a5, 0x2a3, 0x2a1, 0x2a0, 0x29e,
+  0x29c, 0x29b, 0x299, 0x297, 0x295, 0x294, 0x292, 0x291,
+  0x28f, 0x28d, 0x28c, 0x28a, 0x288, 0x287, 0x285, 0x284,
+  0x282, 0x280, 0x27f, 0x27d, 0x27c, 0x27a, 0x279, 0x277,
+  0x276, 0x274, 0x273, 0x271, 0x270, 0x26e, 0x26d, 0x26b,
+  0x26a, 0x268, 0x267, 0x265, 0x264, 0x263, 0x261, 0x260,
+  0x25e, 0x25d, 0x25c, 0x25a, 0x259, 0x257, 0x256, 0x255,
+  0x253, 0x252, 0x251, 0x24f, 0x24e, 0x24d, 0x24b, 0x24a,
+  0x249, 0x247, 0x246, 0x245, 0x243, 0x242, 0x241, 0x240,
+  0x23e, 0x23d, 0x23c, 0x23b, 0x239, 0x238, 0x237, 0x236,
+  0x234, 0x233, 0x232, 0x231, 0x230, 0x22e, 0x22d, 0x22c,
+  0x22b, 0x22a, 0x229, 0x227, 0x226, 0x225, 0x224, 0x223,
+  0x222, 0x220, 0x21f, 0x21e, 0x21d, 0x21c, 0x21b, 0x21a,
+  0x219, 0x218, 0x216, 0x215, 0x214, 0x213, 0x212, 0x211,
+  0x210, 0x20f, 0x20e, 0x20d, 0x20c, 0x20b, 0x20a, 0x209,
+  0x208, 0x207, 0x206, 0x205, 0x204, 0x203, 0x202, 0x201,
+};
+
+/* iteration: z = 2z-(z**2)d */
+
+mp_limb_t
+mpn_invert_limb (mp_limb_t d)
+{
+  mp_limb_t z, z2l, z2h, tl, th;
+  mp_limb_t xh, xl;
+  mp_limb_t zh, zl;
+
+#if GMP_LIMB_BITS == 32
+  z = approx_tab[(d >> 23) - 0x100] << 6;	/* z < 2^16 */
+
+  z2l = z * z;					/* z2l < 2^32 */
+  umul_ppmm (th, tl, z2l, d);
+  z = (z << 17) - (th << 1);
+#endif
+#if GMP_LIMB_BITS == 64
+  z = approx_tab[(d >> 55) - 0x100] << 6;	/* z < 2^16 */
+
+  z2l = z * z;					/* z2l < 2^32 */
+  th = z2l * (d >> 32);				/* th < 2^64 */
+  z = (z << 17) - (th >> 31);			/* z < 2^32 */
+
+  z2l = z * z;
+  umul_ppmm (th, tl, z2l, d);
+  z = (z << 33) - (th << 1);
+#endif
+
+  umul_ppmm (z2h, z2l, z, z);
+  umul_ppmm (th, tl, z2h, d);
+  umul_ppmm (xh, xl, z2l, d);
+  tl += xh;
+  th += tl < xh;
+  th = (th << 2) | (tl >> GMP_LIMB_BITS - 2);
+  tl = tl << 2;
+  sub_ddmmss (zh, zl, z << 2, 0, th, tl);
+
+  umul_ppmm (xh, xl, d, zh);
+  xh += d;		/* add_ssaaaa (xh, xl, xh, xl, d, 0); */
+  if (~xh != 0)
+    {
+      add_ssaaaa (xh, xl, xh, xl, 0, d);
+      zh++;
+    }
+
+  add_ssaaaa (xh, xl, xh, xl, 0, d);
+  if (xh != 0)
+    zh++;
+
+  return zh;
+}

diff --git a/third_party/gmp/mpn/cray/ieee/mul_1.c b/third_party/gmp/mpn/cray/ieee/mul_1.c
new file mode 100644
index 0000000..40139fb
--- /dev/null
+++ b/third_party/gmp/mpn/cray/ieee/mul_1.c

@@ -0,0 +1,103 @@
+/* Cray PVP/IEEE mpn_mul_1 -- multiply a limb vector with a limb and store the
+   result in a second limb vector.
+
+Copyright 2000, 2001 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+/* This code runs at 5 cycles/limb on a T90.  That would probably
+   be hard to improve upon, even with assembly code.  */
+
+#include <intrinsics.h>
+#include "gmp-impl.h"
+
+mp_limb_t
+mpn_mul_1 (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_limb_t vl)
+{
+  mp_limb_t cy[n];
+  mp_limb_t a, b, r, s0, s1, c0, c1;
+  mp_size_t i;
+  int more_carries;
+
+  if (up == rp)
+    {
+      /* The algorithm used below cannot handle overlap.  Handle it here by
+	 making a temporary copy of the source vector, then call ourselves.  */
+      mp_limb_t xp[n];
+      MPN_COPY (xp, up, n);
+      return mpn_mul_1 (rp, xp, n, vl);
+    }
+
+  a = up[0] * vl;
+  rp[0] = a;
+  cy[0] = 0;
+
+  /* Main multiply loop.  Generate a raw accumulated output product in rp[]
+     and a carry vector in cy[].  */
+#pragma _CRI ivdep
+  for (i = 1; i < n; i++)
+    {
+      a = up[i] * vl;
+      b = _int_mult_upper (up[i - 1], vl);
+      s0 = a + b;
+      c0 = ((a & b) | ((a | b) & ~s0)) >> 63;
+      rp[i] = s0;
+      cy[i] = c0;
+    }
+  /* Carry add loop.  Add the carry vector cy[] to the raw sum rp[] and
+     store the new sum back to rp[0].  */
+  more_carries = 0;
+#pragma _CRI ivdep
+  for (i = 2; i < n; i++)
+    {
+      r = rp[i];
+      c0 = cy[i - 1];
+      s0 = r + c0;
+      rp[i] = s0;
+      c0 = (r & ~s0) >> 63;
+      more_carries += c0;
+    }
+  /* If that second loop generated carry, handle that in scalar loop.  */
+  if (more_carries)
+    {
+      mp_limb_t cyrec = 0;
+      /* Look for places where rp[k] is zero and cy[k-1] is non-zero.
+	 These are where we got a recurrency carry.  */
+      for (i = 2; i < n; i++)
+	{
+	  r = rp[i];
+	  c0 = (r == 0 && cy[i - 1] != 0);
+	  s0 = r + cyrec;
+	  rp[i] = s0;
+	  c1 = (r & ~s0) >> 63;
+	  cyrec = c0 | c1;
+	}
+      return _int_mult_upper (up[n - 1], vl) + cyrec + cy[n - 1];
+    }
+
+  return _int_mult_upper (up[n - 1], vl) + cy[n - 1];
+}

diff --git a/third_party/gmp/mpn/cray/ieee/mul_basecase.c b/third_party/gmp/mpn/cray/ieee/mul_basecase.c
new file mode 100644
index 0000000..72628f7
--- /dev/null
+++ b/third_party/gmp/mpn/cray/ieee/mul_basecase.c

@@ -0,0 +1,107 @@
+/* Cray PVP/IEEE mpn_mul_basecase.
+
+Copyright 2000, 2001 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+/* The most critical loop of this code runs at about 5 cycles/limb on a T90.
+   That is not perfect, mainly due to vector register shortage.  */
+
+#include <intrinsics.h>
+#include "gmp-impl.h"
+
+void
+mpn_mul_basecase (mp_ptr rp,
+		  mp_srcptr up, mp_size_t un,
+		  mp_srcptr vp, mp_size_t vn)
+{
+  mp_limb_t cy[un + vn];
+  mp_limb_t vl;
+  mp_limb_t a, b, r, s0, s1, c0, c1;
+  mp_size_t i, j;
+  int more_carries;
+
+  for (i = 0; i < un + vn; i++)
+    {
+      rp[i] = 0;
+      cy[i] = 0;
+    }
+
+#pragma _CRI novector
+  for (j = 0; j < vn; j++)
+    {
+      vl = vp[j];
+
+      a = up[0] * vl;
+      r = rp[j];
+      s0 = a + r;
+      rp[j] = s0;
+      c0 = ((a & r) | ((a | r) & ~s0)) >> 63;
+      cy[j] += c0;
+
+#pragma _CRI ivdep
+      for (i = 1; i < un; i++)
+	{
+	  a = up[i] * vl;
+	  b = _int_mult_upper (up[i - 1], vl);
+	  s0 = a + b;
+	  c0 = ((a & b) | ((a | b) & ~s0)) >> 63;
+	  r = rp[j + i];
+	  s1 = s0 + r;
+	  rp[j + i] = s1;
+	  c1 = ((s0 & r) | ((s0 | r) & ~s1)) >> 63;
+	  cy[j + i] += c0 + c1;
+	}
+      rp[j + un] = _int_mult_upper (up[un - 1], vl);
+    }
+
+  more_carries = 0;
+#pragma _CRI ivdep
+  for (i = 1; i < un + vn; i++)
+    {
+      r = rp[i];
+      c0 = cy[i - 1];
+      s0 = r + c0;
+      rp[i] = s0;
+      c0 = (r & ~s0) >> 63;
+      more_carries += c0;
+    }
+  /* If that second loop generated carry, handle that in scalar loop.  */
+  if (more_carries)
+    {
+      mp_limb_t cyrec = 0;
+      for (i = 1; i < un + vn; i++)
+	{
+	  r = rp[i];
+	  c0 = (r < cy[i - 1]);
+	  s0 = r + cyrec;
+	  rp[i] = s0;
+	  c1 = (r & ~s0) >> 63;
+	  cyrec = c0 | c1;
+	}
+    }
+}

diff --git a/third_party/gmp/mpn/cray/ieee/sqr_basecase.c b/third_party/gmp/mpn/cray/ieee/sqr_basecase.c
new file mode 100644
index 0000000..5bd4e56
--- /dev/null
+++ b/third_party/gmp/mpn/cray/ieee/sqr_basecase.c

@@ -0,0 +1,105 @@
+/* Cray PVP/IEEE mpn_sqr_basecase.
+
+Copyright 2000, 2001 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+/* This is just mpn_mul_basecase with trivial modifications.  */
+
+#include <intrinsics.h>
+#include "gmp-impl.h"
+
+void
+mpn_sqr_basecase (mp_ptr rp,
+		  mp_srcptr up, mp_size_t un)
+{
+  mp_limb_t cy[un + un];
+  mp_limb_t ul;
+  mp_limb_t a, b, r, s0, s1, c0, c1;
+  mp_size_t i, j;
+  int more_carries;
+
+  for (i = 0; i < un + un; i++)
+    {
+      rp[i] = 0;
+      cy[i] = 0;
+    }
+
+#pragma _CRI novector
+  for (j = 0; j < un; j++)
+    {
+      ul = up[j];
+
+      a = up[0] * ul;
+      r = rp[j];
+      s0 = a + r;
+      rp[j] = s0;
+      c0 = ((a & r) | ((a | r) & ~s0)) >> 63;
+      cy[j] += c0;
+
+#pragma _CRI ivdep
+      for (i = 1; i < un; i++)
+	{
+	  a = up[i] * ul;
+	  b = _int_mult_upper (up[i - 1], ul);
+	  s0 = a + b;
+	  c0 = ((a & b) | ((a | b) & ~s0)) >> 63;
+	  r = rp[j + i];
+	  s1 = s0 + r;
+	  rp[j + i] = s1;
+	  c1 = ((s0 & r) | ((s0 | r) & ~s1)) >> 63;
+	  cy[j + i] += c0 + c1;
+	}
+      rp[j + un] = _int_mult_upper (up[un - 1], ul);
+    }
+
+  more_carries = 0;
+#pragma _CRI ivdep
+  for (i = 1; i < un + un; i++)
+    {
+      r = rp[i];
+      c0 = cy[i - 1];
+      s0 = r + c0;
+      rp[i] = s0;
+      c0 = (r & ~s0) >> 63;
+      more_carries += c0;
+    }
+  /* If that second loop generated carry, handle that in scalar loop.  */
+  if (more_carries)
+    {
+      mp_limb_t cyrec = 0;
+      for (i = 1; i < un + un; i++)
+	{
+	  r = rp[i];
+	  c0 = (r < cy[i - 1]);
+	  s0 = r + cyrec;
+	  rp[i] = s0;
+	  c1 = (r & ~s0) >> 63;
+	  cyrec = c0 | c1;
+	}
+    }
+}

diff --git a/third_party/gmp/mpn/cray/ieee/submul_1.c b/third_party/gmp/mpn/cray/ieee/submul_1.c
new file mode 100644
index 0000000..2b3ca21
--- /dev/null
+++ b/third_party/gmp/mpn/cray/ieee/submul_1.c

@@ -0,0 +1,111 @@
+/* Cray PVP/IEEE mpn_submul_1 -- multiply a limb vector with a limb and
+   subtract the result from a second limb vector.
+
+Copyright 2000-2002 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+/* This code runs at just under 9 cycles/limb on a T90.  That is not perfect,
+   mainly due to vector register shortage in the main loop.  Assembly code
+   should bring it down to perhaps 7 cycles/limb.  */
+
+#include <intrinsics.h>
+#include "gmp-impl.h"
+
+mp_limb_t
+mpn_submul_1 (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_limb_t vl)
+{
+  mp_limb_t cy[n];
+  mp_limb_t a, b, r, s0, s1, c0, c1;
+  mp_size_t i;
+  int more_carries;
+
+  if (up == rp)
+    {
+      /* The algorithm used below cannot handle overlap.  Handle it here by
+	 making a temporary copy of the source vector, then call ourselves.  */
+      mp_limb_t xp[n];
+      MPN_COPY (xp, up, n);
+      return mpn_submul_1 (rp, xp, n, vl);
+    }
+
+  a = up[0] * vl;
+  r = rp[0];
+  s0 = r - a;
+  rp[0] = s0;
+  c1 = ((s0 & a) | ((s0 | a) & ~r)) >> 63;
+  cy[0] = c1;
+
+  /* Main multiply loop.  Generate a raw accumulated output product in rp[]
+     and a carry vector in cy[].  */
+#pragma _CRI ivdep
+  for (i = 1; i < n; i++)
+    {
+      a = up[i] * vl;
+      b = _int_mult_upper (up[i - 1], vl);
+      s0 = a + b;
+      c0 = ((a & b) | ((a | b) & ~s0)) >> 63;
+      r = rp[i];
+      s1 = r - s0;
+      rp[i] = s1;
+      c1 = ((s1 & s0) | ((s1 | s0) & ~r)) >> 63;
+      cy[i] = c0 + c1;
+    }
+  /* Carry subtract loop.  Subtract the carry vector cy[] from the raw result
+     rp[] and store the new result back to rp[].  */
+  more_carries = 0;
+#pragma _CRI ivdep
+  for (i = 1; i < n; i++)
+    {
+      r = rp[i];
+      c0 = cy[i - 1];
+      s0 = r - c0;
+      rp[i] = s0;
+      c0 = (s0 & ~r) >> 63;
+      more_carries += c0;
+    }
+  /* If that second loop generated carry, handle that in scalar loop.  */
+  if (more_carries)
+    {
+      mp_limb_t cyrec = 0;
+      /* Look for places where rp[k] == ~0 and cy[k-1] == 1 or
+	 rp[k] == ~1 and cy[k-1] == 2.
+	 These are where we got a recurrency carry.  */
+      for (i = 1; i < n; i++)
+	{
+	  r = rp[i];
+	  c0 = ~r < cy[i - 1];
+	  s0 = r - cyrec;
+	  rp[i] = s0;
+	  c1 = (s0 & ~r) >> 63;
+	  cyrec = c0 | c1;
+	}
+      return _int_mult_upper (up[n - 1], vl) + cyrec + cy[n - 1];
+    }
+
+  return _int_mult_upper (up[n - 1], vl) + cy[n - 1];
+}

diff --git a/third_party/gmp/mpn/cray/lshift.c b/third_party/gmp/mpn/cray/lshift.c
new file mode 100644
index 0000000..8534e93
--- /dev/null
+++ b/third_party/gmp/mpn/cray/lshift.c

@@ -0,0 +1,58 @@
+/* mpn_lshift -- Shift left low level for Cray vector processors.
+
+Copyright (C) 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include <intrinsics.h>
+#include "gmp-impl.h"
+
+mp_limb_t
+mpn_lshift (mp_ptr wp, mp_srcptr up, mp_size_t n, unsigned int cnt)
+{
+  unsigned sh_1, sh_2;
+  mp_size_t i;
+  mp_limb_t retval;
+
+  sh_1 = cnt;
+  sh_2 = GMP_LIMB_BITS - sh_1;
+  retval = up[n - 1] >> sh_2;
+
+#pragma _CRI ivdep
+  for (i = n - 1; i > 0; i--)
+    {
+#if 1
+      wp[i] = (up[i] << sh_1) | (up[i - 1] >> sh_2);
+#else
+      /* This is the recommended way, but at least on SV1 it is slower.  */
+      wp[i] = _dshiftl (up[i], up[i - 1], sh_1);
+#endif
+    }
+
+  wp[0] = up[0] << sh_1;
+  return retval;
+}

diff --git a/third_party/gmp/mpn/cray/mulww.f b/third_party/gmp/mpn/cray/mulww.f
new file mode 100644
index 0000000..6885dfc
--- /dev/null
+++ b/third_party/gmp/mpn/cray/mulww.f

@@ -0,0 +1,63 @@
+c    Helper for mpn_mul_1, mpn_addmul_1, and mpn_submul_1 for Cray PVP.
+
+c    Copyright 1996, 2000 Free Software Foundation, Inc.
+
+c    This file is part of the GNU MP Library.
+c
+c    The GNU MP Library is free software; you can redistribute it and/or modify
+c    it under the terms of either:
+c
+c      * the GNU Lesser General Public License as published by the Free
+c        Software Foundation; either version 3 of the License, or (at your
+c        option) any later version.
+c
+c    or
+c
+c      * the GNU General Public License as published by the Free Software
+c        Foundation; either version 2 of the License, or (at your option) any
+c        later version.
+c
+c    or both in parallel, as here.
+c
+c    The GNU MP Library is distributed in the hope that it will be useful, but
+c    WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+c    or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+c    for more details.
+c
+c    You should have received copies of the GNU General Public License and the
+c    GNU Lesser General Public License along with the GNU MP Library.  If not,
+c    see https://www.gnu.org/licenses/.
+
+c    p1[] = hi(a[]*s); the upper limbs of each product
+c    p0[] = low(a[]*s); the corresponding lower limbs
+c    n is number of limbs in the vectors
+
+      subroutine gmpn_mulww(p1,p0,a,n,s)
+      integer*8 p1(0:*),p0(0:*),a(0:*),s
+      integer n
+
+      integer*8 a0,a1,a2,s0,s1,s2,c
+      integer*8 ai,t0,t1,t2,t3,t4
+
+      s0 = shiftl(and(s,4194303),24)
+      s1 = shiftl(and(shiftr(s,22),4194303),24)
+      s2 = shiftl(and(shiftr(s,44),4194303),24)
+
+      do i = 0,n-1
+         ai = a(i)
+         a0 = shiftl(and(ai,4194303),24)
+         a1 = shiftl(and(shiftr(ai,22),4194303),24)
+         a2 = shiftl(and(shiftr(ai,44),4194303),24)
+
+         t0 = i24mult(a0,s0)
+         t1 = i24mult(a0,s1)+i24mult(a1,s0)
+         t2 = i24mult(a0,s2)+i24mult(a1,s1)+i24mult(a2,s0)
+         t3 = i24mult(a1,s2)+i24mult(a2,s1)
+         t4 = i24mult(a2,s2)
+
+         p0(i)=shiftl(t2,44)+shiftl(t1,22)+t0
+         c=shiftr(shiftr(t0,22)+and(t1,4398046511103)+
+     $        shiftl(and(t2,1048575),22),42)
+         p1(i)=shiftl(t4,24)+shiftl(t3,2)+shiftr(t2,20)+shiftr(t1,42)+c
+      end do
+      end

diff --git a/third_party/gmp/mpn/cray/popcount.c b/third_party/gmp/mpn/cray/popcount.c
new file mode 100644
index 0000000..a79211f
--- /dev/null
+++ b/third_party/gmp/mpn/cray/popcount.c

@@ -0,0 +1,42 @@
+/* Cray mpn_popcount -- population count.
+
+Copyright 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include <intrinsics.h>
+#include "gmp-impl.h"
+
+unsigned long int
+mpn_popcount (mp_srcptr p, mp_size_t n)
+{
+  unsigned long int result = 0;
+  mp_size_t i;
+  for (i = 0; i < n; i++)
+    result += _popcnt (p[i]);
+  return result;
+}

diff --git a/third_party/gmp/mpn/cray/rshift.c b/third_party/gmp/mpn/cray/rshift.c
new file mode 100644
index 0000000..9c4aa22
--- /dev/null
+++ b/third_party/gmp/mpn/cray/rshift.c

@@ -0,0 +1,58 @@
+/* mpn_rshift -- Shift right low level for Cray vector processors.
+
+Copyright (C) 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include <intrinsics.h>
+#include "gmp-impl.h"
+
+mp_limb_t
+mpn_rshift (mp_ptr wp, mp_srcptr up, mp_size_t n, unsigned int cnt)
+{
+  unsigned sh_1, sh_2;
+  mp_size_t i;
+  mp_limb_t retval;
+
+  sh_1 = cnt;
+  sh_2 = GMP_LIMB_BITS - sh_1;
+  retval = up[0] << sh_2;
+
+#pragma _CRI ivdep
+  for (i = 0; i < n - 1; i++)
+    {
+#if 1
+      wp[i] = (up[i] >> sh_1) | (up[i + 1] << sh_2);
+#else
+      /* This is the recommended way, but at least on SV1 it is slower.  */
+      wp[i] = _dshiftr (up[i + 1], up[i], sh_1);
+#endif
+    }
+
+  wp[n - 1] = up[n - 1] >> sh_1;
+  return retval;
+}

diff --git a/third_party/gmp/mpn/cray/sub_n.c b/third_party/gmp/mpn/cray/sub_n.c
new file mode 100644
index 0000000..f518764
--- /dev/null
+++ b/third_party/gmp/mpn/cray/sub_n.c

@@ -0,0 +1,90 @@
+/* Cray PVP mpn_sub_n -- subtract two limb vectors and store their difference
+   in a third limb vector.
+
+Copyright 1996, 2000, 2001 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+/* This code runs at 4 cycles/limb.  It may be possible to bring it down
+   to 3 cycles/limb.  */
+
+#include "gmp-impl.h"
+
+mp_limb_t
+mpn_sub_n (mp_ptr rp, mp_srcptr up, mp_srcptr vp, mp_size_t n)
+{
+  mp_limb_t cy[n];
+  mp_limb_t a, b, r, s0, c0, c1;
+  mp_size_t i;
+  int more_carries;
+
+  /* Main subtract loop.  Generate a raw output difference in rp[] and a
+     borrow vector in cy[].  */
+#pragma _CRI ivdep
+  for (i = 0; i < n; i++)
+    {
+      a = up[i];
+      b = vp[i];
+      s0 = a - b;		/* a = s0 + b */
+      rp[i] = s0;
+      c0 = ((s0 & b) | ((s0 | b) & ~a)) >> 63;
+      cy[i] = c0;
+    }
+  /* Borrow subtract loop.  Subtract the borrow vector cy[] from the raw
+     difference rp[] and store the new difference back to rp[0].  If this
+     generates further borrow, set more_carries.  */
+  more_carries = 0;
+#pragma _CRI ivdep
+  for (i = 1; i < n; i++)
+    {
+      r = rp[i];
+      c0 = cy[i - 1];
+      s0 = r - c0;		/* r = s0 + c0 */
+      rp[i] = s0;
+      c0 = (s0 & ~r) >> 63;
+      more_carries += c0;
+    }
+  /* If that second loop generated borrow, handle that in scalar loop.  */
+  if (more_carries)
+    {
+      mp_limb_t cyrec = 0;
+      /* Look for places where rp[k] contains just ones and cy[k-1] is
+	 non-zero.  These are where we got a recurrency borrow.  */
+      for (i = 1; i < n; i++)
+	{
+	  r = rp[i];
+	  c0 = (~r == 0 && cy[i - 1] != 0);
+	  s0 = r - cyrec;
+	  rp[i] = s0;
+	  c1 = (s0 & ~r) >> 63;
+	  cyrec = c0 | c1;
+	}
+      return cyrec | cy[n - 1];
+    }
+
+  return cy[n - 1];
+}

diff --git a/third_party/gmp/mpn/generic/add.c b/third_party/gmp/mpn/generic/add.c
new file mode 100644
index 0000000..4a6e3ba
--- /dev/null
+++ b/third_party/gmp/mpn/generic/add.c

@@ -0,0 +1,33 @@
+/* mpn_add - add mpn to mpn.
+
+Copyright 2001 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define __GMP_FORCE_mpn_add 1
+
+#include "gmp-impl.h"

diff --git a/third_party/gmp/mpn/generic/add_1.c b/third_party/gmp/mpn/generic/add_1.c
new file mode 100644
index 0000000..1745aed
--- /dev/null
+++ b/third_party/gmp/mpn/generic/add_1.c

@@ -0,0 +1,33 @@
+/* mpn_add_1 - add limb to mpn.
+
+Copyright 2001 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define __GMP_FORCE_mpn_add_1 1
+
+#include "gmp-impl.h"

diff --git a/third_party/gmp/mpn/generic/add_err1_n.c b/third_party/gmp/mpn/generic/add_err1_n.c
new file mode 100644
index 0000000..b247f19
--- /dev/null
+++ b/third_party/gmp/mpn/generic/add_err1_n.c

@@ -0,0 +1,100 @@
+/* mpn_add_err1_n -- add_n with one error term
+
+   Contributed by David Harvey.
+
+   THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
+   SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+Copyright 2011 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+
+/*
+  Computes:
+
+  (1) {rp,n} := {up,n} + {vp,n} (just like mpn_add_n) with incoming carry cy,
+  return value is carry out.
+
+  (2) Let c[i+1] = carry from i-th limb addition (c[0] = cy).
+  Computes c[1]*yp[n-1] + ... + c[n]*yp[0], stores two-limb result at ep.
+
+  Requires n >= 1.
+
+  None of the outputs may overlap each other or any of the inputs, except
+  that {rp,n} may be equal to {up,n} or {vp,n}.
+*/
+mp_limb_t
+mpn_add_err1_n (mp_ptr rp, mp_srcptr up, mp_srcptr vp,
+		mp_ptr ep, mp_srcptr yp,
+                mp_size_t n, mp_limb_t cy)
+{
+  mp_limb_t el, eh, ul, vl, yl, zl, rl, sl, cy1, cy2;
+
+  ASSERT (n >= 1);
+  ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, n));
+  ASSERT (MPN_SAME_OR_SEPARATE_P (rp, vp, n));
+  ASSERT (! MPN_OVERLAP_P (rp, n, yp, n));
+  ASSERT (! MPN_OVERLAP_P (ep, 2, up, n));
+  ASSERT (! MPN_OVERLAP_P (ep, 2, vp, n));
+  ASSERT (! MPN_OVERLAP_P (ep, 2, yp, n));
+  ASSERT (! MPN_OVERLAP_P (ep, 2, rp, n));
+
+  yp += n - 1;
+  el = eh = 0;
+
+  do
+    {
+      yl = *yp--;
+      ul = *up++;
+      vl = *vp++;
+
+      /* ordinary add_n */
+      ADDC_LIMB (cy1, sl, ul, vl);
+      ADDC_LIMB (cy2, rl, sl, cy);
+      cy = cy1 | cy2;
+      *rp++ = rl;
+
+      /* update (eh:el) */
+      zl = (-cy) & yl;
+      el += zl;
+      eh += el < zl;
+    }
+  while (--n);
+
+#if GMP_NAIL_BITS != 0
+  eh = (eh << GMP_NAIL_BITS) + (el >> GMP_NUMB_BITS);
+  el &= GMP_NUMB_MASK;
+#endif
+
+  ep[0] = el;
+  ep[1] = eh;
+
+  return cy;
+}

diff --git a/third_party/gmp/mpn/generic/add_err2_n.c b/third_party/gmp/mpn/generic/add_err2_n.c
new file mode 100644
index 0000000..d584d6d
--- /dev/null
+++ b/third_party/gmp/mpn/generic/add_err2_n.c

@@ -0,0 +1,116 @@
+/* mpn_add_err2_n -- add_n with two error terms
+
+   Contributed by David Harvey.
+
+   THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
+   SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+Copyright 2011 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+
+/*
+  Computes:
+
+  (1) {rp,n} := {up,n} + {vp,n} (just like mpn_add_n) with incoming carry cy,
+  return value is carry out.
+
+  (2) Let c[i+1] = carry from i-th limb addition (c[0] = cy).
+  Computes c[1]*yp1[n-1] + ... + c[n]*yp1[0],
+           c[1]*yp2[n-1] + ... + c[n]*yp2[0],
+  stores two-limb results at {ep,2} and {ep+2,2} respectively.
+
+  Requires n >= 1.
+
+  None of the outputs may overlap each other or any of the inputs, except
+  that {rp,n} may be equal to {up,n} or {vp,n}.
+*/
+mp_limb_t
+mpn_add_err2_n (mp_ptr rp, mp_srcptr up, mp_srcptr vp,
+                mp_ptr ep, mp_srcptr yp1, mp_srcptr yp2,
+                mp_size_t n, mp_limb_t cy)
+{
+  mp_limb_t el1, eh1, el2, eh2, ul, vl, yl1, yl2, zl1, zl2, rl, sl, cy1, cy2;
+
+  ASSERT (n >= 1);
+  ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, n));
+  ASSERT (MPN_SAME_OR_SEPARATE_P (rp, vp, n));
+  ASSERT (! MPN_OVERLAP_P (rp, n, yp1, n));
+  ASSERT (! MPN_OVERLAP_P (rp, n, yp2, n));
+  ASSERT (! MPN_OVERLAP_P (ep, 4, up, n));
+  ASSERT (! MPN_OVERLAP_P (ep, 4, vp, n));
+  ASSERT (! MPN_OVERLAP_P (ep, 4, yp1, n));
+  ASSERT (! MPN_OVERLAP_P (ep, 4, yp2, n));
+  ASSERT (! MPN_OVERLAP_P (ep, 4, rp, n));
+
+  yp1 += n - 1;
+  yp2 += n - 1;
+  el1 = eh1 = 0;
+  el2 = eh2 = 0;
+
+  do
+    {
+      yl1 = *yp1--;
+      yl2 = *yp2--;
+      ul = *up++;
+      vl = *vp++;
+
+      /* ordinary add_n */
+      ADDC_LIMB (cy1, sl, ul, vl);
+      ADDC_LIMB (cy2, rl, sl, cy);
+      cy = cy1 | cy2;
+      *rp++ = rl;
+
+      /* update (eh1:el1) */
+      zl1 = (-cy) & yl1;
+      el1 += zl1;
+      eh1 += el1 < zl1;
+
+      /* update (eh2:el2) */
+      zl2 = (-cy) & yl2;
+      el2 += zl2;
+      eh2 += el2 < zl2;
+    }
+  while (--n);
+
+#if GMP_NAIL_BITS != 0
+  eh1 = (eh1 << GMP_NAIL_BITS) + (el1 >> GMP_NUMB_BITS);
+  el1 &= GMP_NUMB_MASK;
+  eh2 = (eh2 << GMP_NAIL_BITS) + (el2 >> GMP_NUMB_BITS);
+  el2 &= GMP_NUMB_MASK;
+#endif
+
+  ep[0] = el1;
+  ep[1] = eh1;
+  ep[2] = el2;
+  ep[3] = eh2;
+
+  return cy;
+}

diff --git a/third_party/gmp/mpn/generic/add_err3_n.c b/third_party/gmp/mpn/generic/add_err3_n.c
new file mode 100644
index 0000000..a6ed4dc
--- /dev/null
+++ b/third_party/gmp/mpn/generic/add_err3_n.c

@@ -0,0 +1,131 @@
+/* mpn_add_err3_n -- add_n with three error terms
+
+   Contributed by David Harvey.
+
+   THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
+   SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+Copyright 2011 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+
+/*
+  Computes:
+
+  (1) {rp,n} := {up,n} + {vp,n} (just like mpn_add_n) with incoming carry cy,
+  return value is carry out.
+
+  (2) Let c[i+1] = carry from i-th limb addition (c[0] = cy).
+  Computes c[1]*yp1[n-1] + ... + c[n]*yp1[0],
+           c[1]*yp2[n-1] + ... + c[n]*yp2[0],
+           c[1]*yp3[n-1] + ... + c[n]*yp3[0],
+  stores two-limb results at {ep,2}, {ep+2,2} and {ep+4,2} respectively.
+
+  Requires n >= 1.
+
+  None of the outputs may overlap each other or any of the inputs, except
+  that {rp,n} may be equal to {up,n} or {vp,n}.
+*/
+mp_limb_t
+mpn_add_err3_n (mp_ptr rp, mp_srcptr up, mp_srcptr vp,
+                mp_ptr ep, mp_srcptr yp1, mp_srcptr yp2, mp_srcptr yp3,
+                mp_size_t n, mp_limb_t cy)
+{
+  mp_limb_t el1, eh1, el2, eh2, el3, eh3, ul, vl, yl1, yl2, yl3, zl1, zl2, zl3, rl, sl, cy1, cy2;
+
+  ASSERT (n >= 1);
+  ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, n));
+  ASSERT (MPN_SAME_OR_SEPARATE_P (rp, vp, n));
+  ASSERT (! MPN_OVERLAP_P (rp, n, yp1, n));
+  ASSERT (! MPN_OVERLAP_P (rp, n, yp2, n));
+  ASSERT (! MPN_OVERLAP_P (rp, n, yp3, n));
+  ASSERT (! MPN_OVERLAP_P (ep, 6, up, n));
+  ASSERT (! MPN_OVERLAP_P (ep, 6, vp, n));
+  ASSERT (! MPN_OVERLAP_P (ep, 6, yp1, n));
+  ASSERT (! MPN_OVERLAP_P (ep, 6, yp2, n));
+  ASSERT (! MPN_OVERLAP_P (ep, 6, yp3, n));
+  ASSERT (! MPN_OVERLAP_P (ep, 6, rp, n));
+
+  yp1 += n - 1;
+  yp2 += n - 1;
+  yp3 += n - 1;
+  el1 = eh1 = 0;
+  el2 = eh2 = 0;
+  el3 = eh3 = 0;
+
+  do
+    {
+      yl1 = *yp1--;
+      yl2 = *yp2--;
+      yl3 = *yp3--;
+      ul = *up++;
+      vl = *vp++;
+
+      /* ordinary add_n */
+      ADDC_LIMB (cy1, sl, ul, vl);
+      ADDC_LIMB (cy2, rl, sl, cy);
+      cy = cy1 | cy2;
+      *rp++ = rl;
+
+      /* update (eh1:el1) */
+      zl1 = (-cy) & yl1;
+      el1 += zl1;
+      eh1 += el1 < zl1;
+
+      /* update (eh2:el2) */
+      zl2 = (-cy) & yl2;
+      el2 += zl2;
+      eh2 += el2 < zl2;
+
+      /* update (eh3:el3) */
+      zl3 = (-cy) & yl3;
+      el3 += zl3;
+      eh3 += el3 < zl3;
+    }
+  while (--n);
+
+#if GMP_NAIL_BITS != 0
+  eh1 = (eh1 << GMP_NAIL_BITS) + (el1 >> GMP_NUMB_BITS);
+  el1 &= GMP_NUMB_MASK;
+  eh2 = (eh2 << GMP_NAIL_BITS) + (el2 >> GMP_NUMB_BITS);
+  el2 &= GMP_NUMB_MASK;
+  eh3 = (eh3 << GMP_NAIL_BITS) + (el3 >> GMP_NUMB_BITS);
+  el3 &= GMP_NUMB_MASK;
+#endif
+
+  ep[0] = el1;
+  ep[1] = eh1;
+  ep[2] = el2;
+  ep[3] = eh2;
+  ep[4] = el3;
+  ep[5] = eh3;
+
+  return cy;
+}

diff --git a/third_party/gmp/mpn/generic/add_n.c b/third_party/gmp/mpn/generic/add_n.c
new file mode 100644
index 0000000..f62ac87
--- /dev/null
+++ b/third_party/gmp/mpn/generic/add_n.c

@@ -0,0 +1,89 @@
+/* mpn_add_n -- Add equal length limb vectors.
+
+Copyright 1992-1994, 1996, 2000, 2002, 2009 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+
+
+#if GMP_NAIL_BITS == 0
+
+mp_limb_t
+mpn_add_n (mp_ptr rp, mp_srcptr up, mp_srcptr vp, mp_size_t n)
+{
+  mp_limb_t ul, vl, sl, rl, cy, cy1, cy2;
+
+  ASSERT (n >= 1);
+  ASSERT (MPN_SAME_OR_INCR_P (rp, up, n));
+  ASSERT (MPN_SAME_OR_INCR_P (rp, vp, n));
+
+  cy = 0;
+  do
+    {
+      ul = *up++;
+      vl = *vp++;
+      sl = ul + vl;
+      cy1 = sl < ul;
+      rl = sl + cy;
+      cy2 = rl < sl;
+      cy = cy1 | cy2;
+      *rp++ = rl;
+    }
+  while (--n != 0);
+
+  return cy;
+}
+
+#endif
+
+#if GMP_NAIL_BITS >= 1
+
+mp_limb_t
+mpn_add_n (mp_ptr rp, mp_srcptr up, mp_srcptr vp, mp_size_t n)
+{
+  mp_limb_t ul, vl, rl, cy;
+
+  ASSERT (n >= 1);
+  ASSERT (MPN_SAME_OR_INCR_P (rp, up, n));
+  ASSERT (MPN_SAME_OR_INCR_P (rp, vp, n));
+
+  cy = 0;
+  do
+    {
+      ul = *up++;
+      vl = *vp++;
+      rl = ul + vl + cy;
+      cy = rl >> GMP_NUMB_BITS;
+      *rp++ = rl & GMP_NUMB_MASK;
+    }
+  while (--n != 0);
+
+  return cy;
+}
+
+#endif

diff --git a/third_party/gmp/mpn/generic/add_n_sub_n.c b/third_party/gmp/mpn/generic/add_n_sub_n.c
new file mode 100644
index 0000000..1e72b5d
--- /dev/null
+++ b/third_party/gmp/mpn/generic/add_n_sub_n.c

@@ -0,0 +1,172 @@
+/* mpn_add_n_sub_n -- Add and Subtract two limb vectors of equal, non-zero length.
+
+   THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
+   SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+Copyright 1999-2001, 2006 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+
+#ifndef L1_CACHE_SIZE
+#define L1_CACHE_SIZE 8192	/* only 68040 has less than this */
+#endif
+
+#define PART_SIZE (L1_CACHE_SIZE / GMP_LIMB_BYTES / 6)
+
+
+/* mpn_add_n_sub_n.
+   r1[] = s1[] + s2[]
+   r2[] = s1[] - s2[]
+   All operands have n limbs.
+   In-place operations allowed.  */
+mp_limb_t
+mpn_add_n_sub_n (mp_ptr r1p, mp_ptr r2p, mp_srcptr s1p, mp_srcptr s2p, mp_size_t n)
+{
+  mp_limb_t acyn, acyo;		/* carry for add */
+  mp_limb_t scyn, scyo;		/* carry for subtract */
+  mp_size_t off;		/* offset in operands */
+  mp_size_t this_n;		/* size of current chunk */
+
+  /* We alternatingly add and subtract in chunks that fit into the (L1)
+     cache.  Since the chunks are several hundred limbs, the function call
+     overhead is insignificant, but we get much better locality.  */
+
+  /* We have three variant of the inner loop, the proper loop is chosen
+     depending on whether r1 or r2 are the same operand as s1 or s2.  */
+
+  if (r1p != s1p && r1p != s2p)
+    {
+      /* r1 is not identical to either input operand.  We can therefore write
+	 to r1 directly, without using temporary storage.  */
+      acyo = 0;
+      scyo = 0;
+      for (off = 0; off < n; off += PART_SIZE)
+	{
+	  this_n = MIN (n - off, PART_SIZE);
+#if HAVE_NATIVE_mpn_add_nc
+	  acyo = mpn_add_nc (r1p + off, s1p + off, s2p + off, this_n, acyo);
+#else
+	  acyn = mpn_add_n (r1p + off, s1p + off, s2p + off, this_n);
+	  acyo = acyn + mpn_add_1 (r1p + off, r1p + off, this_n, acyo);
+#endif
+#if HAVE_NATIVE_mpn_sub_nc
+	  scyo = mpn_sub_nc (r2p + off, s1p + off, s2p + off, this_n, scyo);
+#else
+	  scyn = mpn_sub_n (r2p + off, s1p + off, s2p + off, this_n);
+	  scyo = scyn + mpn_sub_1 (r2p + off, r2p + off, this_n, scyo);
+#endif
+	}
+    }
+  else if (r2p != s1p && r2p != s2p)
+    {
+      /* r2 is not identical to either input operand.  We can therefore write
+	 to r2 directly, without using temporary storage.  */
+      acyo = 0;
+      scyo = 0;
+      for (off = 0; off < n; off += PART_SIZE)
+	{
+	  this_n = MIN (n - off, PART_SIZE);
+#if HAVE_NATIVE_mpn_sub_nc
+	  scyo = mpn_sub_nc (r2p + off, s1p + off, s2p + off, this_n, scyo);
+#else
+	  scyn = mpn_sub_n (r2p + off, s1p + off, s2p + off, this_n);
+	  scyo = scyn + mpn_sub_1 (r2p + off, r2p + off, this_n, scyo);
+#endif
+#if HAVE_NATIVE_mpn_add_nc
+	  acyo = mpn_add_nc (r1p + off, s1p + off, s2p + off, this_n, acyo);
+#else
+	  acyn = mpn_add_n (r1p + off, s1p + off, s2p + off, this_n);
+	  acyo = acyn + mpn_add_1 (r1p + off, r1p + off, this_n, acyo);
+#endif
+	}
+    }
+  else
+    {
+      /* r1 and r2 are identical to s1 and s2 (r1==s1 and r2==s2 or vice versa)
+	 Need temporary storage.  */
+      mp_limb_t tp[PART_SIZE];
+      acyo = 0;
+      scyo = 0;
+      for (off = 0; off < n; off += PART_SIZE)
+	{
+	  this_n = MIN (n - off, PART_SIZE);
+#if HAVE_NATIVE_mpn_add_nc
+	  acyo = mpn_add_nc (tp, s1p + off, s2p + off, this_n, acyo);
+#else
+	  acyn = mpn_add_n (tp, s1p + off, s2p + off, this_n);
+	  acyo = acyn + mpn_add_1 (tp, tp, this_n, acyo);
+#endif
+#if HAVE_NATIVE_mpn_sub_nc
+	  scyo = mpn_sub_nc (r2p + off, s1p + off, s2p + off, this_n, scyo);
+#else
+	  scyn = mpn_sub_n (r2p + off, s1p + off, s2p + off, this_n);
+	  scyo = scyn + mpn_sub_1 (r2p + off, r2p + off, this_n, scyo);
+#endif
+	  MPN_COPY (r1p + off, tp, this_n);
+	}
+    }
+
+  return 2 * acyo + scyo;
+}
+
+#ifdef MAIN
+#include <stdlib.h>
+#include <stdio.h>
+#include "timing.h"
+
+long cputime ();
+
+int
+main (int argc, char **argv)
+{
+  mp_ptr r1p, r2p, s1p, s2p;
+  double t;
+  mp_size_t n;
+
+  n = strtol (argv[1], 0, 0);
+
+  r1p = malloc (n * GMP_LIMB_BYTES);
+  r2p = malloc (n * GMP_LIMB_BYTES);
+  s1p = malloc (n * GMP_LIMB_BYTES);
+  s2p = malloc (n * GMP_LIMB_BYTES);
+  TIME (t,(mpn_add_n(r1p,s1p,s2p,n),mpn_sub_n(r1p,s1p,s2p,n)));
+  printf ("              separate add and sub: %.3f\n", t);
+  TIME (t,mpn_add_n_sub_n(r1p,r2p,s1p,s2p,n));
+  printf ("combined addsub separate variables: %.3f\n", t);
+  TIME (t,mpn_add_n_sub_n(r1p,r2p,r1p,s2p,n));
+  printf ("        combined addsub r1 overlap: %.3f\n", t);
+  TIME (t,mpn_add_n_sub_n(r1p,r2p,r1p,s2p,n));
+  printf ("        combined addsub r2 overlap: %.3f\n", t);
+  TIME (t,mpn_add_n_sub_n(r1p,r2p,r1p,r2p,n));
+  printf ("          combined addsub in-place: %.3f\n", t);
+
+  return 0;
+}
+#endif

diff --git a/third_party/gmp/mpn/generic/addmul_1.c b/third_party/gmp/mpn/generic/addmul_1.c
new file mode 100644
index 0000000..6140e8e
--- /dev/null
+++ b/third_party/gmp/mpn/generic/addmul_1.c

@@ -0,0 +1,145 @@
+/* mpn_addmul_1 -- multiply the N long limb vector pointed to by UP by VL,
+   add the N least significant limbs of the product to the limb vector
+   pointed to by RP.  Return the most significant limb of the product,
+   adjusted for carry-out from the addition.
+
+Copyright 1992-1994, 1996, 2000, 2002, 2004, 2016 Free Software Foundation,
+Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+
+#if GMP_NAIL_BITS == 0
+
+mp_limb_t
+mpn_addmul_1 (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_limb_t v0)
+{
+  mp_limb_t u0, crec, c, p1, p0, r0;
+
+  ASSERT (n >= 1);
+  ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, n));
+
+  crec = 0;
+  do
+    {
+      u0 = *up++;
+      umul_ppmm (p1, p0, u0, v0);
+
+      r0 = *rp;
+
+      p0 = r0 + p0;
+      c = r0 > p0;
+
+      p1 = p1 + c;
+
+      r0 = p0 + crec;		/* cycle 0, 3, ... */
+      c = p0 > r0;		/* cycle 1, 4, ... */
+
+      crec = p1 + c;		/* cycle 2, 5, ... */
+
+      *rp++ = r0;
+    }
+  while (--n != 0);
+
+  return crec;
+}
+
+#endif
+
+#if GMP_NAIL_BITS == 1
+
+mp_limb_t
+mpn_addmul_1 (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_limb_t v0)
+{
+  mp_limb_t shifted_v0, u0, r0, p0, p1, prev_p1, crec, xl, c1, c2, c3;
+
+  ASSERT (n >= 1);
+  ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, n));
+  ASSERT_MPN (rp, n);
+  ASSERT_MPN (up, n);
+  ASSERT_LIMB (v0);
+
+  shifted_v0 = v0 << GMP_NAIL_BITS;
+  crec = 0;
+  prev_p1 = 0;
+  do
+    {
+      u0 = *up++;
+      r0 = *rp;
+      umul_ppmm (p1, p0, u0, shifted_v0);
+      p0 >>= GMP_NAIL_BITS;
+      ADDC_LIMB (c1, xl, prev_p1, p0);
+      ADDC_LIMB (c2, xl, xl, r0);
+      ADDC_LIMB (c3, xl, xl, crec);
+      crec = c1 + c2 + c3;
+      *rp++ = xl;
+      prev_p1 = p1;
+    }
+  while (--n != 0);
+
+  return prev_p1 + crec;
+}
+
+#endif
+
+#if GMP_NAIL_BITS >= 2
+
+mp_limb_t
+mpn_addmul_1 (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_limb_t v0)
+{
+  mp_limb_t shifted_v0, u0, r0, p0, p1, prev_p1, xw, crec, xl;
+
+  ASSERT (n >= 1);
+  ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, n));
+  ASSERT_MPN (rp, n);
+  ASSERT_MPN (up, n);
+  ASSERT_LIMB (v0);
+
+  shifted_v0 = v0 << GMP_NAIL_BITS;
+  crec = 0;
+  prev_p1 = 0;
+  do
+    {
+      u0 = *up++;
+      r0 = *rp;
+      umul_ppmm (p1, p0, u0, shifted_v0);
+      p0 >>= GMP_NAIL_BITS;
+      xw = prev_p1 + p0 + r0 + crec;
+      crec = xw >> GMP_NUMB_BITS;
+      xl = xw & GMP_NUMB_MASK;
+      *rp++ = xl;
+      prev_p1 = p1;
+    }
+  while (--n != 0);
+
+  return prev_p1 + crec;
+}
+
+#endif

diff --git a/third_party/gmp/mpn/generic/bdiv_dbm1c.c b/third_party/gmp/mpn/generic/bdiv_dbm1c.c
new file mode 100644
index 0000000..543bb6e
--- /dev/null
+++ b/third_party/gmp/mpn/generic/bdiv_dbm1c.c

@@ -0,0 +1,58 @@
+/* mpn_bdiv_dbm1c -- divide an mpn number by a divisor of B-1, where B is the
+   limb base.  The dbm1c moniker means "Divisor of B Minus 1 with Carry".
+
+   THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
+   SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+Copyright 2008, 2009 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+
+mp_limb_t
+mpn_bdiv_dbm1c (mp_ptr qp, mp_srcptr ap, mp_size_t n, mp_limb_t bd, mp_limb_t h)
+{
+  mp_limb_t a, p0, p1, cy;
+  mp_size_t i;
+
+  for (i = 0; i < n; i++)
+    {
+      a = ap[i];
+      umul_ppmm (p1, p0, a, bd << GMP_NAIL_BITS);
+      p0 >>= GMP_NAIL_BITS;
+      cy = h < p0;
+      h = (h - p0) & GMP_NUMB_MASK;
+      qp[i] = h;
+      h = h - p1 - cy;
+    }
+
+  return h;
+}

diff --git a/third_party/gmp/mpn/generic/bdiv_q.c b/third_party/gmp/mpn/generic/bdiv_q.c
new file mode 100644
index 0000000..52aa473
--- /dev/null
+++ b/third_party/gmp/mpn/generic/bdiv_q.c

@@ -0,0 +1,76 @@
+/* mpn_bdiv_q -- Hensel division with precomputed inverse, returning quotient.
+
+   Contributed to the GNU project by Torbjorn Granlund.
+
+   THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
+   SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
+
+Copyright 2006, 2007, 2009 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+
+
+/* Computes Q = N / D mod B^n. */
+
+void
+mpn_bdiv_q (mp_ptr qp,
+	    mp_srcptr np, mp_size_t nn,
+	    mp_srcptr dp, mp_size_t dn,
+	    mp_ptr tp)
+{
+  mp_limb_t di;
+
+  if (BELOW_THRESHOLD (dn, DC_BDIV_Q_THRESHOLD))
+    {
+      MPN_COPY (tp, np, nn);
+      binvert_limb (di, dp[0]);  di = -di;
+      mpn_sbpi1_bdiv_q (qp, tp, nn, dp, dn, di);
+    }
+  else if (BELOW_THRESHOLD (dn, MU_BDIV_Q_THRESHOLD))
+    {
+      MPN_COPY (tp, np, nn);
+      binvert_limb (di, dp[0]);  di = -di;
+      mpn_dcpi1_bdiv_q (qp, tp, nn, dp, dn, di);
+    }
+  else
+    {
+      mpn_mu_bdiv_q (qp, np, nn, dp, dn, tp);
+    }
+  return;
+}
+
+mp_size_t
+mpn_bdiv_q_itch (mp_size_t nn, mp_size_t dn)
+{
+  if (BELOW_THRESHOLD (dn, MU_BDIV_Q_THRESHOLD))
+    return nn;
+  else
+    return mpn_mu_bdiv_q_itch (nn, dn);
+}

diff --git a/third_party/gmp/mpn/generic/bdiv_q_1.c b/third_party/gmp/mpn/generic/bdiv_q_1.c
new file mode 100644
index 0000000..6beb9a0
--- /dev/null
+++ b/third_party/gmp/mpn/generic/bdiv_q_1.c

@@ -0,0 +1,121 @@
+/* mpn_bdiv_q_1, mpn_pi1_bdiv_q_1 -- schoolbook Hensel division by 1-limb
+   divisor, returning quotient only.
+
+   THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY.  THEY'RE ALMOST
+   CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
+   FUTURE GNU MP RELEASES.
+
+Copyright 2000-2003, 2005, 2009, 2017 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+mp_limb_t
+mpn_pi1_bdiv_q_1 (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_limb_t d,
+		  mp_limb_t di, int shift)
+{
+  mp_size_t  i;
+  mp_limb_t  c, h, l, u, u_next, dummy;
+
+  ASSERT (n >= 1);
+  ASSERT (d != 0);
+  ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, n));
+  ASSERT_MPN (up, n);
+  ASSERT_LIMB (d);
+
+  d <<= GMP_NAIL_BITS;
+
+  if (shift != 0)
+    {
+      c = 0;
+
+      u = up[0];
+      rp--;
+      for (i = 1; i < n; i++)
+	{
+	  u_next = up[i];
+	  u = ((u >> shift) | (u_next << (GMP_NUMB_BITS-shift))) & GMP_NUMB_MASK;
+
+	  SUBC_LIMB (c, l, u, c);
+
+	  l = (l * di) & GMP_NUMB_MASK;
+	  rp[i] = l;
+
+	  umul_ppmm (h, dummy, l, d);
+	  c += h;
+	  u = u_next;
+	}
+
+      u = u >> shift;
+      SUBC_LIMB (c, l, u, c);
+
+      l = (l * di) & GMP_NUMB_MASK;
+      rp[n] = l;
+    }
+  else
+    {
+      u = up[0];
+      l = (u * di) & GMP_NUMB_MASK;
+      rp[0] = l;
+      c = 0;
+
+      for (i = 1; i < n; i++)
+	{
+	  umul_ppmm (h, dummy, l, d);
+	  c += h;
+
+	  u = up[i];
+	  SUBC_LIMB (c, l, u, c);
+
+	  l = (l * di) & GMP_NUMB_MASK;
+	  rp[i] = l;
+	}
+    }
+
+  return c;
+}
+
+mp_limb_t
+mpn_bdiv_q_1 (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_limb_t d)
+{
+  mp_limb_t di;
+  int shift;
+
+  ASSERT (n >= 1);
+  ASSERT (d != 0);
+  ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, n));
+  ASSERT_MPN (up, n);
+  ASSERT_LIMB (d);
+
+  count_trailing_zeros (shift, d);
+  d >>= shift;
+
+  binvert_limb (di, d);
+  return mpn_pi1_bdiv_q_1 (rp, up, n, d, di, shift);
+}

diff --git a/third_party/gmp/mpn/generic/bdiv_qr.c b/third_party/gmp/mpn/generic/bdiv_qr.c
new file mode 100644
index 0000000..a4f0f39
--- /dev/null
+++ b/third_party/gmp/mpn/generic/bdiv_qr.c

@@ -0,0 +1,84 @@
+/* mpn_bdiv_qr -- Hensel division with precomputed inverse, returning quotient
+   and remainder.
+
+   Contributed to the GNU project by Torbjorn Granlund.
+
+   THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
+   SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
+
+Copyright 2006, 2007, 2009, 2012 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+
+
+/* Computes Q = N / D mod B^n,
+	    R = N - QD.  */
+
+mp_limb_t
+mpn_bdiv_qr (mp_ptr qp, mp_ptr rp,
+	     mp_srcptr np, mp_size_t nn,
+	     mp_srcptr dp, mp_size_t dn,
+	     mp_ptr tp)
+{
+  mp_limb_t di;
+  mp_limb_t rh;
+
+  ASSERT (nn > dn);
+  if (BELOW_THRESHOLD (dn, DC_BDIV_QR_THRESHOLD) ||
+      BELOW_THRESHOLD (nn - dn, DC_BDIV_QR_THRESHOLD))
+    {
+      MPN_COPY (tp, np, nn);
+      binvert_limb (di, dp[0]);  di = -di;
+      rh = mpn_sbpi1_bdiv_qr (qp, tp, nn, dp, dn, di);
+      MPN_COPY (rp, tp + nn - dn, dn);
+    }
+  else if (BELOW_THRESHOLD (dn, MU_BDIV_QR_THRESHOLD))
+    {
+      MPN_COPY (tp, np, nn);
+      binvert_limb (di, dp[0]);  di = -di;
+      rh = mpn_dcpi1_bdiv_qr (qp, tp, nn, dp, dn, di);
+      MPN_COPY (rp, tp + nn - dn, dn);
+    }
+  else
+    {
+      rh = mpn_mu_bdiv_qr (qp, rp, np, nn, dp, dn, tp);
+    }
+
+  return rh;
+}
+
+mp_size_t
+mpn_bdiv_qr_itch (mp_size_t nn, mp_size_t dn)
+{
+  if (BELOW_THRESHOLD (dn, MU_BDIV_QR_THRESHOLD))
+    return nn;
+  else
+    return  mpn_mu_bdiv_qr_itch (nn, dn);
+}

diff --git a/third_party/gmp/mpn/generic/binvert.c b/third_party/gmp/mpn/generic/binvert.c
new file mode 100644
index 0000000..6c24ab7
--- /dev/null
+++ b/third_party/gmp/mpn/generic/binvert.c

@@ -0,0 +1,103 @@
+/* Compute {up,n}^(-1) mod B^n.
+
+   Contributed to the GNU project by Torbjorn Granlund.
+
+   THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
+   SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
+
+Copyright (C) 2004-2007, 2009, 2012, 2017 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+
+
+/*
+  r[k+1] = r[k] - r[k] * (u*r[k] - 1)
+  r[k+1] = r[k] + r[k] - r[k]*(u*r[k])
+*/
+
+#if TUNE_PROGRAM_BUILD
+#define NPOWS \
+ ((sizeof(mp_size_t) > 6 ? 48 : 8*sizeof(mp_size_t)))
+#else
+#define NPOWS \
+ ((sizeof(mp_size_t) > 6 ? 48 : 8*sizeof(mp_size_t)) - LOG2C (BINV_NEWTON_THRESHOLD))
+#endif
+
+mp_size_t
+mpn_binvert_itch (mp_size_t n)
+{
+  mp_size_t itch_local = mpn_mulmod_bnm1_next_size (n);
+  mp_size_t itch_out = mpn_mulmod_bnm1_itch (itch_local, n, (n + 1) >> 1);
+  return itch_local + itch_out;
+}
+
+void
+mpn_binvert (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_ptr scratch)
+{
+  mp_ptr xp;
+  mp_size_t rn, newrn;
+  mp_size_t sizes[NPOWS], *sizp;
+  mp_limb_t di;
+
+  /* Compute the computation precisions from highest to lowest, leaving the
+     base case size in 'rn'.  */
+  sizp = sizes;
+  for (rn = n; ABOVE_THRESHOLD (rn, BINV_NEWTON_THRESHOLD); rn = (rn + 1) >> 1)
+    *sizp++ = rn;
+
+  xp = scratch;
+
+  /* Compute a base value of rn limbs.  */
+  MPN_ZERO (xp, rn);
+  xp[0] = 1;
+  binvert_limb (di, up[0]);
+  if (BELOW_THRESHOLD (rn, DC_BDIV_Q_THRESHOLD))
+    mpn_sbpi1_bdiv_q (rp, xp, rn, up, rn, -di);
+  else
+    mpn_dcpi1_bdiv_q (rp, xp, rn, up, rn, -di);
+
+  mpn_neg (rp, rp, rn);
+
+  /* Use Newton iterations to get the desired precision.  */
+  for (; rn < n; rn = newrn)
+    {
+      mp_size_t m;
+      newrn = *--sizp;
+
+      /* X <- UR. */
+      m = mpn_mulmod_bnm1_next_size (newrn);
+      mpn_mulmod_bnm1 (xp, m, up, newrn, rp, rn, xp + m);
+      mpn_sub_1 (xp + m, xp, rn - (m - newrn), 1);
+
+      /* R = R(X/B^rn) */
+      mpn_mullo_n (rp + rn, rp, xp + rn, newrn - rn);
+      mpn_neg (rp + rn, rp + rn, newrn - rn);
+    }
+}

diff --git a/third_party/gmp/mpn/generic/broot.c b/third_party/gmp/mpn/generic/broot.c
new file mode 100644
index 0000000..02fe75a
--- /dev/null
+++ b/third_party/gmp/mpn/generic/broot.c

@@ -0,0 +1,195 @@
+/* mpn_broot -- Compute hensel sqrt
+
+   Contributed to the GNU project by Niels Möller
+
+   THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
+   SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
+
+Copyright 2012 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+
+/* Computes a^e (mod B). Uses right-to-left binary algorithm, since
+   typical use will have e small. */
+static mp_limb_t
+powlimb (mp_limb_t a, mp_limb_t e)
+{
+  mp_limb_t r = 1;
+  mp_limb_t s = a;
+
+  for (r = 1, s = a; e > 0; e >>= 1, s *= s)
+    if (e & 1)
+      r *= s;
+
+  return r;
+}
+
+/* Computes a^{1/k - 1} (mod B^n). Both a and k must be odd.
+
+   Iterates
+
+     r' <-- r - r * (a^{k-1} r^k - 1) / n
+
+   If
+
+     a^{k-1} r^k = 1 (mod 2^m),
+
+   then
+
+     a^{k-1} r'^k = 1 (mod 2^{2m}),
+
+   Compute the update term as
+
+     r' = r - (a^{k-1} r^{k+1} - r) / k
+
+   where we still have cancellation of low limbs.
+
+ */
+void
+mpn_broot_invm1 (mp_ptr rp, mp_srcptr ap, mp_size_t n, mp_limb_t k)
+{
+  mp_size_t sizes[GMP_LIMB_BITS * 2];
+  mp_ptr akm1, tp, rnp, ep;
+  mp_limb_t a0, r0, km1, kp1h, kinv;
+  mp_size_t rn;
+  unsigned i;
+
+  TMP_DECL;
+
+  ASSERT (n > 0);
+  ASSERT (ap[0] & 1);
+  ASSERT (k & 1);
+  ASSERT (k >= 3);
+
+  TMP_MARK;
+
+  akm1 = TMP_ALLOC_LIMBS (4*n);
+  tp = akm1 + n;
+
+  km1 = k-1;
+  /* FIXME: Could arrange the iteration so we don't need to compute
+     this up front, computing a^{k-1} * r^k as (a r)^{k-1} * r. Note
+     that we can use wraparound also for a*r, since the low half is
+     unchanged from the previous iteration. Or possibly mulmid. Also,
+     a r = a^{1/k}, so we get that value too, for free? */
+  mpn_powlo (akm1, ap, &km1, 1, n, tp); /* 3 n scratch space */
+
+  a0 = ap[0];
+  binvert_limb (kinv, k);
+
+  /* 4 bits: a^{1/k - 1} (mod 16):
+
+	a % 8
+	1 3 5 7
+   k%4 +-------
+     1 |1 1 1 1
+     3 |1 9 9 1
+  */
+  r0 = 1 + (((k << 2) & ((a0 << 1) ^ (a0 << 2))) & 8);
+  r0 = kinv * r0 * (k+1 - akm1[0] * powlimb (r0, k & 0x7f)); /* 8 bits */
+  r0 = kinv * r0 * (k+1 - akm1[0] * powlimb (r0, k & 0x7fff)); /* 16 bits */
+  r0 = kinv * r0 * (k+1 - akm1[0] * powlimb (r0, k)); /* 32 bits */
+#if GMP_NUMB_BITS > 32
+  {
+    unsigned prec = 32;
+    do
+      {
+	r0 = kinv * r0 * (k+1 - akm1[0] * powlimb (r0, k));
+	prec *= 2;
+      }
+    while (prec < GMP_NUMB_BITS);
+  }
+#endif
+
+  rp[0] = r0;
+  if (n == 1)
+    {
+      TMP_FREE;
+      return;
+    }
+
+  /* For odd k, (k+1)/2 = k/2+1, and the latter avoids overflow. */
+  kp1h = k/2 + 1;
+
+  /* FIXME: Special case for two limb iteration. */
+  rnp = TMP_ALLOC_LIMBS (2*n + 1);
+  ep = rnp + n;
+
+  /* FIXME: Possible to this on the fly with some bit fiddling. */
+  for (i = 0; n > 1; n = (n + 1)/2)
+    sizes[i++] = n;
+
+  rn = 1;
+
+  while (i-- > 0)
+    {
+      /* Compute x^{k+1}. */
+      mpn_sqr (ep, rp, rn); /* For odd n, writes n+1 limbs in the
+			       final iteration. */
+      mpn_powlo (rnp, ep, &kp1h, 1, sizes[i], tp);
+
+      /* Multiply by a^{k-1}. Can use wraparound; low part equals r. */
+
+      mpn_mullo_n (ep, rnp, akm1, sizes[i]);
+      ASSERT (mpn_cmp (ep, rp, rn) == 0);
+
+      ASSERT (sizes[i] <= 2*rn);
+      mpn_pi1_bdiv_q_1 (rp + rn, ep + rn, sizes[i] - rn, k, kinv, 0);
+      mpn_neg (rp + rn, rp + rn, sizes[i] - rn);
+      rn = sizes[i];
+    }
+  TMP_FREE;
+}
+
+/* Computes a^{1/k} (mod B^n). Both a and k must be odd. */
+void
+mpn_broot (mp_ptr rp, mp_srcptr ap, mp_size_t n, mp_limb_t k)
+{
+  mp_ptr tp;
+  TMP_DECL;
+
+  ASSERT (n > 0);
+  ASSERT (ap[0] & 1);
+  ASSERT (k & 1);
+
+  if (k == 1)
+    {
+      MPN_COPY (rp, ap, n);
+      return;
+    }
+
+  TMP_MARK;
+  tp = TMP_ALLOC_LIMBS (n);
+
+  mpn_broot_invm1 (tp, ap, n, k);
+  mpn_mullo_n (rp, tp, ap, n);
+
+  TMP_FREE;
+}

diff --git a/third_party/gmp/mpn/generic/brootinv.c b/third_party/gmp/mpn/generic/brootinv.c
new file mode 100644
index 0000000..e91b597
--- /dev/null
+++ b/third_party/gmp/mpn/generic/brootinv.c

@@ -0,0 +1,159 @@
+/* mpn_brootinv, compute r such that r^k * y = 1 (mod 2^b).
+
+   Contributed to the GNU project by Martin Boij (as part of perfpow.c).
+
+Copyright 2009, 2010, 2012, 2013, 2018 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+
+/* Computes a^2e (mod B). Uses right-to-left binary algorithm, since
+   typical use will have e small. */
+static mp_limb_t
+powsquaredlimb (mp_limb_t a, mp_limb_t e)
+{
+  mp_limb_t r;
+
+  r = 1;
+  /* if (LIKELY (e != 0)) */
+  do {
+    a *= a;
+    if (e & 1)
+      r *= a;
+    e >>= 1;
+  } while (e != 0);
+
+  return r;
+}
+
+/* Compute r such that r^k * y = 1 (mod B^n).
+
+   Iterates
+     r' <-- k^{-1} ((k+1) r - r^{k+1} y) (mod 2^b)
+   using Hensel lifting, each time doubling the number of known bits in r.
+
+   Works just for odd k.  Else the Hensel lifting degenerates.
+
+   FIXME:
+
+     (1) Make it work for k == GMP_LIMB_MAX (k+1 below overflows).
+
+     (2) Rewrite iteration as
+	   r' <-- r - k^{-1} r (r^k y - 1)
+	 and take advantage of the zero low part of r^k y - 1.
+
+     (3) Use wrap-around trick.
+
+     (4) Use a small table to get starting value.
+
+   Scratch need: bn + (((bn + 1) >> 1) + 1) + scratch for mpn_powlo
+   Currently mpn_powlo requires 3*bn
+   so that 5*bn is surely enough, where bn = ceil (bnb / GMP_NUMB_BITS).
+*/
+
+void
+mpn_brootinv (mp_ptr rp, mp_srcptr yp, mp_size_t bn, mp_limb_t k, mp_ptr tp)
+{
+  mp_ptr tp2, tp3;
+  mp_limb_t kinv, k2, r0, y0;
+  mp_size_t order[GMP_LIMB_BITS + 1];
+  int d;
+
+  ASSERT (bn > 0);
+  ASSERT ((k & 1) != 0);
+
+  tp2 = tp + bn;
+  tp3 = tp + bn + ((bn + 3) >> 1);
+  k2 = (k >> 1) + 1; /* (k + 1) / 2 , but avoid k+1 overflow */
+
+  binvert_limb (kinv, k);
+
+  /* 4-bit initial approximation:
+
+   y%16 | 1  3  5  7  9 11 13 15,
+    k%4 +-------------------------+k2%2
+     1  | 1 11 13  7  9  3  5 15  |  1
+     3  | 1  3  5  7  9 11 13 15  |  0
+
+  */
+  y0 = yp[0];
+
+  r0 = y0 ^ (((y0 << 1) ^ (y0 << 2)) & (k2 << 3) & 8);			/* 4 bits */
+  r0 = kinv * (k2 * r0 * 2 - y0 * powsquaredlimb(r0, k2 & 0x3f));	/* 8 bits */
+  r0 = kinv * (k2 * r0 * 2 - y0 * powsquaredlimb(r0, k2 & 0x3fff));	/* 16 bits */
+#if GMP_NUMB_BITS > 16
+  {
+    unsigned prec = 16;
+    do
+      {
+	r0 = kinv * (k2 * r0 * 2 - y0 * powsquaredlimb(r0, k2));
+	prec *= 2;
+      }
+    while (prec < GMP_NUMB_BITS);
+  }
+#endif
+
+  rp[0] = r0;
+  if (bn == 1)
+    return;
+
+  d = 0;
+  for (; bn != 2; bn = (bn + 1) >> 1)
+    order[d++] = bn;
+
+  order[d] = 2;
+  bn = 1;
+
+  do
+    {
+      mpn_sqr (tp, rp, bn); /* Result may overlap tp2 */
+      tp2[bn] = mpn_mul_1 (tp2, rp, bn, k2 << 1);
+
+      bn = order[d];
+
+      mpn_powlo (rp, tp, &k2, 1, bn, tp3);
+      mpn_mullo_n (tp, yp, rp, bn);
+
+      /* mpn_sub (tp, tp2, ((bn + 1) >> 1) + 1, tp, bn); */
+      /* The function above is not handled, ((bn + 1) >> 1) + 1 <= bn*/
+      {
+	mp_size_t pbn = (bn + 3) >> 1; /* Size of tp2 */
+	int borrow;
+	borrow = mpn_sub_n (tp, tp2, tp, pbn) != 0;
+	if (bn > pbn) /* 3 < bn */
+	  {
+	    if (borrow)
+	      mpn_com (tp + pbn, tp + pbn, bn - pbn);
+	    else
+	      mpn_neg (tp + pbn, tp + pbn, bn - pbn);
+	  }
+      }
+      mpn_pi1_bdiv_q_1 (rp, tp, bn, k, kinv, 0);
+    }
+  while (--d >= 0);
+}

diff --git a/third_party/gmp/mpn/generic/bsqrt.c b/third_party/gmp/mpn/generic/bsqrt.c
new file mode 100644
index 0000000..27184f0
--- /dev/null
+++ b/third_party/gmp/mpn/generic/bsqrt.c

@@ -0,0 +1,47 @@
+/* mpn_bsqrt, a^{1/2} (mod 2^n).
+
+Copyright 2009, 2010, 2012, 2015 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+
+
+void
+mpn_bsqrt (mp_ptr rp, mp_srcptr ap, mp_bitcnt_t nb, mp_ptr tp)
+{
+  mp_ptr sp;
+  mp_size_t n;
+
+  ASSERT (nb > 0);
+
+  n = nb / GMP_NUMB_BITS;
+  sp = tp + n;
+
+  mpn_bsqrtinv (tp, ap, nb, sp);
+  mpn_mullo_n (rp, tp, ap, n);
+}

diff --git a/third_party/gmp/mpn/generic/bsqrtinv.c b/third_party/gmp/mpn/generic/bsqrtinv.c
new file mode 100644
index 0000000..c286773
--- /dev/null
+++ b/third_party/gmp/mpn/generic/bsqrtinv.c

@@ -0,0 +1,103 @@
+/* mpn_bsqrtinv, compute r such that r^2 * y = 1 (mod 2^{b+1}).
+
+   Contributed to the GNU project by Martin Boij (as part of perfpow.c).
+
+Copyright 2009, 2010, 2012, 2015 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+
+/* Compute r such that r^2 * y = 1 (mod 2^{b+1}).
+   Return non-zero if such an integer r exists.
+
+   Iterates
+     r' <-- (3r - r^3 y) / 2
+   using Hensel lifting.  Since we divide by two, the Hensel lifting is
+   somewhat degenerates.  Therefore, we lift from 2^b to 2^{b+1}-1.
+
+   FIXME:
+     (1) Simplify to do precision book-keeping in limbs rather than bits.
+
+     (2) Rewrite iteration as
+	   r' <-- r - r (r^2 y - 1) / 2
+	 and take advantage of zero low part of r^2 y - 1.
+
+     (3) Use wrap-around trick.
+
+     (4) Use a small table to get starting value.
+*/
+int
+mpn_bsqrtinv (mp_ptr rp, mp_srcptr yp, mp_bitcnt_t bnb, mp_ptr tp)
+{
+  mp_ptr tp2;
+  mp_size_t bn, order[GMP_LIMB_BITS + 1];
+  int i, d;
+
+  ASSERT (bnb > 0);
+
+  bn = 1 + bnb / GMP_LIMB_BITS;
+
+  tp2 = tp + bn;
+
+  rp[0] = 1;
+  if (bnb == 1)
+    {
+      if ((yp[0] & 3) != 1)
+	return 0;
+    }
+  else
+    {
+      if ((yp[0] & 7) != 1)
+	return 0;
+
+      d = 0;
+      for (; bnb != 2; bnb = (bnb + 2) >> 1)
+	order[d++] = bnb;
+
+      for (i = d - 1; i >= 0; i--)
+	{
+	  bnb = order[i];
+	  bn = 1 + bnb / GMP_LIMB_BITS;
+
+	  mpn_sqrlo (tp, rp, bn);
+	  mpn_mullo_n (tp2, rp, tp, bn); /* tp2 <- rp ^ 3 */
+
+	  mpn_mul_1 (tp, rp, bn, 3);
+
+	  mpn_mullo_n (rp, yp, tp2, bn);
+
+#if HAVE_NATIVE_mpn_rsh1sub_n
+	  mpn_rsh1sub_n (rp, tp, rp, bn);
+#else
+	  mpn_sub_n (tp2, tp, rp, bn);
+	  mpn_rshift (rp, tp2, bn, 1);
+#endif
+	}
+    }
+  return 1;
+}

diff --git a/third_party/gmp/mpn/generic/cmp.c b/third_party/gmp/mpn/generic/cmp.c
new file mode 100644
index 0000000..940314b
--- /dev/null
+++ b/third_party/gmp/mpn/generic/cmp.c

@@ -0,0 +1,33 @@
+/* mpn_cmp -- Compare two low-level natural-number integers.
+
+Copyright 1991, 1993, 1994, 1996, 2000, 2001 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define __GMP_FORCE_mpn_cmp 1
+
+#include "gmp-impl.h"

diff --git a/third_party/gmp/mpn/generic/cnd_add_n.c b/third_party/gmp/mpn/generic/cnd_add_n.c
new file mode 100644
index 0000000..e6b1373
--- /dev/null
+++ b/third_party/gmp/mpn/generic/cnd_add_n.c

@@ -0,0 +1,69 @@
+/* mpn_cnd_add_n -- Compute R = U + V if CND != 0 or R = U if CND == 0.
+   Both cases should take the same time and perform the exact same memory
+   accesses, since this function is intended to be used where side-channel
+   attack resilience is relevant.
+
+Copyright 1992-1994, 1996, 2000, 2002, 2008, 2009, 2011, 2013 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+
+mp_limb_t
+mpn_cnd_add_n (mp_limb_t cnd, mp_ptr rp, mp_srcptr up, mp_srcptr vp, mp_size_t n)
+{
+  mp_limb_t ul, vl, sl, rl, cy, cy1, cy2, mask;
+
+  ASSERT (n >= 1);
+  ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, n));
+  ASSERT (MPN_SAME_OR_SEPARATE_P (rp, vp, n));
+
+  mask = -(mp_limb_t) (cnd != 0);
+  cy = 0;
+  do
+    {
+      ul = *up++;
+      vl = *vp++ & mask;
+#if GMP_NAIL_BITS == 0
+      sl = ul + vl;
+      cy1 = sl < ul;
+      rl = sl + cy;
+      cy2 = rl < sl;
+      cy = cy1 | cy2;
+      *rp++ = rl;
+#else
+      rl = ul + vl;
+      rl += cy;
+      cy = rl >> GMP_NUMB_BITS;
+      *rp++ = rl & GMP_NUMB_MASK;
+#endif
+    }
+  while (--n != 0);
+
+  return cy;
+}

diff --git a/third_party/gmp/mpn/generic/cnd_sub_n.c b/third_party/gmp/mpn/generic/cnd_sub_n.c
new file mode 100644
index 0000000..d04ad8a
--- /dev/null
+++ b/third_party/gmp/mpn/generic/cnd_sub_n.c

@@ -0,0 +1,69 @@
+/* mpn_cnd_sub_n -- Compute R = U - V if CND != 0 or R = U if CND == 0.
+   Both cases should take the same time and perform the exact same memory
+   accesses, since this function is intended to be used where side-channel
+   attack resilience is relevant.
+
+Copyright 1992-1994, 1996, 2000, 2002, 2008, 2009, 2011, 2013 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+
+mp_limb_t
+mpn_cnd_sub_n (mp_limb_t cnd, mp_ptr rp, mp_srcptr up, mp_srcptr vp, mp_size_t n)
+{
+  mp_limb_t ul, vl, sl, rl, cy, cy1, cy2, mask;
+
+  ASSERT (n >= 1);
+  ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, n));
+  ASSERT (MPN_SAME_OR_SEPARATE_P (rp, vp, n));
+
+  mask = -(mp_limb_t) (cnd != 0);
+  cy = 0;
+  do
+    {
+      ul = *up++;
+      vl = *vp++ & mask;
+#if GMP_NAIL_BITS == 0
+      sl = ul - vl;
+      cy1 = sl > ul;
+      rl = sl - cy;
+      cy2 = rl > sl;
+      cy = cy1 | cy2;
+      *rp++ = rl;
+#else
+      rl = ul - vl;
+      rl -= cy;
+      cy = rl >> (GMP_LIMB_BITS - 1);
+      *rp++ = rl & GMP_NUMB_MASK;
+#endif
+    }
+  while (--n != 0);
+
+  return cy;
+}

diff --git a/third_party/gmp/mpn/generic/cnd_swap.c b/third_party/gmp/mpn/generic/cnd_swap.c
new file mode 100644
index 0000000..83d856d
--- /dev/null
+++ b/third_party/gmp/mpn/generic/cnd_swap.c

@@ -0,0 +1,50 @@
+/* mpn_cnd_swap
+
+   Contributed to the GNU project by Niels Möller
+
+Copyright 2013, 2015 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+
+void
+mpn_cnd_swap (mp_limb_t cnd, volatile mp_limb_t *ap, volatile mp_limb_t *bp,
+	      mp_size_t n)
+{
+  volatile mp_limb_t mask = - (mp_limb_t) (cnd != 0);
+  mp_size_t i;
+  for (i = 0; i < n; i++)
+    {
+      mp_limb_t a, b, t;
+      a = ap[i];
+      b = bp[i];
+      t = (a ^ b) & mask;
+      ap[i] = a ^ t;
+      bp[i] = b ^ t;
+    }
+}

diff --git a/third_party/gmp/mpn/generic/com.c b/third_party/gmp/mpn/generic/com.c
new file mode 100644
index 0000000..4de5824
--- /dev/null
+++ b/third_party/gmp/mpn/generic/com.c

@@ -0,0 +1,44 @@
+/* mpn_com - complement an mpn.
+
+Copyright 2009 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+
+#undef mpn_com
+#define mpn_com __MPN(com)
+
+void
+mpn_com (mp_ptr rp, mp_srcptr up, mp_size_t n)
+{
+  mp_limb_t ul;
+  do {
+      ul = *up++;
+      *rp++ = ~ul & GMP_NUMB_MASK;
+  } while (--n != 0);
+}

diff --git a/third_party/gmp/mpn/generic/comb_tables.c b/third_party/gmp/mpn/generic/comb_tables.c
new file mode 100644
index 0000000..dedb77b
--- /dev/null
+++ b/third_party/gmp/mpn/generic/comb_tables.c

@@ -0,0 +1,47 @@
+/* Const tables shared among combinatoric functions.
+
+   THE CONTENTS OF THIS FILE ARE FOR INTERNAL USE AND ARE ALMOST CERTAIN TO
+   BE SUBJECT TO INCOMPATIBLE CHANGES IN FUTURE GNU MP RELEASES.
+
+Copyright 2012 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+
+/* Entry i contains (i!/2^t) where t is chosen such that the parenthesis
+   is an odd integer. */
+const mp_limb_t __gmp_oddfac_table[] = { ONE_LIMB_ODD_FACTORIAL_TABLE, ONE_LIMB_ODD_FACTORIAL_EXTTABLE };
+
+/* Entry i contains ((2i+1)!!/2^t) where t is chosen such that the parenthesis
+   is an odd integer. */
+const mp_limb_t __gmp_odd2fac_table[] = { ONE_LIMB_ODD_DOUBLEFACTORIAL_TABLE };
+
+/* Entry i contains 2i-popc(2i). */
+const unsigned char __gmp_fac2cnt_table[] = { TABLE_2N_MINUS_POPC_2N };
+
+const mp_limb_t __gmp_limbroots_table[] = { NTH_ROOT_NUMB_MASK_TABLE };

diff --git a/third_party/gmp/mpn/generic/compute_powtab.c b/third_party/gmp/mpn/generic/compute_powtab.c
new file mode 100644
index 0000000..f4fbc64
--- /dev/null
+++ b/third_party/gmp/mpn/generic/compute_powtab.c

@@ -0,0 +1,373 @@
+/* mpn_compute_powtab.
+
+   Contributed to the GNU project by Torbjorn Granlund.
+
+   THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
+   SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+Copyright 1991-2017 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+/*
+  CAVEATS:
+  * The exptab and powtab vectors are in opposite orders.  Probably OK.
+  * Consider getting rid of exptab, doing bit ops on the un argument instead.
+  * Consider rounding greatest power slightly upwards to save adjustments.
+  * In powtab_decide, consider computing cost from just the 2-3 largest
+    operands, since smaller operand contribute little.  This makes most sense
+    if exptab is suppressed.
+*/
+
+#include "gmp-impl.h"
+
+#ifndef DIV_1_VS_MUL_1_PERCENT
+#define DIV_1_VS_MUL_1_PERCENT 150
+#endif
+
+#define SET_powers_t(dest, ptr, size, dib, b, sh)	\
+  do {							\
+    dest.p = ptr;					\
+    dest.n = size;					\
+    dest.digits_in_base = dib;				\
+    dest.base = b;					\
+    dest.shift = sh;					\
+  } while (0)
+
+#if DIV_1_VS_MUL_1_PERCENT > 120
+#define HAVE_mpn_compute_powtab_mul 1
+static void
+mpn_compute_powtab_mul (powers_t *powtab, mp_ptr powtab_mem, mp_size_t un,
+			int base, const size_t *exptab, size_t n_pows)
+{
+  mp_size_t n;
+  mp_ptr p, t;
+  mp_limb_t cy;
+  long start_idx;
+  int c;
+
+  mp_limb_t big_base = mp_bases[base].big_base;
+  int chars_per_limb = mp_bases[base].chars_per_limb;
+
+  mp_ptr powtab_mem_ptr = powtab_mem;
+
+  size_t digits_in_base = chars_per_limb;
+
+  powers_t *pt = powtab;
+
+  p = powtab_mem_ptr;
+  powtab_mem_ptr += 1;
+  p[0] = big_base;
+
+  SET_powers_t (pt[0], p, 1, digits_in_base, base, 0);
+  pt++;
+
+  t = powtab_mem_ptr;
+  powtab_mem_ptr += 2;
+  t[1] = mpn_mul_1 (t, p, 1, big_base);
+  n = 2;
+
+  digits_in_base *= 2;
+
+  c = t[0] == 0;
+  t += c;
+  n -= c;
+  mp_size_t shift = c;
+
+  SET_powers_t (pt[0], t, n, digits_in_base, base, shift);
+  p = t;
+  pt++;
+
+  if (exptab[0] == ((size_t) chars_per_limb << n_pows))
+    {
+      start_idx = n_pows - 2;
+    }
+  else
+    {
+      if (((digits_in_base + chars_per_limb) << (n_pows-2)) <= exptab[0])
+	{
+	  /* 3, sometimes adjusted to 4.  */
+	  t = powtab_mem_ptr;
+	  powtab_mem_ptr += 4;
+	  t[n] = cy = mpn_mul_1 (t, p, n, big_base);
+	  n += cy != 0;;
+
+	  digits_in_base += chars_per_limb;
+
+	  c  = t[0] == 0;
+	  t += c;
+	  n -= c;
+	  shift += c;
+	}
+      else
+	{
+	  /* 2 copy, will always become 3 with back-multiplication.  */
+	  t = powtab_mem_ptr;
+	  powtab_mem_ptr += 3;
+	  t[0] = p[0];
+	  t[1] = p[1];
+	}
+
+      SET_powers_t (pt[0], t, n, digits_in_base, base, shift);
+      p = t;
+      pt++;
+      start_idx = n_pows - 3;
+    }
+
+  for (long pi = start_idx; pi >= 0; pi--)
+    {
+      t = powtab_mem_ptr;
+      powtab_mem_ptr += 2 * n + 2;
+
+      ASSERT (powtab_mem_ptr < powtab_mem + mpn_str_powtab_alloc (un));
+
+      mpn_sqr (t, p, n);
+
+      digits_in_base *= 2;
+      n *= 2;
+      n -= t[n - 1] == 0;
+      shift *= 2;
+
+      c = t[0] == 0;
+      t += c;
+      n -= c;
+      shift += c;
+
+      /* Adjust new value if it is too small as input to the next squaring.  */
+      if (((digits_in_base + chars_per_limb) << pi) <= exptab[0])
+	{
+	  t[n] = cy = mpn_mul_1 (t, t, n, big_base);
+	  n += cy != 0;
+
+	  digits_in_base += chars_per_limb;
+
+	  c  = t[0] == 0;
+	  t += c;
+	  n -= c;
+	  shift += c;
+	}
+
+      SET_powers_t (pt[0], t, n, digits_in_base, base, shift);
+
+      /* Adjust previous value if it is not at its target power.  */
+      if (pt[-1].digits_in_base < exptab[pi + 1])
+	{
+	  mp_size_t n = pt[-1].n;
+	  mp_ptr p = pt[-1].p;
+	  p[n] = cy = mpn_mul_1 (p, p, n, big_base);
+	  n += cy != 0;
+
+	  ASSERT (pt[-1].digits_in_base + chars_per_limb == exptab[pi + 1]);
+	  pt[-1].digits_in_base = exptab[pi + 1];
+
+	  c = p[0] == 0;
+	  pt[-1].p = p + c;
+	  pt[-1].n = n - c;
+	  pt[-1].shift += c;
+	}
+
+      p = t;
+      pt++;
+    }
+}
+#endif
+
+#if DIV_1_VS_MUL_1_PERCENT < 275
+#define HAVE_mpn_compute_powtab_div 1
+static void
+mpn_compute_powtab_div (powers_t *powtab, mp_ptr powtab_mem, mp_size_t un,
+			int base, const size_t *exptab, size_t n_pows)
+{
+  mp_ptr p, t;
+
+  mp_limb_t big_base = mp_bases[base].big_base;
+  int chars_per_limb = mp_bases[base].chars_per_limb;
+
+  mp_ptr powtab_mem_ptr = powtab_mem;
+
+  size_t digits_in_base = chars_per_limb;
+
+  powers_t *pt = powtab;
+
+  p = powtab_mem_ptr;
+  powtab_mem_ptr += 1;
+  p[0] = big_base;
+
+  SET_powers_t (pt[0], p, 1, digits_in_base, base, 0);
+  pt++;
+
+  mp_size_t n = 1;
+  mp_size_t shift = 0;
+  for (long pi = n_pows - 1; pi >= 0; pi--)
+    {
+      t = powtab_mem_ptr;
+      powtab_mem_ptr += 2 * n;
+
+      ASSERT (powtab_mem_ptr < powtab_mem + mpn_str_powtab_alloc (un));
+
+      mpn_sqr (t, p, n);
+      n = 2 * n - 1; n += t[n] != 0;
+      digits_in_base *= 2;
+
+      if (digits_in_base != exptab[pi])	/* if ((((un - 1) >> pi) & 2) == 0) */
+	{
+#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 || ! HAVE_NATIVE_mpn_divexact_1
+	  if (__GMP_LIKELY (base == 10))
+	    mpn_pi1_bdiv_q_1 (t, t, n, big_base >> MP_BASES_BIG_BASE_CTZ_10,
+			      MP_BASES_BIG_BASE_BINVERTED_10,
+			      MP_BASES_BIG_BASE_CTZ_10);
+	  else
+#endif
+	    /* FIXME: We could use _pi1 here if we add big_base_binverted and
+	       big_base_ctz fields to struct bases.  That would add about 2 KiB
+	       to mp_bases.c.
+	       FIXME: Use mpn_bdiv_q_1 here when mpn_divexact_1 is converted to
+	       mpn_bdiv_q_1 for more machines. */
+	    mpn_divexact_1 (t, t, n, big_base);
+
+	  n -= t[n - 1] == 0;
+	  digits_in_base -= chars_per_limb;
+	}
+
+      shift *= 2;
+      /* Strip low zero limbs, but be careful to keep the result divisible by
+	 big_base.  */
+      while (t[0] == 0 && (t[1] & ((big_base & -big_base) - 1)) == 0)
+	{
+	  t++;
+	  n--;
+	  shift++;
+	}
+      p = t;
+
+      SET_powers_t (pt[0], p, n, digits_in_base, base, shift);
+      pt++;
+    }
+
+  /* Strip any remaining low zero limbs.  */
+  pt -= n_pows + 1;
+  for (long pi = n_pows; pi >= 0; pi--)
+    {
+      mp_ptr t = pt[pi].p;
+      mp_size_t shift = pt[pi].shift;
+      mp_size_t n = pt[pi].n;
+      int c;
+      c = t[0] == 0;
+      t += c;
+      n -= c;
+      shift += c;
+      pt[pi].p = t;
+      pt[pi].shift = shift;
+      pt[pi].n = n;
+    }
+}
+#endif
+
+static long
+powtab_decide (size_t *exptab, size_t un, int base)
+{
+  int chars_per_limb = mp_bases[base].chars_per_limb;
+  long n_pows = 0;
+  for (size_t pn = (un + 1) >> 1; pn != 1; pn = (pn + 1) >> 1)
+    {
+      exptab[n_pows] = pn * chars_per_limb;
+      n_pows++;
+    }
+  exptab[n_pows] = chars_per_limb;
+
+#if HAVE_mpn_compute_powtab_mul && HAVE_mpn_compute_powtab_div
+  size_t pn = un - 1;
+  size_t xn = (un + 1) >> 1;
+  unsigned mcost = 1;
+  unsigned dcost = 1;
+  for (long i = n_pows - 2; i >= 0; i--)
+    {
+      size_t pow = (pn >> (i + 1)) + 1;
+
+      if (pow & 1)
+	dcost += pow;
+
+      if (xn != (pow << i))
+	{
+	  if (pow > 2 && (pow & 1) == 0)
+	    mcost += 2 * pow;
+	  else
+	    mcost += pow;
+	}
+      else
+	{
+	  if (pow & 1)
+	    mcost += pow;
+	}
+    }
+
+  dcost = dcost * DIV_1_VS_MUL_1_PERCENT / 100;
+
+  if (mcost <= dcost)
+    return n_pows;
+  else
+    return -n_pows;
+#elif HAVE_mpn_compute_powtab_mul
+  return n_pows;
+#elif HAVE_mpn_compute_powtab_div
+  return -n_pows;
+#else
+#error "no powtab function available"
+#endif
+}
+
+size_t
+mpn_compute_powtab (powers_t *powtab, mp_ptr powtab_mem, mp_size_t un, int base)
+{
+  size_t exptab[GMP_LIMB_BITS];
+
+  long n_pows = powtab_decide (exptab, un, base);
+
+#if HAVE_mpn_compute_powtab_mul && HAVE_mpn_compute_powtab_div
+  if (n_pows >= 0)
+    {
+      mpn_compute_powtab_mul (powtab, powtab_mem, un, base, exptab, n_pows);
+      return n_pows;
+    }
+  else
+    {
+      mpn_compute_powtab_div (powtab, powtab_mem, un, base, exptab, -n_pows);
+      return -n_pows;
+    }
+#elif HAVE_mpn_compute_powtab_mul
+  ASSERT (n_pows > 0);
+  mpn_compute_powtab_mul (powtab, powtab_mem, un, base, exptab, n_pows);
+  return n_pows;
+#elif HAVE_mpn_compute_powtab_div
+  ASSERT (n_pows < 0);
+  mpn_compute_powtab_div (powtab, powtab_mem, un, base, exptab, -n_pows);
+  return -n_pows;
+#else
+#error "no powtab function available"
+#endif
+}

diff --git a/third_party/gmp/mpn/generic/copyd.c b/third_party/gmp/mpn/generic/copyd.c
new file mode 100644
index 0000000..7def007
--- /dev/null
+++ b/third_party/gmp/mpn/generic/copyd.c

@@ -0,0 +1,40 @@
+/* mpn_copyd
+
+Copyright 2009 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+
+void
+mpn_copyd (mp_ptr rp, mp_srcptr up, mp_size_t n)
+{
+  mp_size_t i;
+
+  for (i = n - 1; i >= 0; i--)
+    rp[i] = up[i];
+}

diff --git a/third_party/gmp/mpn/generic/copyi.c b/third_party/gmp/mpn/generic/copyi.c
new file mode 100644
index 0000000..736e0b5
--- /dev/null
+++ b/third_party/gmp/mpn/generic/copyi.c

@@ -0,0 +1,42 @@
+/* mpn_copyi
+
+Copyright 2009 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+
+void
+mpn_copyi (mp_ptr rp, mp_srcptr up, mp_size_t n)
+{
+  mp_size_t i;
+
+  up += n;
+  rp += n;
+  for (i = -n; i != 0; i++)
+    rp[i] = up[i];
+}

diff --git a/third_party/gmp/mpn/generic/dcpi1_bdiv_q.c b/third_party/gmp/mpn/generic/dcpi1_bdiv_q.c
new file mode 100644
index 0000000..1a4bd2a
--- /dev/null
+++ b/third_party/gmp/mpn/generic/dcpi1_bdiv_q.c

@@ -0,0 +1,159 @@
+/* mpn_dcpi1_bdiv_q -- divide-and-conquer Hensel division with precomputed
+   inverse, returning quotient.
+
+   Contributed to the GNU project by Niels Möller and Torbjorn Granlund.
+
+   THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
+   SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
+
+Copyright 2006, 2007, 2009-2011, 2017 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+
+
+static mp_size_t
+mpn_dcpi1_bdiv_q_n_itch (mp_size_t n)
+{
+  /* NOTE: Depends on mullo_n and mpn_dcpi1_bdiv_qr_n interface */
+  return n;
+}
+
+/* Computes Q = - N / D mod B^n, destroys N.
+
+   N = {np,n}
+   D = {dp,n}
+*/
+
+static void
+mpn_dcpi1_bdiv_q_n (mp_ptr qp,
+		    mp_ptr np, mp_srcptr dp, mp_size_t n,
+		    mp_limb_t dinv, mp_ptr tp)
+{
+  while (ABOVE_THRESHOLD (n, DC_BDIV_Q_THRESHOLD))
+    {
+      mp_size_t lo, hi;
+      mp_limb_t cy;
+
+      lo = n >> 1;			/* floor(n/2) */
+      hi = n - lo;			/* ceil(n/2) */
+
+      cy = mpn_dcpi1_bdiv_qr_n (qp, np, dp, lo, dinv, tp);
+
+      mpn_mullo_n (tp, qp, dp + hi, lo);
+      mpn_add_n (np + hi, np + hi, tp, lo);
+
+      if (lo < hi)
+	{
+	  cy += mpn_addmul_1 (np + lo, qp, lo, dp[lo]);
+	  np[n - 1] += cy;
+	}
+      qp += lo;
+      np += lo;
+      n -= lo;
+    }
+  mpn_sbpi1_bdiv_q (qp, np, n, dp, n, dinv);
+}
+
+/* Computes Q = - N / D mod B^nn, destroys N.
+
+   N = {np,nn}
+   D = {dp,dn}
+*/
+
+void
+mpn_dcpi1_bdiv_q (mp_ptr qp,
+		  mp_ptr np, mp_size_t nn,
+		  mp_srcptr dp, mp_size_t dn,
+		  mp_limb_t dinv)
+{
+  mp_size_t qn;
+  mp_limb_t cy;
+  mp_ptr tp;
+  TMP_DECL;
+
+  TMP_MARK;
+
+  ASSERT (dn >= 2);
+  ASSERT (nn - dn >= 0);
+  ASSERT (dp[0] & 1);
+
+  tp = TMP_SALLOC_LIMBS (dn);
+
+  qn = nn;
+
+  if (qn > dn)
+    {
+      /* Reduce qn mod dn in a super-efficient manner.  */
+      do
+	qn -= dn;
+      while (qn > dn);
+
+      /* Perform the typically smaller block first.  */
+      if (BELOW_THRESHOLD (qn, DC_BDIV_QR_THRESHOLD))
+	cy = mpn_sbpi1_bdiv_qr (qp, np, 2 * qn, dp, qn, dinv);
+      else
+	cy = mpn_dcpi1_bdiv_qr_n (qp, np, dp, qn, dinv, tp);
+
+      if (qn != dn)
+	{
+	  if (qn > dn - qn)
+	    mpn_mul (tp, qp, qn, dp + qn, dn - qn);
+	  else
+	    mpn_mul (tp, dp + qn, dn - qn, qp, qn);
+	  mpn_incr_u (tp + qn, cy);
+
+	  mpn_add (np + qn, np + qn, nn - qn, tp, dn);
+	  cy = 0;
+	}
+
+      np += qn;
+      qp += qn;
+
+      qn = nn - qn;
+      while (qn > dn)
+	{
+	  mpn_add_1 (np + dn, np + dn, qn - dn, cy);
+	  cy = mpn_dcpi1_bdiv_qr_n (qp, np, dp, dn, dinv, tp);
+	  qp += dn;
+	  np += dn;
+	  qn -= dn;
+	}
+      mpn_dcpi1_bdiv_q_n (qp, np, dp, dn, dinv, tp);
+    }
+  else
+    {
+      if (BELOW_THRESHOLD (qn, DC_BDIV_Q_THRESHOLD))
+	mpn_sbpi1_bdiv_q (qp, np, qn, dp, qn, dinv);
+      else
+	mpn_dcpi1_bdiv_q_n (qp, np, dp, qn, dinv, tp);
+    }
+
+  TMP_FREE;
+}

diff --git a/third_party/gmp/mpn/generic/dcpi1_bdiv_qr.c b/third_party/gmp/mpn/generic/dcpi1_bdiv_qr.c
new file mode 100644
index 0000000..11da44f
--- /dev/null
+++ b/third_party/gmp/mpn/generic/dcpi1_bdiv_qr.c

@@ -0,0 +1,176 @@
+/* mpn_dcpi1_bdiv_qr -- divide-and-conquer Hensel division with precomputed
+   inverse, returning quotient and remainder.
+
+   Contributed to the GNU project by Niels Möller and Torbjorn Granlund.
+
+   THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
+   SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
+
+Copyright 2006, 2007, 2009, 2010, 2017 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+
+
+/* Computes Hensel binary division of {np, 2*n} by {dp, n}.
+
+   Output:
+
+      q = -n * d^{-1} mod 2^{qn * GMP_NUMB_BITS},
+
+      r = (n + q * d) * 2^{-qn * GMP_NUMB_BITS}
+
+   Stores q at qp. Stores the n least significant limbs of r at the high half
+   of np, and returns the carry from the addition n + q*d.
+
+   d must be odd. dinv is (-d)^-1 mod 2^GMP_NUMB_BITS. */
+
+mp_size_t
+mpn_dcpi1_bdiv_qr_n_itch (mp_size_t n)
+{
+  return n;
+}
+
+mp_limb_t
+mpn_dcpi1_bdiv_qr_n (mp_ptr qp, mp_ptr np, mp_srcptr dp, mp_size_t n,
+		     mp_limb_t dinv, mp_ptr tp)
+{
+  mp_size_t lo, hi;
+  mp_limb_t cy;
+  mp_limb_t rh;
+
+  lo = n >> 1;			/* floor(n/2) */
+  hi = n - lo;			/* ceil(n/2) */
+
+  if (BELOW_THRESHOLD (lo, DC_BDIV_QR_THRESHOLD))
+    cy = mpn_sbpi1_bdiv_qr (qp, np, 2 * lo, dp, lo, dinv);
+  else
+    cy = mpn_dcpi1_bdiv_qr_n (qp, np, dp, lo, dinv, tp);
+
+  mpn_mul (tp, dp + lo, hi, qp, lo);
+
+  mpn_incr_u (tp + lo, cy);
+  rh = mpn_add (np + lo, np + lo, n + hi, tp, n);
+
+  if (BELOW_THRESHOLD (hi, DC_BDIV_QR_THRESHOLD))
+    cy = mpn_sbpi1_bdiv_qr (qp + lo, np + lo, 2 * hi, dp, hi, dinv);
+  else
+    cy = mpn_dcpi1_bdiv_qr_n (qp + lo, np + lo, dp, hi, dinv, tp);
+
+  mpn_mul (tp, qp + lo, hi, dp + hi, lo);
+
+  mpn_incr_u (tp + hi, cy);
+  rh += mpn_add_n (np + n, np + n, tp, n);
+
+  return rh;
+}
+
+mp_limb_t
+mpn_dcpi1_bdiv_qr (mp_ptr qp, mp_ptr np, mp_size_t nn,
+		   mp_srcptr dp, mp_size_t dn, mp_limb_t dinv)
+{
+  mp_size_t qn;
+  mp_limb_t rr, cy;
+  mp_ptr tp;
+  TMP_DECL;
+
+  TMP_MARK;
+
+  ASSERT (dn >= 2);		/* to adhere to mpn_sbpi1_div_qr's limits */
+  ASSERT (nn - dn >= 1);	/* to adhere to mpn_sbpi1_div_qr's limits */
+  ASSERT (dp[0] & 1);
+
+  tp = TMP_SALLOC_LIMBS (dn);
+
+  qn = nn - dn;
+
+  if (qn > dn)
+    {
+      /* Reduce qn mod dn without division, optimizing small operations.  */
+      do
+	qn -= dn;
+      while (qn > dn);
+
+      /* Perform the typically smaller block first.  */
+      if (BELOW_THRESHOLD (qn, DC_BDIV_QR_THRESHOLD))
+	cy = mpn_sbpi1_bdiv_qr (qp, np, 2 * qn, dp, qn, dinv);
+      else
+	cy = mpn_dcpi1_bdiv_qr_n (qp, np, dp, qn, dinv, tp);
+
+      rr = 0;
+      if (qn != dn)
+	{
+	  if (qn > dn - qn)
+	    mpn_mul (tp, qp, qn, dp + qn, dn - qn);
+	  else
+	    mpn_mul (tp, dp + qn, dn - qn, qp, qn);
+	  mpn_incr_u (tp + qn, cy);
+
+	  rr = mpn_add (np + qn, np + qn, nn - qn, tp, dn);
+	  cy = 0;
+	}
+
+      np += qn;
+      qp += qn;
+
+      qn = nn - dn - qn;
+      do
+	{
+	  rr += mpn_add_1 (np + dn, np + dn, qn, cy);
+	  cy = mpn_dcpi1_bdiv_qr_n (qp, np, dp, dn, dinv, tp);
+	  qp += dn;
+	  np += dn;
+	  qn -= dn;
+	}
+      while (qn > 0);
+      TMP_FREE;
+      return rr + cy;
+    }
+
+  if (BELOW_THRESHOLD (qn, DC_BDIV_QR_THRESHOLD))
+    cy = mpn_sbpi1_bdiv_qr (qp, np, 2 * qn, dp, qn, dinv);
+  else
+    cy = mpn_dcpi1_bdiv_qr_n (qp, np, dp, qn, dinv, tp);
+
+  rr = 0;
+  if (qn != dn)
+    {
+      if (qn > dn - qn)
+	mpn_mul (tp, qp, qn, dp + qn, dn - qn);
+      else
+	mpn_mul (tp, dp + qn, dn - qn, qp, qn);
+      mpn_incr_u (tp + qn, cy);
+
+      rr = mpn_add (np + qn, np + qn, nn - qn, tp, dn);
+      cy = 0;
+    }
+
+  TMP_FREE;
+  return rr + cy;
+}

diff --git a/third_party/gmp/mpn/generic/dcpi1_div_q.c b/third_party/gmp/mpn/generic/dcpi1_div_q.c
new file mode 100644
index 0000000..1905c98
--- /dev/null
+++ b/third_party/gmp/mpn/generic/dcpi1_div_q.c

@@ -0,0 +1,86 @@
+/* mpn_dc_div_q -- divide-and-conquer division, returning exact quotient
+   only.
+
+   Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
+
+   THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
+   SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
+
+Copyright 2006, 2007, 2009, 2010 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+
+
+mp_limb_t
+mpn_dcpi1_div_q (mp_ptr qp, mp_ptr np, mp_size_t nn,
+		 mp_srcptr dp, mp_size_t dn, gmp_pi1_t *dinv)
+{
+  mp_ptr tp, wp;
+  mp_limb_t qh;
+  mp_size_t qn;
+  TMP_DECL;
+
+  TMP_MARK;
+
+  ASSERT (dn >= 6);
+  ASSERT (nn - dn >= 3);
+  ASSERT (dp[dn-1] & GMP_NUMB_HIGHBIT);
+
+  tp = TMP_ALLOC_LIMBS (nn + 1);
+  MPN_COPY (tp + 1, np, nn);
+  tp[0] = 0;
+
+  qn = nn - dn;
+  wp = TMP_ALLOC_LIMBS (qn + 1);
+
+  qh = mpn_dcpi1_divappr_q (wp, tp, nn + 1, dp, dn, dinv);
+
+  if (wp[0] == 0)
+    {
+      mp_limb_t cy;
+
+      if (qn > dn)
+	mpn_mul (tp, wp + 1, qn, dp, dn);
+      else
+	mpn_mul (tp, dp, dn, wp + 1, qn);
+
+      cy = (qh != 0) ? mpn_add_n (tp + qn, tp + qn, dp, dn) : 0;
+
+      if (cy || mpn_cmp (tp, np, nn) > 0) /* At most is wrong by one, no cycle. */
+	qh -= mpn_sub_1 (qp, wp + 1, qn, 1);
+      else /* Same as below */
+	MPN_COPY (qp, wp + 1, qn);
+    }
+  else
+    MPN_COPY (qp, wp + 1, qn);
+
+  TMP_FREE;
+  return qh;
+}

diff --git a/third_party/gmp/mpn/generic/dcpi1_div_qr.c b/third_party/gmp/mpn/generic/dcpi1_div_qr.c
new file mode 100644
index 0000000..d7a65f8
--- /dev/null
+++ b/third_party/gmp/mpn/generic/dcpi1_div_qr.c

@@ -0,0 +1,248 @@
+/* mpn_dcpi1_div_qr_n -- recursive divide-and-conquer division for arbitrary
+   size operands.
+
+   Contributed to the GNU project by Torbjorn Granlund.
+
+   THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
+   SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
+
+Copyright 2006, 2007, 2009 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+
+mp_limb_t
+mpn_dcpi1_div_qr_n (mp_ptr qp, mp_ptr np, mp_srcptr dp, mp_size_t n,
+		    gmp_pi1_t *dinv, mp_ptr tp)
+{
+  mp_size_t lo, hi;
+  mp_limb_t cy, qh, ql;
+
+  lo = n >> 1;			/* floor(n/2) */
+  hi = n - lo;			/* ceil(n/2) */
+
+  if (BELOW_THRESHOLD (hi, DC_DIV_QR_THRESHOLD))
+    qh = mpn_sbpi1_div_qr (qp + lo, np + 2 * lo, 2 * hi, dp + lo, hi, dinv->inv32);
+  else
+    qh = mpn_dcpi1_div_qr_n (qp + lo, np + 2 * lo, dp + lo, hi, dinv, tp);
+
+  mpn_mul (tp, qp + lo, hi, dp, lo);
+
+  cy = mpn_sub_n (np + lo, np + lo, tp, n);
+  if (qh != 0)
+    cy += mpn_sub_n (np + n, np + n, dp, lo);
+
+  while (cy != 0)
+    {
+      qh -= mpn_sub_1 (qp + lo, qp + lo, hi, 1);
+      cy -= mpn_add_n (np + lo, np + lo, dp, n);
+    }
+
+  if (BELOW_THRESHOLD (lo, DC_DIV_QR_THRESHOLD))
+    ql = mpn_sbpi1_div_qr (qp, np + hi, 2 * lo, dp + hi, lo, dinv->inv32);
+  else
+    ql = mpn_dcpi1_div_qr_n (qp, np + hi, dp + hi, lo, dinv, tp);
+
+  mpn_mul (tp, dp, hi, qp, lo);
+
+  cy = mpn_sub_n (np, np, tp, n);
+  if (ql != 0)
+    cy += mpn_sub_n (np + lo, np + lo, dp, hi);
+
+  while (cy != 0)
+    {
+      mpn_sub_1 (qp, qp, lo, 1);
+      cy -= mpn_add_n (np, np, dp, n);
+    }
+
+  return qh;
+}
+
+mp_limb_t
+mpn_dcpi1_div_qr (mp_ptr qp,
+		  mp_ptr np, mp_size_t nn,
+		  mp_srcptr dp, mp_size_t dn,
+		  gmp_pi1_t *dinv)
+{
+  mp_size_t qn;
+  mp_limb_t qh, cy;
+  mp_ptr tp;
+  TMP_DECL;
+
+  TMP_MARK;
+
+  ASSERT (dn >= 6);		/* to adhere to mpn_sbpi1_div_qr's limits */
+  ASSERT (nn - dn >= 3);	/* to adhere to mpn_sbpi1_div_qr's limits */
+  ASSERT (dp[dn-1] & GMP_NUMB_HIGHBIT);
+
+  tp = TMP_ALLOC_LIMBS (dn);
+
+  qn = nn - dn;
+  qp += qn;
+  np += nn;
+  dp += dn;
+
+  if (qn > dn)
+    {
+      /* Reduce qn mod dn without division, optimizing small operations.  */
+      do
+	qn -= dn;
+      while (qn > dn);
+
+      qp -= qn;			/* point at low limb of next quotient block */
+      np -= qn;			/* point in the middle of partial remainder */
+
+      /* Perform the typically smaller block first.  */
+      if (qn == 1)
+	{
+	  mp_limb_t q, n2, n1, n0, d1, d0;
+
+	  /* Handle qh up front, for simplicity. */
+	  qh = mpn_cmp (np - dn + 1, dp - dn, dn) >= 0;
+	  if (qh)
+	    ASSERT_NOCARRY (mpn_sub_n (np - dn + 1, np - dn + 1, dp - dn, dn));
+
+	  /* A single iteration of schoolbook: One 3/2 division,
+	     followed by the bignum update and adjustment. */
+	  n2 = np[0];
+	  n1 = np[-1];
+	  n0 = np[-2];
+	  d1 = dp[-1];
+	  d0 = dp[-2];
+
+	  ASSERT (n2 < d1 || (n2 == d1 && n1 <= d0));
+
+	  if (UNLIKELY (n2 == d1) && n1 == d0)
+	    {
+	      q = GMP_NUMB_MASK;
+	      cy = mpn_submul_1 (np - dn, dp - dn, dn, q);
+	      ASSERT (cy == n2);
+	    }
+	  else
+	    {
+	      udiv_qr_3by2 (q, n1, n0, n2, n1, n0, d1, d0, dinv->inv32);
+
+	      if (dn > 2)
+		{
+		  mp_limb_t cy, cy1;
+		  cy = mpn_submul_1 (np - dn, dp - dn, dn - 2, q);
+
+		  cy1 = n0 < cy;
+		  n0 = (n0 - cy) & GMP_NUMB_MASK;
+		  cy = n1 < cy1;
+		  n1 = (n1 - cy1) & GMP_NUMB_MASK;
+		  np[-2] = n0;
+
+		  if (UNLIKELY (cy != 0))
+		    {
+		      n1 += d1 + mpn_add_n (np - dn, np - dn, dp - dn, dn - 1);
+		      qh -= (q == 0);
+		      q = (q - 1) & GMP_NUMB_MASK;
+		    }
+		}
+	      else
+		np[-2] = n0;
+
+	      np[-1] = n1;
+	    }
+	  qp[0] = q;
+	}
+      else
+	{
+	  /* Do a 2qn / qn division */
+	  if (qn == 2)
+	    qh = mpn_divrem_2 (qp, 0L, np - 2, 4, dp - 2); /* FIXME: obsolete function. Use 5/3 division? */
+	  else if (BELOW_THRESHOLD (qn, DC_DIV_QR_THRESHOLD))
+	    qh = mpn_sbpi1_div_qr (qp, np - qn, 2 * qn, dp - qn, qn, dinv->inv32);
+	  else
+	    qh = mpn_dcpi1_div_qr_n (qp, np - qn, dp - qn, qn, dinv, tp);
+
+	  if (qn != dn)
+	    {
+	      if (qn > dn - qn)
+		mpn_mul (tp, qp, qn, dp - dn, dn - qn);
+	      else
+		mpn_mul (tp, dp - dn, dn - qn, qp, qn);
+
+	      cy = mpn_sub_n (np - dn, np - dn, tp, dn);
+	      if (qh != 0)
+		cy += mpn_sub_n (np - dn + qn, np - dn + qn, dp - dn, dn - qn);
+
+	      while (cy != 0)
+		{
+		  qh -= mpn_sub_1 (qp, qp, qn, 1);
+		  cy -= mpn_add_n (np - dn, np - dn, dp - dn, dn);
+		}
+	    }
+	}
+
+      qn = nn - dn - qn;
+      do
+	{
+	  qp -= dn;
+	  np -= dn;
+	  mpn_dcpi1_div_qr_n (qp, np - dn, dp - dn, dn, dinv, tp);
+	  qn -= dn;
+	}
+      while (qn > 0);
+    }
+  else
+    {
+      qp -= qn;			/* point at low limb of next quotient block */
+      np -= qn;			/* point in the middle of partial remainder */
+
+      if (BELOW_THRESHOLD (qn, DC_DIV_QR_THRESHOLD))
+	qh = mpn_sbpi1_div_qr (qp, np - qn, 2 * qn, dp - qn, qn, dinv->inv32);
+      else
+	qh = mpn_dcpi1_div_qr_n (qp, np - qn, dp - qn, qn, dinv, tp);
+
+      if (qn != dn)
+	{
+	  if (qn > dn - qn)
+	    mpn_mul (tp, qp, qn, dp - dn, dn - qn);
+	  else
+	    mpn_mul (tp, dp - dn, dn - qn, qp, qn);
+
+	  cy = mpn_sub_n (np - dn, np - dn, tp, dn);
+	  if (qh != 0)
+	    cy += mpn_sub_n (np - dn + qn, np - dn + qn, dp - dn, dn - qn);
+
+	  while (cy != 0)
+	    {
+	      qh -= mpn_sub_1 (qp, qp, qn, 1);
+	      cy -= mpn_add_n (np - dn, np - dn, dp - dn, dn);
+	    }
+	}
+    }
+
+  TMP_FREE;
+  return qh;
+}

diff --git a/third_party/gmp/mpn/generic/dcpi1_divappr_q.c b/third_party/gmp/mpn/generic/dcpi1_divappr_q.c
new file mode 100644
index 0000000..0abe04e
--- /dev/null
+++ b/third_party/gmp/mpn/generic/dcpi1_divappr_q.c

@@ -0,0 +1,256 @@
+/* mpn_dcpi1_divappr_q -- divide-and-conquer division, returning approximate
+   quotient.  The quotient returned is either correct, or one too large.
+
+   Contributed to the GNU project by Torbjorn Granlund.
+
+   THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
+   SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
+
+Copyright 2006, 2007, 2009, 2010 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+
+static mp_limb_t
+mpn_dcpi1_divappr_q_n (mp_ptr qp, mp_ptr np, mp_srcptr dp, mp_size_t n,
+		       gmp_pi1_t *dinv, mp_ptr tp)
+{
+  mp_size_t lo, hi;
+  mp_limb_t cy, qh, ql;
+
+  lo = n >> 1;			/* floor(n/2) */
+  hi = n - lo;			/* ceil(n/2) */
+
+  if (BELOW_THRESHOLD (hi, DC_DIV_QR_THRESHOLD))
+    qh = mpn_sbpi1_div_qr (qp + lo, np + 2 * lo, 2 * hi, dp + lo, hi, dinv->inv32);
+  else
+    qh = mpn_dcpi1_div_qr_n (qp + lo, np + 2 * lo, dp + lo, hi, dinv, tp);
+
+  mpn_mul (tp, qp + lo, hi, dp, lo);
+
+  cy = mpn_sub_n (np + lo, np + lo, tp, n);
+  if (qh != 0)
+    cy += mpn_sub_n (np + n, np + n, dp, lo);
+
+  while (cy != 0)
+    {
+      qh -= mpn_sub_1 (qp + lo, qp + lo, hi, 1);
+      cy -= mpn_add_n (np + lo, np + lo, dp, n);
+    }
+
+  if (BELOW_THRESHOLD (lo, DC_DIVAPPR_Q_THRESHOLD))
+    ql = mpn_sbpi1_divappr_q (qp, np + hi, 2 * lo, dp + hi, lo, dinv->inv32);
+  else
+    ql = mpn_dcpi1_divappr_q_n (qp, np + hi, dp + hi, lo, dinv, tp);
+
+  if (UNLIKELY (ql != 0))
+    {
+      mp_size_t i;
+      for (i = 0; i < lo; i++)
+	qp[i] = GMP_NUMB_MASK;
+    }
+
+  return qh;
+}
+
+mp_limb_t
+mpn_dcpi1_divappr_q (mp_ptr qp, mp_ptr np, mp_size_t nn,
+		     mp_srcptr dp, mp_size_t dn, gmp_pi1_t *dinv)
+{
+  mp_size_t qn;
+  mp_limb_t qh, cy, qsave;
+  mp_ptr tp;
+  TMP_DECL;
+
+  TMP_MARK;
+
+  ASSERT (dn >= 6);
+  ASSERT (nn > dn);
+  ASSERT (dp[dn-1] & GMP_NUMB_HIGHBIT);
+
+  qn = nn - dn;
+  qp += qn;
+  np += nn;
+  dp += dn;
+
+  if (qn >= dn)
+    {
+      qn++;			/* pretend we'll need an extra limb */
+      /* Reduce qn mod dn without division, optimizing small operations.  */
+      do
+	qn -= dn;
+      while (qn > dn);
+
+      qp -= qn;			/* point at low limb of next quotient block */
+      np -= qn;			/* point in the middle of partial remainder */
+
+      tp = TMP_SALLOC_LIMBS (dn);
+
+      /* Perform the typically smaller block first.  */
+      if (qn == 1)
+	{
+	  mp_limb_t q, n2, n1, n0, d1, d0;
+
+	  /* Handle qh up front, for simplicity. */
+	  qh = mpn_cmp (np - dn + 1, dp - dn, dn) >= 0;
+	  if (qh)
+	    ASSERT_NOCARRY (mpn_sub_n (np - dn + 1, np - dn + 1, dp - dn, dn));
+
+	  /* A single iteration of schoolbook: One 3/2 division,
+	     followed by the bignum update and adjustment. */
+	  n2 = np[0];
+	  n1 = np[-1];
+	  n0 = np[-2];
+	  d1 = dp[-1];
+	  d0 = dp[-2];
+
+	  ASSERT (n2 < d1 || (n2 == d1 && n1 <= d0));
+
+	  if (UNLIKELY (n2 == d1) && n1 == d0)
+	    {
+	      q = GMP_NUMB_MASK;
+	      cy = mpn_submul_1 (np - dn, dp - dn, dn, q);
+	      ASSERT (cy == n2);
+	    }
+	  else
+	    {
+	      udiv_qr_3by2 (q, n1, n0, n2, n1, n0, d1, d0, dinv->inv32);
+
+	      if (dn > 2)
+		{
+		  mp_limb_t cy, cy1;
+		  cy = mpn_submul_1 (np - dn, dp - dn, dn - 2, q);
+
+		  cy1 = n0 < cy;
+		  n0 = (n0 - cy) & GMP_NUMB_MASK;
+		  cy = n1 < cy1;
+		  n1 = (n1 - cy1) & GMP_NUMB_MASK;
+		  np[-2] = n0;
+
+		  if (UNLIKELY (cy != 0))
+		    {
+		      n1 += d1 + mpn_add_n (np - dn, np - dn, dp - dn, dn - 1);
+		      qh -= (q == 0);
+		      q = (q - 1) & GMP_NUMB_MASK;
+		    }
+		}
+	      else
+		np[-2] = n0;
+
+	      np[-1] = n1;
+	    }
+	  qp[0] = q;
+	}
+      else
+	{
+	  if (qn == 2)
+	    qh = mpn_divrem_2 (qp, 0L, np - 2, 4, dp - 2);
+	  else if (BELOW_THRESHOLD (qn, DC_DIV_QR_THRESHOLD))
+	    qh = mpn_sbpi1_div_qr (qp, np - qn, 2 * qn, dp - qn, qn, dinv->inv32);
+	  else
+	    qh = mpn_dcpi1_div_qr_n (qp, np - qn, dp - qn, qn, dinv, tp);
+
+	  if (qn != dn)
+	    {
+	      if (qn > dn - qn)
+		mpn_mul (tp, qp, qn, dp - dn, dn - qn);
+	      else
+		mpn_mul (tp, dp - dn, dn - qn, qp, qn);
+
+	      cy = mpn_sub_n (np - dn, np - dn, tp, dn);
+	      if (qh != 0)
+		cy += mpn_sub_n (np - dn + qn, np - dn + qn, dp - dn, dn - qn);
+
+	      while (cy != 0)
+		{
+		  qh -= mpn_sub_1 (qp, qp, qn, 1);
+		  cy -= mpn_add_n (np - dn, np - dn, dp - dn, dn);
+		}
+	    }
+	}
+      qn = nn - dn - qn + 1;
+      while (qn > dn)
+	{
+	  qp -= dn;
+	  np -= dn;
+	  mpn_dcpi1_div_qr_n (qp, np - dn, dp - dn, dn, dinv, tp);
+	  qn -= dn;
+	}
+
+      /* Since we pretended we'd need an extra quotient limb before, we now
+	 have made sure the code above left just dn-1=qn quotient limbs to
+	 develop.  Develop that plus a guard limb. */
+      qn--;
+      qp -= qn;
+      np -= dn;
+      qsave = qp[qn];
+      mpn_dcpi1_divappr_q_n (qp, np - dn, dp - dn, dn, dinv, tp);
+      MPN_COPY_INCR (qp, qp + 1, qn);
+      qp[qn] = qsave;
+    }
+  else    /* (qn < dn) */
+    {
+      mp_ptr q2p;
+#if 0				/* not possible since we demand nn > dn */
+      if (qn == 0)
+	{
+	  qh = mpn_cmp (np - dn, dp - dn, dn) >= 0;
+	  if (qh)
+	    mpn_sub_n (np - dn, np - dn, dp - dn, dn);
+	  TMP_FREE;
+	  return qh;
+	}
+#endif
+
+      qp -= qn;			/* point at low limb of next quotient block */
+      np -= qn;			/* point in the middle of partial remainder */
+
+      q2p = TMP_SALLOC_LIMBS (qn + 1);
+      /* Should we at all check DC_DIVAPPR_Q_THRESHOLD here, or reply on
+	 callers not to be silly?  */
+      if (BELOW_THRESHOLD (qn, DC_DIVAPPR_Q_THRESHOLD))
+	{
+	  qh = mpn_sbpi1_divappr_q (q2p, np - qn - 2, 2 * (qn + 1),
+				    dp - (qn + 1), qn + 1, dinv->inv32);
+	}
+      else
+	{
+	  /* It is tempting to use qp for recursive scratch and put quotient in
+	     tp, but the recursive scratch needs one limb too many.  */
+	  tp = TMP_SALLOC_LIMBS (qn + 1);
+	  qh = mpn_dcpi1_divappr_q_n (q2p, np - qn - 2, dp - (qn + 1), qn + 1, dinv, tp);
+	}
+      MPN_COPY (qp, q2p + 1, qn);
+    }
+
+  TMP_FREE;
+  return qh;
+}

diff --git a/third_party/gmp/mpn/generic/div_q.c b/third_party/gmp/mpn/generic/div_q.c
new file mode 100644
index 0000000..18c4ecf
--- /dev/null
+++ b/third_party/gmp/mpn/generic/div_q.c

@@ -0,0 +1,313 @@
+/* mpn_div_q -- division for arbitrary size operands.
+
+   Contributed to the GNU project by Torbjorn Granlund.
+
+   THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
+   SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
+
+Copyright 2009, 2010, 2015, 2018 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+
+/* Compute Q = N/D with truncation.
+     N = {np,nn}
+     D = {dp,dn}
+     Q = {qp,nn-dn+1}
+     T = {scratch,nn+1} is scratch space
+   N and D are both untouched by the computation.
+   N and T may overlap; pass the same space if N is irrelevant after the call,
+   but note that tp needs an extra limb.
+
+   Operand requirements:
+     N >= D > 0
+     dp[dn-1] != 0
+     No overlap between the N, D, and Q areas.
+
+   This division function does not clobber its input operands, since it is
+   intended to support average-O(qn) division, and for that to be effective, it
+   cannot put requirements on callers to copy a O(nn) operand.
+
+   If a caller does not care about the value of {np,nn+1} after calling this
+   function, it should pass np also for the scratch argument.  This function
+   will then save some time and space by avoiding allocation and copying.
+   (FIXME: Is this a good design?  We only really save any copying for
+   already-normalised divisors, which should be rare.  It also prevents us from
+   reasonably asking for all scratch space we need.)
+
+   We write nn-dn+1 limbs for the quotient, but return void.  Why not return
+   the most significant quotient limb?  Look at the 4 main code blocks below
+   (consisting of an outer if-else where each arm contains an if-else). It is
+   tricky for the first code block, since the mpn_*_div_q calls will typically
+   generate all nn-dn+1 and return 0 or 1.  I don't see how to fix that unless
+   we generate the most significant quotient limb here, before calling
+   mpn_*_div_q, or put the quotient in a temporary area.  Since this is a
+   critical division case (the SB sub-case in particular) copying is not a good
+   idea.
+
+   It might make sense to split the if-else parts of the (qn + FUDGE
+   >= dn) blocks into separate functions, since we could promise quite
+   different things to callers in these two cases.  The 'then' case
+   benefits from np=scratch, and it could perhaps even tolerate qp=np,
+   saving some headache for many callers.
+
+   FIXME: Scratch allocation leaves a lot to be desired.  E.g., for the MU size
+   operands, we do not reuse the huge scratch for adjustments.  This can be a
+   serious waste of memory for the largest operands.
+*/
+
+/* FUDGE determines when to try getting an approximate quotient from the upper
+   parts of the dividend and divisor, then adjust.  N.B. FUDGE must be >= 2
+   for the code to be correct.  */
+#define FUDGE 5			/* FIXME: tune this */
+
+#define DC_DIV_Q_THRESHOLD      DC_DIVAPPR_Q_THRESHOLD
+#define MU_DIV_Q_THRESHOLD      MU_DIVAPPR_Q_THRESHOLD
+#define MUPI_DIV_Q_THRESHOLD  MUPI_DIVAPPR_Q_THRESHOLD
+#ifndef MUPI_DIVAPPR_Q_THRESHOLD
+#define MUPI_DIVAPPR_Q_THRESHOLD  MUPI_DIV_QR_THRESHOLD
+#endif
+
+void
+mpn_div_q (mp_ptr qp,
+	   mp_srcptr np, mp_size_t nn,
+	   mp_srcptr dp, mp_size_t dn, mp_ptr scratch)
+{
+  mp_ptr new_dp, new_np, tp, rp;
+  mp_limb_t cy, dh, qh;
+  mp_size_t new_nn, qn;
+  gmp_pi1_t dinv;
+  int cnt;
+  TMP_DECL;
+  TMP_MARK;
+
+  ASSERT (nn >= dn);
+  ASSERT (dn > 0);
+  ASSERT (dp[dn - 1] != 0);
+  ASSERT (! MPN_OVERLAP_P (qp, nn - dn + 1, np, nn));
+  ASSERT (! MPN_OVERLAP_P (qp, nn - dn + 1, dp, dn));
+  ASSERT (MPN_SAME_OR_SEPARATE_P (np, scratch, nn));
+
+  ASSERT_ALWAYS (FUDGE >= 2);
+
+  dh = dp[dn - 1];
+  if (dn == 1)
+    {
+      mpn_divrem_1 (qp, 0L, np, nn, dh);
+      return;
+    }
+
+  qn = nn - dn + 1;		/* Quotient size, high limb might be zero */
+
+  if (qn + FUDGE >= dn)
+    {
+      /* |________________________|
+                          |_______|  */
+      new_np = scratch;
+
+      if (LIKELY ((dh & GMP_NUMB_HIGHBIT) == 0))
+	{
+	  count_leading_zeros (cnt, dh);
+
+	  cy = mpn_lshift (new_np, np, nn, cnt);
+	  new_np[nn] = cy;
+	  new_nn = nn + (cy != 0);
+
+	  new_dp = TMP_ALLOC_LIMBS (dn);
+	  mpn_lshift (new_dp, dp, dn, cnt);
+
+	  if (dn == 2)
+	    {
+	      qh = mpn_divrem_2 (qp, 0L, new_np, new_nn, new_dp);
+	    }
+	  else if (BELOW_THRESHOLD (dn, DC_DIV_Q_THRESHOLD) ||
+		   BELOW_THRESHOLD (new_nn - dn, DC_DIV_Q_THRESHOLD))
+	    {
+	      invert_pi1 (dinv, new_dp[dn - 1], new_dp[dn - 2]);
+	      qh = mpn_sbpi1_div_q (qp, new_np, new_nn, new_dp, dn, dinv.inv32);
+	    }
+	  else if (BELOW_THRESHOLD (dn, MUPI_DIV_Q_THRESHOLD) ||   /* fast condition */
+		   BELOW_THRESHOLD (nn, 2 * MU_DIV_Q_THRESHOLD) || /* fast condition */
+		   (double) (2 * (MU_DIV_Q_THRESHOLD - MUPI_DIV_Q_THRESHOLD)) * dn /* slow... */
+		   + (double) MUPI_DIV_Q_THRESHOLD * nn > (double) dn * nn)   /* ...condition */
+	    {
+	      invert_pi1 (dinv, new_dp[dn - 1], new_dp[dn - 2]);
+	      qh = mpn_dcpi1_div_q (qp, new_np, new_nn, new_dp, dn, &dinv);
+	    }
+	  else
+	    {
+	      mp_size_t itch = mpn_mu_div_q_itch (new_nn, dn, 0);
+	      mp_ptr scratch = TMP_ALLOC_LIMBS (itch);
+	      qh = mpn_mu_div_q (qp, new_np, new_nn, new_dp, dn, scratch);
+	    }
+	  if (cy == 0)
+	    qp[qn - 1] = qh;
+	  else
+	    ASSERT (qh == 0);
+	}
+      else  /* divisor is already normalised */
+	{
+	  if (new_np != np)
+	    MPN_COPY (new_np, np, nn);
+
+	  if (dn == 2)
+	    {
+	      qh = mpn_divrem_2 (qp, 0L, new_np, nn, dp);
+	    }
+	  else if (BELOW_THRESHOLD (dn, DC_DIV_Q_THRESHOLD) ||
+		   BELOW_THRESHOLD (nn - dn, DC_DIV_Q_THRESHOLD))
+	    {
+	      invert_pi1 (dinv, dh, dp[dn - 2]);
+	      qh = mpn_sbpi1_div_q (qp, new_np, nn, dp, dn, dinv.inv32);
+	    }
+	  else if (BELOW_THRESHOLD (dn, MUPI_DIV_Q_THRESHOLD) ||   /* fast condition */
+		   BELOW_THRESHOLD (nn, 2 * MU_DIV_Q_THRESHOLD) || /* fast condition */
+		   (double) (2 * (MU_DIV_Q_THRESHOLD - MUPI_DIV_Q_THRESHOLD)) * dn /* slow... */
+		   + (double) MUPI_DIV_Q_THRESHOLD * nn > (double) dn * nn)   /* ...condition */
+	    {
+	      invert_pi1 (dinv, dh, dp[dn - 2]);
+	      qh = mpn_dcpi1_div_q (qp, new_np, nn, dp, dn, &dinv);
+	    }
+	  else
+	    {
+	      mp_size_t itch = mpn_mu_div_q_itch (nn, dn, 0);
+	      mp_ptr scratch = TMP_ALLOC_LIMBS (itch);
+	      qh = mpn_mu_div_q (qp, np, nn, dp, dn, scratch);
+	    }
+	  qp[nn - dn] = qh;
+	}
+    }
+  else
+    {
+      /* |________________________|
+                |_________________|  */
+      tp = TMP_ALLOC_LIMBS (qn + 1);
+
+      new_np = scratch;
+      new_nn = 2 * qn + 1;
+      if (new_np == np)
+	/* We need {np,nn} to remain untouched until the final adjustment, so
+	   we need to allocate separate space for new_np.  */
+	new_np = TMP_ALLOC_LIMBS (new_nn + 1);
+
+
+      if (LIKELY ((dh & GMP_NUMB_HIGHBIT) == 0))
+	{
+	  count_leading_zeros (cnt, dh);
+
+	  cy = mpn_lshift (new_np, np + nn - new_nn, new_nn, cnt);
+	  new_np[new_nn] = cy;
+
+	  new_nn += (cy != 0);
+
+	  new_dp = TMP_ALLOC_LIMBS (qn + 1);
+	  mpn_lshift (new_dp, dp + dn - (qn + 1), qn + 1, cnt);
+	  new_dp[0] |= dp[dn - (qn + 1) - 1] >> (GMP_NUMB_BITS - cnt);
+
+	  if (qn + 1 == 2)
+	    {
+	      qh = mpn_divrem_2 (tp, 0L, new_np, new_nn, new_dp);
+	    }
+	  else if (BELOW_THRESHOLD (qn, DC_DIVAPPR_Q_THRESHOLD - 1))
+	    {
+	      invert_pi1 (dinv, new_dp[qn], new_dp[qn - 1]);
+	      qh = mpn_sbpi1_divappr_q (tp, new_np, new_nn, new_dp, qn + 1, dinv.inv32);
+	    }
+	  else if (BELOW_THRESHOLD (qn, MU_DIVAPPR_Q_THRESHOLD - 1))
+	    {
+	      invert_pi1 (dinv, new_dp[qn], new_dp[qn - 1]);
+	      qh = mpn_dcpi1_divappr_q (tp, new_np, new_nn, new_dp, qn + 1, &dinv);
+	    }
+	  else
+	    {
+	      mp_size_t itch = mpn_mu_divappr_q_itch (new_nn, qn + 1, 0);
+	      mp_ptr scratch = TMP_ALLOC_LIMBS (itch);
+	      qh = mpn_mu_divappr_q (tp, new_np, new_nn, new_dp, qn + 1, scratch);
+	    }
+	  if (cy == 0)
+	    tp[qn] = qh;
+	  else if (UNLIKELY (qh != 0))
+	    {
+	      /* This happens only when the quotient is close to B^n and
+		 mpn_*_divappr_q returned B^n.  */
+	      mp_size_t i, n;
+	      n = new_nn - (qn + 1);
+	      for (i = 0; i < n; i++)
+		tp[i] = GMP_NUMB_MAX;
+	      qh = 0;		/* currently ignored */
+	    }
+	}
+      else  /* divisor is already normalised */
+	{
+	  MPN_COPY (new_np, np + nn - new_nn, new_nn); /* pointless if MU will be used */
+
+	  new_dp = (mp_ptr) dp + dn - (qn + 1);
+
+	  if (qn == 2 - 1)
+	    {
+	      qh = mpn_divrem_2 (tp, 0L, new_np, new_nn, new_dp);
+	    }
+	  else if (BELOW_THRESHOLD (qn, DC_DIVAPPR_Q_THRESHOLD - 1))
+	    {
+	      invert_pi1 (dinv, dh, new_dp[qn - 1]);
+	      qh = mpn_sbpi1_divappr_q (tp, new_np, new_nn, new_dp, qn + 1, dinv.inv32);
+	    }
+	  else if (BELOW_THRESHOLD (qn, MU_DIVAPPR_Q_THRESHOLD - 1))
+	    {
+	      invert_pi1 (dinv, dh, new_dp[qn - 1]);
+	      qh = mpn_dcpi1_divappr_q (tp, new_np, new_nn, new_dp, qn + 1, &dinv);
+	    }
+	  else
+	    {
+	      mp_size_t itch = mpn_mu_divappr_q_itch (new_nn, qn + 1, 0);
+	      mp_ptr scratch = TMP_ALLOC_LIMBS (itch);
+	      qh = mpn_mu_divappr_q (tp, new_np, new_nn, new_dp, qn + 1, scratch);
+	    }
+	  tp[qn] = qh;
+	}
+
+      MPN_COPY (qp, tp + 1, qn);
+      if (tp[0] <= 4)
+        {
+	  mp_size_t rn;
+
+          rp = TMP_ALLOC_LIMBS (dn + qn);
+          mpn_mul (rp, dp, dn, tp + 1, qn);
+	  rn = dn + qn;
+	  rn -= rp[rn - 1] == 0;
+
+          if (rn > nn || mpn_cmp (np, rp, nn) < 0)
+            MPN_DECR_U (qp, qn, 1);
+        }
+    }
+
+  TMP_FREE;
+}

diff --git a/third_party/gmp/mpn/generic/div_qr_1.c b/third_party/gmp/mpn/generic/div_qr_1.c
new file mode 100644
index 0000000..8f80d37
--- /dev/null
+++ b/third_party/gmp/mpn/generic/div_qr_1.c

@@ -0,0 +1,125 @@
+/* mpn_div_qr_1 -- mpn by limb division.
+
+   Contributed to the GNU project by Niels Möller and Torbjörn Granlund
+
+Copyright 1991, 1993, 1994, 1996, 1998-2000, 2002, 2003, 2013 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+#ifndef DIV_QR_1_NORM_THRESHOLD
+#define DIV_QR_1_NORM_THRESHOLD 3
+#endif
+#ifndef DIV_QR_1_UNNORM_THRESHOLD
+#define DIV_QR_1_UNNORM_THRESHOLD 3
+#endif
+
+#if GMP_NAIL_BITS > 0
+#error Nail bits not supported
+#endif
+
+/* Divides {up, n} by d. Writes the n-1 low quotient limbs at {qp,
+ * n-1}, and the high quotient limb at *qh. Returns remainder. */
+mp_limb_t
+mpn_div_qr_1 (mp_ptr qp, mp_limb_t *qh, mp_srcptr up, mp_size_t n,
+	      mp_limb_t d)
+{
+  unsigned cnt;
+  mp_limb_t uh;
+
+  ASSERT (n > 0);
+  ASSERT (d > 0);
+
+  if (d & GMP_NUMB_HIGHBIT)
+    {
+      /* Normalized case */
+      mp_limb_t dinv, q;
+
+      uh = up[--n];
+
+      q = (uh >= d);
+      *qh = q;
+      uh -= (-q) & d;
+
+      if (BELOW_THRESHOLD (n, DIV_QR_1_NORM_THRESHOLD))
+	{
+	  cnt = 0;
+	plain:
+	  while (n > 0)
+	    {
+	      mp_limb_t ul = up[--n];
+	      udiv_qrnnd (qp[n], uh, uh, ul, d);
+	    }
+	  return uh >> cnt;
+	}
+      invert_limb (dinv, d);
+      return mpn_div_qr_1n_pi1 (qp, up, n, uh, d, dinv);
+    }
+  else
+    {
+      /* Unnormalized case */
+      mp_limb_t dinv, ul;
+
+      if (! UDIV_NEEDS_NORMALIZATION
+	  && BELOW_THRESHOLD (n, DIV_QR_1_UNNORM_THRESHOLD))
+	{
+	  uh = up[--n];
+	  udiv_qrnnd (*qh, uh, CNST_LIMB(0), uh, d);
+	  cnt = 0;
+	  goto plain;
+	}
+
+      count_leading_zeros (cnt, d);
+      d <<= cnt;
+
+#if HAVE_NATIVE_mpn_div_qr_1u_pi1
+      /* FIXME: Call loop doing on-the-fly normalization */
+#endif
+
+      /* Shift up front, use qp area for shifted copy. A bit messy,
+	 since we have only n-1 limbs available, and shift the high
+	 limb manually. */
+      uh = up[--n];
+      ul = (uh << cnt) | mpn_lshift (qp, up, n, cnt);
+      uh >>= (GMP_LIMB_BITS - cnt);
+
+      if (UDIV_NEEDS_NORMALIZATION
+	  && BELOW_THRESHOLD (n, DIV_QR_1_UNNORM_THRESHOLD))
+	{
+	  udiv_qrnnd (*qh, uh, uh, ul, d);
+	  up = qp;
+	  goto plain;
+	}
+      invert_limb (dinv, d);
+
+      udiv_qrnnd_preinv (*qh, uh, uh, ul, d, dinv);
+      return mpn_div_qr_1n_pi1 (qp, qp, n, uh, d, dinv) >> cnt;
+    }
+}

diff --git a/third_party/gmp/mpn/generic/div_qr_1n_pi1.c b/third_party/gmp/mpn/generic/div_qr_1n_pi1.c
new file mode 100644
index 0000000..5c32810
--- /dev/null
+++ b/third_party/gmp/mpn/generic/div_qr_1n_pi1.c

@@ -0,0 +1,277 @@
+/* mpn_div_qr_1n_pi1
+
+   Contributed to the GNU project by Niels Möller
+
+   THIS FILE CONTAINS INTERNAL FUNCTIONS WITH MUTABLE INTERFACES.  IT IS ONLY
+   SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+
+Copyright 2013 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+#if GMP_NAIL_BITS > 0
+#error Nail bits not supported
+#endif
+
+#ifndef DIV_QR_1N_METHOD
+#define DIV_QR_1N_METHOD 2
+#endif
+
+/* FIXME: Duplicated in mod_1_1.c. Move to gmp-impl.h */
+
+#if defined (__GNUC__) && ! defined (NO_ASM)
+
+#if HAVE_HOST_CPU_FAMILY_x86 && W_TYPE_SIZE == 32
+#define add_mssaaaa(m, s1, s0, a1, a0, b1, b0)				\
+  __asm__ (  "add	%6, %k2\n\t"					\
+	     "adc	%4, %k1\n\t"					\
+	     "sbb	%k0, %k0"					\
+	   : "=r" (m), "=r" (s1), "=&r" (s0)				\
+	   : "1"  ((USItype)(a1)), "g" ((USItype)(b1)),			\
+	     "%2" ((USItype)(a0)), "g" ((USItype)(b0)))
+#endif
+
+#if HAVE_HOST_CPU_FAMILY_x86_64 && W_TYPE_SIZE == 64
+#define add_mssaaaa(m, s1, s0, a1, a0, b1, b0)				\
+  __asm__ (  "add	%6, %q2\n\t"					\
+	     "adc	%4, %q1\n\t"					\
+	     "sbb	%q0, %q0"					\
+	   : "=r" (m), "=r" (s1), "=&r" (s0)				\
+	   : "1"  ((UDItype)(a1)), "rme" ((UDItype)(b1)),		\
+	     "%2" ((UDItype)(a0)), "rme" ((UDItype)(b0)))
+#endif
+
+#if defined (__sparc__) && W_TYPE_SIZE == 32
+#define add_mssaaaa(m, sh, sl, ah, al, bh, bl)				\
+  __asm__ (  "addcc	%r5, %6, %2\n\t"				\
+	     "addxcc	%r3, %4, %1\n\t"				\
+	     "subx	%%g0, %%g0, %0"					\
+	   : "=r" (m), "=r" (sh), "=&r" (sl)				\
+	   : "rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl)		\
+	 __CLOBBER_CC)
+#endif
+
+#if defined (__sparc__) && W_TYPE_SIZE == 64
+#define add_mssaaaa(m, sh, sl, ah, al, bh, bl)				\
+  __asm__ (  "addcc	%r5, %6, %2\n\t"				\
+	     "addccc	%r7, %8, %%g0\n\t"				\
+	     "addccc	%r3, %4, %1\n\t"				\
+	     "clr	%0\n\t"						\
+	     "movcs	%%xcc, -1, %0"					\
+	   : "=r" (m), "=r" (sh), "=&r" (sl)				\
+	   : "rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl),		\
+	     "rJ" ((al) >> 32), "rI" ((bl) >> 32)			\
+	 __CLOBBER_CC)
+#if __VIS__ >= 0x300
+#undef add_mssaaaa
+#define add_mssaaaa(m, sh, sl, ah, al, bh, bl)				\
+  __asm__ (  "addcc	%r5, %6, %2\n\t"				\
+	     "addxccc	%r3, %4, %1\n\t"				\
+	     "clr	%0\n\t"						\
+	     "movcs	%%xcc, -1, %0"					\
+	   : "=r" (m), "=r" (sh), "=&r" (sl)				\
+	   : "rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl)		\
+	 __CLOBBER_CC)
+#endif
+#endif
+
+#if HAVE_HOST_CPU_FAMILY_powerpc && !defined (_LONG_LONG_LIMB)
+/* This works fine for 32-bit and 64-bit limbs, except for 64-bit limbs with a
+   processor running in 32-bit mode, since the carry flag then gets the 32-bit
+   carry.  */
+#define add_mssaaaa(m, s1, s0, a1, a0, b1, b0)				\
+  __asm__ (  "add%I6c	%2, %5, %6\n\t"					\
+	     "adde	%1, %3, %4\n\t"					\
+	     "subfe	%0, %0, %0\n\t"					\
+	     "nor	%0, %0, %0"					\
+	   : "=r" (m), "=r" (s1), "=&r" (s0)				\
+	   : "r"  (a1), "r" (b1), "%r" (a0), "rI" (b0)			\
+	   __CLOBBER_CC)
+#endif
+
+#if defined (__s390x__) && W_TYPE_SIZE == 64
+#define add_mssaaaa(m, s1, s0, a1, a0, b1, b0)				\
+  __asm__ (  "algr	%2, %6\n\t"					\
+	     "alcgr	%1, %4\n\t"					\
+	     "lghi	%0, 0\n\t"					\
+	     "alcgr	%0, %0\n\t"					\
+	     "lcgr	%0, %0"						\
+	   : "=r" (m), "=r" (s1), "=&r" (s0)				\
+	   : "1"  ((UDItype)(a1)), "r" ((UDItype)(b1)),			\
+	     "%2" ((UDItype)(a0)), "r" ((UDItype)(b0)) __CLOBBER_CC)
+#endif
+
+#if defined (__arm__) && !defined (__thumb__) && W_TYPE_SIZE == 32
+#define add_mssaaaa(m, sh, sl, ah, al, bh, bl)				\
+  __asm__ (  "adds	%2, %5, %6\n\t"					\
+	     "adcs	%1, %3, %4\n\t"					\
+	     "movcc	%0, #0\n\t"					\
+	     "movcs	%0, #-1"					\
+	   : "=r" (m), "=r" (sh), "=&r" (sl)				\
+	   : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl) __CLOBBER_CC)
+#endif
+#endif /* defined (__GNUC__) */
+
+#ifndef add_mssaaaa
+#define add_mssaaaa(m, s1, s0, a1, a0, b1, b0)				\
+  do {									\
+    UWtype __s0, __s1, __c0, __c1;					\
+    __s0 = (a0) + (b0);							\
+    __s1 = (a1) + (b1);							\
+    __c0 = __s0 < (a0);							\
+    __c1 = __s1 < (a1);							\
+    (s0) = __s0;							\
+    __s1 = __s1 + __c0;							\
+    (s1) = __s1;							\
+    (m) = - (__c1 + (__s1 < __c0));					\
+  } while (0)
+#endif
+
+#if DIV_QR_1N_METHOD == 1
+
+/* Divides (uh B^n + {up, n}) by d, storing the quotient at {qp, n}.
+   Requires that uh < d. */
+mp_limb_t
+mpn_div_qr_1n_pi1 (mp_ptr qp, mp_srcptr up, mp_size_t n, mp_limb_t uh,
+		   mp_limb_t d, mp_limb_t dinv)
+{
+  ASSERT (n > 0);
+  ASSERT (uh < d);
+  ASSERT (d & GMP_NUMB_HIGHBIT);
+  ASSERT (MPN_SAME_OR_SEPARATE_P (qp, up, n));
+
+  do
+    {
+      mp_limb_t q, ul;
+
+      ul = up[--n];
+      udiv_qrnnd_preinv (q, uh, uh, ul, d, dinv);
+      qp[n] = q;
+    }
+  while (n > 0);
+
+  return uh;
+}
+
+#elif DIV_QR_1N_METHOD == 2
+
+mp_limb_t
+mpn_div_qr_1n_pi1 (mp_ptr qp, mp_srcptr up, mp_size_t n, mp_limb_t u1,
+		   mp_limb_t d, mp_limb_t dinv)
+{
+  mp_limb_t B2;
+  mp_limb_t u0, u2;
+  mp_limb_t q0, q1;
+  mp_limb_t p0, p1;
+  mp_limb_t t;
+  mp_size_t j;
+
+  ASSERT (d & GMP_LIMB_HIGHBIT);
+  ASSERT (n > 0);
+  ASSERT (u1 < d);
+
+  if (n == 1)
+    {
+      udiv_qrnnd_preinv (qp[0], u1, u1, up[0], d, dinv);
+      return u1;
+    }
+
+  /* FIXME: Could be precomputed */
+  B2 = -d*dinv;
+
+  umul_ppmm (q1, q0, dinv, u1);
+  umul_ppmm (p1, p0, B2, u1);
+  q1 += u1;
+  ASSERT (q1 >= u1);
+  u0 = up[n-1];	/* Early read, to allow qp == up. */
+  qp[n-1] = q1;
+
+  add_mssaaaa (u2, u1, u0, u0, up[n-2], p1, p0);
+
+  /* FIXME: Keep q1 in a variable between iterations, to reduce number
+     of memory accesses. */
+  for (j = n-2; j-- > 0; )
+    {
+      mp_limb_t q2, cy;
+
+      /* Additions for the q update:
+       *	+-------+
+       *        |u1 * v |
+       *        +---+---+
+       *        | u1|
+       *    +---+---+
+       *    | 1 | v |  (conditional on u2)
+       *    +---+---+
+       *        | 1 |  (conditional on u0 + u2 B2 carry)
+       *        +---+
+       * +      | q0|
+       *   -+---+---+---+
+       *    | q2| q1| q0|
+       *    +---+---+---+
+      */
+      umul_ppmm (p1, t, u1, dinv);
+      add_ssaaaa (q2, q1, -u2, u2 & dinv, CNST_LIMB(0), u1);
+      add_ssaaaa (q2, q1, q2, q1, CNST_LIMB(0), p1);
+      add_ssaaaa (q2, q1, q2, q1, CNST_LIMB(0), q0);
+      q0 = t;
+
+      umul_ppmm (p1, p0, u1, B2);
+      ADDC_LIMB (cy, u0, u0, u2 & B2);
+      u0 -= (-cy) & d;
+
+      /* Final q update */
+      add_ssaaaa (q2, q1, q2, q1, CNST_LIMB(0), cy);
+      qp[j+1] = q1;
+      MPN_INCR_U (qp+j+2, n-j-2, q2);
+
+      add_mssaaaa (u2, u1, u0, u0, up[j], p1, p0);
+    }
+
+  q1 = (u2 > 0);
+  u1 -= (-q1) & d;
+
+  t = (u1 >= d);
+  q1 += t;
+  u1 -= (-t) & d;
+
+  udiv_qrnnd_preinv (t, u0, u1, u0, d, dinv);
+  add_ssaaaa (q1, q0, q1, q0, CNST_LIMB(0), t);
+
+  MPN_INCR_U (qp+1, n-1, q1);
+
+  qp[0] = q0;
+  return u0;
+}
+
+#else
+#error Unknown DIV_QR_1N_METHOD
+#endif

diff --git a/third_party/gmp/mpn/generic/div_qr_1n_pi2.c b/third_party/gmp/mpn/generic/div_qr_1n_pi2.c
new file mode 100644
index 0000000..d8834ea
--- /dev/null
+++ b/third_party/gmp/mpn/generic/div_qr_1n_pi2.c

@@ -0,0 +1,203 @@
+/* mpn_div_qr_1n_pi2.
+
+   THIS FILE CONTAINS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE.  IT IS
+   ONLY SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+Copyright 2013, 2017 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+/* ISSUES:
+
+   * Can we really use the high pi2 inverse limb for udiv_qrnnd_preinv?
+
+   * Are there any problems with generating n quotient limbs in the q area?  It
+     surely simplifies things.
+
+   * Not yet adequately tested.
+*/
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+/* Define some longlong.h-style macros, but for wider operations.
+   * add_sssaaaa is like longlong.h's add_ssaaaa but propagating carry-out into
+     an additional sum operand.
+*/
+#if defined (__GNUC__)  && ! defined (__INTEL_COMPILER) && ! defined (NO_ASM)
+
+#if HAVE_HOST_CPU_FAMILY_x86 && W_TYPE_SIZE == 32
+#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
+  __asm__ ("add\t%7, %k2\n\tadc\t%5, %k1\n\tadc\t$0, %k0"		\
+	   : "=r" (s2), "=&r" (s1), "=&r" (s0)				\
+	   : "0"  ((USItype)(s2)),					\
+	     "1"  ((USItype)(a1)), "g" ((USItype)(b1)),			\
+	     "%2" ((USItype)(a0)), "g" ((USItype)(b0)))
+#endif
+
+#if defined (__amd64__) && W_TYPE_SIZE == 64
+#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
+  __asm__ ("add\t%7, %q2\n\tadc\t%5, %q1\n\tadc\t$0, %q0"		\
+	   : "=r" (s2), "=&r" (s1), "=&r" (s0)				\
+	   : "0"  ((UDItype)(s2)),					\
+	     "1"  ((UDItype)(a1)), "rme" ((UDItype)(b1)),		\
+	     "%2" ((UDItype)(a0)), "rme" ((UDItype)(b0)))
+#endif
+
+#if defined (__aarch64__) && W_TYPE_SIZE == 64
+#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
+  __asm__ ("adds\t%2, %x6, %7\n\tadcs\t%1, %x4, %x5\n\tadc\t%0, %3, xzr"\
+	   : "=r" (s2), "=&r" (s1), "=&r" (s0)				\
+	   : "rZ" (s2), "%rZ"  (a1), "rZ" (b1), "%rZ" (a0), "rI" (b0)	\
+	     __CLOBBER_CC)
+#endif
+
+#if HAVE_HOST_CPU_FAMILY_powerpc && !defined (_LONG_LONG_LIMB)
+/* This works fine for 32-bit and 64-bit limbs, except for 64-bit limbs with a
+   processor running in 32-bit mode, since the carry flag then gets the 32-bit
+   carry.  */
+#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
+  __asm__ ("add%I7c\t%2,%6,%7\n\tadde\t%1,%4,%5\n\taddze\t%0,%3"	\
+	   : "=r" (s2), "=&r" (s1), "=&r" (s0)				\
+	   : "r"  (s2), "r"  (a1), "r" (b1), "%r" (a0), "rI" (b0)	\
+	     __CLOBBER_CC)
+#endif
+
+#endif /* __GNUC__ */
+
+#ifndef add_sssaaaa
+#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
+  do {									\
+    UWtype __s0, __s1, __c0, __c1;					\
+    __s0 = (a0) + (b0);							\
+    __s1 = (a1) + (b1);							\
+    __c0 = __s0 < (a0);							\
+    __c1 = __s1 < (a1);							\
+    (s0) = __s0;							\
+    __s1 = __s1 + __c0;							\
+    (s1) = __s1;							\
+    (s2) += __c1 + (__s1 < __c0);					\
+  } while (0)
+#endif
+
+struct precomp_div_1_pi2
+{
+  mp_limb_t dip[2];
+  mp_limb_t d;
+  int norm_cnt;
+};
+
+mp_limb_t
+mpn_div_qr_1n_pi2 (mp_ptr qp,
+		   mp_srcptr up, mp_size_t un,
+		   struct precomp_div_1_pi2 *pd)
+{
+  mp_limb_t most_significant_q_limb;
+  mp_size_t i;
+  mp_limb_t r, u2, u1, u0;
+  mp_limb_t d0, di1, di0;
+  mp_limb_t q3a, q2a, q2b, q1b, q2c, q1c, q1d, q0d;
+  mp_limb_t cnd;
+
+  ASSERT (un >= 2);
+  ASSERT ((pd->d & GMP_NUMB_HIGHBIT) != 0);
+  ASSERT (! MPN_OVERLAP_P (qp, un-2, up, un) || qp+2 >= up);
+  ASSERT_MPN (up, un);
+
+#define q3 q3a
+#define q2 q2b
+#define q1 q1b
+
+  up += un - 3;
+  r = up[2];
+  d0 = pd->d;
+
+  most_significant_q_limb = (r >= d0);
+  r -= d0 & -most_significant_q_limb;
+
+  qp += un - 3;
+  qp[2] = most_significant_q_limb;
+
+  di1 = pd->dip[1];
+  di0 = pd->dip[0];
+
+  for (i = un - 3; i >= 0; i -= 2)
+    {
+      u2 = r;
+      u1 = up[1];
+      u0 = up[0];
+
+      /* Dividend in {r,u1,u0} */
+
+      umul_ppmm (q1d,q0d, u1, di0);
+      umul_ppmm (q2b,q1b, u1, di1);
+      q2b++;				/* cannot spill */
+      add_sssaaaa (r,q2b,q1b, q2b,q1b, u1,u0);
+
+      umul_ppmm (q2c,q1c, u2,  di0);
+      add_sssaaaa (r,q2b,q1b, q2b,q1b, q2c,q1c);
+      umul_ppmm (q3a,q2a, u2, di1);
+
+      add_sssaaaa (r,q2b,q1b, q2b,q1b, q2a,q1d);
+
+      q3 += r;
+
+      r = u0 - q2 * d0;
+
+      cnd = (r >= q1);
+      r += d0 & -cnd;
+      sub_ddmmss (q3,q2,  q3,q2,  0,cnd);
+
+      if (UNLIKELY (r >= d0))
+	{
+	  r -= d0;
+	  add_ssaaaa (q3,q2,  q3,q2,  0,1);
+	}
+
+      qp[0] = q2;
+      qp[1] = q3;
+
+      up -= 2;
+      qp -= 2;
+    }
+
+  if ((un & 1) == 0)
+    {
+      u2 = r;
+      u1 = up[1];
+
+      udiv_qrnnd_preinv (q3, r, u2, u1, d0, di1);
+      qp[1] = q3;
+    }
+
+  return r;
+
+#undef q3
+#undef q2
+#undef q1
+}

diff --git a/third_party/gmp/mpn/generic/div_qr_1u_pi2.c b/third_party/gmp/mpn/generic/div_qr_1u_pi2.c
new file mode 100644
index 0000000..047662f
--- /dev/null
+++ b/third_party/gmp/mpn/generic/div_qr_1u_pi2.c

@@ -0,0 +1,235 @@
+/* mpn_div_qr_1u_pi2.
+
+   THIS FILE CONTAINS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE.  IT IS
+   ONLY SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+Copyright 2013, 2017 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+/* ISSUES:
+
+   * Can we really use the high pi2 inverse limb for udiv_qrnnd_preinv?
+
+   * Are there any problems with generating n quotient limbs in the q area?  It
+     surely simplifies things.
+
+   * Not yet adequately tested.
+*/
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+/* Define some longlong.h-style macros, but for wider operations.
+   * add_sssaaaa is like longlong.h's add_ssaaaa but propagating carry-out into
+     an additional sum operand.
+*/
+#if defined (__GNUC__)  && ! defined (__INTEL_COMPILER) && ! defined (NO_ASM)
+
+#if HAVE_HOST_CPU_FAMILY_x86 && W_TYPE_SIZE == 32
+#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
+  __asm__ ("add\t%7, %k2\n\tadc\t%5, %k1\n\tadc\t$0, %k0"		\
+	   : "=r" (s2), "=&r" (s1), "=&r" (s0)				\
+	   : "0"  ((USItype)(s2)),					\
+	     "1"  ((USItype)(a1)), "g" ((USItype)(b1)),			\
+	     "%2" ((USItype)(a0)), "g" ((USItype)(b0)))
+#endif
+
+#if defined (__amd64__) && W_TYPE_SIZE == 64
+#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
+  __asm__ ("add\t%7, %q2\n\tadc\t%5, %q1\n\tadc\t$0, %q0"		\
+	   : "=r" (s2), "=&r" (s1), "=&r" (s0)				\
+	   : "0"  ((UDItype)(s2)),					\
+	     "1"  ((UDItype)(a1)), "rme" ((UDItype)(b1)),		\
+	     "%2" ((UDItype)(a0)), "rme" ((UDItype)(b0)))
+#endif
+
+#if defined (__aarch64__) && W_TYPE_SIZE == 64
+#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
+  __asm__ ("adds\t%2, %x6, %7\n\tadcs\t%1, %x4, %x5\n\tadc\t%0, %3, xzr"\
+	   : "=r" (s2), "=&r" (s1), "=&r" (s0)				\
+	   : "rZ" (s2), "%rZ"  (a1), "rZ" (b1), "%rZ" (a0), "rI" (b0) __CLOBBER_CC)
+#endif
+
+#if HAVE_HOST_CPU_FAMILY_powerpc && !defined (_LONG_LONG_LIMB)
+/* This works fine for 32-bit and 64-bit limbs, except for 64-bit limbs with a
+   processor running in 32-bit mode, since the carry flag then gets the 32-bit
+   carry.  */
+#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
+  __asm__ ("add%I7c\t%2,%6,%7\n\tadde\t%1,%4,%5\n\taddze\t%0,%3"	\
+	   : "=r" (s2), "=&r" (s1), "=&r" (s0)				\
+	   : "r"  (s2), "r"  (a1), "r" (b1), "%r" (a0), "rI" (b0)	\
+	     __CLOBBER_CC)
+#endif
+
+#endif /* __GNUC__ */
+
+#ifndef add_sssaaaa
+#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
+  do {									\
+    UWtype __s0, __s1, __c0, __c1;					\
+    __s0 = (a0) + (b0);							\
+    __s1 = (a1) + (b1);							\
+    __c0 = __s0 < (a0);							\
+    __c1 = __s1 < (a1);							\
+    (s0) = __s0;							\
+    __s1 = __s1 + __c0;							\
+    (s1) = __s1;							\
+    (s2) += __c1 + (__s1 < __c0);					\
+  } while (0)
+#endif
+
+struct precomp_div_1_pi2
+{
+  mp_limb_t dip[2];
+  mp_limb_t d;
+  int norm_cnt;
+};
+
+mp_limb_t
+mpn_div_qr_1u_pi2 (mp_ptr qp,
+		   mp_srcptr up, mp_size_t un,
+		   struct precomp_div_1_pi2 *pd)
+{
+  mp_size_t i;
+  mp_limb_t r, u2, u1, u0;
+  mp_limb_t d0, di1, di0;
+  mp_limb_t q3a, q2a, q2b, q1b, q2c, q1c, q1d, q0d;
+  mp_limb_t cnd;
+  int cnt;
+
+  ASSERT (un >= 2);
+  ASSERT ((pd->d & GMP_NUMB_HIGHBIT) == 0);
+  ASSERT (! MPN_OVERLAP_P (qp, un-2, up, un) || qp+2 >= up);
+  ASSERT_MPN (up, un);
+
+#define q3 q3a
+#define q2 q2b
+#define q1 q1b
+
+  up += un - 3;
+  cnt = pd->norm_cnt;
+  r = up[2] >> (GMP_NUMB_BITS - cnt);
+  d0 = pd->d << cnt;
+
+  qp += un - 2;
+
+  di1 = pd->dip[1];
+  di0 = pd->dip[0];
+
+  for (i = un - 3; i >= 0; i -= 2)
+    {
+      u2 = r;
+      u1 = (up[2] << cnt) | (up[1] >> (GMP_NUMB_BITS - cnt));
+      u0 = (up[1] << cnt) | (up[0] >> (GMP_NUMB_BITS - cnt));
+
+      /* Dividend in {r,u1,u0} */
+
+      umul_ppmm (q1d,q0d, u1, di0);
+      umul_ppmm (q2b,q1b, u1, di1);
+      q2b++;				/* cannot spill */
+      add_sssaaaa (r,q2b,q1b, q2b,q1b, u1,u0);
+
+      umul_ppmm (q2c,q1c, u2,  di0);
+      add_sssaaaa (r,q2b,q1b, q2b,q1b, q2c,q1c);
+      umul_ppmm (q3a,q2a, u2, di1);
+
+      add_sssaaaa (r,q2b,q1b, q2b,q1b, q2a,q1d);
+
+      q3 += r;
+
+      r = u0 - q2 * d0;
+
+      cnd = (r >= q1);
+      r += d0 & -cnd;
+      sub_ddmmss (q3,q2,  q3,q2,  0,cnd);
+
+      if (UNLIKELY (r >= d0))
+	{
+	  r -= d0;
+	  add_ssaaaa (q3,q2,  q3,q2,  0,1);
+	}
+
+      qp[0] = q2;
+      qp[1] = q3;
+
+      up -= 2;
+      qp -= 2;
+    }
+
+  if ((un & 1) != 0)
+    {
+      u2 = r;
+      u1 = (up[2] << cnt);
+
+      udiv_qrnnd_preinv (q3, r, u2, u1, d0, di1);
+      qp[1] = q3;
+    }
+  else
+    {
+      u2 = r;
+      u1 = (up[2] << cnt) | (up[1] >> (GMP_NUMB_BITS - cnt));
+      u0 = (up[1] << cnt);
+
+      /* Dividend in {r,u1,u0} */
+
+      umul_ppmm (q1d,q0d, u1, di0);
+      umul_ppmm (q2b,q1b, u1, di1);
+      q2b++;				/* cannot spill */
+      add_sssaaaa (r,q2b,q1b, q2b,q1b, u1,u0);
+
+      umul_ppmm (q2c,q1c, u2,  di0);
+      add_sssaaaa (r,q2b,q1b, q2b,q1b, q2c,q1c);
+      umul_ppmm (q3a,q2a, u2, di1);
+
+      add_sssaaaa (r,q2b,q1b, q2b,q1b, q2a,q1d);
+
+      q3 += r;
+
+      r = u0 - q2 * d0;
+
+      cnd = (r >= q1);
+      r += d0 & -cnd;
+      sub_ddmmss (q3,q2,  q3,q2,  0,cnd);
+
+      if (UNLIKELY (r >= d0))
+	{
+	  r -= d0;
+	  add_ssaaaa (q3,q2,  q3,q2,  0,1);
+	}
+
+      qp[0] = q2;
+      qp[1] = q3;
+    }
+
+  return r >> cnt;
+
+#undef q3
+#undef q2
+#undef q1
+}

diff --git a/third_party/gmp/mpn/generic/div_qr_2.c b/third_party/gmp/mpn/generic/div_qr_2.c
new file mode 100644
index 0000000..f7add44
--- /dev/null
+++ b/third_party/gmp/mpn/generic/div_qr_2.c

@@ -0,0 +1,314 @@
+/* mpn_div_qr_2 -- Divide natural numbers, producing both remainder and
+   quotient.  The divisor is two limbs.
+
+   Contributed to the GNU project by Torbjorn Granlund and Niels Möller
+
+   THIS FILE CONTAINS INTERNAL FUNCTIONS WITH MUTABLE INTERFACES.  IT IS ONLY
+   SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+
+Copyright 1993-1996, 1999-2002, 2011, 2017 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+#ifndef DIV_QR_2_PI2_THRESHOLD
+/* Disabled unless explicitly tuned. */
+#define DIV_QR_2_PI2_THRESHOLD MP_LIMB_T_MAX
+#endif
+
+#ifndef SANITY_CHECK
+#define SANITY_CHECK 0
+#endif
+
+/* Define some longlong.h-style macros, but for wider operations.
+   * add_sssaaaa is like longlong.h's add_ssaaaa but propagating carry-out into
+     an additional sum operand.
+   * add_csaac accepts two addends and a carry in, and generates a sum and a
+     carry out.  A little like a "full adder".
+*/
+#if defined (__GNUC__)  && ! defined (__INTEL_COMPILER) && ! defined (NO_ASM)
+
+#if HAVE_HOST_CPU_FAMILY_x86 && W_TYPE_SIZE == 32
+#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
+  __asm__ ("add\t%7, %k2\n\tadc\t%5, %k1\n\tadc\t$0, %k0"		\
+	   : "=r" (s2), "=&r" (s1), "=&r" (s0)				\
+	   : "0"  ((USItype)(s2)),					\
+	     "1"  ((USItype)(a1)), "g" ((USItype)(b1)),			\
+	     "%2" ((USItype)(a0)), "g" ((USItype)(b0)))
+#endif
+
+#if defined (__amd64__) && W_TYPE_SIZE == 64
+#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
+  __asm__ ("add\t%7, %q2\n\tadc\t%5, %q1\n\tadc\t$0, %q0"		\
+	   : "=r" (s2), "=&r" (s1), "=&r" (s0)				\
+	   : "0"  ((UDItype)(s2)),					\
+	     "1"  ((UDItype)(a1)), "rme" ((UDItype)(b1)),		\
+	     "%2" ((UDItype)(a0)), "rme" ((UDItype)(b0)))
+#endif
+
+#if defined (__aarch64__) && W_TYPE_SIZE == 64
+#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
+  __asm__ ("adds\t%2, %x6, %7\n\tadcs\t%1, %x4, %x5\n\tadc\t%0, %3, xzr"\
+	   : "=r" (s2), "=&r" (s1), "=&r" (s0)				\
+	   : "rZ" (s2), "%rZ"  (a1), "rZ" (b1), "%rZ" (a0), "rI" (b0)	\
+	     __CLOBBER_CC)
+#endif
+
+#if HAVE_HOST_CPU_FAMILY_powerpc && !defined (_LONG_LONG_LIMB)
+/* This works fine for 32-bit and 64-bit limbs, except for 64-bit limbs with a
+   processor running in 32-bit mode, since the carry flag then gets the 32-bit
+   carry.  */
+#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
+  __asm__ ("add%I7c\t%2,%6,%7\n\tadde\t%1,%4,%5\n\taddze\t%0,%3"	\
+	   : "=r" (s2), "=&r" (s1), "=&r" (s0)				\
+	   : "r"  (s2), "r"  (a1), "r" (b1), "%r" (a0), "rI" (b0)	\
+	     __CLOBBER_CC)
+#endif
+
+#endif /* __GNUC__ */
+
+#ifndef add_sssaaaa
+#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
+  do {									\
+    UWtype __s0, __s1, __c0, __c1;					\
+    __s0 = (a0) + (b0);							\
+    __s1 = (a1) + (b1);							\
+    __c0 = __s0 < (a0);							\
+    __c1 = __s1 < (a1);							\
+    (s0) = __s0;							\
+    __s1 = __s1 + __c0;							\
+    (s1) = __s1;							\
+    (s2) += __c1 + (__s1 < __c0);					\
+  } while (0)
+#endif
+
+/* Typically used with r1, r0 same as n3, n2. Other types of overlap
+   between inputs and outputs are not supported. */
+#define udiv_qr_4by2(q1,q0, r1,r0, n3,n2,n1,n0, d1,d0, di1,di0)		\
+  do {									\
+    mp_limb_t _q3, _q2a, _q2, _q1, _q2c, _q1c, _q1d, _q0;		\
+    mp_limb_t _t1, _t0;							\
+    mp_limb_t _mask;							\
+									\
+    /* [q3,q2,q1,q0] = [n3,n2]*[di1,di0] + [n3,n2,n1,n0] + [0,1,0,0] */	\
+    umul_ppmm (_q2,_q1, n2, di1);					\
+    umul_ppmm (_q3,_q2a, n3, di1);					\
+    ++_q2;	/* _q2 cannot overflow */				\
+    add_ssaaaa (_q3,_q2, _q3,_q2, n3,_q2a);				\
+    umul_ppmm (_q2c,_q1c, n3, di0);					\
+    add_sssaaaa (_q3,_q2,_q1, _q2,_q1, n2,_q1c);			\
+    umul_ppmm (_q1d,_q0, n2, di0);					\
+    add_sssaaaa (_q2c,_q1,_q0, _q1,_q0, n1,n0); /* _q2c cannot overflow */ \
+    add_sssaaaa (_q3,_q2,_q1, _q2,_q1, _q2c,_q1d);			\
+									\
+    umul_ppmm (_t1,_t0, _q2, d0);					\
+    _t1 += _q2 * d1 + _q3 * d0;						\
+									\
+    sub_ddmmss (r1, r0, n1, n0, _t1, _t0);				\
+									\
+    _mask = -(mp_limb_t) ((r1 >= _q1) & ((r1 > _q1) | (r0 >= _q0)));  /* (r1,r0) >= (q1,q0) */  \
+    add_ssaaaa (r1, r0, r1, r0, d1 & _mask, d0 & _mask);		\
+    sub_ddmmss (_q3, _q2, _q3, _q2, CNST_LIMB(0), -_mask);		\
+									\
+    if (UNLIKELY (r1 >= d1))						\
+      {									\
+	if (r1 > d1 || r0 >= d0)					\
+	  {								\
+	    sub_ddmmss (r1, r0, r1, r0, d1, d0);			\
+	    add_ssaaaa (_q3, _q2, _q3, _q2, CNST_LIMB(0), CNST_LIMB(1));\
+	  }								\
+      }									\
+    (q1) = _q3;								\
+    (q0) = _q2;								\
+  } while (0)
+
+static void
+invert_4by2 (mp_ptr di, mp_limb_t d1, mp_limb_t d0)
+{
+  mp_limb_t v1, v0, p1, t1, t0, p0, mask;
+  invert_limb (v1, d1);
+  p1 = d1 * v1;
+  /* <1, v1> * d1 = <B-1, p1> */
+  p1 += d0;
+  if (p1 < d0)
+    {
+      v1--;
+      mask = -(mp_limb_t) (p1 >= d1);
+      p1 -= d1;
+      v1 += mask;
+      p1 -= mask & d1;
+    }
+  /* <1, v1> * d1 + d0 = <B-1, p1> */
+  umul_ppmm (t1, p0, d0, v1);
+  p1 += t1;
+  if (p1 < t1)
+    {
+      if (UNLIKELY (p1 >= d1))
+	{
+	  if (p1 > d1 || p0 >= d0)
+	    {
+	      sub_ddmmss (p1, p0, p1, p0, d1, d0);
+	      v1--;
+	    }
+	}
+      sub_ddmmss (p1, p0, p1, p0, d1, d0);
+      v1--;
+    }
+  /* Now v1 is the 3/2 inverse, <1, v1> * <d1, d0> = <B-1, p1, p0>,
+   * with <p1, p0> + <d1, d0> >= B^2.
+   *
+   * The 4/2 inverse is (B^4 - 1) / <d1, d0> = <1, v1, v0>. The
+   * partial remainder after <1, v1> is
+   *
+   * B^4 - 1 - B <1, v1> <d1, d0> = <B-1, B-1, B-1, B-1> - <B-1, p1, p0, 0>
+   *                              = <~p1, ~p0, B-1>
+   */
+  udiv_qr_3by2 (v0, t1, t0, ~p1, ~p0, MP_LIMB_T_MAX, d1, d0, v1);
+  di[0] = v0;
+  di[1] = v1;
+
+#if SANITY_CHECK
+  {
+    mp_limb_t tp[4];
+    mp_limb_t dp[2];
+    dp[0] = d0;
+    dp[1] = d1;
+    mpn_mul_n (tp, dp, di, 2);
+    ASSERT_ALWAYS (mpn_add_n (tp+2, tp+2, dp, 2) == 0);
+    ASSERT_ALWAYS (tp[2] == MP_LIMB_T_MAX);
+    ASSERT_ALWAYS (tp[3] == MP_LIMB_T_MAX);
+    ASSERT_ALWAYS (mpn_add_n (tp, tp, dp, 2) == 1);
+  }
+#endif
+}
+
+static mp_limb_t
+mpn_div_qr_2n_pi2 (mp_ptr qp, mp_ptr rp, mp_srcptr np, mp_size_t nn,
+		   mp_limb_t d1, mp_limb_t d0, mp_limb_t di1, mp_limb_t di0)
+{
+  mp_limb_t qh;
+  mp_size_t i;
+  mp_limb_t r1, r0;
+
+  ASSERT (nn >= 2);
+  ASSERT (d1 & GMP_NUMB_HIGHBIT);
+
+  r1 = np[nn-1];
+  r0 = np[nn-2];
+
+  qh = 0;
+  if (r1 >= d1 && (r1 > d1 || r0 >= d0))
+    {
+#if GMP_NAIL_BITS == 0
+      sub_ddmmss (r1, r0, r1, r0, d1, d0);
+#else
+      r0 = r0 - d0;
+      r1 = r1 - d1 - (r0 >> GMP_LIMB_BITS - 1);
+      r0 &= GMP_NUMB_MASK;
+#endif
+      qh = 1;
+    }
+
+  for (i = nn - 2; i >= 2; i -= 2)
+    {
+      mp_limb_t n1, n0, q1, q0;
+      n1 = np[i-1];
+      n0 = np[i-2];
+      udiv_qr_4by2 (q1, q0, r1, r0, r1, r0, n1, n0, d1, d0, di1, di0);
+      qp[i-1] = q1;
+      qp[i-2] = q0;
+    }
+
+  if (i > 0)
+    {
+      mp_limb_t q;
+      udiv_qr_3by2 (q, r1, r0, r1, r0, np[0], d1, d0, di1);
+      qp[0] = q;
+    }
+  rp[1] = r1;
+  rp[0] = r0;
+
+  return qh;
+}
+
+
+/* Divide num {np,nn} by den {dp,2} and write the nn-2 least
+   significant quotient limbs at qp and the 2 long remainder at np.
+   Return the most significant limb of the quotient.
+
+   Preconditions:
+   1. qp must either not overlap with the other operands at all, or
+      qp >= np + 2 must hold true.  (This means that it's possible to put
+      the quotient in the high part of {np,nn}, right above the remainder.)
+   2. nn >= 2.  */
+
+mp_limb_t
+mpn_div_qr_2 (mp_ptr qp, mp_ptr rp, mp_srcptr np, mp_size_t nn,
+	      mp_srcptr dp)
+{
+  mp_limb_t d1;
+  mp_limb_t d0;
+  gmp_pi1_t dinv;
+
+  ASSERT (nn >= 2);
+  ASSERT (! MPN_OVERLAP_P (qp, nn-2, np, nn) || qp >= np + 2);
+  ASSERT_MPN (np, nn);
+  ASSERT_MPN (dp, 2);
+
+  d1 = dp[1]; d0 = dp[0];
+
+  ASSERT (d1 > 0);
+
+  if (UNLIKELY (d1 & GMP_NUMB_HIGHBIT))
+    {
+      if (BELOW_THRESHOLD (nn, DIV_QR_2_PI2_THRESHOLD))
+	{
+	  gmp_pi1_t dinv;
+	  invert_pi1 (dinv, d1, d0);
+	  return mpn_div_qr_2n_pi1 (qp, rp, np, nn, d1, d0, dinv.inv32);
+	}
+      else
+	{
+	  mp_limb_t di[2];
+	  invert_4by2 (di, d1, d0);
+	  return mpn_div_qr_2n_pi2 (qp, rp, np, nn, d1, d0, di[1], di[0]);
+	}
+    }
+  else
+    {
+      int shift;
+      count_leading_zeros (shift, d1);
+      d1 = (d1 << shift) | (d0 >> (GMP_LIMB_BITS - shift));
+      d0 <<= shift;
+      invert_pi1 (dinv, d1, d0);
+      return mpn_div_qr_2u_pi1 (qp, rp, np, nn, d1, d0, shift, dinv.inv32);
+    }
+}

diff --git a/third_party/gmp/mpn/generic/div_qr_2n_pi1.c b/third_party/gmp/mpn/generic/div_qr_2n_pi1.c
new file mode 100644
index 0000000..131a811
--- /dev/null
+++ b/third_party/gmp/mpn/generic/div_qr_2n_pi1.c

@@ -0,0 +1,84 @@
+/* mpn_div_qr_2n_pi1
+
+   Contributed to the GNU project by Torbjorn Granlund and Niels Möller
+
+   THIS FILE CONTAINS INTERNAL FUNCTIONS WITH MUTABLE INTERFACES.  IT IS ONLY
+   SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+
+Copyright 1993-1996, 1999-2002, 2011 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+
+/* 3/2 loop, for normalized divisor */
+mp_limb_t
+mpn_div_qr_2n_pi1 (mp_ptr qp, mp_ptr rp, mp_srcptr np, mp_size_t nn,
+		   mp_limb_t d1, mp_limb_t d0, mp_limb_t di)
+{
+  mp_limb_t qh;
+  mp_size_t i;
+  mp_limb_t r1, r0;
+
+  ASSERT (nn >= 2);
+  ASSERT (d1 & GMP_NUMB_HIGHBIT);
+
+  np += nn - 2;
+  r1 = np[1];
+  r0 = np[0];
+
+  qh = 0;
+  if (r1 >= d1 && (r1 > d1 || r0 >= d0))
+    {
+#if GMP_NAIL_BITS == 0
+      sub_ddmmss (r1, r0, r1, r0, d1, d0);
+#else
+      r0 = r0 - d0;
+      r1 = r1 - d1 - (r0 >> GMP_LIMB_BITS - 1);
+      r0 &= GMP_NUMB_MASK;
+#endif
+      qh = 1;
+    }
+
+  for (i = nn - 2 - 1; i >= 0; i--)
+    {
+      mp_limb_t n0, q;
+      n0 = np[-1];
+      udiv_qr_3by2 (q, r1, r0, r1, r0, n0, d1, d0, di);
+      np--;
+      qp[i] = q;
+    }
+
+  rp[1] = r1;
+  rp[0] = r0;
+
+  return qh;
+}

diff --git a/third_party/gmp/mpn/generic/div_qr_2u_pi1.c b/third_party/gmp/mpn/generic/div_qr_2u_pi1.c
new file mode 100644
index 0000000..70e617b
--- /dev/null
+++ b/third_party/gmp/mpn/generic/div_qr_2u_pi1.c

@@ -0,0 +1,76 @@
+/* mpn_div_qr_2u_pi1
+
+   Contributed to the GNU project by Niels Möller
+
+   THIS FILE CONTAINS INTERNAL FUNCTIONS WITH MUTABLE INTERFACES.  IT IS ONLY
+   SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+
+Copyright 2011 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+
+/* 3/2 loop, for unnormalized divisor. Caller must pass shifted d1 and
+   d0, while {np,nn} is shifted on the fly. */
+mp_limb_t
+mpn_div_qr_2u_pi1 (mp_ptr qp, mp_ptr rp, mp_srcptr np, mp_size_t nn,
+		   mp_limb_t d1, mp_limb_t d0, int shift, mp_limb_t di)
+{
+  mp_limb_t qh;
+  mp_limb_t r2, r1, r0;
+  mp_size_t i;
+
+  ASSERT (nn >= 2);
+  ASSERT (d1 & GMP_NUMB_HIGHBIT);
+  ASSERT (shift > 0);
+
+  r2 = np[nn-1] >> (GMP_LIMB_BITS - shift);
+  r1 = (np[nn-1] << shift) | (np[nn-2] >> (GMP_LIMB_BITS - shift));
+  r0 = np[nn-2] << shift;
+
+  udiv_qr_3by2 (qh, r2, r1, r2, r1, r0, d1, d0, di);
+
+  for (i = nn - 2 - 1; i >= 0; i--)
+    {
+      mp_limb_t q;
+      r0 = np[i];
+      r1 |= r0 >> (GMP_LIMB_BITS - shift);
+      r0 <<= shift;
+      udiv_qr_3by2 (q, r2, r1, r2, r1, r0, d1, d0, di);
+      qp[i] = q;
+    }
+
+  rp[0] = (r1 >> shift) | (r2 << (GMP_LIMB_BITS - shift));
+  rp[1] = r2 >> shift;
+
+  return qh;
+}

diff --git a/third_party/gmp/mpn/generic/dive_1.c b/third_party/gmp/mpn/generic/dive_1.c
new file mode 100644
index 0000000..056f5b9
--- /dev/null
+++ b/third_party/gmp/mpn/generic/dive_1.c

@@ -0,0 +1,146 @@
+/* mpn_divexact_1 -- mpn by limb exact division.
+
+   THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY.  THEY'RE ALMOST
+   CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
+   FUTURE GNU MP RELEASES.
+
+Copyright 2000-2003, 2005, 2013 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+
+
+/* Divide a={src,size} by d=divisor and store the quotient in q={dst,size}.
+   q will only be correct if d divides a exactly.
+
+   A separate loop is used for shift==0 because n<<GMP_LIMB_BITS doesn't
+   give zero on all CPUs (for instance it doesn't on the x86s).  This
+   separate loop might run faster too, helping odd divisors.
+
+   Possibilities:
+
+   mpn_divexact_1c could be created, accepting and returning c.  This would
+   let a long calculation be done piece by piece.  Currently there's no
+   particular need for that, and not returning c means that a final umul can
+   be skipped.
+
+   Another use for returning c would be letting the caller know whether the
+   division was in fact exact.  It would work just to return the carry bit
+   "c=(l>s)" and let the caller do a final umul if interested.
+
+   When the divisor is even, the factors of two could be handled with a
+   separate mpn_rshift, instead of shifting on the fly.  That might be
+   faster on some CPUs and would mean just the shift==0 style loop would be
+   needed.
+
+   If n<<GMP_LIMB_BITS gives zero on a particular CPU then the separate
+   shift==0 loop is unnecessary, and could be eliminated if there's no great
+   speed difference.
+
+   It's not clear whether "/" is the best way to handle size==1.  Alpha gcc
+   2.95 for instance has a poor "/" and might prefer the modular method.
+   Perhaps a tuned parameter should control this.
+
+   If src[size-1] < divisor then dst[size-1] will be zero, and one divide
+   step could be skipped.  A test at last step for s<divisor (or ls in the
+   even case) might be a good way to do that.  But if this code is often
+   used with small divisors then it might not be worth bothering  */
+
+void
+mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor)
+{
+  mp_size_t  i;
+  mp_limb_t  c, h, l, ls, s, s_next, inverse, dummy;
+  unsigned   shift;
+
+  ASSERT (size >= 1);
+  ASSERT (divisor != 0);
+  ASSERT (MPN_SAME_OR_SEPARATE_P (dst, src, size));
+  ASSERT_MPN (src, size);
+  ASSERT_LIMB (divisor);
+
+  if ((divisor & 1) == 0)
+    {
+      count_trailing_zeros (shift, divisor);
+      divisor >>= shift;
+    }
+  else
+    shift = 0;
+
+  binvert_limb (inverse, divisor);
+  divisor <<= GMP_NAIL_BITS;
+
+  if (shift != 0)
+    {
+      c = 0;
+
+      s = src[0];
+
+      for (i = 1; i < size; i++)
+	{
+	  s_next = src[i];
+	  ls = ((s >> shift) | (s_next << (GMP_NUMB_BITS-shift))) & GMP_NUMB_MASK;
+	  s = s_next;
+
+	  SUBC_LIMB (c, l, ls, c);
+
+	  l = (l * inverse) & GMP_NUMB_MASK;
+	  dst[i - 1] = l;
+
+	  umul_ppmm (h, dummy, l, divisor);
+	  c += h;
+	}
+
+      ls = s >> shift;
+      l = ls - c;
+      l = (l * inverse) & GMP_NUMB_MASK;
+      dst[size - 1] = l;
+    }
+  else
+    {
+      s = src[0];
+
+      l = (s * inverse) & GMP_NUMB_MASK;
+      dst[0] = l;
+      c = 0;
+
+      for (i = 1; i < size; i++)
+	{
+	  umul_ppmm (h, dummy, l, divisor);
+	  c += h;
+
+	  s = src[i];
+	  SUBC_LIMB (c, l, s, c);
+
+	  l = (l * inverse) & GMP_NUMB_MASK;
+	  dst[i] = l;
+	}
+    }
+}

diff --git a/third_party/gmp/mpn/generic/diveby3.c b/third_party/gmp/mpn/generic/diveby3.c
new file mode 100644
index 0000000..7dee0bc
--- /dev/null
+++ b/third_party/gmp/mpn/generic/diveby3.c

@@ -0,0 +1,173 @@
+/* mpn_divexact_by3c -- mpn exact division by 3.
+
+Copyright 2000-2003, 2008 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+
+#if DIVEXACT_BY3_METHOD == 0
+
+mp_limb_t
+mpn_divexact_by3c (mp_ptr rp, mp_srcptr up, mp_size_t un, mp_limb_t c)
+{
+  mp_limb_t r;
+  r = mpn_bdiv_dbm1c (rp, up, un, GMP_NUMB_MASK / 3, GMP_NUMB_MASK / 3 * c);
+
+  /* Possible bdiv_dbm1 return values are C * (GMP_NUMB_MASK / 3), 0 <= C < 3.
+     We want to return C.  We compute the remainder mod 4 and notice that the
+     inverse of (2^(2k)-1)/3 mod 4 is 1.  */
+  return r & 3;
+}
+
+#endif
+
+#if DIVEXACT_BY3_METHOD == 1
+
+/* The algorithm here is basically the same as mpn_divexact_1, as described
+   in the manual.  Namely at each step q = (src[i]-c)*inverse, and new c =
+   borrow(src[i]-c) + high(divisor*q).  But because the divisor is just 3,
+   high(divisor*q) can be determined with two comparisons instead of a
+   multiply.
+
+   The "c += ..."s add the high limb of 3*l to c.  That high limb will be 0,
+   1 or 2.  Doing two separate "+="s seems to give better code on gcc (as of
+   2.95.2 at least).
+
+   It will be noted that the new c is formed by adding three values each 0
+   or 1.  But the total is only 0, 1 or 2.  When the subtraction src[i]-c
+   causes a borrow, that leaves a limb value of either 0xFF...FF or
+   0xFF...FE.  The multiply by MODLIMB_INVERSE_3 gives 0x55...55 or
+   0xAA...AA respectively, and in those cases high(3*q) is only 0 or 1
+   respectively, hence a total of no more than 2.
+
+   Alternatives:
+
+   This implementation has each multiply on the dependent chain, due to
+   "l=s-c".  See below for alternative code which avoids that.  */
+
+mp_limb_t
+mpn_divexact_by3c (mp_ptr restrict rp, mp_srcptr restrict up, mp_size_t un, mp_limb_t c)
+{
+  mp_limb_t  l, q, s;
+  mp_size_t  i;
+
+  ASSERT (un >= 1);
+  ASSERT (c == 0 || c == 1 || c == 2);
+  ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, un));
+
+  i = 0;
+  do
+    {
+      s = up[i];
+      SUBC_LIMB (c, l, s, c);
+
+      q = (l * MODLIMB_INVERSE_3) & GMP_NUMB_MASK;
+      rp[i] = q;
+
+      c += (q >= GMP_NUMB_CEIL_MAX_DIV3);
+      c += (q >= GMP_NUMB_CEIL_2MAX_DIV3);
+    }
+  while (++i < un);
+
+  ASSERT (c == 0 || c == 1 || c == 2);
+  return c;
+}
+
+
+#endif
+
+#if DIVEXACT_BY3_METHOD == 2
+
+/* The following alternative code re-arranges the quotient calculation from
+   (src[i]-c)*inverse to instead
+
+       q = src[i]*inverse - c*inverse
+
+   thereby allowing src[i]*inverse to be scheduled back as far as desired,
+   making full use of multiplier throughput and leaving just some carry
+   handing on the dependent chain.
+
+   The carry handling consists of determining the c for the next iteration.
+   This is the same as described above, namely look for any borrow from
+   src[i]-c, and at the high of 3*q.
+
+   high(3*q) is done with two comparisons as above (in c2 and c3).  The
+   borrow from src[i]-c is incorporated into those by noting that if there's
+   a carry then then we have src[i]-c == 0xFF..FF or 0xFF..FE, in turn
+   giving q = 0x55..55 or 0xAA..AA.  Adding 1 to either of those q values is
+   enough to make high(3*q) come out 1 bigger, as required.
+
+   l = -c*inverse is calculated at the same time as c, since for most chips
+   it can be more conveniently derived from separate c1/c2/c3 values than
+   from a combined c equal to 0, 1 or 2.
+
+   The net effect is that with good pipelining this loop should be able to
+   run at perhaps 4 cycles/limb, depending on available execute resources
+   etc.
+
+   Usage:
+
+   This code is not used by default, since we really can't rely on the
+   compiler generating a good software pipeline, nor on such an approach
+   even being worthwhile on all CPUs.
+
+   Itanium is one chip where this algorithm helps though, see
+   mpn/ia64/diveby3.asm.  */
+
+mp_limb_t
+mpn_divexact_by3c (mp_ptr restrict rp, mp_srcptr restrict up, mp_size_t un, mp_limb_t cy)
+{
+  mp_limb_t  s, sm, cl, q, qx, c2, c3;
+  mp_size_t  i;
+
+  ASSERT (un >= 1);
+  ASSERT (cy == 0 || cy == 1 || cy == 2);
+  ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, un));
+
+  cl = cy == 0 ? 0 : cy == 1 ? -MODLIMB_INVERSE_3 : -2*MODLIMB_INVERSE_3;
+
+  for (i = 0; i < un; i++)
+    {
+      s = up[i];
+      sm = (s * MODLIMB_INVERSE_3) & GMP_NUMB_MASK;
+
+      q = (cl + sm) & GMP_NUMB_MASK;
+      rp[i] = q;
+      qx = q + (s < cy);
+
+      c2 = qx >= GMP_NUMB_CEIL_MAX_DIV3;
+      c3 = qx >= GMP_NUMB_CEIL_2MAX_DIV3 ;
+
+      cy = c2 + c3;
+      cl = (-c2 & -MODLIMB_INVERSE_3) + (-c3 & -MODLIMB_INVERSE_3);
+    }
+
+  return cy;
+}
+
+#endif

diff --git a/third_party/gmp/mpn/generic/divexact.c b/third_party/gmp/mpn/generic/divexact.c
new file mode 100644
index 0000000..ec417df
--- /dev/null
+++ b/third_party/gmp/mpn/generic/divexact.c

@@ -0,0 +1,296 @@
+/* mpn_divexact(qp,np,nn,dp,dn,tp) -- Divide N = {np,nn} by D = {dp,dn} storing
+   the result in Q = {qp,nn-dn+1} expecting no remainder.  Overlap allowed
+   between Q and N; all other overlap disallowed.
+
+   Contributed to the GNU project by Torbjorn Granlund.
+
+   THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
+   SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
+
+Copyright 2006, 2007, 2009, 2017 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+#if 1
+void
+mpn_divexact (mp_ptr qp,
+	      mp_srcptr np, mp_size_t nn,
+	      mp_srcptr dp, mp_size_t dn)
+{
+  unsigned shift;
+  mp_size_t qn;
+  mp_ptr tp;
+  TMP_DECL;
+
+  ASSERT (dn > 0);
+  ASSERT (nn >= dn);
+  ASSERT (dp[dn-1] > 0);
+
+  while (dp[0] == 0)
+    {
+      ASSERT (np[0] == 0);
+      dp++;
+      np++;
+      dn--;
+      nn--;
+    }
+
+  if (dn == 1)
+    {
+      MPN_DIVREM_OR_DIVEXACT_1 (qp, np, nn, dp[0]);
+      return;
+    }
+
+  TMP_MARK;
+
+  qn = nn + 1 - dn;
+  count_trailing_zeros (shift, dp[0]);
+
+  if (shift > 0)
+    {
+      mp_ptr wp;
+      mp_size_t ss;
+      ss = (dn > qn) ? qn + 1 : dn;
+
+      tp = TMP_ALLOC_LIMBS (ss);
+      mpn_rshift (tp, dp, ss, shift);
+      dp = tp;
+
+      /* Since we have excluded dn == 1, we have nn > qn, and we need
+	 to shift one limb beyond qn. */
+      wp = TMP_ALLOC_LIMBS (qn + 1);
+      mpn_rshift (wp, np, qn + 1, shift);
+      np = wp;
+    }
+
+  if (dn > qn)
+    dn = qn;
+
+  tp = TMP_ALLOC_LIMBS (mpn_bdiv_q_itch (qn, dn));
+  mpn_bdiv_q (qp, np, qn, dp, dn, tp);
+  TMP_FREE;
+
+  /* Since bdiv_q computes -N/D (mod B^{qn}), we must negate now. */
+  mpn_neg (qp, qp, qn);
+}
+
+#else
+
+/* We use the Jebelean's bidirectional exact division algorithm.  This is
+   somewhat naively implemented, with equal quotient parts done by 2-adic
+   division and truncating division.  Since 2-adic division is faster, it
+   should be used for a larger chunk.
+
+   This code is horrendously ugly, in all sorts of ways.
+
+   * It was hacked without much care or thought, but with a testing program.
+   * It handles scratch space frivolously, and furthermore the itch function
+     is broken.
+   * Doesn't provide any measures to deal with mu_divappr_q's +3 error.  We
+     have yet to provoke an error due to this, though.
+   * Algorithm selection leaves a lot to be desired.  In particular, the choice
+     between DC and MU isn't a point, but we treat it like one.
+   * It makes the msb part 1 or 2 limbs larger than the lsb part, in spite of
+     that the latter is faster.  We should at least reverse this, but perhaps
+     we should make the lsb part considerably larger.  (How do we tune this?)
+*/
+
+mp_size_t
+mpn_divexact_itch (mp_size_t nn, mp_size_t dn)
+{
+  return nn + dn;		/* FIXME this is not right */
+}
+
+void
+mpn_divexact (mp_ptr qp,
+	      mp_srcptr np, mp_size_t nn,
+	      mp_srcptr dp, mp_size_t dn,
+	      mp_ptr scratch)
+{
+  mp_size_t qn;
+  mp_size_t nn0, qn0;
+  mp_size_t nn1, qn1;
+  mp_ptr tp;
+  mp_limb_t qml;
+  mp_limb_t qh;
+  int cnt;
+  mp_ptr xdp;
+  mp_limb_t di;
+  mp_limb_t cy;
+  gmp_pi1_t dinv;
+  TMP_DECL;
+
+  TMP_MARK;
+
+  qn = nn - dn + 1;
+
+  /* For small divisors, and small quotients, don't use Jebelean's algorithm. */
+  if (dn < DIVEXACT_JEB_THRESHOLD || qn < DIVEXACT_JEB_THRESHOLD)
+    {
+      tp = scratch;
+      MPN_COPY (tp, np, qn);
+      binvert_limb (di, dp[0]);  di = -di;
+      dn = MIN (dn, qn);
+      mpn_sbpi1_bdiv_q (qp, tp, qn, dp, dn, di);
+      TMP_FREE;
+      return;
+    }
+
+  qn0 = ((nn - dn) >> 1) + 1;	/* low quotient size */
+
+  /* If quotient is much larger than the divisor, the bidirectional algorithm
+     does not work as currently implemented.  Fall back to plain bdiv.  */
+  if (qn0 > dn)
+    {
+      if (BELOW_THRESHOLD (dn, DC_BDIV_Q_THRESHOLD))
+	{
+	  tp = scratch;
+	  MPN_COPY (tp, np, qn);
+	  binvert_limb (di, dp[0]);  di = -di;
+	  dn = MIN (dn, qn);
+	  mpn_sbpi1_bdiv_q (qp, tp, qn, dp, dn, di);
+	}
+      else if (BELOW_THRESHOLD (dn, MU_BDIV_Q_THRESHOLD))
+	{
+	  tp = scratch;
+	  MPN_COPY (tp, np, qn);
+	  binvert_limb (di, dp[0]);  di = -di;
+	  mpn_dcpi1_bdiv_q (qp, tp, qn, dp, dn, di);
+	}
+      else
+	{
+	  mpn_mu_bdiv_q (qp, np, qn, dp, dn, scratch);
+	}
+      TMP_FREE;
+      return;
+    }
+
+  nn0 = qn0 + qn0;
+
+  nn1 = nn0 - 1 + ((nn-dn) & 1);
+  qn1 = qn0;
+  if (LIKELY (qn0 != dn))
+    {
+      nn1 = nn1 + 1;
+      qn1 = qn1 + 1;
+      if (UNLIKELY (dp[dn - 1] == 1 && qn1 != dn))
+	{
+	  /* If the leading divisor limb == 1, i.e. has just one bit, we have
+	     to include an extra limb in order to get the needed overlap.  */
+	  /* FIXME: Now with the mu_divappr_q function, we should really need
+	     more overlap. That indicates one of two things: (1) The test code
+	     is not good. (2) We actually overlap too much by default.  */
+	  nn1 = nn1 + 1;
+	  qn1 = qn1 + 1;
+	}
+    }
+
+  tp = TMP_ALLOC_LIMBS (nn1 + 1);
+
+  count_leading_zeros (cnt, dp[dn - 1]);
+
+  /* Normalize divisor, store into tmp area.  */
+  if (cnt != 0)
+    {
+      xdp = TMP_ALLOC_LIMBS (qn1);
+      mpn_lshift (xdp, dp + dn - qn1, qn1, cnt);
+    }
+  else
+    {
+      xdp = (mp_ptr) dp + dn - qn1;
+    }
+
+  /* Shift dividend according to the divisor normalization.  */
+  /* FIXME: We compute too much here for XX_divappr_q, but these functions'
+     interfaces want a pointer to the imaginative least significant limb, not
+     to the least significant *used* limb.  Of course, we could leave nn1-qn1
+     rubbish limbs in the low part, to save some time.  */
+  if (cnt != 0)
+    {
+      cy = mpn_lshift (tp, np + nn - nn1, nn1, cnt);
+      if (cy != 0)
+	{
+	  tp[nn1] = cy;
+	  nn1++;
+	}
+    }
+  else
+    {
+      /* FIXME: This copy is not needed for mpn_mu_divappr_q, except when the
+	 mpn_sub_n right before is executed.  */
+      MPN_COPY (tp, np + nn - nn1, nn1);
+    }
+
+  invert_pi1 (dinv, xdp[qn1 - 1], xdp[qn1 - 2]);
+  if (BELOW_THRESHOLD (qn1, DC_DIVAPPR_Q_THRESHOLD))
+    {
+      qp[qn0 - 1 + nn1 - qn1] = mpn_sbpi1_divappr_q (qp + qn0 - 1, tp, nn1, xdp, qn1, dinv.inv32);
+    }
+  else if (BELOW_THRESHOLD (qn1, MU_DIVAPPR_Q_THRESHOLD))
+    {
+      qp[qn0 - 1 + nn1 - qn1] = mpn_dcpi1_divappr_q (qp + qn0 - 1, tp, nn1, xdp, qn1, &dinv);
+    }
+  else
+    {
+      /* FIXME: mpn_mu_divappr_q doesn't handle qh != 0.  Work around it with a
+	 conditional subtraction here.  */
+      qh = mpn_cmp (tp + nn1 - qn1, xdp, qn1) >= 0;
+      if (qh)
+	mpn_sub_n (tp + nn1 - qn1, tp + nn1 - qn1, xdp, qn1);
+      mpn_mu_divappr_q (qp + qn0 - 1, tp, nn1, xdp, qn1, scratch);
+      qp[qn0 - 1 + nn1 - qn1] = qh;
+    }
+  qml = qp[qn0 - 1];
+
+  binvert_limb (di, dp[0]);  di = -di;
+
+  if (BELOW_THRESHOLD (qn0, DC_BDIV_Q_THRESHOLD))
+    {
+      MPN_COPY (tp, np, qn0);
+      mpn_sbpi1_bdiv_q (qp, tp, qn0, dp, qn0, di);
+    }
+  else if (BELOW_THRESHOLD (qn0, MU_BDIV_Q_THRESHOLD))
+    {
+      MPN_COPY (tp, np, qn0);
+      mpn_dcpi1_bdiv_q (qp, tp, qn0, dp, qn0, di);
+    }
+  else
+    {
+      mpn_mu_bdiv_q (qp, np, qn0, dp, qn0, scratch);
+    }
+
+  if (qml < qp[qn0 - 1])
+    mpn_decr_u (qp + qn0, 1);
+
+  TMP_FREE;
+}
+#endif

diff --git a/third_party/gmp/mpn/generic/divis.c b/third_party/gmp/mpn/generic/divis.c
new file mode 100644
index 0000000..f989ddb
--- /dev/null
+++ b/third_party/gmp/mpn/generic/divis.c

@@ -0,0 +1,194 @@
+/* mpn_divisible_p -- mpn by mpn divisibility test
+
+   THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY.  THEY'RE ALMOST
+   CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
+   FUTURE GNU MP RELEASES.
+
+Copyright 2001, 2002, 2005, 2009, 2014, 2017, 2018 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+
+/* Determine whether A={ap,an} is divisible by D={dp,dn}.  Must have both
+   operands normalized, meaning high limbs non-zero, except that an==0 is
+   allowed.
+
+   There usually won't be many low zero bits on D, but the checks for this
+   are fast and might pick up a few operand combinations, in particular they
+   might reduce D to fit the single-limb mod_1/modexact_1 code.
+
+   Future:
+
+   Getting the remainder limb by limb would make an early exit possible on
+   finding a non-zero.  This would probably have to be bdivmod style so
+   there's no addback, but it would need a multi-precision inverse and so
+   might be slower than the plain method (on small sizes at least).
+
+   When D must be normalized (shifted to low bit set), it's possible to
+   suppress the bit-shifting of A down, as long as it's already been checked
+   that A has at least as many trailing zero bits as D.  */
+
+int
+mpn_divisible_p (mp_srcptr ap, mp_size_t an,
+		 mp_srcptr dp, mp_size_t dn)
+{
+  mp_limb_t  alow, dlow, dmask;
+  mp_ptr     qp, rp, tp;
+  mp_limb_t di;
+  unsigned  twos;
+  int c;
+  TMP_DECL;
+
+  ASSERT (an >= 0);
+  ASSERT (an == 0 || ap[an-1] != 0);
+  ASSERT (dn >= 1);
+  ASSERT (dp[dn-1] != 0);
+  ASSERT_MPN (ap, an);
+  ASSERT_MPN (dp, dn);
+
+  /* When a<d only a==0 is divisible.
+     Notice this test covers all cases of an==0. */
+  if (an < dn)
+    return (an == 0);
+
+  /* Strip low zero limbs from d, requiring a==0 on those. */
+  for (;;)
+    {
+      alow = *ap;
+      dlow = *dp;
+
+      if (dlow != 0)
+	break;
+
+      if (alow != 0)
+	return 0;  /* a has fewer low zero limbs than d, so not divisible */
+
+      /* a!=0 and d!=0 so won't get to n==0 */
+      an--; ASSERT (an >= 1);
+      dn--; ASSERT (dn >= 1);
+      ap++;
+      dp++;
+    }
+
+  /* a must have at least as many low zero bits as d */
+  dmask = LOW_ZEROS_MASK (dlow);
+  if ((alow & dmask) != 0)
+    return 0;
+
+  if (dn == 1)
+    {
+      if (ABOVE_THRESHOLD (an, BMOD_1_TO_MOD_1_THRESHOLD))
+	return mpn_mod_1 (ap, an, dlow) == 0;
+
+      count_trailing_zeros (twos, dlow);
+      dlow >>= twos;
+      return mpn_modexact_1_odd (ap, an, dlow) == 0;
+    }
+
+  count_trailing_zeros (twos, dlow);
+  if (dn == 2)
+    {
+      mp_limb_t  dsecond = dp[1];
+      if (dsecond <= dmask)
+	{
+	  dlow = (dlow >> twos) | (dsecond << (GMP_NUMB_BITS-twos));
+	  ASSERT_LIMB (dlow);
+	  return MPN_MOD_OR_MODEXACT_1_ODD (ap, an, dlow) == 0;
+	}
+    }
+
+  /* Should we compute Q = A * D^(-1) mod B^k,
+                       R = A - Q * D  mod B^k
+     here, for some small values of k?  Then check if R = 0 (mod B^k).  */
+
+  /* We could also compute A' = A mod T and D' = D mod P, for some
+     P = 3 * 5 * 7 * 11 ..., and then check if any prime factor from P
+     dividing D' also divides A'.  */
+
+  TMP_MARK;
+
+  TMP_ALLOC_LIMBS_2 (rp, an + 1,
+		     qp, an - dn + 1); /* FIXME: Could we avoid this? */
+
+  if (twos != 0)
+    {
+      tp = TMP_ALLOC_LIMBS (dn);
+      ASSERT_NOCARRY (mpn_rshift (tp, dp, dn, twos));
+      dp = tp;
+
+      ASSERT_NOCARRY (mpn_rshift (rp, ap, an, twos));
+    }
+  else
+    {
+      MPN_COPY (rp, ap, an);
+    }
+  if (rp[an - 1] >= dp[dn - 1])
+    {
+      rp[an] = 0;
+      an++;
+    }
+  else if (an == dn)
+    {
+      TMP_FREE;
+      return 0;
+    }
+
+  ASSERT (an > dn);		/* requirement of functions below */
+
+  if (BELOW_THRESHOLD (dn, DC_BDIV_QR_THRESHOLD) ||
+      BELOW_THRESHOLD (an - dn, DC_BDIV_QR_THRESHOLD))
+    {
+      binvert_limb (di, dp[0]);
+      mpn_sbpi1_bdiv_qr (qp, rp, an, dp, dn, -di);
+      rp += an - dn;
+    }
+  else if (BELOW_THRESHOLD (dn, MU_BDIV_QR_THRESHOLD))
+    {
+      binvert_limb (di, dp[0]);
+      mpn_dcpi1_bdiv_qr (qp, rp, an, dp, dn, -di);
+      rp += an - dn;
+    }
+  else
+    {
+      tp = TMP_ALLOC_LIMBS (mpn_mu_bdiv_qr_itch (an, dn));
+      mpn_mu_bdiv_qr (qp, rp, rp, an, dp, dn, tp);
+    }
+
+  /* In general, bdiv may return either R = 0 or R = D when D divides
+     A. But R = 0 can happen only when A = 0, which we already have
+     excluded. Furthermore, R == D (mod B^{dn}) implies no carry, so
+     we don't need to check the carry returned from bdiv. */
+
+  MPN_CMP (c, rp, dp, dn);
+
+  TMP_FREE;
+  return c == 0;
+}

diff --git a/third_party/gmp/mpn/generic/divrem.c b/third_party/gmp/mpn/generic/divrem.c
new file mode 100644
index 0000000..1da84a8
--- /dev/null
+++ b/third_party/gmp/mpn/generic/divrem.c

@@ -0,0 +1,103 @@
+/* mpn_divrem -- Divide natural numbers, producing both remainder and
+   quotient.  This is now just a middle layer calling mpn_tdiv_qr.
+
+Copyright 1993-1997, 1999-2002, 2005, 2016 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+mp_limb_t
+mpn_divrem (mp_ptr qp, mp_size_t qxn,
+	    mp_ptr np, mp_size_t nn,
+	    mp_srcptr dp, mp_size_t dn)
+{
+  ASSERT (qxn >= 0);
+  ASSERT (nn >= dn);
+  ASSERT (dn >= 1);
+  ASSERT (dp[dn-1] & GMP_NUMB_HIGHBIT);
+  ASSERT (! MPN_OVERLAP_P (np, nn, dp, dn));
+  ASSERT (! MPN_OVERLAP_P (qp, nn-dn+qxn, np, nn) || qp==np+dn+qxn);
+  ASSERT (! MPN_OVERLAP_P (qp, nn-dn+qxn, dp, dn));
+  ASSERT_MPN (np, nn);
+  ASSERT_MPN (dp, dn);
+
+  if (dn == 1)
+    {
+      mp_limb_t ret;
+      mp_ptr q2p;
+      mp_size_t qn;
+      TMP_DECL;
+
+      TMP_MARK;
+      q2p = TMP_ALLOC_LIMBS (nn + qxn);
+
+      np[0] = mpn_divrem_1 (q2p, qxn, np, nn, dp[0]);
+      qn = nn + qxn - 1;
+      MPN_COPY (qp, q2p, qn);
+      ret = q2p[qn];
+
+      TMP_FREE;
+      return ret;
+    }
+  else if (dn == 2)
+    {
+      return mpn_divrem_2 (qp, qxn, np, nn, dp);
+    }
+  else
+    {
+      mp_ptr q2p;
+      mp_limb_t qhl;
+      mp_size_t qn;
+      TMP_DECL;
+
+      TMP_MARK;
+      if (UNLIKELY (qxn != 0))
+	{
+	  mp_ptr n2p;
+	  TMP_ALLOC_LIMBS_2 (n2p, nn + qxn,
+			     q2p, nn - dn + qxn + 1);
+	  MPN_ZERO (n2p, qxn);
+	  MPN_COPY (n2p + qxn, np, nn);
+	  mpn_tdiv_qr (q2p, np, 0L, n2p, nn + qxn, dp, dn);
+	  qn = nn - dn + qxn;
+	  MPN_COPY (qp, q2p, qn);
+	  qhl = q2p[qn];
+	}
+      else
+	{
+	  q2p = TMP_ALLOC_LIMBS (nn - dn + 1);
+	  mpn_tdiv_qr (q2p, np, 0L, np, nn, dp, dn);
+	  qn = nn - dn;
+	  MPN_COPY (qp, q2p, qn);
+	  qhl = q2p[qn];
+	}
+      TMP_FREE;
+      return qhl;
+    }
+}

diff --git a/third_party/gmp/mpn/generic/divrem_1.c b/third_party/gmp/mpn/generic/divrem_1.c
new file mode 100644
index 0000000..c13aa79
--- /dev/null
+++ b/third_party/gmp/mpn/generic/divrem_1.c

@@ -0,0 +1,254 @@
+/* mpn_divrem_1 -- mpn by limb division.
+
+Copyright 1991, 1993, 1994, 1996, 1998-2000, 2002, 2003 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+
+/* The size where udiv_qrnnd_preinv should be used rather than udiv_qrnnd,
+   meaning the quotient size where that should happen, the quotient size
+   being how many udiv divisions will be done.
+
+   The default is to use preinv always, CPUs where this doesn't suit have
+   tuned thresholds.  Note in particular that preinv should certainly be
+   used if that's the only division available (USE_PREINV_ALWAYS).  */
+
+#ifndef DIVREM_1_NORM_THRESHOLD
+#define DIVREM_1_NORM_THRESHOLD  0
+#endif
+#ifndef DIVREM_1_UNNORM_THRESHOLD
+#define DIVREM_1_UNNORM_THRESHOLD  0
+#endif
+
+
+
+/* If the cpu only has multiply-by-inverse division (eg. alpha), then NORM
+   and UNNORM thresholds are 0 and only the inversion code is included.
+
+   If multiply-by-inverse is never viable, then NORM and UNNORM thresholds
+   will be MP_SIZE_T_MAX and only the plain division code is included.
+
+   Otherwise mul-by-inverse is better than plain division above some
+   threshold, and best results are obtained by having code for both present.
+
+   The main reason for separating the norm and unnorm cases is that not all
+   CPUs give zero for "n0 >> GMP_LIMB_BITS" which would arise in the unnorm
+   code used on an already normalized divisor.
+
+   If UDIV_NEEDS_NORMALIZATION is false then plain division uses the same
+   non-shifting code for both the norm and unnorm cases, though with
+   different criteria for skipping a division, and with different thresholds
+   of course.  And in fact if inversion is never viable, then that simple
+   non-shifting division would be all that's left.
+
+   The NORM and UNNORM thresholds might not differ much, but if there's
+   going to be separate code for norm and unnorm then it makes sense to have
+   separate thresholds.  One thing that's possible is that the
+   mul-by-inverse might be better only for normalized divisors, due to that
+   case not needing variable bit shifts.
+
+   Notice that the thresholds are tested after the decision to possibly skip
+   one divide step, so they're based on the actual number of divisions done.
+
+   For the unnorm case, it would be possible to call mpn_lshift to adjust
+   the dividend all in one go (into the quotient space say), rather than
+   limb-by-limb in the loop.  This might help if mpn_lshift is a lot faster
+   than what the compiler can generate for EXTRACT.  But this is left to CPU
+   specific implementations to consider, especially since EXTRACT isn't on
+   the dependent chain.  */
+
+mp_limb_t
+mpn_divrem_1 (mp_ptr qp, mp_size_t qxn,
+	      mp_srcptr up, mp_size_t un, mp_limb_t d)
+{
+  mp_size_t  n;
+  mp_size_t  i;
+  mp_limb_t  n1, n0;
+  mp_limb_t  r = 0;
+
+  ASSERT (qxn >= 0);
+  ASSERT (un >= 0);
+  ASSERT (d != 0);
+  /* FIXME: What's the correct overlap rule when qxn!=0? */
+  ASSERT (MPN_SAME_OR_SEPARATE_P (qp+qxn, up, un));
+
+  n = un + qxn;
+  if (n == 0)
+    return 0;
+
+  d <<= GMP_NAIL_BITS;
+
+  qp += (n - 1);   /* Make qp point at most significant quotient limb */
+
+  if ((d & GMP_LIMB_HIGHBIT) != 0)
+    {
+      if (un != 0)
+	{
+	  /* High quotient limb is 0 or 1, skip a divide step. */
+	  mp_limb_t q;
+	  r = up[un - 1] << GMP_NAIL_BITS;
+	  q = (r >= d);
+	  *qp-- = q;
+	  r -= (d & -q);
+	  r >>= GMP_NAIL_BITS;
+	  n--;
+	  un--;
+	}
+
+      if (BELOW_THRESHOLD (n, DIVREM_1_NORM_THRESHOLD))
+	{
+	plain:
+	  for (i = un - 1; i >= 0; i--)
+	    {
+	      n0 = up[i] << GMP_NAIL_BITS;
+	      udiv_qrnnd (*qp, r, r, n0, d);
+	      r >>= GMP_NAIL_BITS;
+	      qp--;
+	    }
+	  for (i = qxn - 1; i >= 0; i--)
+	    {
+	      udiv_qrnnd (*qp, r, r, CNST_LIMB(0), d);
+	      r >>= GMP_NAIL_BITS;
+	      qp--;
+	    }
+	  return r;
+	}
+      else
+	{
+	  /* Multiply-by-inverse, divisor already normalized. */
+	  mp_limb_t dinv;
+	  invert_limb (dinv, d);
+
+	  for (i = un - 1; i >= 0; i--)
+	    {
+	      n0 = up[i] << GMP_NAIL_BITS;
+	      udiv_qrnnd_preinv (*qp, r, r, n0, d, dinv);
+	      r >>= GMP_NAIL_BITS;
+	      qp--;
+	    }
+	  for (i = qxn - 1; i >= 0; i--)
+	    {
+	      udiv_qrnnd_preinv (*qp, r, r, CNST_LIMB(0), d, dinv);
+	      r >>= GMP_NAIL_BITS;
+	      qp--;
+	    }
+	  return r;
+	}
+    }
+  else
+    {
+      /* Most significant bit of divisor == 0.  */
+      int cnt;
+
+      /* Skip a division if high < divisor (high quotient 0).  Testing here
+	 before normalizing will still skip as often as possible.  */
+      if (un != 0)
+	{
+	  n1 = up[un - 1] << GMP_NAIL_BITS;
+	  if (n1 < d)
+	    {
+	      r = n1 >> GMP_NAIL_BITS;
+	      *qp-- = 0;
+	      n--;
+	      if (n == 0)
+		return r;
+	      un--;
+	    }
+	}
+
+      if (! UDIV_NEEDS_NORMALIZATION
+	  && BELOW_THRESHOLD (n, DIVREM_1_UNNORM_THRESHOLD))
+	goto plain;
+
+      count_leading_zeros (cnt, d);
+      d <<= cnt;
+      r <<= cnt;
+
+      if (UDIV_NEEDS_NORMALIZATION
+	  && BELOW_THRESHOLD (n, DIVREM_1_UNNORM_THRESHOLD))
+	{
+	  mp_limb_t nshift;
+	  if (un != 0)
+	    {
+	      n1 = up[un - 1] << GMP_NAIL_BITS;
+	      r |= (n1 >> (GMP_LIMB_BITS - cnt));
+	      for (i = un - 2; i >= 0; i--)
+		{
+		  n0 = up[i] << GMP_NAIL_BITS;
+		  nshift = (n1 << cnt) | (n0 >> (GMP_NUMB_BITS - cnt));
+		  udiv_qrnnd (*qp, r, r, nshift, d);
+		  r >>= GMP_NAIL_BITS;
+		  qp--;
+		  n1 = n0;
+		}
+	      udiv_qrnnd (*qp, r, r, n1 << cnt, d);
+	      r >>= GMP_NAIL_BITS;
+	      qp--;
+	    }
+	  for (i = qxn - 1; i >= 0; i--)
+	    {
+	      udiv_qrnnd (*qp, r, r, CNST_LIMB(0), d);
+	      r >>= GMP_NAIL_BITS;
+	      qp--;
+	    }
+	  return r >> cnt;
+	}
+      else
+	{
+	  mp_limb_t  dinv, nshift;
+	  invert_limb (dinv, d);
+	  if (un != 0)
+	    {
+	      n1 = up[un - 1] << GMP_NAIL_BITS;
+	      r |= (n1 >> (GMP_LIMB_BITS - cnt));
+	      for (i = un - 2; i >= 0; i--)
+		{
+		  n0 = up[i] << GMP_NAIL_BITS;
+		  nshift = (n1 << cnt) | (n0 >> (GMP_NUMB_BITS - cnt));
+		  udiv_qrnnd_preinv (*qp, r, r, nshift, d, dinv);
+		  r >>= GMP_NAIL_BITS;
+		  qp--;
+		  n1 = n0;
+		}
+	      udiv_qrnnd_preinv (*qp, r, r, n1 << cnt, d, dinv);
+	      r >>= GMP_NAIL_BITS;
+	      qp--;
+	    }
+	  for (i = qxn - 1; i >= 0; i--)
+	    {
+	      udiv_qrnnd_preinv (*qp, r, r, CNST_LIMB(0), d, dinv);
+	      r >>= GMP_NAIL_BITS;
+	      qp--;
+	    }
+	  return r >> cnt;
+	}
+    }
+}

diff --git a/third_party/gmp/mpn/generic/divrem_2.c b/third_party/gmp/mpn/generic/divrem_2.c
new file mode 100644
index 0000000..217f2f6
--- /dev/null
+++ b/third_party/gmp/mpn/generic/divrem_2.c

@@ -0,0 +1,118 @@
+/* mpn_divrem_2 -- Divide natural numbers, producing both remainder and
+   quotient.  The divisor is two limbs.
+
+   THIS FILE CONTAINS INTERNAL FUNCTIONS WITH MUTABLE INTERFACES.  IT IS ONLY
+   SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+
+Copyright 1993-1996, 1999-2002 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+
+/* Divide num {np,nn} by den {dp,2} and write the nn-2 least significant
+   quotient limbs at qp and the 2 long remainder at np.  If qxn is non-zero,
+   generate that many fraction bits and append them after the other quotient
+   limbs.  Return the most significant limb of the quotient, this is always 0
+   or 1.
+
+   Preconditions:
+   1. The most significant bit of the divisor must be set.
+   2. qp must either not overlap with the input operands at all, or
+      qp >= np + 2 must hold true.  (This means that it's possible to put
+      the quotient in the high part of {np,nn}, right above the remainder.
+   3. nn >= 2, even if qxn is non-zero.  */
+
+mp_limb_t
+mpn_divrem_2 (mp_ptr qp, mp_size_t qxn,
+	      mp_ptr np, mp_size_t nn,
+	      mp_srcptr dp)
+{
+  mp_limb_t most_significant_q_limb;
+  mp_size_t i;
+  mp_limb_t r1, r0, d1, d0;
+  gmp_pi1_t di;
+
+  ASSERT (nn >= 2);
+  ASSERT (qxn >= 0);
+  ASSERT (dp[1] & GMP_NUMB_HIGHBIT);
+  ASSERT (! MPN_OVERLAP_P (qp, nn-2+qxn, np, nn) || qp >= np+2);
+  ASSERT_MPN (np, nn);
+  ASSERT_MPN (dp, 2);
+
+  np += nn - 2;
+  d1 = dp[1];
+  d0 = dp[0];
+  r1 = np[1];
+  r0 = np[0];
+
+  most_significant_q_limb = 0;
+  if (r1 >= d1 && (r1 > d1 || r0 >= d0))
+    {
+#if GMP_NAIL_BITS == 0
+      sub_ddmmss (r1, r0, r1, r0, d1, d0);
+#else
+      r0 = r0 - d0;
+      r1 = r1 - d1 - (r0 >> GMP_LIMB_BITS - 1);
+      r0 &= GMP_NUMB_MASK;
+#endif
+      most_significant_q_limb = 1;
+    }
+
+  invert_pi1 (di, d1, d0);
+
+  qp += qxn;
+
+  for (i = nn - 2 - 1; i >= 0; i--)
+    {
+      mp_limb_t n0, q;
+      n0 = np[-1];
+      udiv_qr_3by2 (q, r1, r0, r1, r0, n0, d1, d0, di.inv32);
+      np--;
+      qp[i] = q;
+    }
+
+  if (UNLIKELY (qxn != 0))
+    {
+      qp -= qxn;
+      for (i = qxn - 1; i >= 0; i--)
+	{
+	  mp_limb_t q;
+	  udiv_qr_3by2 (q, r1, r0, r1, r0, CNST_LIMB(0), d1, d0, di.inv32);
+	  qp[i] = q;
+	}
+    }
+
+  np[1] = r1;
+  np[0] = r0;
+
+  return most_significant_q_limb;
+}

diff --git a/third_party/gmp/mpn/generic/dump.c b/third_party/gmp/mpn/generic/dump.c
new file mode 100644
index 0000000..9a4ddf4
--- /dev/null
+++ b/third_party/gmp/mpn/generic/dump.c

@@ -0,0 +1,99 @@
+/* THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE.  IT IS NOT SAFE TO
+   CALL THIS FUNCTION DIRECTLY.  IN FACT, IT IS ALMOST GUARANTEED THAT THIS
+   FUNCTION WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+
+Copyright 1996, 2000-2002, 2005 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include <stdio.h>
+#include "gmp-impl.h"
+
+#if GMP_NUMB_BITS % 4 == 0
+void
+mpn_dump (mp_srcptr ptr, mp_size_t n)
+{
+  MPN_NORMALIZE (ptr, n);
+
+  if (n == 0)
+    printf ("0\n");
+  else
+    {
+      n--;
+#if _LONG_LONG_LIMB
+      if ((ptr[n] >> GMP_LIMB_BITS / 2) != 0)
+	{
+	  printf ("%lX", (unsigned long) (ptr[n] >> GMP_LIMB_BITS / 2));
+	  printf ("%0*lX", (GMP_LIMB_BITS / 2 / 4), (unsigned long) ptr[n]);
+	}
+      else
+#endif
+	printf ("%lX", (unsigned long) ptr[n]);
+
+      while (n)
+	{
+	  n--;
+#if _LONG_LONG_LIMB
+	  printf ("%0*lX", (GMP_NUMB_BITS - GMP_LIMB_BITS / 2) / 4,
+		  (unsigned long) (ptr[n] >> GMP_LIMB_BITS / 2));
+	  printf ("%0*lX", GMP_LIMB_BITS / 2 / 4, (unsigned long) ptr[n]);
+#else
+	  printf ("%0*lX", GMP_NUMB_BITS / 4, (unsigned long) ptr[n]);
+#endif
+	}
+      printf ("\n");
+    }
+}
+
+#else
+
+static void
+mpn_recdump (mp_ptr p, mp_size_t n)
+{
+  mp_limb_t lo;
+  if (n != 0)
+    {
+      lo = p[0] & 0xf;
+      mpn_rshift (p, p, n, 4);
+      mpn_recdump (p, n);
+      printf ("%lX", lo);
+    }
+}
+
+void
+mpn_dump (mp_srcptr p, mp_size_t n)
+{
+  mp_ptr tp;
+  TMP_DECL;
+  TMP_MARK;
+  tp = TMP_ALLOC_LIMBS (n);
+  MPN_COPY (tp, p, n);
+  TMP_FREE;
+}
+
+#endif

diff --git a/third_party/gmp/mpn/generic/fib2_ui.c b/third_party/gmp/mpn/generic/fib2_ui.c
new file mode 100644
index 0000000..0b81571
--- /dev/null
+++ b/third_party/gmp/mpn/generic/fib2_ui.c

@@ -0,0 +1,174 @@
+/* mpn_fib2_ui -- calculate Fibonacci numbers.
+
+   THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY.  THEY'RE ALMOST
+   CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
+   FUTURE GNU MP RELEASES.
+
+Copyright 2001, 2002, 2005, 2009, 2018 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include <stdio.h>
+#include "gmp-impl.h"
+
+/* change this to "#define TRACE(x) x" for diagnostics */
+#define TRACE(x)
+
+
+/* Store F[n] at fp and F[n-1] at f1p.  fp and f1p should have room for
+   MPN_FIB2_SIZE(n) limbs.
+
+   The return value is the actual number of limbs stored, this will be at
+   least 1.  fp[size-1] will be non-zero, except when n==0, in which case
+   fp[0] is 0 and f1p[0] is 1.  f1p[size-1] can be zero, since F[n-1]<F[n]
+   (for n>0).
+
+   Notes: F[2k+1] = 4*F[k]^2 - F[k-1]^2 + 2*(-1)^k.
+
+   In F[2k+1] with k even, +2 is applied to 4*F[k]^2 just by ORing into the
+   low limb.
+
+   In F[2k+1] with k odd, -2 is applied to F[k-1]^2 just by ORing into the
+   low limb.
+*/
+
+mp_size_t
+mpn_fib2_ui (mp_ptr fp, mp_ptr f1p, unsigned long int n)
+{
+  mp_size_t      size;
+  unsigned long  nfirst, mask;
+
+  TRACE (printf ("mpn_fib2_ui n=%lu\n", n));
+
+  ASSERT (! MPN_OVERLAP_P (fp, MPN_FIB2_SIZE(n), f1p, MPN_FIB2_SIZE(n)));
+
+  /* Take a starting pair from the table. */
+  mask = 1;
+  for (nfirst = n; nfirst > FIB_TABLE_LIMIT; nfirst /= 2)
+    mask <<= 1;
+  TRACE (printf ("nfirst=%lu mask=0x%lX\n", nfirst, mask));
+
+  f1p[0] = FIB_TABLE ((int) nfirst - 1);
+  fp[0]  = FIB_TABLE (nfirst);
+  size = 1;
+
+  /* Skip to the end if the table lookup gives the final answer. */
+  if (mask != 1)
+    {
+      mp_size_t  alloc;
+      mp_ptr        xp;
+      TMP_DECL;
+
+      TMP_MARK;
+      alloc = MPN_FIB2_SIZE (n);
+      xp = TMP_ALLOC_LIMBS (alloc);
+
+      do
+	{
+	  /* Here fp==F[k] and f1p==F[k-1], with k being the bits of n from
+	     n&mask upwards.
+
+	     The next bit of n is n&(mask>>1) and we'll double to the pair
+	     fp==F[2k],f1p==F[2k-1] or fp==F[2k+1],f1p==F[2k], according as
+	     that bit is 0 or 1 respectively.  */
+
+	  TRACE (printf ("k=%lu mask=0x%lX size=%ld alloc=%ld\n",
+			 n >> refmpn_count_trailing_zeros(mask),
+			 mask, size, alloc);
+		 mpn_trace ("fp ", fp, size);
+		 mpn_trace ("f1p", f1p, size));
+
+	  /* fp normalized, f1p at most one high zero */
+	  ASSERT (fp[size-1] != 0);
+	  ASSERT (f1p[size-1] != 0 || f1p[size-2] != 0);
+
+	  /* f1p[size-1] might be zero, but this occurs rarely, so it's not
+	     worth bothering checking for it */
+	  ASSERT (alloc >= 2*size);
+	  mpn_sqr (xp, fp,  size);
+	  mpn_sqr (fp, f1p, size);
+	  size *= 2;
+
+	  /* Shrink if possible.  Since fp was normalized there'll be at
+	     most one high zero on xp (and if there is then there's one on
+	     yp too).  */
+	  ASSERT (xp[size-1] != 0 || fp[size-1] == 0);
+	  size -= (xp[size-1] == 0);
+	  ASSERT (xp[size-1] != 0);  /* only one xp high zero */
+
+	  /* Calculate F[2k-1] = F[k]^2 + F[k-1]^2. */
+	  f1p[size] = mpn_add_n (f1p, xp, fp, size);
+
+	  /* Calculate F[2k+1] = 4*F[k]^2 - F[k-1]^2 + 2*(-1)^k.
+	     n&mask is the low bit of our implied k.  */
+
+	  ASSERT ((fp[0] & 2) == 0);
+	  /* fp is F[k-1]^2 == 0 or 1 mod 4, like all squares. */
+	  fp[0] |= (n & mask ? 2 : 0);			/* possible -2 */
+#if HAVE_NATIVE_mpn_rsblsh2_n
+	  fp[size] = mpn_rsblsh2_n (fp, fp, xp, size);
+	  MPN_INCR_U(fp, size + 1, (n & mask ? 0 : 2));	/* possible +2 */
+#else
+	  {
+	    mp_limb_t  c;
+
+	    c = mpn_lshift (xp, xp, size, 2);
+	    xp[0] |= (n & mask ? 0 : 2);	/* possible +2 */
+	    c -= mpn_sub_n (fp, xp, fp, size);
+	    fp[size] = c;
+	  }
+#endif
+	  ASSERT (alloc >= size+1);
+	  size += (fp[size] != 0);
+
+	  /* now n&mask is the new bit of n being considered */
+	  mask >>= 1;
+
+	  /* Calculate F[2k] = F[2k+1] - F[2k-1], replacing the unwanted one of
+	     F[2k+1] and F[2k-1].  */
+	  if (n & mask)
+	    ASSERT_NOCARRY (mpn_sub_n (f1p, fp, f1p, size));
+	  else {
+	    ASSERT_NOCARRY (mpn_sub_n ( fp, fp, f1p, size));
+
+	    /* Can have a high zero after replacing F[2k+1] with F[2k].
+	       f1p will have a high zero if fp does. */
+	    ASSERT (fp[size-1] != 0 || f1p[size-1] == 0);
+	    size -= (fp[size-1] == 0);
+	  }
+	}
+      while (mask != 1);
+
+      TMP_FREE;
+    }
+
+  TRACE (printf ("done size=%ld\n", size);
+	 mpn_trace ("fp ", fp, size);
+	 mpn_trace ("f1p", f1p, size));
+
+  return size;
+}

diff --git a/third_party/gmp/mpn/generic/fib2m.c b/third_party/gmp/mpn/generic/fib2m.c
new file mode 100644
index 0000000..89d2b86
--- /dev/null
+++ b/third_party/gmp/mpn/generic/fib2m.c

@@ -0,0 +1,252 @@
+/* mpn_fib2m -- calculate Fibonacci numbers, modulo m.
+
+Contributed to the GNU project by Marco Bodrato, based on the previous
+fib2_ui.c file.
+
+   THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY.  THEY'RE ALMOST
+   CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
+   FUTURE GNU MP RELEASES.
+
+Copyright 2001, 2002, 2005, 2009, 2018 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include <stdio.h>
+#include "gmp-impl.h"
+#include "longlong.h"
+
+
+/* Stores |{ap,n}-{bp,n}| in {rp,n},
+   returns the sign of {ap,n}-{bp,n}. */
+static int
+abs_sub_n (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t n)
+{
+  mp_limb_t  x, y;
+  while (--n >= 0)
+    {
+      x = ap[n];
+      y = bp[n];
+      if (x != y)
+        {
+          ++n;
+          if (x > y)
+            {
+              ASSERT_NOCARRY (mpn_sub_n (rp, ap, bp, n));
+              return 1;
+            }
+          else
+            {
+              ASSERT_NOCARRY (mpn_sub_n (rp, bp, ap, n));
+              return -1;
+            }
+        }
+      rp[n] = 0;
+    }
+  return 0;
+}
+
+/* Store F[n] at fp and F[n-1] at f1p.  Both are computed modulo m.
+   fp and f1p should have room for mn*2+1 limbs.
+
+   The sign of one or both the values may be flipped (n-F, instead of F),
+   the return value is 0 (zero) if the signs are coherent (both positive
+   or both negative) and 1 (one) otherwise.
+
+   Notes:
+
+   In F[2k+1] with k even, +2 is applied to 4*F[k]^2 just by ORing into the
+   low limb.
+
+   In F[2k+1] with k odd, -2 is applied to F[k-1]^2 just by ORing into the
+   low limb.
+
+   TODO: Should {tp, 2 * mn} be passed as a scratch pointer?
+   Should the call to mpn_fib2_ui() obtain (up to) 2*mn limbs?
+*/
+
+int
+mpn_fib2m (mp_ptr fp, mp_ptr f1p, mp_srcptr np, mp_size_t nn, mp_srcptr mp, mp_size_t mn)
+{
+  unsigned long	nfirst;
+  mp_limb_t	nh;
+  mp_bitcnt_t	nbi;
+  mp_size_t	sn, fn;
+  int		fcnt, ncnt;
+
+  ASSERT (! MPN_OVERLAP_P (fp, MAX(2*mn+1,5), f1p, MAX(2*mn+1,5)));
+  ASSERT (nn > 0 && np[nn - 1] != 0);
+
+  /* Estimate the maximal n such that fibonacci(n) fits in mn limbs. */
+#if GMP_NUMB_BITS % 16 == 0
+  if (UNLIKELY (ULONG_MAX / (23 * (GMP_NUMB_BITS / 16)) <= mn))
+    nfirst = ULONG_MAX;
+  else
+    nfirst = mn * (23 * (GMP_NUMB_BITS / 16));
+#else
+  {
+    mp_bitcnt_t	mbi;
+    mbi = (mp_bitcnt_t) mn * GMP_NUMB_BITS;
+
+    if (UNLIKELY (ULONG_MAX / 23 < mbi))
+      {
+	if (UNLIKELY (ULONG_MAX / 23 * 16 <= mbi))
+	  nfirst = ULONG_MAX;
+	else
+	  nfirst = mbi / 16 * 23;
+      }
+    else
+      nfirst = mbi * 23 / 16;
+  }
+#endif
+
+  sn = nn - 1;
+  nh = np[sn];
+  count_leading_zeros (ncnt, nh);
+  count_leading_zeros (fcnt, nfirst);
+
+  if (fcnt >= ncnt)
+    {
+      ncnt = fcnt - ncnt;
+      nh >>= ncnt;
+    }
+  else if (sn > 0)
+    {
+      ncnt -= fcnt;
+      nh <<= ncnt;
+      ncnt = GMP_NUMB_BITS - ncnt;
+      --sn;
+      nh |= np[sn] >> ncnt;
+    }
+  else
+    ncnt = 0;
+
+  nbi = sn * GMP_NUMB_BITS + ncnt;
+  if (nh > nfirst)
+    {
+      nh >>= 1;
+      ++nbi;
+    }
+
+  ASSERT (nh <= nfirst);
+  /* Take a starting pair from mpn_fib2_ui. */
+  fn = mpn_fib2_ui (fp, f1p, nh);
+  MPN_ZERO (fp + fn, mn - fn);
+  MPN_ZERO (f1p + fn, mn - fn);
+
+  if (nbi == 0)
+    {
+      if (fn == mn)
+	{
+	  mp_limb_t qp[2];
+	  mpn_tdiv_qr (qp, fp, 0, fp, fn, mp, mn);
+	  mpn_tdiv_qr (qp, f1p, 0, f1p, fn, mp, mn);
+	}
+
+      return 0;
+    }
+  else
+    {
+      mp_ptr	tp;
+      unsigned	pb = nh & 1;
+      int	neg;
+      TMP_DECL;
+
+      TMP_MARK;
+
+      tp = TMP_ALLOC_LIMBS (2 * mn + (mn < 2));
+
+      do
+	{
+	  mp_ptr	rp;
+	  /* Here fp==F[k] and f1p==F[k-1], with k being the bits of n from
+	     nbi upwards.
+
+	     Based on the next bit of n, we'll double to the pair
+	     fp==F[2k],f1p==F[2k-1] or fp==F[2k+1],f1p==F[2k], according as
+	     that bit is 0 or 1 respectively.  */
+
+	  mpn_sqr (tp, fp,  mn);
+	  mpn_sqr (fp, f1p, mn);
+
+	  /* Calculate F[2k-1] = F[k]^2 + F[k-1]^2. */
+	  f1p[2 * mn] = mpn_add_n (f1p, tp, fp, 2 * mn);
+
+	  /* Calculate F[2k+1] = 4*F[k]^2 - F[k-1]^2 + 2*(-1)^k.
+	     pb is the low bit of our implied k.  */
+
+	  /* fp is F[k-1]^2 == 0 or 1 mod 4, like all squares. */
+	  ASSERT ((fp[0] & 2) == 0);
+	  ASSERT (pb == (pb & 1));
+	  ASSERT ((fp[0] + (pb ? 2 : 0)) == (fp[0] | (pb << 1)));
+	  fp[0] |= pb << 1;		/* possible -2 */
+#if HAVE_NATIVE_mpn_rsblsh2_n
+	  fp[2 * mn] = 1 + mpn_rsblsh2_n (fp, fp, tp, 2 * mn);
+	  MPN_INCR_U(fp, 2 * mn + 1, (1 ^ pb) << 1);	/* possible +2 */
+	  fp[2 * mn] = (fp[2 * mn] - 1) & GMP_NUMB_MAX;
+#else
+	  {
+	    mp_limb_t  c;
+
+	    c = mpn_lshift (tp, tp, 2 * mn, 2);
+	    tp[0] |= (1 ^ pb) << 1;	/* possible +2 */
+	    c -= mpn_sub_n (fp, tp, fp, 2 * mn);
+	    fp[2 * mn] = c & GMP_NUMB_MAX;
+	  }
+#endif
+	  neg = fp[2 * mn] == GMP_NUMB_MAX;
+
+	  /* Calculate F[2k-1] = F[k]^2 + F[k-1]^2 */
+	  /* Calculate F[2k+1] = 4*F[k]^2 - F[k-1]^2 + 2*(-1)^k */
+
+	  /* Calculate F[2k] = F[2k+1] - F[2k-1], replacing the unwanted one of
+	     F[2k+1] and F[2k-1].  */
+	  --nbi;
+	  pb = (np [nbi / GMP_NUMB_BITS] >> (nbi % GMP_NUMB_BITS)) & 1;
+	  rp = pb ? f1p : fp;
+	  if (neg)
+	    {
+	      /* Calculate -(F[2k+1] - F[2k-1]) */
+	      rp[2 * mn] = f1p[2 * mn] + 1 - mpn_sub_n (rp, f1p, fp, 2 * mn);
+	      neg = ! pb;
+	      if (pb) /* fp not overwritten, negate it. */
+		fp [2 * mn] = 1 ^ mpn_neg (fp, fp, 2 * mn);
+	    }
+	  else
+	    {
+	      neg = abs_sub_n (rp, fp, f1p, 2 * mn + 1) < 0;
+	    }
+
+	  mpn_tdiv_qr (tp, fp, 0, fp, 2 * mn + 1, mp, mn);
+	  mpn_tdiv_qr (tp, f1p, 0, f1p, 2 * mn + 1, mp, mn);
+	}
+      while (nbi != 0);
+
+      TMP_FREE;
+
+      return neg;
+    }
+}

diff --git a/third_party/gmp/mpn/generic/gcd.c b/third_party/gmp/mpn/generic/gcd.c
new file mode 100644
index 0000000..3f92cbf
--- /dev/null
+++ b/third_party/gmp/mpn/generic/gcd.c

@@ -0,0 +1,266 @@
+/* mpn/gcd.c: mpn_gcd for gcd of two odd integers.
+
+Copyright 1991, 1993-1998, 2000-2005, 2008, 2010, 2012, 2019 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+/* Uses the HGCD operation described in
+
+     N. Möller, On Schönhage's algorithm and subquadratic integer gcd
+     computation, Math. Comp. 77 (2008), 589-607.
+
+  to reduce inputs until they are of size below GCD_DC_THRESHOLD, and
+  then uses Lehmer's algorithm.
+*/
+
+/* Some reasonable choices are n / 2 (same as in hgcd), and p = (n +
+ * 2)/3, which gives a balanced multiplication in
+ * mpn_hgcd_matrix_adjust. However, p = 2 n/3 gives slightly better
+ * performance. The matrix-vector multiplication is then
+ * 4:1-unbalanced, with matrix elements of size n/6, and vector
+ * elements of size p = 2n/3. */
+
+/* From analysis of the theoretical running time, it appears that when
+ * multiplication takes time O(n^alpha), p should be chosen so that
+ * the ratio of the time for the mpn_hgcd call, and the time for the
+ * multiplication in mpn_hgcd_matrix_adjust, is roughly 1/(alpha -
+ * 1). */
+#ifdef TUNE_GCD_P
+#define P_TABLE_SIZE 10000
+mp_size_t p_table[P_TABLE_SIZE];
+#define CHOOSE_P(n) ( (n) < P_TABLE_SIZE ? p_table[n] : 2*(n)/3)
+#else
+#define CHOOSE_P(n) (2*(n) / 3)
+#endif
+
+struct gcd_ctx
+{
+  mp_ptr gp;
+  mp_size_t gn;
+};
+
+static void
+gcd_hook (void *p, mp_srcptr gp, mp_size_t gn,
+	  mp_srcptr qp, mp_size_t qn, int d)
+{
+  struct gcd_ctx *ctx = (struct gcd_ctx *) p;
+  MPN_COPY (ctx->gp, gp, gn);
+  ctx->gn = gn;
+}
+
+mp_size_t
+mpn_gcd (mp_ptr gp, mp_ptr up, mp_size_t usize, mp_ptr vp, mp_size_t n)
+{
+  mp_size_t talloc;
+  mp_size_t scratch;
+  mp_size_t matrix_scratch;
+
+  struct gcd_ctx ctx;
+  mp_ptr tp;
+  TMP_DECL;
+
+  ASSERT (usize >= n);
+  ASSERT (n > 0);
+  ASSERT (vp[n-1] > 0);
+
+  /* FIXME: Check for small sizes first, before setting up temporary
+     storage etc. */
+  talloc = MPN_GCD_SUBDIV_STEP_ITCH(n);
+
+  /* For initial division */
+  scratch = usize - n + 1;
+  if (scratch > talloc)
+    talloc = scratch;
+
+#if TUNE_GCD_P
+  if (CHOOSE_P (n) > 0)
+#else
+  if (ABOVE_THRESHOLD (n, GCD_DC_THRESHOLD))
+#endif
+    {
+      mp_size_t hgcd_scratch;
+      mp_size_t update_scratch;
+      mp_size_t p = CHOOSE_P (n);
+      mp_size_t scratch;
+#if TUNE_GCD_P
+      /* Worst case, since we don't guarantee that n - CHOOSE_P(n)
+	 is increasing */
+      matrix_scratch = MPN_HGCD_MATRIX_INIT_ITCH (n);
+      hgcd_scratch = mpn_hgcd_itch (n);
+      update_scratch = 2*(n - 1);
+#else
+      matrix_scratch = MPN_HGCD_MATRIX_INIT_ITCH (n - p);
+      hgcd_scratch = mpn_hgcd_itch (n - p);
+      update_scratch = p + n - 1;
+#endif
+      scratch = matrix_scratch + MAX(hgcd_scratch, update_scratch);
+      if (scratch > talloc)
+	talloc = scratch;
+    }
+
+  TMP_MARK;
+  tp = TMP_ALLOC_LIMBS(talloc);
+
+  if (usize > n)
+    {
+      mpn_tdiv_qr (tp, up, 0, up, usize, vp, n);
+
+      if (mpn_zero_p (up, n))
+	{
+	  MPN_COPY (gp, vp, n);
+	  ctx.gn = n;
+	  goto done;
+	}
+    }
+
+  ctx.gp = gp;
+
+#if TUNE_GCD_P
+  while (CHOOSE_P (n) > 0)
+#else
+  while (ABOVE_THRESHOLD (n, GCD_DC_THRESHOLD))
+#endif
+    {
+      struct hgcd_matrix M;
+      mp_size_t p = CHOOSE_P (n);
+      mp_size_t matrix_scratch = MPN_HGCD_MATRIX_INIT_ITCH (n - p);
+      mp_size_t nn;
+      mpn_hgcd_matrix_init (&M, n - p, tp);
+      nn = mpn_hgcd (up + p, vp + p, n - p, &M, tp + matrix_scratch);
+      if (nn > 0)
+	{
+	  ASSERT (M.n <= (n - p - 1)/2);
+	  ASSERT (M.n + p <= (p + n - 1) / 2);
+	  /* Temporary storage 2 (p + M->n) <= p + n - 1. */
+	  n = mpn_hgcd_matrix_adjust (&M, p + nn, up, vp, p, tp + matrix_scratch);
+	}
+      else
+	{
+	  /* Temporary storage n */
+	  n = mpn_gcd_subdiv_step (up, vp, n, 0, gcd_hook, &ctx, tp);
+	  if (n == 0)
+	    goto done;
+	}
+    }
+
+  while (n > 2)
+    {
+      struct hgcd_matrix1 M;
+      mp_limb_t uh, ul, vh, vl;
+      mp_limb_t mask;
+
+      mask = up[n-1] | vp[n-1];
+      ASSERT (mask > 0);
+
+      if (mask & GMP_NUMB_HIGHBIT)
+	{
+	  uh = up[n-1]; ul = up[n-2];
+	  vh = vp[n-1]; vl = vp[n-2];
+	}
+      else
+	{
+	  int shift;
+
+	  count_leading_zeros (shift, mask);
+	  uh = MPN_EXTRACT_NUMB (shift, up[n-1], up[n-2]);
+	  ul = MPN_EXTRACT_NUMB (shift, up[n-2], up[n-3]);
+	  vh = MPN_EXTRACT_NUMB (shift, vp[n-1], vp[n-2]);
+	  vl = MPN_EXTRACT_NUMB (shift, vp[n-2], vp[n-3]);
+	}
+
+      /* Try an mpn_hgcd2 step */
+      if (mpn_hgcd2 (uh, ul, vh, vl, &M))
+	{
+	  n = mpn_matrix22_mul1_inverse_vector (&M, tp, up, vp, n);
+	  MP_PTR_SWAP (up, tp);
+	}
+      else
+	{
+	  /* mpn_hgcd2 has failed. Then either one of a or b is very
+	     small, or the difference is very small. Perform one
+	     subtraction followed by one division. */
+
+	  /* Temporary storage n */
+	  n = mpn_gcd_subdiv_step (up, vp, n, 0, &gcd_hook, &ctx, tp);
+	  if (n == 0)
+	    goto done;
+	}
+    }
+
+  ASSERT(up[n-1] | vp[n-1]);
+
+  /* Due to the calling convention for mpn_gcd, at most one can be even. */
+  if ((up[0] & 1) == 0)
+    MP_PTR_SWAP (up, vp);
+  ASSERT ((up[0] & 1) != 0);
+
+  {
+    mp_limb_t u0, u1, v0, v1;
+    mp_double_limb_t g;
+
+    u0 = up[0];
+    v0 = vp[0];
+
+    if (n == 1)
+      {
+	int cnt;
+	count_trailing_zeros (cnt, v0);
+	*gp = mpn_gcd_11 (u0, v0 >> cnt);
+	ctx.gn = 1;
+	goto done;
+      }
+
+    v1 = vp[1];
+    if (UNLIKELY (v0 == 0))
+      {
+	v0 = v1;
+	v1 = 0;
+	/* FIXME: We could invoke a mpn_gcd_21 here, just like mpn_gcd_22 could
+	   when this situation occurs internally.  */
+      }
+    if ((v0 & 1) == 0)
+      {
+	int cnt;
+	count_trailing_zeros (cnt, v0);
+	v0 = ((v1 << (GMP_NUMB_BITS - cnt)) & GMP_NUMB_MASK) | (v0 >> cnt);
+	v1 >>= cnt;
+      }
+
+    u1 = up[1];
+    g = mpn_gcd_22 (u1, u0, v1, v0);
+    gp[0] = g.d0;
+    gp[1] = g.d1;
+    ctx.gn = 1 + (g.d1 > 0);
+  }
+done:
+  TMP_FREE;
+  return ctx.gn;
+}

diff --git a/third_party/gmp/mpn/generic/gcd_1.c b/third_party/gmp/mpn/generic/gcd_1.c
new file mode 100644
index 0000000..22b1422
--- /dev/null
+++ b/third_party/gmp/mpn/generic/gcd_1.c

@@ -0,0 +1,103 @@
+/* mpn_gcd_1 -- mpn and limb greatest common divisor.
+
+Copyright 1994, 1996, 2000, 2001, 2009, 2012, 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+/* Does not work for U == 0 or V == 0.  It would be tough to make it work for
+   V == 0 since gcd(x,0) = x, and U does not generally fit in an mp_limb_t.
+
+   The threshold for doing u%v when size==1 will vary by CPU according to
+   the speed of a division and the code generated for the main loop.  Any
+   tuning for this is left to a CPU specific implementation.  */
+
+mp_limb_t
+mpn_gcd_1 (mp_srcptr up, mp_size_t size, mp_limb_t vlimb)
+{
+  mp_limb_t      ulimb;
+  unsigned long  zero_bits, u_low_zero_bits;
+  int c;
+
+  ASSERT (size >= 1);
+  ASSERT (vlimb != 0);
+  ASSERT_MPN_NONZERO_P (up, size);
+
+  ulimb = up[0];
+
+  /* Need vlimb odd for modexact, want it odd to get common zeros. */
+  count_trailing_zeros (zero_bits, vlimb);
+  vlimb >>= zero_bits;
+
+  if (size > 1)
+    {
+      /* Must get common zeros before the mod reduction.  If ulimb==0 then
+	 vlimb already gives the common zeros.  */
+      if (ulimb != 0)
+	{
+	  count_trailing_zeros (u_low_zero_bits, ulimb);
+	  zero_bits = MIN (zero_bits, u_low_zero_bits);
+	}
+
+      ulimb = MPN_MOD_OR_MODEXACT_1_ODD (up, size, vlimb);
+      if (ulimb == 0)
+	goto done;
+
+      count_trailing_zeros (c, ulimb);
+      ulimb >>= c;
+    }
+  else
+    {
+      /* size==1, so up[0]!=0 */
+      count_trailing_zeros (u_low_zero_bits, ulimb);
+      ulimb >>= u_low_zero_bits;
+      zero_bits = MIN (zero_bits, u_low_zero_bits);
+
+      /* make u bigger */
+      if (vlimb > ulimb)
+	MP_LIMB_T_SWAP (ulimb, vlimb);
+
+      /* if u is much bigger than v, reduce using a division rather than
+	 chipping away at it bit-by-bit */
+      if ((ulimb >> 16) > vlimb)
+	{
+	  ulimb %= vlimb;
+	  if (ulimb == 0)
+	    goto done;
+
+	  count_trailing_zeros (c, ulimb);
+	  ulimb >>= c;
+	}
+    }
+
+  vlimb = mpn_gcd_11 (ulimb, vlimb);
+
+ done:
+  return vlimb << zero_bits;
+}

diff --git a/third_party/gmp/mpn/generic/gcd_11.c b/third_party/gmp/mpn/generic/gcd_11.c
new file mode 100644
index 0000000..214e45c
--- /dev/null
+++ b/third_party/gmp/mpn/generic/gcd_11.c

@@ -0,0 +1,74 @@
+/* mpn_gcd_11 -- limb greatest common divisor.
+
+Copyright 1994, 1996, 2000, 2001, 2009, 2012, 2019 Free Software Foundation,
+Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+mp_limb_t
+mpn_gcd_11 (mp_limb_t u, mp_limb_t v)
+{
+  ASSERT (u & v & 1);
+
+  /* In this loop, we represent the odd numbers ulimb and vlimb
+     without the redundant least significant one bit. This reduction
+     in size by one bit ensures that the high bit of t, below, is set
+     if and only if vlimb > ulimb. */
+
+  u >>= 1;
+  v >>= 1;
+
+  while (u != v)
+    {
+      mp_limb_t t;
+      mp_limb_t vgtu;
+      int c;
+
+      t = u - v;
+      vgtu = LIMB_HIGHBIT_TO_MASK (t);
+
+      /* v <-- min (u, v) */
+      v += (vgtu & t);
+
+      /* u <-- |u - v| */
+      u = (t ^ vgtu) - vgtu;
+
+      count_trailing_zeros (c, t);
+      /* We have c <= GMP_LIMB_BITS - 2 here, so that
+
+	   ulimb >>= (c + 1);
+
+	 would be safe. But unlike the addition c + 1, a separate
+	 shift by 1 is independent of c, and can be executed in
+	 parallel with count_trailing_zeros. */
+      u = (u >> 1) >> c;
+    }
+  return (u << 1) + 1;
+}

diff --git a/third_party/gmp/mpn/generic/gcd_22.c b/third_party/gmp/mpn/generic/gcd_22.c
new file mode 100644
index 0000000..d97f096
--- /dev/null
+++ b/third_party/gmp/mpn/generic/gcd_22.c

@@ -0,0 +1,131 @@
+/* mpn_gcd_22 -- double limb greatest common divisor.
+
+Copyright 1994, 1996, 2000, 2001, 2009, 2012, 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+#if GMP_NAIL_BITS > 0
+#error Nails not supported.
+#endif
+
+mp_double_limb_t
+mpn_gcd_22 (mp_limb_t u1, mp_limb_t u0, mp_limb_t v1, mp_limb_t v0)
+{
+  mp_double_limb_t g;
+  ASSERT (u0 & v0 & 1);
+
+  /* Implicit least significant bit */
+  u0 = (u0 >> 1) | (u1 << (GMP_LIMB_BITS - 1));
+  u1 >>= 1;
+
+  v0 = (v0 >> 1) | (v1 << (GMP_LIMB_BITS - 1));
+  v1 >>= 1;
+
+  while (u1 || v1) /* u1 == 0 can happen at most twice per call */
+    {
+      mp_limb_t vgtu, t1, t0;
+      sub_ddmmss (t1, t0, u1, u0, v1, v0);
+      vgtu = LIMB_HIGHBIT_TO_MASK(t1);
+
+      if (UNLIKELY (t0 == 0))
+	{
+	  if (t1 == 0)
+	    {
+	      g.d1 = (u1 << 1) | (u0 >> (GMP_LIMB_BITS - 1));
+	      g.d0 = (u0 << 1) | 1;
+	      return g;
+	    }
+	  int c;
+	  count_trailing_zeros (c, t1);
+
+	  /* v1 = min (u1, v1) */
+	  v1 += (vgtu & t1);
+	  /* u0 = |u1 - v1| */
+	  u0 = (t1 ^ vgtu) - vgtu;
+	  ASSERT (c < GMP_LIMB_BITS - 1);
+	  u0 >>= c + 1;
+	  u1 = 0;
+	}
+      else
+	{
+	  int c;
+	  count_trailing_zeros (c, t0);
+	  c++;
+	  /* V <-- min (U, V).
+
+	     Assembly version should use cmov. Another alternative,
+	     avoiding carry propagation, would be
+
+	     v0 += vgtu & t0; v1 += vtgu & (u1 - v1);
+	  */
+	  add_ssaaaa (v1, v0, v1, v0, vgtu & t1, vgtu & t0);
+	  /* U  <--  |U - V|
+	     No carry handling needed in this conditional negation,
+	     since t0 != 0. */
+	  u0 = (t0 ^ vgtu) - vgtu;
+	  u1 = t1 ^ vgtu;
+	  if (UNLIKELY (c == GMP_LIMB_BITS))
+	    {
+	      u0 = u1;
+	      u1 = 0;
+	    }
+	  else
+	    {
+	      u0 = (u0 >> c) | (u1 << (GMP_LIMB_BITS - c));
+	      u1 >>= c;
+	    }
+	}
+    }
+  while ((v0 | u0) & GMP_LIMB_HIGHBIT)
+    { /* At most two iterations */
+      mp_limb_t vgtu, t0;
+      int c;
+      sub_ddmmss (vgtu, t0, 0, u0, 0, v0);
+      if (UNLIKELY (t0 == 0))
+	{
+	  g.d1 = u0 >> (GMP_LIMB_BITS - 1);
+	  g.d0 = (u0 << 1) | 1;
+	  return g;
+	}
+
+      /* v <-- min (u, v) */
+      v0 += (vgtu & t0);
+
+      /* u <-- |u - v| */
+      u0 = (t0 ^ vgtu) - vgtu;
+
+      count_trailing_zeros (c, t0);
+      u0 = (u0 >> 1) >> c;
+    }
+
+  g.d0 = mpn_gcd_11 ((u0 << 1) + 1, (v0 << 1) + 1);
+  g.d1 = 0;
+  return g;
+}

diff --git a/third_party/gmp/mpn/generic/gcd_subdiv_step.c b/third_party/gmp/mpn/generic/gcd_subdiv_step.c
new file mode 100644
index 0000000..9c3b88d
--- /dev/null
+++ b/third_party/gmp/mpn/generic/gcd_subdiv_step.c

@@ -0,0 +1,204 @@
+/* gcd_subdiv_step.c.
+
+   THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
+   SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+Copyright 2003-2005, 2008, 2010, 2011 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include <stdlib.h>		/* for NULL */
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+/* Used when mpn_hgcd or mpn_hgcd2 has failed. Then either one of a or
+   b is small, or the difference is small. Perform one subtraction
+   followed by one division. The normal case is to compute the reduced
+   a and b, and return the new size.
+
+   If s == 0 (used for gcd and gcdext), returns zero if the gcd is
+   found.
+
+   If s > 0, don't reduce to size <= s, and return zero if no
+   reduction is possible (if either a, b or |a-b| is of size <= s). */
+
+/* The hook function is called as
+
+     hook(ctx, gp, gn, qp, qn, d)
+
+   in the following cases:
+
+   + If A = B at the start, G is the gcd, Q is NULL, d = -1.
+
+   + If one input is zero at the start, G is the gcd, Q is NULL,
+     d = 0 if A = G and d = 1 if B = G.
+
+   Otherwise, if d = 0 we have just subtracted a multiple of A from B,
+   and if d = 1 we have subtracted a multiple of B from A.
+
+   + If A = B after subtraction, G is the gcd, Q is NULL.
+
+   + If we get a zero remainder after division, G is the gcd, Q is the
+     quotient.
+
+   + Otherwise, G is NULL, Q is the quotient (often 1).
+
+ */
+
+mp_size_t
+mpn_gcd_subdiv_step (mp_ptr ap, mp_ptr bp, mp_size_t n, mp_size_t s,
+		     gcd_subdiv_step_hook *hook, void *ctx,
+		     mp_ptr tp)
+{
+  static const mp_limb_t one = CNST_LIMB(1);
+  mp_size_t an, bn, qn;
+
+  int swapped;
+
+  ASSERT (n > 0);
+  ASSERT (ap[n-1] > 0 || bp[n-1] > 0);
+
+  an = bn = n;
+  MPN_NORMALIZE (ap, an);
+  MPN_NORMALIZE (bp, bn);
+
+  swapped = 0;
+
+  /* Arrange so that a < b, subtract b -= a, and maintain
+     normalization. */
+  if (an == bn)
+    {
+      int c;
+      MPN_CMP (c, ap, bp, an);
+      if (UNLIKELY (c == 0))
+	{
+	  /* For gcdext, return the smallest of the two cofactors, so
+	     pass d = -1. */
+	  if (s == 0)
+	    hook (ctx, ap, an, NULL, 0, -1);
+	  return 0;
+	}
+      else if (c > 0)
+	{
+	  MP_PTR_SWAP (ap, bp);
+	  swapped ^= 1;
+	}
+    }
+  else
+    {
+      if (an > bn)
+	{
+	  MPN_PTR_SWAP (ap, an, bp, bn);
+	  swapped ^= 1;
+	}
+    }
+  if (an <= s)
+    {
+      if (s == 0)
+	hook (ctx, bp, bn, NULL, 0, swapped ^ 1);
+      return 0;
+    }
+
+  ASSERT_NOCARRY (mpn_sub (bp, bp, bn, ap, an));
+  MPN_NORMALIZE (bp, bn);
+  ASSERT (bn > 0);
+
+  if (bn <= s)
+    {
+      /* Undo subtraction. */
+      mp_limb_t cy = mpn_add (bp, ap, an, bp, bn);
+      if (cy > 0)
+	bp[an] = cy;
+      return 0;
+    }
+
+  /* Arrange so that a < b */
+  if (an == bn)
+    {
+      int c;
+      MPN_CMP (c, ap, bp, an);
+      if (UNLIKELY (c == 0))
+	{
+	  if (s > 0)
+	    /* Just record subtraction and return */
+	    hook (ctx, NULL, 0, &one, 1, swapped);
+	  else
+	    /* Found gcd. */
+	    hook (ctx, bp, bn, NULL, 0, swapped);
+	  return 0;
+	}
+
+      hook (ctx, NULL, 0, &one, 1, swapped);
+
+      if (c > 0)
+	{
+	  MP_PTR_SWAP (ap, bp);
+	  swapped ^= 1;
+	}
+    }
+  else
+    {
+      hook (ctx, NULL, 0, &one, 1, swapped);
+
+      if (an > bn)
+	{
+	  MPN_PTR_SWAP (ap, an, bp, bn);
+	  swapped ^= 1;
+	}
+    }
+
+  mpn_tdiv_qr (tp, bp, 0, bp, bn, ap, an);
+  qn = bn - an + 1;
+  bn = an;
+  MPN_NORMALIZE (bp, bn);
+
+  if (UNLIKELY (bn <= s))
+    {
+      if (s == 0)
+	{
+	  hook (ctx, ap, an, tp, qn, swapped);
+	  return 0;
+	}
+
+      /* Quotient is one too large, so decrement it and add back A. */
+      if (bn > 0)
+	{
+	  mp_limb_t cy = mpn_add (bp, ap, an, bp, bn);
+	  if (cy)
+	    bp[an++] = cy;
+	}
+      else
+	MPN_COPY (bp, ap, an);
+
+      MPN_DECR_U (tp, qn, 1);
+    }
+
+  hook (ctx, NULL, 0, tp, qn, swapped);
+  return an;
+}

diff --git a/third_party/gmp/mpn/generic/gcdext.c b/third_party/gmp/mpn/generic/gcdext.c
new file mode 100644
index 0000000..5501480
--- /dev/null
+++ b/third_party/gmp/mpn/generic/gcdext.c

@@ -0,0 +1,557 @@
+/* mpn_gcdext -- Extended Greatest Common Divisor.
+
+Copyright 1996, 1998, 2000-2005, 2008, 2009, 2012 Free Software Foundation,
+Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+/* Computes (r;b) = (a; b) M. Result is of size n + M->n +/- 1, and
+   the size is returned (if inputs are non-normalized, result may be
+   non-normalized too). Temporary space needed is M->n + n.
+ */
+static size_t
+hgcd_mul_matrix_vector (struct hgcd_matrix *M,
+			mp_ptr rp, mp_srcptr ap, mp_ptr bp, mp_size_t n, mp_ptr tp)
+{
+  mp_limb_t ah, bh;
+
+  /* Compute (r,b) <-- (u00 a + u10 b, u01 a + u11 b) as
+
+     t  = u00 * a
+     r  = u10 * b
+     r += t;
+
+     t  = u11 * b
+     b  = u01 * a
+     b += t;
+  */
+
+  if (M->n >= n)
+    {
+      mpn_mul (tp, M->p[0][0], M->n, ap, n);
+      mpn_mul (rp, M->p[1][0], M->n, bp, n);
+    }
+  else
+    {
+      mpn_mul (tp, ap, n, M->p[0][0], M->n);
+      mpn_mul (rp, bp, n, M->p[1][0], M->n);
+    }
+
+  ah = mpn_add_n (rp, rp, tp, n + M->n);
+
+  if (M->n >= n)
+    {
+      mpn_mul (tp, M->p[1][1], M->n, bp, n);
+      mpn_mul (bp, M->p[0][1], M->n, ap, n);
+    }
+  else
+    {
+      mpn_mul (tp, bp, n, M->p[1][1], M->n);
+      mpn_mul (bp, ap, n, M->p[0][1], M->n);
+    }
+  bh = mpn_add_n (bp, bp, tp, n + M->n);
+
+  n += M->n;
+  if ( (ah | bh) > 0)
+    {
+      rp[n] = ah;
+      bp[n] = bh;
+      n++;
+    }
+  else
+    {
+      /* Normalize */
+      while ( (rp[n-1] | bp[n-1]) == 0)
+	n--;
+    }
+
+  return n;
+}
+
+#define COMPUTE_V_ITCH(n) (2*(n))
+
+/* Computes |v| = |(g - u a)| / b, where u may be positive or
+   negative, and v is of the opposite sign. max(a, b) is of size n, u and
+   v at most size n, and v must have space for n+1 limbs. */
+static mp_size_t
+compute_v (mp_ptr vp,
+	   mp_srcptr ap, mp_srcptr bp, mp_size_t n,
+	   mp_srcptr gp, mp_size_t gn,
+	   mp_srcptr up, mp_size_t usize,
+	   mp_ptr tp)
+{
+  mp_size_t size;
+  mp_size_t an;
+  mp_size_t bn;
+  mp_size_t vn;
+
+  ASSERT (n > 0);
+  ASSERT (gn > 0);
+  ASSERT (usize != 0);
+
+  size = ABS (usize);
+  ASSERT (size <= n);
+  ASSERT (up[size-1] > 0);
+
+  an = n;
+  MPN_NORMALIZE (ap, an);
+  ASSERT (gn <= an);
+
+  if (an >= size)
+    mpn_mul (tp, ap, an, up, size);
+  else
+    mpn_mul (tp, up, size, ap, an);
+
+  size += an;
+
+  if (usize > 0)
+    {
+      /* |v| = -v = (u a - g) / b */
+
+      ASSERT_NOCARRY (mpn_sub (tp, tp, size, gp, gn));
+      MPN_NORMALIZE (tp, size);
+      if (size == 0)
+	return 0;
+    }
+  else
+    { /* |v| = v = (g - u a) / b = (g + |u| a) / b. Since g <= a,
+	 (g + |u| a) always fits in (|usize| + an) limbs. */
+
+      ASSERT_NOCARRY (mpn_add (tp, tp, size, gp, gn));
+      size -= (tp[size - 1] == 0);
+    }
+
+  /* Now divide t / b. There must be no remainder */
+  bn = n;
+  MPN_NORMALIZE (bp, bn);
+  ASSERT (size >= bn);
+
+  vn = size + 1 - bn;
+  ASSERT (vn <= n + 1);
+
+  mpn_divexact (vp, tp, size, bp, bn);
+  vn -= (vp[vn-1] == 0);
+
+  return vn;
+}
+
+/* Temporary storage:
+
+   Initial division: Quotient of at most an - n + 1 <= an limbs.
+
+   Storage for u0 and u1: 2(n+1).
+
+   Storage for hgcd matrix M, with input ceil(n/2): 5 * ceil(n/4)
+
+   Storage for hgcd, input (n + 1)/2: 9 n/4 plus some.
+
+   When hgcd succeeds: 1 + floor(3n/2) for adjusting a and b, and 2(n+1) for the cofactors.
+
+   When hgcd fails: 2n + 1 for mpn_gcdext_subdiv_step, which is less.
+
+   For the lehmer call after the loop, Let T denote
+   GCDEXT_DC_THRESHOLD. For the gcdext_lehmer call, we need T each for
+   u, a and b, and 4T+3 scratch space. Next, for compute_v, we need T
+   for u, T+1 for v and 2T scratch space. In all, 7T + 3 is
+   sufficient for both operations.
+
+*/
+
+/* Optimal choice of p seems difficult. In each iteration the division
+ * of work between hgcd and the updates of u0 and u1 depends on the
+ * current size of the u. It may be desirable to use a different
+ * choice of p in each iteration. Also the input size seems to matter;
+ * choosing p = n / 3 in the first iteration seems to improve
+ * performance slightly for input size just above the threshold, but
+ * degrade performance for larger inputs. */
+#define CHOOSE_P_1(n) ((n) / 2)
+#define CHOOSE_P_2(n) ((n) / 3)
+
+mp_size_t
+mpn_gcdext (mp_ptr gp, mp_ptr up, mp_size_t *usizep,
+	    mp_ptr ap, mp_size_t an, mp_ptr bp, mp_size_t n)
+{
+  mp_size_t talloc;
+  mp_size_t scratch;
+  mp_size_t matrix_scratch;
+  mp_size_t ualloc = n + 1;
+
+  struct gcdext_ctx ctx;
+  mp_size_t un;
+  mp_ptr u0;
+  mp_ptr u1;
+
+  mp_ptr tp;
+
+  TMP_DECL;
+
+  ASSERT (an >= n);
+  ASSERT (n > 0);
+  ASSERT (bp[n-1] > 0);
+
+  TMP_MARK;
+
+  /* FIXME: Check for small sizes first, before setting up temporary
+     storage etc. */
+  talloc = MPN_GCDEXT_LEHMER_N_ITCH(n);
+
+  /* For initial division */
+  scratch = an - n + 1;
+  if (scratch > talloc)
+    talloc = scratch;
+
+  if (ABOVE_THRESHOLD (n, GCDEXT_DC_THRESHOLD))
+    {
+      /* For hgcd loop. */
+      mp_size_t hgcd_scratch;
+      mp_size_t update_scratch;
+      mp_size_t p1 = CHOOSE_P_1 (n);
+      mp_size_t p2 = CHOOSE_P_2 (n);
+      mp_size_t min_p = MIN(p1, p2);
+      mp_size_t max_p = MAX(p1, p2);
+      matrix_scratch = MPN_HGCD_MATRIX_INIT_ITCH (n - min_p);
+      hgcd_scratch = mpn_hgcd_itch (n - min_p);
+      update_scratch = max_p + n - 1;
+
+      scratch = matrix_scratch + MAX(hgcd_scratch, update_scratch);
+      if (scratch > talloc)
+	talloc = scratch;
+
+      /* Final mpn_gcdext_lehmer_n call. Need space for u and for
+	 copies of a and b. */
+      scratch = MPN_GCDEXT_LEHMER_N_ITCH (GCDEXT_DC_THRESHOLD)
+	+ 3*GCDEXT_DC_THRESHOLD;
+
+      if (scratch > talloc)
+	talloc = scratch;
+
+      /* Cofactors u0 and u1 */
+      talloc += 2*(n+1);
+    }
+
+  tp = TMP_ALLOC_LIMBS(talloc);
+
+  if (an > n)
+    {
+      mpn_tdiv_qr (tp, ap, 0, ap, an, bp, n);
+
+      if (mpn_zero_p (ap, n))
+	{
+	  MPN_COPY (gp, bp, n);
+	  *usizep = 0;
+	  TMP_FREE;
+	  return n;
+	}
+    }
+
+  if (BELOW_THRESHOLD (n, GCDEXT_DC_THRESHOLD))
+    {
+      mp_size_t gn = mpn_gcdext_lehmer_n(gp, up, usizep, ap, bp, n, tp);
+
+      TMP_FREE;
+      return gn;
+    }
+
+  MPN_ZERO (tp, 2*ualloc);
+  u0 = tp; tp += ualloc;
+  u1 = tp; tp += ualloc;
+
+  ctx.gp = gp;
+  ctx.up = up;
+  ctx.usize = usizep;
+
+  {
+    /* For the first hgcd call, there are no u updates, and it makes
+       some sense to use a different choice for p. */
+
+    /* FIXME: We could trim use of temporary storage, since u0 and u1
+       are not used yet. For the hgcd call, we could swap in the u0
+       and u1 pointers for the relevant matrix elements. */
+
+    struct hgcd_matrix M;
+    mp_size_t p = CHOOSE_P_1 (n);
+    mp_size_t nn;
+
+    mpn_hgcd_matrix_init (&M, n - p, tp);
+    nn = mpn_hgcd (ap + p, bp + p, n - p, &M, tp + matrix_scratch);
+    if (nn > 0)
+      {
+	ASSERT (M.n <= (n - p - 1)/2);
+	ASSERT (M.n + p <= (p + n - 1) / 2);
+
+	/* Temporary storage 2 (p + M->n) <= p + n - 1 */
+	n = mpn_hgcd_matrix_adjust (&M, p + nn, ap, bp, p, tp + matrix_scratch);
+
+	MPN_COPY (u0, M.p[1][0], M.n);
+	MPN_COPY (u1, M.p[1][1], M.n);
+	un = M.n;
+	while ( (u0[un-1] | u1[un-1] ) == 0)
+	  un--;
+      }
+    else
+      {
+	/* mpn_hgcd has failed. Then either one of a or b is very
+	   small, or the difference is very small. Perform one
+	   subtraction followed by one division. */
+	u1[0] = 1;
+
+	ctx.u0 = u0;
+	ctx.u1 = u1;
+	ctx.tp = tp + n; /* ualloc */
+	ctx.un = 1;
+
+	/* Temporary storage n */
+	n = mpn_gcd_subdiv_step (ap, bp, n, 0, mpn_gcdext_hook, &ctx, tp);
+	if (n == 0)
+	  {
+	    TMP_FREE;
+	    return ctx.gn;
+	  }
+
+	un = ctx.un;
+	ASSERT (un < ualloc);
+      }
+  }
+
+  while (ABOVE_THRESHOLD (n, GCDEXT_DC_THRESHOLD))
+    {
+      struct hgcd_matrix M;
+      mp_size_t p = CHOOSE_P_2 (n);
+      mp_size_t nn;
+
+      mpn_hgcd_matrix_init (&M, n - p, tp);
+      nn = mpn_hgcd (ap + p, bp + p, n - p, &M, tp + matrix_scratch);
+      if (nn > 0)
+	{
+	  mp_ptr t0;
+
+	  t0 = tp + matrix_scratch;
+	  ASSERT (M.n <= (n - p - 1)/2);
+	  ASSERT (M.n + p <= (p + n - 1) / 2);
+
+	  /* Temporary storage 2 (p + M->n) <= p + n - 1 */
+	  n = mpn_hgcd_matrix_adjust (&M, p + nn, ap, bp, p, t0);
+
+	  /* By the same analysis as for mpn_hgcd_matrix_mul */
+	  ASSERT (M.n + un <= ualloc);
+
+	  /* FIXME: This copying could be avoided by some swapping of
+	   * pointers. May need more temporary storage, though. */
+	  MPN_COPY (t0, u0, un);
+
+	  /* Temporary storage ualloc */
+	  un = hgcd_mul_matrix_vector (&M, u0, t0, u1, un, t0 + un);
+
+	  ASSERT (un < ualloc);
+	  ASSERT ( (u0[un-1] | u1[un-1]) > 0);
+	}
+      else
+	{
+	  /* mpn_hgcd has failed. Then either one of a or b is very
+	     small, or the difference is very small. Perform one
+	     subtraction followed by one division. */
+	  ctx.u0 = u0;
+	  ctx.u1 = u1;
+	  ctx.tp = tp + n; /* ualloc */
+	  ctx.un = un;
+
+	  /* Temporary storage n */
+	  n = mpn_gcd_subdiv_step (ap, bp, n, 0, mpn_gcdext_hook, &ctx, tp);
+	  if (n == 0)
+	    {
+	      TMP_FREE;
+	      return ctx.gn;
+	    }
+
+	  un = ctx.un;
+	  ASSERT (un < ualloc);
+	}
+    }
+  /* We have A = ... a + ... b
+	     B =  u0 a +  u1 b
+
+	     a = u1  A + ... B
+	     b = -u0 A + ... B
+
+     with bounds
+
+       |u0|, |u1| <= B / min(a, b)
+
+     We always have u1 > 0, and u0 == 0 is possible only if u1 == 1,
+     in which case the only reduction done so far is a = A - k B for
+     some k.
+
+     Compute g = u a + v b = (u u1 - v u0) A + (...) B
+     Here, u, v are bounded by
+
+       |u| <= b,
+       |v| <= a
+  */
+
+  ASSERT ( (ap[n-1] | bp[n-1]) > 0);
+
+  if (UNLIKELY (mpn_cmp (ap, bp, n) == 0))
+    {
+      /* Must return the smallest cofactor, +u1 or -u0 */
+      int c;
+
+      MPN_COPY (gp, ap, n);
+
+      MPN_CMP (c, u0, u1, un);
+      /* c == 0 can happen only when A = (2k+1) G, B = 2 G. And in
+	 this case we choose the cofactor + 1, corresponding to G = A
+	 - k B, rather than -1, corresponding to G = - A + (k+1) B. */
+      ASSERT (c != 0 || (un == 1 && u0[0] == 1 && u1[0] == 1));
+      if (c < 0)
+	{
+	  MPN_NORMALIZE (u0, un);
+	  MPN_COPY (up, u0, un);
+	  *usizep = -un;
+	}
+      else
+	{
+	  MPN_NORMALIZE_NOT_ZERO (u1, un);
+	  MPN_COPY (up, u1, un);
+	  *usizep = un;
+	}
+
+      TMP_FREE;
+      return n;
+    }
+  else if (UNLIKELY (u0[0] == 0) && un == 1)
+    {
+      mp_size_t gn;
+      ASSERT (u1[0] == 1);
+
+      /* g = u a + v b = (u u1 - v u0) A + (...) B = u A + (...) B */
+      gn = mpn_gcdext_lehmer_n (gp, up, usizep, ap, bp, n, tp);
+
+      TMP_FREE;
+      return gn;
+    }
+  else
+    {
+      mp_size_t u0n;
+      mp_size_t u1n;
+      mp_size_t lehmer_un;
+      mp_size_t lehmer_vn;
+      mp_size_t gn;
+
+      mp_ptr lehmer_up;
+      mp_ptr lehmer_vp;
+      int negate;
+
+      lehmer_up = tp; tp += n;
+
+      /* Call mpn_gcdext_lehmer_n with copies of a and b. */
+      MPN_COPY (tp, ap, n);
+      MPN_COPY (tp + n, bp, n);
+      gn = mpn_gcdext_lehmer_n (gp, lehmer_up, &lehmer_un, tp, tp + n, n, tp + 2*n);
+
+      u0n = un;
+      MPN_NORMALIZE (u0, u0n);
+      ASSERT (u0n > 0);
+
+      if (lehmer_un == 0)
+	{
+	  /* u == 0  ==>  v = g / b == 1  ==> g = - u0 A + (...) B */
+	  MPN_COPY (up, u0, u0n);
+	  *usizep = -u0n;
+
+	  TMP_FREE;
+	  return gn;
+	}
+
+      lehmer_vp = tp;
+      /* Compute v = (g - u a) / b */
+      lehmer_vn = compute_v (lehmer_vp,
+			     ap, bp, n, gp, gn, lehmer_up, lehmer_un, tp + n + 1);
+
+      if (lehmer_un > 0)
+	negate = 0;
+      else
+	{
+	  lehmer_un = -lehmer_un;
+	  negate = 1;
+	}
+
+      u1n = un;
+      MPN_NORMALIZE (u1, u1n);
+      ASSERT (u1n > 0);
+
+      ASSERT (lehmer_un + u1n <= ualloc);
+      ASSERT (lehmer_vn + u0n <= ualloc);
+
+      /* We may still have v == 0 */
+
+      /* Compute u u0 */
+      if (lehmer_un <= u1n)
+	/* Should be the common case */
+	mpn_mul (up, u1, u1n, lehmer_up, lehmer_un);
+      else
+	mpn_mul (up, lehmer_up, lehmer_un, u1, u1n);
+
+      un = u1n + lehmer_un;
+      un -= (up[un - 1] == 0);
+
+      if (lehmer_vn > 0)
+	{
+	  mp_limb_t cy;
+
+	  /* Overwrites old u1 value */
+	  if (lehmer_vn <= u0n)
+	    /* Should be the common case */
+	    mpn_mul (u1, u0, u0n, lehmer_vp, lehmer_vn);
+	  else
+	    mpn_mul (u1, lehmer_vp, lehmer_vn, u0, u0n);
+
+	  u1n = u0n + lehmer_vn;
+	  u1n -= (u1[u1n - 1] == 0);
+
+	  if (u1n <= un)
+	    {
+	      cy = mpn_add (up, up, un, u1, u1n);
+	    }
+	  else
+	    {
+	      cy = mpn_add (up, u1, u1n, up, un);
+	      un = u1n;
+	    }
+	  up[un] = cy;
+	  un += (cy != 0);
+
+	  ASSERT (un < ualloc);
+	}
+      *usizep = negate ? -un : un;
+
+      TMP_FREE;
+      return gn;
+    }
+}

diff --git a/third_party/gmp/mpn/generic/gcdext_1.c b/third_party/gmp/mpn/generic/gcdext_1.c
new file mode 100644
index 0000000..b221a92
--- /dev/null
+++ b/third_party/gmp/mpn/generic/gcdext_1.c

@@ -0,0 +1,275 @@
+/* mpn_gcdext -- Extended Greatest Common Divisor.
+
+Copyright 1996, 1998, 2000-2005, 2008, 2009 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+#ifndef GCDEXT_1_USE_BINARY
+#define GCDEXT_1_USE_BINARY 0
+#endif
+
+#ifndef GCDEXT_1_BINARY_METHOD
+#define GCDEXT_1_BINARY_METHOD 2
+#endif
+
+#if GCDEXT_1_USE_BINARY
+
+mp_limb_t
+mpn_gcdext_1 (mp_limb_signed_t *sp, mp_limb_signed_t *tp,
+	      mp_limb_t u, mp_limb_t v)
+{
+  /* Maintain
+
+     U = t1 u + t0 v
+     V = s1 u + s0 v
+
+     where U, V are the inputs (without any shared power of two),
+     and the matrix has determinant ± 2^{shift}.
+  */
+  mp_limb_t s0 = 1;
+  mp_limb_t t0 = 0;
+  mp_limb_t s1 = 0;
+  mp_limb_t t1 = 1;
+  mp_limb_t ug;
+  mp_limb_t vg;
+  mp_limb_t ugh;
+  mp_limb_t vgh;
+  unsigned zero_bits;
+  unsigned shift;
+  unsigned i;
+#if GCDEXT_1_BINARY_METHOD == 2
+  mp_limb_t det_sign;
+#endif
+
+  ASSERT (u > 0);
+  ASSERT (v > 0);
+
+  count_trailing_zeros (zero_bits, u | v);
+  u >>= zero_bits;
+  v >>= zero_bits;
+
+  if ((u & 1) == 0)
+    {
+      count_trailing_zeros (shift, u);
+      u >>= shift;
+      t1 <<= shift;
+    }
+  else if ((v & 1) == 0)
+    {
+      count_trailing_zeros (shift, v);
+      v >>= shift;
+      s0 <<= shift;
+    }
+  else
+    shift = 0;
+
+#if GCDEXT_1_BINARY_METHOD == 1
+  while (u != v)
+    {
+      unsigned count;
+      if (u > v)
+	{
+	  u -= v;
+
+	  count_trailing_zeros (count, u);
+	  u >>= count;
+
+	  t0 += t1; t1 <<= count;
+	  s0 += s1; s1 <<= count;
+	}
+      else
+	{
+	  v -= u;
+
+	  count_trailing_zeros (count, v);
+	  v >>= count;
+
+	  t1 += t0; t0 <<= count;
+	  s1 += s0; s0 <<= count;
+	}
+      shift += count;
+    }
+#else
+# if GCDEXT_1_BINARY_METHOD == 2
+  u >>= 1;
+  v >>= 1;
+
+  det_sign = 0;
+
+  while (u != v)
+    {
+      unsigned count;
+      mp_limb_t d =  u - v;
+      mp_limb_t vgtu = LIMB_HIGHBIT_TO_MASK (d);
+      mp_limb_t sx;
+      mp_limb_t tx;
+
+      /* When v <= u (vgtu == 0), the updates are:
+
+	   (u; v)   <-- ( (u - v) >> count; v)    (det = +(1<<count) for corr. M factor)
+	   (t1, t0) <-- (t1 << count, t0 + t1)
+
+	 and when v > 0, the updates are
+
+	   (u; v)   <-- ( (v - u) >> count; u)    (det = -(1<<count))
+	   (t1, t0) <-- (t0 << count, t0 + t1)
+
+	 and similarly for s1, s0
+      */
+
+      /* v <-- min (u, v) */
+      v += (vgtu & d);
+
+      /* u <-- |u - v| */
+      u = (d ^ vgtu) - vgtu;
+
+      /* Number of trailing zeros is the same no matter if we look at
+       * d or u, but using d gives more parallelism. */
+      count_trailing_zeros (count, d);
+
+      det_sign ^= vgtu;
+
+      tx = vgtu & (t0 - t1);
+      sx = vgtu & (s0 - s1);
+      t0 += t1;
+      s0 += s1;
+      t1 += tx;
+      s1 += sx;
+
+      count++;
+      u >>= count;
+      t1 <<= count;
+      s1 <<= count;
+      shift += count;
+    }
+  u = (u << 1) + 1;
+# else /* GCDEXT_1_BINARY_METHOD == 2 */
+#  error Unknown GCDEXT_1_BINARY_METHOD
+# endif
+#endif
+
+  /* Now u = v = g = gcd (u,v). Compute U/g and V/g */
+  ug = t0 + t1;
+  vg = s0 + s1;
+
+  ugh = ug/2 + (ug & 1);
+  vgh = vg/2 + (vg & 1);
+
+  /* Now 2^{shift} g = s0 U - t0 V. Get rid of the power of two, using
+     s0 U - t0 V = (s0 + V/g) U - (t0 + U/g) V. */
+  for (i = 0; i < shift; i++)
+    {
+      mp_limb_t mask = - ( (s0 | t0) & 1);
+
+      s0 /= 2;
+      t0 /= 2;
+      s0 += mask & vgh;
+      t0 += mask & ugh;
+    }
+
+  ASSERT_ALWAYS (s0 <= vg);
+  ASSERT_ALWAYS (t0 <= ug);
+
+  if (s0 > vg - s0)
+    {
+      s0 -= vg;
+      t0 -= ug;
+    }
+#if GCDEXT_1_BINARY_METHOD == 2
+  /* Conditional negation. */
+  s0 = (s0 ^ det_sign) - det_sign;
+  t0 = (t0 ^ det_sign) - det_sign;
+#endif
+  *sp = s0;
+  *tp = -t0;
+
+  return u << zero_bits;
+}
+
+#else /* !GCDEXT_1_USE_BINARY */
+
+
+/* FIXME: Takes two single-word limbs. It could be extended to a
+ * function that accepts a bignum for the first input, and only
+ * returns the first co-factor. */
+
+mp_limb_t
+mpn_gcdext_1 (mp_limb_signed_t *up, mp_limb_signed_t *vp,
+	      mp_limb_t a, mp_limb_t b)
+{
+  /* Maintain
+
+     a =  u0 A + v0 B
+     b =  u1 A + v1 B
+
+     where A, B are the original inputs.
+  */
+  mp_limb_signed_t u0 = 1;
+  mp_limb_signed_t v0 = 0;
+  mp_limb_signed_t u1 = 0;
+  mp_limb_signed_t v1 = 1;
+
+  ASSERT (a > 0);
+  ASSERT (b > 0);
+
+  if (a < b)
+    goto divide_by_b;
+
+  for (;;)
+    {
+      mp_limb_t q;
+
+      q = a / b;
+      a -= q * b;
+
+      if (a == 0)
+	{
+	  *up = u1;
+	  *vp = v1;
+	  return b;
+	}
+      u0 -= q * u1;
+      v0 -= q * v1;
+
+    divide_by_b:
+      q = b / a;
+      b -= q * a;
+
+      if (b == 0)
+	{
+	  *up = u0;
+	  *vp = v0;
+	  return a;
+	}
+      u1 -= q * u0;
+      v1 -= q * v0;
+    }
+}
+#endif /* !GCDEXT_1_USE_BINARY */

diff --git a/third_party/gmp/mpn/generic/gcdext_lehmer.c b/third_party/gmp/mpn/generic/gcdext_lehmer.c
new file mode 100644
index 0000000..ea4e86d
--- /dev/null
+++ b/third_party/gmp/mpn/generic/gcdext_lehmer.c

@@ -0,0 +1,336 @@
+/* mpn_gcdext -- Extended Greatest Common Divisor.
+
+Copyright 1996, 1998, 2000-2005, 2008, 2009, 2012 Free Software Foundation,
+Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+/* Here, d is the index of the cofactor to update. FIXME: Could use qn
+   = 0 for the common case q = 1. */
+void
+mpn_gcdext_hook (void *p, mp_srcptr gp, mp_size_t gn,
+		 mp_srcptr qp, mp_size_t qn, int d)
+{
+  struct gcdext_ctx *ctx = (struct gcdext_ctx *) p;
+  mp_size_t un = ctx->un;
+
+  if (gp)
+    {
+      mp_srcptr up;
+
+      ASSERT (gn > 0);
+      ASSERT (gp[gn-1] > 0);
+
+      MPN_COPY (ctx->gp, gp, gn);
+      ctx->gn = gn;
+
+      if (d < 0)
+	{
+	  int c;
+
+	  /* Must return the smallest cofactor, +u1 or -u0 */
+	  MPN_CMP (c, ctx->u0, ctx->u1, un);
+	  ASSERT (c != 0 || (un == 1 && ctx->u0[0] == 1 && ctx->u1[0] == 1));
+
+	  d = c < 0;
+	}
+
+      up = d ? ctx->u0 : ctx->u1;
+
+      MPN_NORMALIZE (up, un);
+      MPN_COPY (ctx->up, up, un);
+
+      *ctx->usize = d ? -un : un;
+    }
+  else
+    {
+      mp_limb_t cy;
+      mp_ptr u0 = ctx->u0;
+      mp_ptr u1 = ctx->u1;
+
+      ASSERT (d >= 0);
+
+      if (d)
+	MP_PTR_SWAP (u0, u1);
+
+      qn -= (qp[qn-1] == 0);
+
+      /* Update u0 += q  * u1 */
+      if (qn == 1)
+	{
+	  mp_limb_t q = qp[0];
+
+	  if (q == 1)
+	    /* A common case. */
+	    cy = mpn_add_n (u0, u0, u1, un);
+	  else
+	    cy = mpn_addmul_1 (u0, u1, un, q);
+	}
+      else
+	{
+	  mp_size_t u1n;
+	  mp_ptr tp;
+
+	  u1n = un;
+	  MPN_NORMALIZE (u1, u1n);
+
+	  if (u1n == 0)
+	    return;
+
+	  /* Should always have u1n == un here, and u1 >= u0. The
+	     reason is that we alternate adding u0 to u1 and u1 to u0
+	     (corresponding to subtractions a - b and b - a), and we
+	     can get a large quotient only just after a switch, which
+	     means that we'll add (a multiple of) the larger u to the
+	     smaller. */
+
+	  tp = ctx->tp;
+
+	  if (qn > u1n)
+	    mpn_mul (tp, qp, qn, u1, u1n);
+	  else
+	    mpn_mul (tp, u1, u1n, qp, qn);
+
+	  u1n += qn;
+	  u1n -= tp[u1n-1] == 0;
+
+	  if (u1n >= un)
+	    {
+	      cy = mpn_add (u0, tp, u1n, u0, un);
+	      un = u1n;
+	    }
+	  else
+	    /* Note: Unlikely case, maybe never happens? */
+	    cy = mpn_add (u0, u0, un, tp, u1n);
+
+	}
+      u0[un] = cy;
+      ctx->un = un + (cy > 0);
+    }
+}
+
+/* Temporary storage: 3*(n+1) for u. If hgcd2 succeeds, we need n for
+   the matrix-vector multiplication adjusting a, b. If hgcd fails, we
+   need at most n for the quotient and n+1 for the u update (reusing
+   the extra u). In all, 4n + 3. */
+
+mp_size_t
+mpn_gcdext_lehmer_n (mp_ptr gp, mp_ptr up, mp_size_t *usize,
+		     mp_ptr ap, mp_ptr bp, mp_size_t n,
+		     mp_ptr tp)
+{
+  mp_size_t ualloc = n + 1;
+
+  /* Keeps track of the second row of the reduction matrix
+   *
+   *   M = (v0, v1 ; u0, u1)
+   *
+   * which correspond to the first column of the inverse
+   *
+   *   M^{-1} = (u1, -v1; -u0, v0)
+   *
+   * This implies that
+   *
+   *   a =  u1 A (mod B)
+   *   b = -u0 A (mod B)
+   *
+   * where A, B denotes the input values.
+   */
+
+  struct gcdext_ctx ctx;
+  mp_size_t un;
+  mp_ptr u0;
+  mp_ptr u1;
+  mp_ptr u2;
+
+  MPN_ZERO (tp, 3*ualloc);
+  u0 = tp; tp += ualloc;
+  u1 = tp; tp += ualloc;
+  u2 = tp; tp += ualloc;
+
+  u1[0] = 1; un = 1;
+
+  ctx.gp = gp;
+  ctx.up = up;
+  ctx.usize = usize;
+
+  /* FIXME: Handle n == 2 differently, after the loop? */
+  while (n >= 2)
+    {
+      struct hgcd_matrix1 M;
+      mp_limb_t ah, al, bh, bl;
+      mp_limb_t mask;
+
+      mask = ap[n-1] | bp[n-1];
+      ASSERT (mask > 0);
+
+      if (mask & GMP_NUMB_HIGHBIT)
+	{
+	  ah = ap[n-1]; al = ap[n-2];
+	  bh = bp[n-1]; bl = bp[n-2];
+	}
+      else if (n == 2)
+	{
+	  /* We use the full inputs without truncation, so we can
+	     safely shift left. */
+	  int shift;
+
+	  count_leading_zeros (shift, mask);
+	  ah = MPN_EXTRACT_NUMB (shift, ap[1], ap[0]);
+	  al = ap[0] << shift;
+	  bh = MPN_EXTRACT_NUMB (shift, bp[1], bp[0]);
+	  bl = bp[0] << shift;
+	}
+      else
+	{
+	  int shift;
+
+	  count_leading_zeros (shift, mask);
+	  ah = MPN_EXTRACT_NUMB (shift, ap[n-1], ap[n-2]);
+	  al = MPN_EXTRACT_NUMB (shift, ap[n-2], ap[n-3]);
+	  bh = MPN_EXTRACT_NUMB (shift, bp[n-1], bp[n-2]);
+	  bl = MPN_EXTRACT_NUMB (shift, bp[n-2], bp[n-3]);
+	}
+
+      /* Try an mpn_nhgcd2 step */
+      if (mpn_hgcd2 (ah, al, bh, bl, &M))
+	{
+	  n = mpn_matrix22_mul1_inverse_vector (&M, tp, ap, bp, n);
+	  MP_PTR_SWAP (ap, tp);
+	  un = mpn_hgcd_mul_matrix1_vector(&M, u2, u0, u1, un);
+	  MP_PTR_SWAP (u0, u2);
+	}
+      else
+	{
+	  /* mpn_hgcd2 has failed. Then either one of a or b is very
+	     small, or the difference is very small. Perform one
+	     subtraction followed by one division. */
+	  ctx.u0 = u0;
+	  ctx.u1 = u1;
+	  ctx.tp = u2;
+	  ctx.un = un;
+
+	  /* Temporary storage n for the quotient and ualloc for the
+	     new cofactor. */
+	  n = mpn_gcd_subdiv_step (ap, bp, n, 0, mpn_gcdext_hook, &ctx, tp);
+	  if (n == 0)
+	    return ctx.gn;
+
+	  un = ctx.un;
+	}
+    }
+  ASSERT_ALWAYS (ap[0] > 0);
+  ASSERT_ALWAYS (bp[0] > 0);
+
+  if (ap[0] == bp[0])
+    {
+      int c;
+
+      /* Which cofactor to return now? Candidates are +u1 and -u0,
+	 depending on which of a and b was most recently reduced,
+	 which we don't keep track of. So compare and get the smallest
+	 one. */
+
+      gp[0] = ap[0];
+
+      MPN_CMP (c, u0, u1, un);
+      ASSERT (c != 0 || (un == 1 && u0[0] == 1 && u1[0] == 1));
+      if (c < 0)
+	{
+	  MPN_NORMALIZE (u0, un);
+	  MPN_COPY (up, u0, un);
+	  *usize = -un;
+	}
+      else
+	{
+	  MPN_NORMALIZE_NOT_ZERO (u1, un);
+	  MPN_COPY (up, u1, un);
+	  *usize = un;
+	}
+      return 1;
+    }
+  else
+    {
+      mp_limb_t uh, vh;
+      mp_limb_signed_t u;
+      mp_limb_signed_t v;
+      int negate;
+
+      gp[0] = mpn_gcdext_1 (&u, &v, ap[0], bp[0]);
+
+      /* Set up = u u1 - v u0. Keep track of size, un grows by one or
+	 two limbs. */
+
+      if (u == 0)
+	{
+	  ASSERT (v == 1);
+	  MPN_NORMALIZE (u0, un);
+	  MPN_COPY (up, u0, un);
+	  *usize = -un;
+	  return 1;
+	}
+      else if (v == 0)
+	{
+	  ASSERT (u == 1);
+	  MPN_NORMALIZE (u1, un);
+	  MPN_COPY (up, u1, un);
+	  *usize = un;
+	  return 1;
+	}
+      else if (u > 0)
+	{
+	  negate = 0;
+	  ASSERT (v < 0);
+	  v = -v;
+	}
+      else
+	{
+	  negate = 1;
+	  ASSERT (v > 0);
+	  u = -u;
+	}
+
+      uh = mpn_mul_1 (up, u1, un, u);
+      vh = mpn_addmul_1 (up, u0, un, v);
+
+      if ( (uh | vh) > 0)
+	{
+	  uh += vh;
+	  up[un++] = uh;
+	  if (uh < vh)
+	    up[un++] = 1;
+	}
+
+      MPN_NORMALIZE_NOT_ZERO (up, un);
+
+      *usize = negate ? -un : un;
+      return 1;
+    }
+}

diff --git a/third_party/gmp/mpn/generic/get_d.c b/third_party/gmp/mpn/generic/get_d.c
new file mode 100644
index 0000000..9784f21
--- /dev/null
+++ b/third_party/gmp/mpn/generic/get_d.c

@@ -0,0 +1,438 @@
+/* mpn_get_d -- limbs to double conversion.
+
+   THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY.  THEY'RE ALMOST
+   CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
+   FUTURE GNU MP RELEASES.
+
+Copyright 2003, 2004, 2007, 2009, 2010, 2012, 2018 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "config.h"
+
+#if HAVE_FLOAT_H
+#include <float.h>  /* for DBL_MANT_DIG and FLT_RADIX */
+#endif
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+#ifndef _GMP_IEEE_FLOATS
+#define _GMP_IEEE_FLOATS 0
+#endif
+
+/* To force use of the generic C code for testing, put
+   "#define _GMP_IEEE_FLOATS 0" at this point.  */
+
+
+/* In alpha gcc prior to 3.4, signed DI comparisons involving constants are
+   rearranged from "x < n" to "x+(-n) < 0", which is of course hopelessly
+   wrong if that addition overflows.
+
+   The workaround here avoids this bug by ensuring n is not a literal constant.
+   Note that this is alpha specific.  The offending transformation is/was in
+   alpha.c alpha_emit_conditional_branch() under "We want to use cmpcc/bcc".
+
+   Bizarrely, this happens also with Cray cc on alphaev5-cray-unicosmk2.0.6.X,
+   and has the same solution.  Don't know why or how.  */
+
+#if HAVE_HOST_CPU_FAMILY_alpha				\
+  && ((defined (__GNUC__) && ! __GMP_GNUC_PREREQ(3,4))	\
+      || defined (_CRAY))
+static volatile const long CONST_1024 = 1024;
+static volatile const long CONST_NEG_1023 = -1023;
+static volatile const long CONST_NEG_1022_SUB_53 = -1022 - 53;
+#else
+#define CONST_1024	      (1024)
+#define CONST_NEG_1023	      (-1023)
+#define CONST_NEG_1022_SUB_53 (-1022 - 53)
+#endif
+
+
+/* Return the value {ptr,size}*2^exp, and negative if sign<0.  Must have
+   size>=1, and a non-zero high limb ptr[size-1].
+
+   When we know the fp format, the result is truncated towards zero.  This is
+   consistent with other gmp conversions, like mpz_set_f or mpz_set_q, and is
+   easy to implement and test.
+
+   When we do not know the format, such truncation seems much harder.  One
+   would need to defeat any rounding mode, including round-up.
+
+   It's felt that GMP is not primarily concerned with hardware floats, and
+   really isn't enhanced by getting involved with hardware rounding modes
+   (which could even be some weird unknown style), so something unambiguous and
+   straightforward is best.
+
+
+   The IEEE code below is the usual case, it knows either a 32-bit or 64-bit
+   limb and is done with shifts and masks.  The 64-bit case in particular
+   should come out nice and compact.
+
+   The generic code used to work one bit at a time, which was not only slow,
+   but implicitly relied upon denorms for intermediates, since the lowest bits'
+   weight of a perfectly valid fp number underflows in non-denorm.  Therefore,
+   the generic code now works limb-per-limb, initially creating a number x such
+   that 1 <= x <= BASE.  (BASE is reached only as result of rounding.)  Then
+   x's exponent is scaled with explicit code (not ldexp to avoid libm
+   dependency).  It is a tap-dance to avoid underflow or overflow, beware!
+
+
+   Traps:
+
+   Hardware traps for overflow to infinity, underflow to zero, or unsupported
+   denorms may or may not be taken.  The IEEE code works bitwise and so
+   probably won't trigger them, the generic code works by float operations and
+   so probably will.  This difference might be thought less than ideal, but
+   again its felt straightforward code is better than trying to get intimate
+   with hardware exceptions (of perhaps unknown nature).
+
+
+   Not done:
+
+   mpz_get_d in the past handled size==1 with a cast limb->double.  This might
+   still be worthwhile there (for up to the mantissa many bits), but for
+   mpn_get_d here, the cost of applying "exp" to the resulting exponent would
+   probably use up any benefit a cast may have over bit twiddling.  Also, if
+   the exponent is pushed into denorm range then bit twiddling is the only
+   option, to ensure the desired truncation is obtained.
+
+
+   Other:
+
+   For reference, note that HPPA 8000, 8200, 8500 and 8600 trap FCNV,UDW,DBL
+   to the kernel for values >= 2^63.  This makes it slow, and worse the kernel
+   Linux (what versions?) apparently uses untested code in its trap handling
+   routines, and gets the sign wrong.  We don't use such a limb-to-double
+   cast, neither in the IEEE or generic code.  */
+
+
+
+#undef FORMAT_RECOGNIZED
+
+double
+mpn_get_d (mp_srcptr up, mp_size_t size, mp_size_t sign, long exp)
+{
+  int lshift, nbits;
+  mp_limb_t x, mhi, mlo;
+
+  ASSERT (size >= 0);
+  ASSERT_MPN (up, size);
+  ASSERT (size == 0 || up[size-1] != 0);
+
+  if (size == 0)
+    return 0.0;
+
+  /* Adjust exp to a radix point just above {up,size}, guarding against
+     overflow.  After this exp can of course be reduced to anywhere within
+     the {up,size} region without underflow.  */
+  if (UNLIKELY ((unsigned long) (GMP_NUMB_BITS * size)
+		> ((unsigned long) LONG_MAX - exp)))
+    {
+#if _GMP_IEEE_FLOATS
+      goto ieee_infinity;
+#endif
+
+      /* generic */
+      exp = LONG_MAX;
+    }
+  else
+    {
+      exp += GMP_NUMB_BITS * size;
+    }
+
+#if _GMP_IEEE_FLOATS
+    {
+      union ieee_double_extract u;
+
+      up += size;
+
+#if GMP_LIMB_BITS == 64
+      mlo = up[-1];
+      count_leading_zeros (lshift, mlo);
+
+      exp -= (lshift - GMP_NAIL_BITS) + 1;
+      mlo <<= lshift;
+
+      nbits = GMP_LIMB_BITS - lshift;
+
+      if (nbits < 53 && size > 1)
+	{
+	  x = up[-2];
+	  x <<= GMP_NAIL_BITS;
+	  x >>= nbits;
+	  mlo |= x;
+	  nbits += GMP_NUMB_BITS;
+
+	  if (LIMBS_PER_DOUBLE >= 3 && nbits < 53 && size > 2)
+	    {
+	      x = up[-3];
+	      x <<= GMP_NAIL_BITS;
+	      x >>= nbits;
+	      mlo |= x;
+	      nbits += GMP_NUMB_BITS;
+	    }
+	}
+      mhi = mlo >> (32 + 11);
+      mlo = mlo >> 11;		/* later implicitly truncated to 32 bits */
+#endif
+#if GMP_LIMB_BITS == 32
+      x = *--up;
+      count_leading_zeros (lshift, x);
+
+      exp -= (lshift - GMP_NAIL_BITS) + 1;
+      x <<= lshift;
+      mhi = x >> 11;
+
+      if (lshift < 11)		/* FIXME: never true if NUMB < 20 bits */
+	{
+	  /* All 20 bits in mhi */
+	  mlo = x << 21;
+	  /* >= 1 bit in mlo */
+	  nbits = GMP_LIMB_BITS - lshift - 21;
+	}
+      else
+	{
+	  if (size > 1)
+	    {
+	      nbits = GMP_LIMB_BITS - lshift;
+
+	      x = *--up, size--;
+	      x <<= GMP_NAIL_BITS;
+	      mhi |= x >> nbits >> 11;
+
+	      mlo = x << GMP_LIMB_BITS - nbits - 11;
+	      nbits = nbits + 11 - GMP_NAIL_BITS;
+	    }
+	  else
+	    {
+	      mlo = 0;
+	      goto done;
+	    }
+	}
+
+      /* Now all needed bits in mhi have been accumulated.  Add bits to mlo.  */
+
+      if (LIMBS_PER_DOUBLE >= 2 && nbits < 32 && size > 1)
+	{
+	  x = up[-1];
+	  x <<= GMP_NAIL_BITS;
+	  x >>= nbits;
+	  mlo |= x;
+	  nbits += GMP_NUMB_BITS;
+
+	  if (LIMBS_PER_DOUBLE >= 3 && nbits < 32 && size > 2)
+	    {
+	      x = up[-2];
+	      x <<= GMP_NAIL_BITS;
+	      x >>= nbits;
+	      mlo |= x;
+	      nbits += GMP_NUMB_BITS;
+
+	      if (LIMBS_PER_DOUBLE >= 4 && nbits < 32 && size > 3)
+		{
+		  x = up[-3];
+		  x <<= GMP_NAIL_BITS;
+		  x >>= nbits;
+		  mlo |= x;
+		  nbits += GMP_NUMB_BITS;
+		}
+	    }
+	}
+
+    done:;
+
+#endif
+      if (UNLIKELY (exp >= CONST_1024))
+	{
+	  /* overflow, return infinity */
+	ieee_infinity:
+	  mhi = 0;
+	  mlo = 0;
+	  exp = 1024;
+	}
+      else if (UNLIKELY (exp <= CONST_NEG_1023))
+	{
+	  int rshift;
+
+	  if (LIKELY (exp <= CONST_NEG_1022_SUB_53))
+	    return 0.0;	 /* denorm underflows to zero */
+
+	  rshift = -1022 - exp;
+	  ASSERT (rshift > 0 && rshift < 53);
+#if GMP_LIMB_BITS > 53
+	  mlo >>= rshift;
+	  mhi = mlo >> 32;
+#else
+	  if (rshift >= 32)
+	    {
+	      mlo = mhi;
+	      mhi = 0;
+	      rshift -= 32;
+	    }
+	  lshift = GMP_LIMB_BITS - rshift;
+	  mlo = (mlo >> rshift) | (rshift == 0 ? 0 : mhi << lshift);
+	  mhi >>= rshift;
+#endif
+	  exp = -1023;
+	}
+      u.s.manh = mhi;
+      u.s.manl = mlo;
+      u.s.exp = exp + 1023;
+      u.s.sig = (sign < 0);
+      return u.d;
+    }
+#define FORMAT_RECOGNIZED 1
+#endif
+
+#if HAVE_DOUBLE_VAX_D
+    {
+      union double_extract u;
+
+      up += size;
+
+      mhi = up[-1];
+
+      count_leading_zeros (lshift, mhi);
+      exp -= lshift;
+      mhi <<= lshift;
+
+      mlo = 0;
+      if (size > 1)
+	{
+	  mlo = up[-2];
+	  if (lshift != 0)
+	    mhi += mlo >> (GMP_LIMB_BITS - lshift);
+	  mlo <<= lshift;
+
+	  if (size > 2 && lshift > 8)
+	    {
+	      x = up[-3];
+	      mlo += x >> (GMP_LIMB_BITS - lshift);
+	    }
+	}
+
+      if (UNLIKELY (exp >= 128))
+	{
+	  /* overflow, return maximum number */
+	  mhi = 0xffffffff;
+	  mlo = 0xffffffff;
+	  exp = 127;
+	}
+      else if (UNLIKELY (exp < -128))
+	{
+	  return 0.0;	 /* underflows to zero */
+	}
+
+      u.s.man3 = mhi >> 24;	/* drop msb, since implicit */
+      u.s.man2 = mhi >> 8;
+      u.s.man1 = (mhi << 8) + (mlo >> 24);
+      u.s.man0 = mlo >> 8;
+      u.s.exp = exp + 128;
+      u.s.sig = sign < 0;
+      return u.d;
+    }
+#define FORMAT_RECOGNIZED 1
+#endif
+
+#if ! FORMAT_RECOGNIZED
+
+#if !defined(GMP_DBL_MANT_BITS)
+#if defined(DBL_MANT_DIG) && FLT_RADIX == 2
+#define GMP_DBL_MANT_BITS DBL_MANT_DIG
+#else
+/* FIXME: Chose a smarter default value. */
+#define GMP_DBL_MANT_BITS (16 * sizeof (double))
+#endif
+#endif
+
+    { /* Non-IEEE or strange limb size, generically convert
+	 GMP_DBL_MANT_BITS bits. */
+      mp_limb_t l;
+      int m;
+      mp_size_t i;
+      double d, weight;
+      unsigned long uexp;
+
+      /* First generate an fp number disregarding exp, instead keeping things
+	 within the numb base factor from 1, which should prevent overflow and
+	 underflow even for the most exponent limited fp formats.  */
+      i = size - 1;
+      l = up[i];
+      count_leading_zeros (m, l);
+      m = m + GMP_DBL_MANT_BITS - GMP_LIMB_BITS;
+      if (m < 0)
+	l &= GMP_NUMB_MAX << -m;
+      d = l;
+      for (weight = 1/MP_BASE_AS_DOUBLE; m > 0 && --i >= 0;)
+	{
+	  l = up[i];
+	  m -= GMP_NUMB_BITS;
+	  if (m < 0)
+	    l &= GMP_NUMB_MAX << -m;
+	  d += l * weight;
+	  weight /= MP_BASE_AS_DOUBLE;
+	  if (weight == 0)
+	    break;
+	}
+
+      /* Now apply exp.  */
+      exp -= GMP_NUMB_BITS;
+      if (exp > 0)
+	{
+	  weight = 2.0;
+	  uexp = exp;
+	}
+      else
+	{
+	  weight = 0.5;
+	  uexp = NEG_CAST (unsigned long, exp);
+	}
+#if 1
+      /* Square-and-multiply exponentiation.  */
+      if (uexp & 1)
+	d *= weight;
+      while (uexp >>= 1)
+	{
+	  weight *= weight;
+	  if (uexp & 1)
+	    d *= weight;
+	}
+#else
+      /* Plain exponentiation.  */
+      while (uexp > 0)
+	{
+	  d *= weight;
+	  uexp--;
+	}
+#endif
+
+      return sign >= 0 ? d : -d;
+    }
+#endif
+}

diff --git a/third_party/gmp/mpn/generic/get_str.c b/third_party/gmp/mpn/generic/get_str.c
new file mode 100644
index 0000000..19cc581
--- /dev/null
+++ b/third_party/gmp/mpn/generic/get_str.c

@@ -0,0 +1,451 @@
+/* mpn_get_str -- Convert {UP,USIZE} to a base BASE string in STR.
+
+   Contributed to the GNU project by Torbjorn Granlund.
+
+   THE FUNCTIONS IN THIS FILE, EXCEPT mpn_get_str, ARE INTERNAL WITH MUTABLE
+   INTERFACES.  IT IS ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.
+   IN FACT, IT IS ALMOST GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A
+   FUTURE GNU MP RELEASE.
+
+Copyright 1991-2017 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+/* Conversion of U {up,un} to a string in base b.  Internally, we convert to
+   base B = b^m, the largest power of b that fits a limb.  Basic algorithms:
+
+  A) Divide U repeatedly by B, generating a quotient and remainder, until the
+     quotient becomes zero.  The remainders hold the converted digits.  Digits
+     come out from right to left.  (Used in mpn_bc_get_str.)
+
+  B) Divide U by b^g, for g such that 1/b <= U/b^g < 1, generating a fraction.
+     Then develop digits by multiplying the fraction repeatedly by b.  Digits
+     come out from left to right.  (Currently not used herein, except for in
+     code for converting single limbs to individual digits.)
+
+  C) Compute B^1, B^2, B^4, ..., B^s, for s such that B^s is just above
+     sqrt(U).  Then divide U by B^s, generating quotient and remainder.
+     Recursively convert the quotient, then the remainder, using the
+     precomputed powers.  Digits come out from left to right.  (Used in
+     mpn_dc_get_str.)
+
+  When using algorithm C, algorithm B might be suitable for basecase code,
+  since the required b^g power will be readily accessible.
+
+  Optimization ideas:
+  1. The recursive function of (C) could use less temporary memory.  The powtab
+     allocation could be trimmed with some computation, and the tmp area could
+     be reduced, or perhaps eliminated if up is reused for both quotient and
+     remainder (it is currently used just for remainder).
+  2. Store the powers of (C) in normalized form, with the normalization count.
+     Quotients will usually need to be left-shifted before each divide, and
+     remainders will either need to be left-shifted of right-shifted.
+  3. In the code for developing digits from a single limb, we could avoid using
+     a full umul_ppmm except for the first (or first few) digits, provided base
+     is even.  Subsequent digits can be developed using plain multiplication.
+     (This saves on register-starved machines (read x86) and on all machines
+     that generate the upper product half using a separate instruction (alpha,
+     powerpc, IA-64) or lacks such support altogether (sparc64, hppa64).
+  4. Separate mpn_dc_get_str basecase code from code for small conversions. The
+     former code will have the exact right power readily available in the
+     powtab parameter for dividing the current number into a fraction.  Convert
+     that using algorithm B.
+  5. Completely avoid division.  Compute the inverses of the powers now in
+     powtab instead of the actual powers.
+  6. Decrease powtab allocation for even bases.  E.g. for base 10 we could save
+     about 30% (1-log(5)/log(10)).
+
+  Basic structure of (C):
+    mpn_get_str:
+      if POW2_P (n)
+	...
+      else
+	if (un < GET_STR_PRECOMPUTE_THRESHOLD)
+	  mpn_bx_get_str (str, base, up, un);
+	else
+	  precompute_power_tables
+	  mpn_dc_get_str
+
+    mpn_dc_get_str:
+	mpn_tdiv_qr
+	if (qn < GET_STR_DC_THRESHOLD)
+	  mpn_bc_get_str
+	else
+	  mpn_dc_get_str
+	if (rn < GET_STR_DC_THRESHOLD)
+	  mpn_bc_get_str
+	else
+	  mpn_dc_get_str
+
+
+  The reason for the two threshold values is the cost of
+  precompute_power_tables.  GET_STR_PRECOMPUTE_THRESHOLD will be
+  considerably larger than GET_STR_DC_THRESHOLD.  */
+
+
+/* The x86s and m68020 have a quotient and remainder "div" instruction and
+   gcc recognises an adjacent "/" and "%" can be combined using that.
+   Elsewhere "/" and "%" are either separate instructions, or separate
+   libgcc calls (which unfortunately gcc as of version 3.0 doesn't combine).
+   A multiply and subtract should be faster than a "%" in those cases.  */
+#if HAVE_HOST_CPU_FAMILY_x86            \
+  || HAVE_HOST_CPU_m68020               \
+  || HAVE_HOST_CPU_m68030               \
+  || HAVE_HOST_CPU_m68040               \
+  || HAVE_HOST_CPU_m68060               \
+  || HAVE_HOST_CPU_m68360 /* CPU32 */
+#define udiv_qrnd_unnorm(q,r,n,d)       \
+  do {                                  \
+    mp_limb_t  __q = (n) / (d);         \
+    mp_limb_t  __r = (n) % (d);         \
+    (q) = __q;                          \
+    (r) = __r;                          \
+  } while (0)
+#else
+#define udiv_qrnd_unnorm(q,r,n,d)       \
+  do {                                  \
+    mp_limb_t  __q = (n) / (d);         \
+    mp_limb_t  __r = (n) - __q*(d);     \
+    (q) = __q;                          \
+    (r) = __r;                          \
+  } while (0)
+#endif
+
+
+/* Convert {up,un} to a string in base base, and put the result in str.
+   Generate len characters, possibly padding with zeros to the left.  If len is
+   zero, generate as many characters as required.  Return a pointer immediately
+   after the last digit of the result string.  Complexity is O(un^2); intended
+   for small conversions.  */
+static unsigned char *
+mpn_bc_get_str (unsigned char *str, size_t len,
+		mp_ptr up, mp_size_t un, int base)
+{
+  mp_limb_t rl, ul;
+  unsigned char *s;
+  size_t l;
+  /* Allocate memory for largest possible string, given that we only get here
+     for operands with un < GET_STR_PRECOMPUTE_THRESHOLD and that the smallest
+     base is 3.  7/11 is an approximation to 1/log2(3).  */
+#if TUNE_PROGRAM_BUILD
+#define BUF_ALLOC (GET_STR_THRESHOLD_LIMIT * GMP_LIMB_BITS * 7 / 11)
+#else
+#define BUF_ALLOC (GET_STR_PRECOMPUTE_THRESHOLD * GMP_LIMB_BITS * 7 / 11)
+#endif
+  unsigned char buf[BUF_ALLOC];
+#if TUNE_PROGRAM_BUILD
+  mp_limb_t rp[GET_STR_THRESHOLD_LIMIT];
+#else
+  mp_limb_t rp[GET_STR_PRECOMPUTE_THRESHOLD];
+#endif
+
+  if (base == 10)
+    {
+      /* Special case code for base==10 so that the compiler has a chance to
+	 optimize things.  */
+
+      MPN_COPY (rp + 1, up, un);
+
+      s = buf + BUF_ALLOC;
+      while (un > 1)
+	{
+	  int i;
+	  mp_limb_t frac, digit;
+	  MPN_DIVREM_OR_PREINV_DIVREM_1 (rp, (mp_size_t) 1, rp + 1, un,
+					 MP_BASES_BIG_BASE_10,
+					 MP_BASES_BIG_BASE_INVERTED_10,
+					 MP_BASES_NORMALIZATION_STEPS_10);
+	  un -= rp[un] == 0;
+	  frac = (rp[0] + 1) << GMP_NAIL_BITS;
+	  s -= MP_BASES_CHARS_PER_LIMB_10;
+#if HAVE_HOST_CPU_FAMILY_x86
+	  /* The code below turns out to be a bit slower for x86 using gcc.
+	     Use plain code.  */
+	  i = MP_BASES_CHARS_PER_LIMB_10;
+	  do
+	    {
+	      umul_ppmm (digit, frac, frac, 10);
+	      *s++ = digit;
+	    }
+	  while (--i);
+#else
+	  /* Use the fact that 10 in binary is 1010, with the lowest bit 0.
+	     After a few umul_ppmm, we will have accumulated enough low zeros
+	     to use a plain multiply.  */
+	  if (MP_BASES_NORMALIZATION_STEPS_10 == 0)
+	    {
+	      umul_ppmm (digit, frac, frac, 10);
+	      *s++ = digit;
+	    }
+	  if (MP_BASES_NORMALIZATION_STEPS_10 <= 1)
+	    {
+	      umul_ppmm (digit, frac, frac, 10);
+	      *s++ = digit;
+	    }
+	  if (MP_BASES_NORMALIZATION_STEPS_10 <= 2)
+	    {
+	      umul_ppmm (digit, frac, frac, 10);
+	      *s++ = digit;
+	    }
+	  if (MP_BASES_NORMALIZATION_STEPS_10 <= 3)
+	    {
+	      umul_ppmm (digit, frac, frac, 10);
+	      *s++ = digit;
+	    }
+	  i = (MP_BASES_CHARS_PER_LIMB_10 - ((MP_BASES_NORMALIZATION_STEPS_10 < 4)
+					     ? (4-MP_BASES_NORMALIZATION_STEPS_10)
+					     : 0));
+	  frac = (frac + 0xf) >> 4;
+	  do
+	    {
+	      frac *= 10;
+	      digit = frac >> (GMP_LIMB_BITS - 4);
+	      *s++ = digit;
+	      frac &= (~(mp_limb_t) 0) >> 4;
+	    }
+	  while (--i);
+#endif
+	  s -= MP_BASES_CHARS_PER_LIMB_10;
+	}
+
+      ul = rp[1];
+      while (ul != 0)
+	{
+	  udiv_qrnd_unnorm (ul, rl, ul, 10);
+	  *--s = rl;
+	}
+    }
+  else /* not base 10 */
+    {
+      unsigned chars_per_limb;
+      mp_limb_t big_base, big_base_inverted;
+      unsigned normalization_steps;
+
+      chars_per_limb = mp_bases[base].chars_per_limb;
+      big_base = mp_bases[base].big_base;
+      big_base_inverted = mp_bases[base].big_base_inverted;
+      count_leading_zeros (normalization_steps, big_base);
+
+      MPN_COPY (rp + 1, up, un);
+
+      s = buf + BUF_ALLOC;
+      while (un > 1)
+	{
+	  int i;
+	  mp_limb_t frac;
+	  MPN_DIVREM_OR_PREINV_DIVREM_1 (rp, (mp_size_t) 1, rp + 1, un,
+					 big_base, big_base_inverted,
+					 normalization_steps);
+	  un -= rp[un] == 0;
+	  frac = (rp[0] + 1) << GMP_NAIL_BITS;
+	  s -= chars_per_limb;
+	  i = chars_per_limb;
+	  do
+	    {
+	      mp_limb_t digit;
+	      umul_ppmm (digit, frac, frac, base);
+	      *s++ = digit;
+	    }
+	  while (--i);
+	  s -= chars_per_limb;
+	}
+
+      ul = rp[1];
+      while (ul != 0)
+	{
+	  udiv_qrnd_unnorm (ul, rl, ul, base);
+	  *--s = rl;
+	}
+    }
+
+  l = buf + BUF_ALLOC - s;
+  while (l < len)
+    {
+      *str++ = 0;
+      len--;
+    }
+  while (l != 0)
+    {
+      *str++ = *s++;
+      l--;
+    }
+  return str;
+}
+
+
+/* Convert {UP,UN} to a string with a base as represented in POWTAB, and put
+   the string in STR.  Generate LEN characters, possibly padding with zeros to
+   the left.  If LEN is zero, generate as many characters as required.
+   Return a pointer immediately after the last digit of the result string.
+   This uses divide-and-conquer and is intended for large conversions.  */
+static unsigned char *
+mpn_dc_get_str (unsigned char *str, size_t len,
+		mp_ptr up, mp_size_t un,
+		const powers_t *powtab, mp_ptr tmp)
+{
+  if (BELOW_THRESHOLD (un, GET_STR_DC_THRESHOLD))
+    {
+      if (un != 0)
+	str = mpn_bc_get_str (str, len, up, un, powtab->base);
+      else
+	{
+	  while (len != 0)
+	    {
+	      *str++ = 0;
+	      len--;
+	    }
+	}
+    }
+  else
+    {
+      mp_ptr pwp, qp, rp;
+      mp_size_t pwn, qn;
+      mp_size_t sn;
+
+      pwp = powtab->p;
+      pwn = powtab->n;
+      sn = powtab->shift;
+
+      if (un < pwn + sn || (un == pwn + sn && mpn_cmp (up + sn, pwp, un - sn) < 0))
+	{
+	  str = mpn_dc_get_str (str, len, up, un, powtab - 1, tmp);
+	}
+      else
+	{
+	  qp = tmp;		/* (un - pwn + 1) limbs for qp */
+	  rp = up;		/* pwn limbs for rp; overwrite up area */
+
+	  mpn_tdiv_qr (qp, rp + sn, 0L, up + sn, un - sn, pwp, pwn);
+	  qn = un - sn - pwn; qn += qp[qn] != 0;		/* quotient size */
+
+	  ASSERT (qn < pwn + sn || (qn == pwn + sn && mpn_cmp (qp + sn, pwp, pwn) < 0));
+
+	  if (len != 0)
+	    len = len - powtab->digits_in_base;
+
+	  str = mpn_dc_get_str (str, len, qp, qn, powtab - 1, tmp + qn);
+	  str = mpn_dc_get_str (str, powtab->digits_in_base, rp, pwn + sn, powtab - 1, tmp);
+	}
+    }
+  return str;
+}
+
+/* There are no leading zeros on the digits generated at str, but that's not
+   currently a documented feature.  The current mpz_out_str and mpz_get_str
+   rely on it.  */
+
+size_t
+mpn_get_str (unsigned char *str, int base, mp_ptr up, mp_size_t un)
+{
+  mp_ptr powtab_mem;
+  powers_t powtab[GMP_LIMB_BITS];
+  int pi;
+  size_t out_len;
+  mp_ptr tmp;
+  TMP_DECL;
+
+  /* Special case zero, as the code below doesn't handle it.  */
+  if (un == 0)
+    {
+      str[0] = 0;
+      return 1;
+    }
+
+  if (POW2_P (base))
+    {
+      /* The base is a power of 2.  Convert from most significant end.  */
+      mp_limb_t n1, n0;
+      int bits_per_digit = mp_bases[base].big_base;
+      int cnt;
+      int bit_pos;
+      mp_size_t i;
+      unsigned char *s = str;
+      mp_bitcnt_t bits;
+
+      n1 = up[un - 1];
+      count_leading_zeros (cnt, n1);
+
+      /* BIT_POS should be R when input ends in least significant nibble,
+	 R + bits_per_digit * n when input ends in nth least significant
+	 nibble. */
+
+      bits = (mp_bitcnt_t) GMP_NUMB_BITS * un - cnt + GMP_NAIL_BITS;
+      cnt = bits % bits_per_digit;
+      if (cnt != 0)
+	bits += bits_per_digit - cnt;
+      bit_pos = bits - (mp_bitcnt_t) (un - 1) * GMP_NUMB_BITS;
+
+      /* Fast loop for bit output.  */
+      i = un - 1;
+      for (;;)
+	{
+	  bit_pos -= bits_per_digit;
+	  while (bit_pos >= 0)
+	    {
+	      *s++ = (n1 >> bit_pos) & ((1 << bits_per_digit) - 1);
+	      bit_pos -= bits_per_digit;
+	    }
+	  i--;
+	  if (i < 0)
+	    break;
+	  n0 = (n1 << -bit_pos) & ((1 << bits_per_digit) - 1);
+	  n1 = up[i];
+	  bit_pos += GMP_NUMB_BITS;
+	  *s++ = n0 | (n1 >> bit_pos);
+	}
+
+      return s - str;
+    }
+
+  /* General case.  The base is not a power of 2.  */
+
+  if (BELOW_THRESHOLD (un, GET_STR_PRECOMPUTE_THRESHOLD))
+    return mpn_bc_get_str (str, (size_t) 0, up, un, base) - str;
+
+  TMP_MARK;
+
+  /* Allocate one large block for the powers of big_base.  */
+  powtab_mem = TMP_BALLOC_LIMBS (mpn_str_powtab_alloc (un));
+
+  /* Compute a table of powers, were the largest power is >= sqrt(U).  */
+  size_t ndig;
+  mp_size_t xn;
+  DIGITS_IN_BASE_PER_LIMB (ndig, un, base);
+  xn = 1 + ndig / mp_bases[base].chars_per_limb; /* FIXME: scalar integer division */
+
+  pi = 1 + mpn_compute_powtab (powtab, powtab_mem, xn, base);
+
+  /* Using our precomputed powers, now in powtab[], convert our number.  */
+  tmp = TMP_BALLOC_LIMBS (mpn_dc_get_str_itch (un));
+  out_len = mpn_dc_get_str (str, 0, up, un, powtab + (pi - 1), tmp) - str;
+  TMP_FREE;
+
+  return out_len;
+}

diff --git a/third_party/gmp/mpn/generic/gmp-mparam.h b/third_party/gmp/mpn/generic/gmp-mparam.h
new file mode 100644
index 0000000..7dc057a
--- /dev/null
+++ b/third_party/gmp/mpn/generic/gmp-mparam.h

@@ -0,0 +1,33 @@
+/* Generic C gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 2000 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+/* Values for GMP_LIMB_BITS etc will be determined by ./configure and put
+   in config.h. */

diff --git a/third_party/gmp/mpn/generic/hgcd.c b/third_party/gmp/mpn/generic/hgcd.c
new file mode 100644
index 0000000..e3e9c66
--- /dev/null
+++ b/third_party/gmp/mpn/generic/hgcd.c

@@ -0,0 +1,182 @@
+/* hgcd.c.
+
+   THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
+   SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+Copyright 2003-2005, 2008, 2011, 2012 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+
+/* Size analysis for hgcd:
+
+   For the recursive calls, we have n1 <= ceil(n / 2). Then the
+   storage need is determined by the storage for the recursive call
+   computing M1, and hgcd_matrix_adjust and hgcd_matrix_mul calls that use M1
+   (after this, the storage needed for M1 can be recycled).
+
+   Let S(r) denote the required storage. For M1 we need 4 * (ceil(n1/2) + 1)
+   = 4 * (ceil(n/4) + 1), for the hgcd_matrix_adjust call, we need n + 2,
+   and for the hgcd_matrix_mul, we may need 3 ceil(n/2) + 8. In total,
+   4 * ceil(n/4) + 3 ceil(n/2) + 12 <= 10 ceil(n/4) + 12.
+
+   For the recursive call, we need S(n1) = S(ceil(n/2)).
+
+   S(n) <= 10*ceil(n/4) + 12 + S(ceil(n/2))
+	<= 10*(ceil(n/4) + ... + ceil(n/2^(1+k))) + 12k + S(ceil(n/2^k))
+	<= 10*(2 ceil(n/4) + k) + 12k + S(ceil(n/2^k))
+	<= 20 ceil(n/4) + 22k + S(ceil(n/2^k))
+*/
+
+mp_size_t
+mpn_hgcd_itch (mp_size_t n)
+{
+  unsigned k;
+  int count;
+  mp_size_t nscaled;
+
+  if (BELOW_THRESHOLD (n, HGCD_THRESHOLD))
+    return n;
+
+  /* Get the recursion depth. */
+  nscaled = (n - 1) / (HGCD_THRESHOLD - 1);
+  count_leading_zeros (count, nscaled);
+  k = GMP_LIMB_BITS - count;
+
+  return 20 * ((n+3) / 4) + 22 * k + HGCD_THRESHOLD;
+}
+
+/* Reduces a,b until |a-b| fits in n/2 + 1 limbs. Constructs matrix M
+   with elements of size at most (n+1)/2 - 1. Returns new size of a,
+   b, or zero if no reduction is possible. */
+
+mp_size_t
+mpn_hgcd (mp_ptr ap, mp_ptr bp, mp_size_t n,
+	  struct hgcd_matrix *M, mp_ptr tp)
+{
+  mp_size_t s = n/2 + 1;
+
+  mp_size_t nn;
+  int success = 0;
+
+  if (n <= s)
+    /* Happens when n <= 2, a fairly uninteresting case but exercised
+       by the random inputs of the testsuite. */
+    return 0;
+
+  ASSERT ((ap[n-1] | bp[n-1]) > 0);
+
+  ASSERT ((n+1)/2 - 1 < M->alloc);
+
+  if (ABOVE_THRESHOLD (n, HGCD_THRESHOLD))
+    {
+      mp_size_t n2 = (3*n)/4 + 1;
+      mp_size_t p = n/2;
+
+      nn = mpn_hgcd_reduce (M, ap, bp, n, p, tp);
+      if (nn)
+	{
+	  n = nn;
+	  success = 1;
+	}
+
+      /* NOTE: It appears this loop never runs more than once (at
+	 least when not recursing to hgcd_appr). */
+      while (n > n2)
+	{
+	  /* Needs n + 1 storage */
+	  nn = mpn_hgcd_step (n, ap, bp, s, M, tp);
+	  if (!nn)
+	    return success ? n : 0;
+
+	  n = nn;
+	  success = 1;
+	}
+
+      if (n > s + 2)
+	{
+	  struct hgcd_matrix M1;
+	  mp_size_t scratch;
+
+	  p = 2*s - n + 1;
+	  scratch = MPN_HGCD_MATRIX_INIT_ITCH (n-p);
+
+	  mpn_hgcd_matrix_init(&M1, n - p, tp);
+
+	  /* FIXME: Should use hgcd_reduce, but that may require more
+	     scratch space, which requires review. */
+
+	  nn = mpn_hgcd (ap + p, bp + p, n - p, &M1, tp + scratch);
+	  if (nn > 0)
+	    {
+	      /* We always have max(M) > 2^{-(GMP_NUMB_BITS + 1)} max(M1) */
+	      ASSERT (M->n + 2 >= M1.n);
+
+	      /* Furthermore, assume M ends with a quotient (1, q; 0, 1),
+		 then either q or q + 1 is a correct quotient, and M1 will
+		 start with either (1, 0; 1, 1) or (2, 1; 1, 1). This
+		 rules out the case that the size of M * M1 is much
+		 smaller than the expected M->n + M1->n. */
+
+	      ASSERT (M->n + M1.n < M->alloc);
+
+	      /* Needs 2 (p + M->n) <= 2 (2*s - n2 + 1 + n2 - s - 1)
+		 = 2*s <= 2*(floor(n/2) + 1) <= n + 2. */
+	      n = mpn_hgcd_matrix_adjust (&M1, p + nn, ap, bp, p, tp + scratch);
+
+	      /* We need a bound for of M->n + M1.n. Let n be the original
+		 input size. Then
+
+		 ceil(n/2) - 1 >= size of product >= M.n + M1.n - 2
+
+		 and it follows that
+
+		 M.n + M1.n <= ceil(n/2) + 1
+
+		 Then 3*(M.n + M1.n) + 5 <= 3 * ceil(n/2) + 8 is the
+		 amount of needed scratch space. */
+	      mpn_hgcd_matrix_mul (M, &M1, tp + scratch);
+	      success = 1;
+	    }
+	}
+    }
+
+  for (;;)
+    {
+      /* Needs s+3 < n */
+      nn = mpn_hgcd_step (n, ap, bp, s, M, tp);
+      if (!nn)
+	return success ? n : 0;
+
+      n = nn;
+      success = 1;
+    }
+}

diff --git a/third_party/gmp/mpn/generic/hgcd2.c b/third_party/gmp/mpn/generic/hgcd2.c
new file mode 100644
index 0000000..3fa4012
--- /dev/null
+++ b/third_party/gmp/mpn/generic/hgcd2.c

@@ -0,0 +1,734 @@
+/* hgcd2.c
+
+   THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
+   SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+Copyright 1996, 1998, 2000-2004, 2008, 2012, 2019 Free Software Foundation,
+Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+#ifndef HGCD2_DIV1_METHOD
+#define HGCD2_DIV1_METHOD 3
+#endif
+
+#ifndef HGCD2_DIV2_METHOD
+#define HGCD2_DIV2_METHOD 2
+#endif
+
+#if GMP_NAIL_BITS != 0
+#error Nails not implemented
+#endif
+
+#if HAVE_NATIVE_mpn_div_11
+
+#define div1 mpn_div_11
+/* Single-limb division optimized for small quotients.
+   Returned value holds d0 = r, d1 = q. */
+mp_double_limb_t div1 (mp_limb_t, mp_limb_t);
+
+#elif HGCD2_DIV1_METHOD == 1
+
+static inline mp_double_limb_t
+div1 (mp_limb_t n0, mp_limb_t d0)
+{
+  mp_double_limb_t res;
+  res.d1 = n0 / d0;
+  res.d0 = n0 - res.d1 * d0;
+
+  return res;
+}
+
+#elif HGCD2_DIV1_METHOD == 2
+
+static mp_double_limb_t
+div1 (mp_limb_t n0, mp_limb_t d0)
+{
+  mp_double_limb_t res;
+  int ncnt, dcnt, cnt;
+  mp_limb_t q;
+  mp_limb_t mask;
+
+  ASSERT (n0 >= d0);
+
+  count_leading_zeros (ncnt, n0);
+  count_leading_zeros (dcnt, d0);
+  cnt = dcnt - ncnt;
+
+  d0 <<= cnt;
+
+  q = -(mp_limb_t) (n0 >= d0);
+  n0 -= d0 & q;
+  d0 >>= 1;
+  q = -q;
+
+  while (--cnt >= 0)
+    {
+      mask = -(mp_limb_t) (n0 >= d0);
+      n0 -= d0 & mask;
+      d0 >>= 1;
+      q = (q << 1) - mask;
+    }
+
+  res.d0 = n0;
+  res.d1 = q;
+  return res;
+}
+
+#elif HGCD2_DIV1_METHOD == 3
+
+static inline mp_double_limb_t
+div1 (mp_limb_t n0, mp_limb_t d0)
+{
+  mp_double_limb_t res;
+  if (UNLIKELY ((d0 >> (GMP_LIMB_BITS - 3)) != 0)
+      || UNLIKELY (n0 >= (d0 << 3)))
+    {
+      res.d1 = n0 / d0;
+      res.d0 = n0 - res.d1 * d0;
+    }
+  else
+    {
+      mp_limb_t q, mask;
+
+      d0 <<= 2;
+
+      mask = -(mp_limb_t) (n0 >= d0);
+      n0 -= d0 & mask;
+      q = 4 & mask;
+
+      d0 >>= 1;
+      mask = -(mp_limb_t) (n0 >= d0);
+      n0 -= d0 & mask;
+      q += 2 & mask;
+
+      d0 >>= 1;
+      mask = -(mp_limb_t) (n0 >= d0);
+      n0 -= d0 & mask;
+      q -= mask;
+
+      res.d0 = n0;
+      res.d1 = q;
+    }
+  return res;
+}
+
+#elif HGCD2_DIV1_METHOD == 4
+
+/* Table quotients.  We extract the NBITS most significant bits of the
+   numerator limb, and the corresponding bits from the divisor limb, and use
+   these to form an index into the table.  This method is probably only useful
+   for short pipelines with slow multiplication.
+
+   Possible improvements:
+
+   * Perhaps extract the highest NBITS of the divisor instead of the same bits
+     as from the numerator.  That would require another count_leading_zeros,
+     and a post-multiply shift of the quotient.
+
+   * Compress tables?  Their values are tiny, and there are lots of zero
+     entries (which are never used).
+
+   * Round the table entries more cleverly?
+*/
+
+#ifndef NBITS
+#define NBITS 5
+#endif
+
+#if NBITS == 5
+/* This needs full division about 13.2% of the time. */
+static const unsigned char tab[512] = {
+17, 9, 5,4,3,2,2,2,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+18, 9, 6,4,3,2,2,2,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+19,10, 6,4,3,3,2,2,2,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,
+20,10, 6,5,3,3,2,2,2,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,
+21,11, 7,5,4,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,
+22,11, 7,5,4,3,3,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,
+23,12, 7,5,4,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,
+24,12, 8,6,4,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,
+25,13, 8,6,5,4,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,
+26,13, 8,6,5,4,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,
+27,14, 9,6,5,4,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,
+28,14, 9,7,5,4,3,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,
+29,15,10,7,5,4,4,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,
+30,15,10,7,6,5,4,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,
+31,16,10,7,6,5,4,3,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,
+32,16,11,8,6,5,4,3,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
+};
+#elif NBITS == 6
+/* This needs full division about 9.8% of the time. */
+static const unsigned char tab[2048] = {
+33,17,11, 8, 6, 5,4,4,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 1, 0, 0, 0, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+34,17,11, 8, 6, 5,4,4,3,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 1, 1, 0, 0, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+35,18,12, 9, 7, 5,5,4,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 1, 1, 1, 0, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+36,18,12, 9, 7, 6,5,4,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 1, 1, 1, 1, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+37,19,13, 9, 7, 6,5,4,4,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 1, 1, 1, 1, 1, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+38,19,13, 9, 7, 6,5,4,4,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 1, 1, 1, 1, 1, 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+39,20,13,10, 7, 6,5,4,4,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 1, 1, 1, 1, 1, 1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+40,20,14,10, 8, 6,5,5,4,3,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 1, 1, 1, 1, 1, 1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+41,21,14,10, 8, 6,5,5,4,4,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,
+ 1, 1, 1, 1, 1, 1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+42,21,14,10, 8, 7,6,5,4,4,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,
+ 1, 1, 1, 1, 1, 1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+43,22,15,11, 8, 7,6,5,4,4,3,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,
+ 1, 1, 1, 1, 1, 1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+44,22,15,11, 9, 7,6,5,4,4,3,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,
+ 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+45,23,15,11, 9, 7,6,5,5,4,4,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,
+ 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+46,23,16,11, 9, 7,6,5,5,4,4,3,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,
+ 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+47,24,16,12, 9, 7,6,5,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,
+ 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+48,24,16,12, 9, 8,6,6,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,
+ 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+49,25,17,12,10, 8,7,6,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,
+ 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+50,25,17,13,10, 8,7,6,5,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,
+ 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+51,26,18,13,10, 8,7,6,5,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,
+ 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,
+52,26,18,13,10, 8,7,6,5,5,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,
+ 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,
+53,27,18,13,10, 9,7,6,5,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,
+ 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,
+54,27,19,14,11, 9,7,6,6,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,
+ 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,
+55,28,19,14,11, 9,7,6,6,5,5,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,
+ 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,
+56,28,19,14,11, 9,8,7,6,5,5,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,
+ 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,
+57,29,20,14,11, 9,8,7,6,5,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,1,1,
+ 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,
+58,29,20,15,11, 9,8,7,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,
+ 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,
+59,30,20,15,12,10,8,7,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,1,
+ 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,
+60,30,21,15,12,10,8,7,6,6,5,5,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,1,
+ 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,
+61,31,21,15,12,10,8,7,6,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,
+ 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,
+62,31,22,16,12,10,9,7,6,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,
+ 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,
+63,32,22,16,13,10,9,7,7,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,2,1,
+ 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,
+64,32,22,16,13,10,9,8,7,6,5,5,4,4,4,3,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,
+ 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
+};
+#else
+#error No table for provided NBITS
+#endif
+
+static const unsigned char *tabp = tab - (1 << (NBITS - 1) << NBITS);
+
+static inline mp_double_limb_t
+div1 (mp_limb_t n0, mp_limb_t d0)
+{
+  int ncnt;
+  size_t nbi, dbi;
+  mp_limb_t q0;
+  mp_limb_t r0;
+  mp_limb_t mask;
+  mp_double_limb_t res;
+
+  ASSERT (n0 >= d0);		/* Actually only msb position is critical. */
+
+  count_leading_zeros (ncnt, n0);
+  nbi = n0 << ncnt >> (GMP_LIMB_BITS - NBITS);
+  dbi = d0 << ncnt >> (GMP_LIMB_BITS - NBITS);
+
+  q0 = tabp[(nbi << NBITS) + dbi];
+  r0 = n0 - q0 * d0;
+  mask = -(mp_limb_t) (r0 >= d0);
+  q0 -= mask;
+  r0 -= d0 & mask;
+
+  if (UNLIKELY (r0 >= d0))
+    {
+      q0 = n0 / d0;
+      r0 = n0 - q0 * d0;
+    }
+
+  res.d1 = q0;
+  res.d0 = r0;
+  return res;
+}
+
+#elif HGCD2_DIV1_METHOD == 5
+
+/* Table inverses of divisors.  We don't bother with suppressing the msb from
+   the tables.  We index with the NBITS most significant divisor bits,
+   including the always-set highest bit, but use addressing trickery via tabp
+   to suppress it.
+
+   Possible improvements:
+
+   * Do first multiply using 32-bit operations on 64-bit computers.  At least
+     on most Arm64 cores, that uses 3 times less resources.  It also saves on
+     many x86-64 processors.
+*/
+
+#ifndef NBITS
+#define NBITS 7
+#endif
+
+#if NBITS == 5
+/* This needs full division about 1.63% of the time. */
+static const unsigned char tab[16] = {
+ 63, 59, 55, 52, 50, 47, 45, 43, 41, 39, 38, 36, 35, 34, 33, 32
+};
+static const unsigned char *tabp = tab - (1 << (NBITS - 1));
+#elif NBITS == 6
+/* This needs full division about 0.93% of the time. */
+static const unsigned char tab[32] = {
+127,123,119,116,112,109,106,104,101, 98, 96, 94, 92, 90, 88, 86,
+ 84, 82, 80, 79, 77, 76, 74, 73, 72, 70, 69, 68, 67, 66, 65, 64
+};
+static const unsigned char *tabp = tab - (1 << (NBITS - 1));
+#elif NBITS == 7
+/* This needs full division about 0.49% of the time. */
+static const unsigned char tab[64] = {
+255,251,247,243,239,236,233,229,226,223,220,217,214,211,209,206,
+203,201,198,196,194,191,189,187,185,183,181,179,177,175,173,171,
+169,167,166,164,162,161,159,158,156,155,153,152,150,149,147,146,
+145,143,142,141,140,139,137,136,135,134,133,132,131,130,129,128
+};
+static const unsigned char *tabp = tab - (1 << (NBITS - 1));
+#elif NBITS == 8
+/* This needs full division about 0.26% of the time. */
+static const unsigned short tab[128] = {
+511,507,503,499,495,491,488,484,480,477,473,470,467,463,460,457,
+454,450,447,444,441,438,435,433,430,427,424,421,419,416,413,411,
+408,406,403,401,398,396,393,391,389,386,384,382,380,377,375,373,
+371,369,367,365,363,361,359,357,355,353,351,349,347,345,343,342,
+340,338,336,335,333,331,329,328,326,325,323,321,320,318,317,315,
+314,312,311,309,308,306,305,303,302,301,299,298,296,295,294,292,
+291,290,288,287,286,285,283,282,281,280,279,277,276,275,274,273,
+272,270,269,268,267,266,265,264,263,262,261,260,259,258,257,256
+};
+static const unsigned short *tabp = tab - (1 << (NBITS - 1));
+#else
+#error No table for provided NBITS
+#endif
+
+static inline mp_double_limb_t
+div1 (mp_limb_t n0, mp_limb_t d0)
+{
+  int ncnt, dcnt;
+  size_t dbi;
+  mp_limb_t inv;
+  mp_limb_t q0;
+  mp_limb_t r0;
+  mp_limb_t mask;
+  mp_double_limb_t res;
+
+  count_leading_zeros (ncnt, n0);
+  count_leading_zeros (dcnt, d0);
+
+  dbi = d0 << dcnt >> (GMP_LIMB_BITS - NBITS);
+  inv = tabp[dbi];
+  q0 = ((n0 << ncnt) >> (NBITS + 1)) * inv >> (GMP_LIMB_BITS - 1 + ncnt - dcnt);
+  r0 = n0 - q0 * d0;
+  mask = -(mp_limb_t) (r0 >= d0);
+  q0 -= mask;
+  r0 -= d0 & mask;
+
+  if (UNLIKELY (r0 >= d0))
+    {
+      q0 = n0 / d0;
+      r0 = n0 - q0 * d0;
+    }
+
+  res.d1 = q0;
+  res.d0 = r0;
+  return res;
+}
+
+#else
+#error Unknown HGCD2_DIV1_METHOD
+#endif
+
+#if HAVE_NATIVE_mpn_div_22
+
+#define div2 mpn_div_22
+/* Two-limb division optimized for small quotients.  */
+mp_limb_t div2 (mp_ptr, mp_limb_t, mp_limb_t, mp_limb_t, mp_limb_t);
+
+#elif HGCD2_DIV2_METHOD == 1
+
+static mp_limb_t
+div2 (mp_ptr rp,
+      mp_limb_t n1, mp_limb_t n0,
+      mp_limb_t d1, mp_limb_t d0)
+{
+  mp_double_limb_t rq = div1 (n1, d1);
+  if (UNLIKELY (rq.d1 > d1))
+    {
+      mp_limb_t n2, q, t1, t0;
+      int c;
+
+      /* Normalize */
+      count_leading_zeros (c, d1);
+      ASSERT (c > 0);
+
+      n2 = n1 >> (GMP_LIMB_BITS - c);
+      n1 = (n1 << c) | (n0 >> (GMP_LIMB_BITS - c));
+      n0 <<= c;
+      d1 = (d1 << c) | (d0 >> (GMP_LIMB_BITS - c));
+      d0 <<= c;
+
+      udiv_qrnnd (q, n1, n2, n1, d1);
+      umul_ppmm (t1, t0, q, d0);
+      if (t1 > n1 || (t1 == n1 && t0 > n0))
+	{
+	  ASSERT (q > 0);
+	  q--;
+	  sub_ddmmss (t1, t0, t1, t0, d1, d0);
+	}
+      sub_ddmmss (n1, n0, n1, n0, t1, t0);
+
+      /* Undo normalization */
+      rp[0] = (n0 >> c) | (n1 << (GMP_LIMB_BITS - c));
+      rp[1] = n1 >> c;
+
+      return q;
+    }
+  else
+    {
+      mp_limb_t q, t1, t0;
+      n1 = rq.d0;
+      q = rq.d1;
+      umul_ppmm (t1, t0, q, d0);
+      if (UNLIKELY (t1 >= n1) && (t1 > n1 || t0 > n0))
+	{
+	  ASSERT (q > 0);
+	  q--;
+	  sub_ddmmss (t1, t0, t1, t0, d1, d0);
+	}
+      sub_ddmmss (rp[1], rp[0], n1, n0, t1, t0);
+      return q;
+    }
+}
+
+#elif HGCD2_DIV2_METHOD == 2
+
+/* Bit-wise div2. Relies on fast count_leading_zeros. */
+static mp_limb_t
+div2 (mp_ptr rp,
+      mp_limb_t n1, mp_limb_t n0,
+      mp_limb_t d1, mp_limb_t d0)
+{
+  mp_limb_t q = 0;
+  int ncnt;
+  int dcnt;
+
+  count_leading_zeros (ncnt, n1);
+  count_leading_zeros (dcnt, d1);
+  dcnt -= ncnt;
+
+  d1 = (d1 << dcnt) + (d0 >> 1 >> (GMP_LIMB_BITS - 1 - dcnt));
+  d0 <<= dcnt;
+
+  do
+    {
+      mp_limb_t mask;
+      q <<= 1;
+      if (UNLIKELY (n1 == d1))
+	mask = -(n0 >= d0);
+      else
+	mask = -(n1 > d1);
+
+      q -= mask;
+
+      sub_ddmmss (n1, n0, n1, n0, mask & d1, mask & d0);
+
+      d0 = (d1 << (GMP_LIMB_BITS - 1)) | (d0 >> 1);
+      d1 = d1 >> 1;
+    }
+  while (dcnt--);
+
+  rp[0] = n0;
+  rp[1] = n1;
+
+  return q;
+}
+#else
+#error Unknown HGCD2_DIV2_METHOD
+#endif
+
+/* Reduces a,b until |a-b| (almost) fits in one limb + 1 bit. Constructs
+   matrix M. Returns 1 if we make progress, i.e. can perform at least
+   one subtraction. Otherwise returns zero. */
+
+/* FIXME: Possible optimizations:
+
+   The div2 function starts with checking the most significant bit of
+   the numerator. We can maintained normalized operands here, call
+   hgcd with normalized operands only, which should make the code
+   simpler and possibly faster.
+
+   Experiment with table lookups on the most significant bits.
+
+   This function is also a candidate for assembler implementation.
+*/
+int
+mpn_hgcd2 (mp_limb_t ah, mp_limb_t al, mp_limb_t bh, mp_limb_t bl,
+	   struct hgcd_matrix1 *M)
+{
+  mp_limb_t u00, u01, u10, u11;
+
+  if (ah < 2 || bh < 2)
+    return 0;
+
+  if (ah > bh || (ah == bh && al > bl))
+    {
+      sub_ddmmss (ah, al, ah, al, bh, bl);
+      if (ah < 2)
+	return 0;
+
+      u00 = u01 = u11 = 1;
+      u10 = 0;
+    }
+  else
+    {
+      sub_ddmmss (bh, bl, bh, bl, ah, al);
+      if (bh < 2)
+	return 0;
+
+      u00 = u10 = u11 = 1;
+      u01 = 0;
+    }
+
+  if (ah < bh)
+    goto subtract_a;
+
+  for (;;)
+    {
+      ASSERT (ah >= bh);
+      if (ah == bh)
+	goto done;
+
+      if (ah < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2)))
+	{
+	  ah = (ah << (GMP_LIMB_BITS / 2) ) + (al >> (GMP_LIMB_BITS / 2));
+	  bh = (bh << (GMP_LIMB_BITS / 2) ) + (bl >> (GMP_LIMB_BITS / 2));
+
+	  break;
+	}
+
+      /* Subtract a -= q b, and multiply M from the right by (1 q ; 0
+	 1), affecting the second column of M. */
+      ASSERT (ah > bh);
+      sub_ddmmss (ah, al, ah, al, bh, bl);
+
+      if (ah < 2)
+	goto done;
+
+      if (ah <= bh)
+	{
+	  /* Use q = 1 */
+	  u01 += u00;
+	  u11 += u10;
+	}
+      else
+	{
+	  mp_limb_t r[2];
+	  mp_limb_t q = div2 (r, ah, al, bh, bl);
+	  al = r[0]; ah = r[1];
+	  if (ah < 2)
+	    {
+	      /* A is too small, but q is correct. */
+	      u01 += q * u00;
+	      u11 += q * u10;
+	      goto done;
+	    }
+	  q++;
+	  u01 += q * u00;
+	  u11 += q * u10;
+	}
+    subtract_a:
+      ASSERT (bh >= ah);
+      if (ah == bh)
+	goto done;
+
+      if (bh < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2)))
+	{
+	  ah = (ah << (GMP_LIMB_BITS / 2) ) + (al >> (GMP_LIMB_BITS / 2));
+	  bh = (bh << (GMP_LIMB_BITS / 2) ) + (bl >> (GMP_LIMB_BITS / 2));
+
+	  goto subtract_a1;
+	}
+
+      /* Subtract b -= q a, and multiply M from the right by (1 0 ; q
+	 1), affecting the first column of M. */
+      sub_ddmmss (bh, bl, bh, bl, ah, al);
+
+      if (bh < 2)
+	goto done;
+
+      if (bh <= ah)
+	{
+	  /* Use q = 1 */
+	  u00 += u01;
+	  u10 += u11;
+	}
+      else
+	{
+	  mp_limb_t r[2];
+	  mp_limb_t q = div2 (r, bh, bl, ah, al);
+	  bl = r[0]; bh = r[1];
+	  if (bh < 2)
+	    {
+	      /* B is too small, but q is correct. */
+	      u00 += q * u01;
+	      u10 += q * u11;
+	      goto done;
+	    }
+	  q++;
+	  u00 += q * u01;
+	  u10 += q * u11;
+	}
+    }
+
+  /* NOTE: Since we discard the least significant half limb, we don't get a
+     truly maximal M (corresponding to |a - b| < 2^{GMP_LIMB_BITS +1}). */
+  /* Single precision loop */
+  for (;;)
+    {
+      ASSERT (ah >= bh);
+
+      ah -= bh;
+      if (ah < (CNST_LIMB (1) << (GMP_LIMB_BITS / 2 + 1)))
+	break;
+
+      if (ah <= bh)
+	{
+	  /* Use q = 1 */
+	  u01 += u00;
+	  u11 += u10;
+	}
+      else
+	{
+	  mp_double_limb_t rq = div1 (ah, bh);
+	  mp_limb_t q = rq.d1;
+	  ah = rq.d0;
+
+	  if (ah < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2 + 1)))
+	    {
+	      /* A is too small, but q is correct. */
+	      u01 += q * u00;
+	      u11 += q * u10;
+	      break;
+	    }
+	  q++;
+	  u01 += q * u00;
+	  u11 += q * u10;
+	}
+    subtract_a1:
+      ASSERT (bh >= ah);
+
+      bh -= ah;
+      if (bh < (CNST_LIMB (1) << (GMP_LIMB_BITS / 2 + 1)))
+	break;
+
+      if (bh <= ah)
+	{
+	  /* Use q = 1 */
+	  u00 += u01;
+	  u10 += u11;
+	}
+      else
+	{
+	  mp_double_limb_t rq = div1 (bh, ah);
+	  mp_limb_t q = rq.d1;
+	  bh = rq.d0;
+
+	  if (bh < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2 + 1)))
+	    {
+	      /* B is too small, but q is correct. */
+	      u00 += q * u01;
+	      u10 += q * u11;
+	      break;
+	    }
+	  q++;
+	  u00 += q * u01;
+	  u10 += q * u11;
+	}
+    }
+
+ done:
+  M->u[0][0] = u00; M->u[0][1] = u01;
+  M->u[1][0] = u10; M->u[1][1] = u11;
+
+  return 1;
+}
+
+/* Sets (r;b) = (a;b) M, with M = (u00, u01; u10, u11). Vector must
+ * have space for n + 1 limbs. Uses three buffers to avoid a copy*/
+mp_size_t
+mpn_hgcd_mul_matrix1_vector (const struct hgcd_matrix1 *M,
+			     mp_ptr rp, mp_srcptr ap, mp_ptr bp, mp_size_t n)
+{
+  mp_limb_t ah, bh;
+
+  /* Compute (r,b) <-- (u00 a + u10 b, u01 a + u11 b) as
+
+     r  = u00 * a
+     r += u10 * b
+     b *= u11
+     b += u01 * a
+  */
+
+#if HAVE_NATIVE_mpn_addaddmul_1msb0
+  ah = mpn_addaddmul_1msb0 (rp, ap, bp, n, M->u[0][0], M->u[1][0]);
+  bh = mpn_addaddmul_1msb0 (bp, bp, ap, n, M->u[1][1], M->u[0][1]);
+#else
+  ah =     mpn_mul_1 (rp, ap, n, M->u[0][0]);
+  ah += mpn_addmul_1 (rp, bp, n, M->u[1][0]);
+
+  bh =     mpn_mul_1 (bp, bp, n, M->u[1][1]);
+  bh += mpn_addmul_1 (bp, ap, n, M->u[0][1]);
+#endif
+  rp[n] = ah;
+  bp[n] = bh;
+
+  n += (ah | bh) > 0;
+  return n;
+}

diff --git a/third_party/gmp/mpn/generic/hgcd2_jacobi.c b/third_party/gmp/mpn/generic/hgcd2_jacobi.c
new file mode 100644
index 0000000..98e079b
--- /dev/null
+++ b/third_party/gmp/mpn/generic/hgcd2_jacobi.c

@@ -0,0 +1,365 @@
+/* hgcd2_jacobi.c
+
+   THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
+   SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+Copyright 1996, 1998, 2000-2004, 2008, 2011 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+#if GMP_NAIL_BITS > 0
+#error Nails not supported.
+#endif
+
+/* FIXME: Duplicated in hgcd2.c. Should move to gmp-impl.h, and
+   possibly be renamed. */
+static inline mp_limb_t
+div1 (mp_ptr rp,
+      mp_limb_t n0,
+      mp_limb_t d0)
+{
+  mp_limb_t q = 0;
+
+  if ((mp_limb_signed_t) n0 < 0)
+    {
+      int cnt;
+      for (cnt = 1; (mp_limb_signed_t) d0 >= 0; cnt++)
+	{
+	  d0 = d0 << 1;
+	}
+
+      q = 0;
+      while (cnt)
+	{
+	  q <<= 1;
+	  if (n0 >= d0)
+	    {
+	      n0 = n0 - d0;
+	      q |= 1;
+	    }
+	  d0 = d0 >> 1;
+	  cnt--;
+	}
+    }
+  else
+    {
+      int cnt;
+      for (cnt = 0; n0 >= d0; cnt++)
+	{
+	  d0 = d0 << 1;
+	}
+
+      q = 0;
+      while (cnt)
+	{
+	  d0 = d0 >> 1;
+	  q <<= 1;
+	  if (n0 >= d0)
+	    {
+	      n0 = n0 - d0;
+	      q |= 1;
+	    }
+	  cnt--;
+	}
+    }
+  *rp = n0;
+  return q;
+}
+
+/* Two-limb division optimized for small quotients.  */
+static inline mp_limb_t
+div2 (mp_ptr rp,
+      mp_limb_t nh, mp_limb_t nl,
+      mp_limb_t dh, mp_limb_t dl)
+{
+  mp_limb_t q = 0;
+
+  if ((mp_limb_signed_t) nh < 0)
+    {
+      int cnt;
+      for (cnt = 1; (mp_limb_signed_t) dh >= 0; cnt++)
+	{
+	  dh = (dh << 1) | (dl >> (GMP_LIMB_BITS - 1));
+	  dl = dl << 1;
+	}
+
+      while (cnt)
+	{
+	  q <<= 1;
+	  if (nh > dh || (nh == dh && nl >= dl))
+	    {
+	      sub_ddmmss (nh, nl, nh, nl, dh, dl);
+	      q |= 1;
+	    }
+	  dl = (dh << (GMP_LIMB_BITS - 1)) | (dl >> 1);
+	  dh = dh >> 1;
+	  cnt--;
+	}
+    }
+  else
+    {
+      int cnt;
+      for (cnt = 0; nh > dh || (nh == dh && nl >= dl); cnt++)
+	{
+	  dh = (dh << 1) | (dl >> (GMP_LIMB_BITS - 1));
+	  dl = dl << 1;
+	}
+
+      while (cnt)
+	{
+	  dl = (dh << (GMP_LIMB_BITS - 1)) | (dl >> 1);
+	  dh = dh >> 1;
+	  q <<= 1;
+	  if (nh > dh || (nh == dh && nl >= dl))
+	    {
+	      sub_ddmmss (nh, nl, nh, nl, dh, dl);
+	      q |= 1;
+	    }
+	  cnt--;
+	}
+    }
+
+  rp[0] = nl;
+  rp[1] = nh;
+
+  return q;
+}
+
+int
+mpn_hgcd2_jacobi (mp_limb_t ah, mp_limb_t al, mp_limb_t bh, mp_limb_t bl,
+		  struct hgcd_matrix1 *M, unsigned *bitsp)
+{
+  mp_limb_t u00, u01, u10, u11;
+  unsigned bits = *bitsp;
+
+  if (ah < 2 || bh < 2)
+    return 0;
+
+  if (ah > bh || (ah == bh && al > bl))
+    {
+      sub_ddmmss (ah, al, ah, al, bh, bl);
+      if (ah < 2)
+	return 0;
+
+      u00 = u01 = u11 = 1;
+      u10 = 0;
+      bits = mpn_jacobi_update (bits, 1, 1);
+    }
+  else
+    {
+      sub_ddmmss (bh, bl, bh, bl, ah, al);
+      if (bh < 2)
+	return 0;
+
+      u00 = u10 = u11 = 1;
+      u01 = 0;
+      bits = mpn_jacobi_update (bits, 0, 1);
+    }
+
+  if (ah < bh)
+    goto subtract_a;
+
+  for (;;)
+    {
+      ASSERT (ah >= bh);
+      if (ah == bh)
+	goto done;
+
+      if (ah < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2)))
+	{
+	  ah = (ah << (GMP_LIMB_BITS / 2) ) + (al >> (GMP_LIMB_BITS / 2));
+	  bh = (bh << (GMP_LIMB_BITS / 2) ) + (bl >> (GMP_LIMB_BITS / 2));
+
+	  break;
+	}
+
+      /* Subtract a -= q b, and multiply M from the right by (1 q ; 0
+	 1), affecting the second column of M. */
+      ASSERT (ah > bh);
+      sub_ddmmss (ah, al, ah, al, bh, bl);
+
+      if (ah < 2)
+	goto done;
+
+      if (ah <= bh)
+	{
+	  /* Use q = 1 */
+	  u01 += u00;
+	  u11 += u10;
+	  bits = mpn_jacobi_update (bits, 1, 1);
+	}
+      else
+	{
+	  mp_limb_t r[2];
+	  mp_limb_t q = div2 (r, ah, al, bh, bl);
+	  al = r[0]; ah = r[1];
+	  if (ah < 2)
+	    {
+	      /* A is too small, but q is correct. */
+	      u01 += q * u00;
+	      u11 += q * u10;
+	      bits = mpn_jacobi_update (bits, 1, q & 3);
+	      goto done;
+	    }
+	  q++;
+	  u01 += q * u00;
+	  u11 += q * u10;
+	  bits = mpn_jacobi_update (bits, 1, q & 3);
+	}
+    subtract_a:
+      ASSERT (bh >= ah);
+      if (ah == bh)
+	goto done;
+
+      if (bh < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2)))
+	{
+	  ah = (ah << (GMP_LIMB_BITS / 2) ) + (al >> (GMP_LIMB_BITS / 2));
+	  bh = (bh << (GMP_LIMB_BITS / 2) ) + (bl >> (GMP_LIMB_BITS / 2));
+
+	  goto subtract_a1;
+	}
+
+      /* Subtract b -= q a, and multiply M from the right by (1 0 ; q
+	 1), affecting the first column of M. */
+      sub_ddmmss (bh, bl, bh, bl, ah, al);
+
+      if (bh < 2)
+	goto done;
+
+      if (bh <= ah)
+	{
+	  /* Use q = 1 */
+	  u00 += u01;
+	  u10 += u11;
+	  bits = mpn_jacobi_update (bits, 0, 1);
+	}
+      else
+	{
+	  mp_limb_t r[2];
+	  mp_limb_t q = div2 (r, bh, bl, ah, al);
+	  bl = r[0]; bh = r[1];
+	  if (bh < 2)
+	    {
+	      /* B is too small, but q is correct. */
+	      u00 += q * u01;
+	      u10 += q * u11;
+	      bits = mpn_jacobi_update (bits, 0, q & 3);
+	      goto done;
+	    }
+	  q++;
+	  u00 += q * u01;
+	  u10 += q * u11;
+	  bits = mpn_jacobi_update (bits, 0, q & 3);
+	}
+    }
+
+  /* NOTE: Since we discard the least significant half limb, we don't
+     get a truly maximal M (corresponding to |a - b| <
+     2^{GMP_LIMB_BITS +1}). */
+  /* Single precision loop */
+  for (;;)
+    {
+      ASSERT (ah >= bh);
+      if (ah == bh)
+	break;
+
+      ah -= bh;
+      if (ah < (CNST_LIMB (1) << (GMP_LIMB_BITS / 2 + 1)))
+	break;
+
+      if (ah <= bh)
+	{
+	  /* Use q = 1 */
+	  u01 += u00;
+	  u11 += u10;
+	  bits = mpn_jacobi_update (bits, 1, 1);
+	}
+      else
+	{
+	  mp_limb_t r;
+	  mp_limb_t q = div1 (&r, ah, bh);
+	  ah = r;
+	  if (ah < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2 + 1)))
+	    {
+	      /* A is too small, but q is correct. */
+	      u01 += q * u00;
+	      u11 += q * u10;
+	      bits = mpn_jacobi_update (bits, 1, q & 3);
+	      break;
+	    }
+	  q++;
+	  u01 += q * u00;
+	  u11 += q * u10;
+	  bits = mpn_jacobi_update (bits, 1, q & 3);
+	}
+    subtract_a1:
+      ASSERT (bh >= ah);
+      if (ah == bh)
+	break;
+
+      bh -= ah;
+      if (bh < (CNST_LIMB (1) << (GMP_LIMB_BITS / 2 + 1)))
+	break;
+
+      if (bh <= ah)
+	{
+	  /* Use q = 1 */
+	  u00 += u01;
+	  u10 += u11;
+	  bits = mpn_jacobi_update (bits, 0, 1);
+	}
+      else
+	{
+	  mp_limb_t r;
+	  mp_limb_t q = div1 (&r, bh, ah);
+	  bh = r;
+	  if (bh < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2 + 1)))
+	    {
+	      /* B is too small, but q is correct. */
+	      u00 += q * u01;
+	      u10 += q * u11;
+	      bits = mpn_jacobi_update (bits, 0, q & 3);
+	      break;
+	    }
+	  q++;
+	  u00 += q * u01;
+	  u10 += q * u11;
+	  bits = mpn_jacobi_update (bits, 0, q & 3);
+	}
+    }
+
+ done:
+  M->u[0][0] = u00; M->u[0][1] = u01;
+  M->u[1][0] = u10; M->u[1][1] = u11;
+  *bitsp = bits;
+
+  return 1;
+}

diff --git a/third_party/gmp/mpn/generic/hgcd_appr.c b/third_party/gmp/mpn/generic/hgcd_appr.c
new file mode 100644
index 0000000..bb01738
--- /dev/null
+++ b/third_party/gmp/mpn/generic/hgcd_appr.c

@@ -0,0 +1,267 @@
+/* hgcd_appr.c.
+
+   THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
+   SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+Copyright 2011, 2012 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+/* Identical to mpn_hgcd_itch. FIXME: Do we really need to add
+   HGCD_THRESHOLD at the end? */
+mp_size_t
+mpn_hgcd_appr_itch (mp_size_t n)
+{
+  if (BELOW_THRESHOLD (n, HGCD_APPR_THRESHOLD))
+    return n;
+  else
+    {
+      unsigned k;
+      int count;
+      mp_size_t nscaled;
+
+      /* Get the recursion depth. */
+      nscaled = (n - 1) / (HGCD_APPR_THRESHOLD - 1);
+      count_leading_zeros (count, nscaled);
+      k = GMP_LIMB_BITS - count;
+
+      return 20 * ((n+3) / 4) + 22 * k + HGCD_THRESHOLD;
+    }
+}
+
+/* Destroys inputs. */
+int
+mpn_hgcd_appr (mp_ptr ap, mp_ptr bp, mp_size_t n,
+	       struct hgcd_matrix *M, mp_ptr tp)
+{
+  mp_size_t s;
+  int success = 0;
+
+  ASSERT (n > 0);
+
+  ASSERT ((ap[n-1] | bp[n-1]) != 0);
+
+  if (n <= 2)
+    /* Implies s = n. A fairly uninteresting case but exercised by the
+       random inputs of the testsuite. */
+    return 0;
+
+  ASSERT ((n+1)/2 - 1 < M->alloc);
+
+  /* We aim for reduction of to GMP_NUMB_BITS * s bits. But each time
+     we discard some of the least significant limbs, we must keep one
+     additional bit to account for the truncation error. We maintain
+     the GMP_NUMB_BITS * s - extra_bits as the current target size. */
+
+  s = n/2 + 1;
+  if (BELOW_THRESHOLD (n, HGCD_APPR_THRESHOLD))
+    {
+      unsigned extra_bits = 0;
+
+      while (n > 2)
+	{
+	  mp_size_t nn;
+
+	  ASSERT (n > s);
+	  ASSERT (n <= 2*s);
+
+	  nn = mpn_hgcd_step (n, ap, bp, s, M, tp);
+	  if (!nn)
+	    break;
+
+	  n = nn;
+	  success = 1;
+
+	  /* We can truncate and discard the lower p bits whenever nbits <=
+	     2*sbits - p. To account for the truncation error, we must
+	     adjust
+
+	     sbits <-- sbits + 1 - p,
+
+	     rather than just sbits <-- sbits - p. This adjustment makes
+	     the produced matrix slightly smaller than it could be. */
+
+	  if (GMP_NUMB_BITS * (n + 1) + 2 * extra_bits <= 2*GMP_NUMB_BITS * s)
+	    {
+	      mp_size_t p = (GMP_NUMB_BITS * (2*s - n) - 2*extra_bits) / GMP_NUMB_BITS;
+
+	      if (extra_bits == 0)
+		{
+		  /* We cross a limb boundary and bump s. We can't do that
+		     if the result is that it makes makes min(U, V)
+		     smaller than 2^{GMP_NUMB_BITS} s. */
+		  if (s + 1 == n
+		      || mpn_zero_p (ap + s + 1, n - s - 1)
+		      || mpn_zero_p (bp + s + 1, n - s - 1))
+		    continue;
+
+		  extra_bits = GMP_NUMB_BITS - 1;
+		  s++;
+		}
+	      else
+		{
+		  extra_bits--;
+		}
+
+	      /* Drop the p least significant limbs */
+	      ap += p; bp += p; n -= p; s -= p;
+	    }
+	}
+
+      ASSERT (s > 0);
+
+      if (extra_bits > 0)
+	{
+	  /* We can get here only of we have dropped at least one of the least
+	     significant bits, so we can decrement ap and bp. We can then shift
+	     left extra bits using mpn_rshift. */
+	  /* NOTE: In the unlikely case that n is large, it would be preferable
+	     to do an initial subdiv step to reduce the size before shifting,
+	     but that would mean duplicating mpn_gcd_subdiv_step with a bit
+	     count rather than a limb count. */
+	  ap--; bp--;
+	  ap[0] = mpn_rshift (ap+1, ap+1, n, GMP_NUMB_BITS - extra_bits);
+	  bp[0] = mpn_rshift (bp+1, bp+1, n, GMP_NUMB_BITS - extra_bits);
+	  n += (ap[n] | bp[n]) > 0;
+
+	  ASSERT (success);
+
+	  while (n > 2)
+	    {
+	      mp_size_t nn;
+
+	      ASSERT (n > s);
+	      ASSERT (n <= 2*s);
+
+	      nn = mpn_hgcd_step (n, ap, bp, s, M, tp);
+
+	      if (!nn)
+		return 1;
+
+	      n = nn;
+	    }
+	}
+
+      if (n == 2)
+	{
+	  struct hgcd_matrix1 M1;
+	  ASSERT (s == 1);
+
+	  if (mpn_hgcd2 (ap[1], ap[0], bp[1], bp[0], &M1))
+	    {
+	      /* Multiply M <- M * M1 */
+	      mpn_hgcd_matrix_mul_1 (M, &M1, tp);
+	      success = 1;
+	    }
+	}
+      return success;
+    }
+  else
+    {
+      mp_size_t n2 = (3*n)/4 + 1;
+      mp_size_t p = n/2;
+      mp_size_t nn;
+
+      nn = mpn_hgcd_reduce (M, ap, bp, n, p, tp);
+      if (nn)
+	{
+	  n = nn;
+	  /* FIXME: Discard some of the low limbs immediately? */
+	  success = 1;
+	}
+
+      while (n > n2)
+	{
+	  mp_size_t nn;
+
+	  /* Needs n + 1 storage */
+	  nn = mpn_hgcd_step (n, ap, bp, s, M, tp);
+	  if (!nn)
+	    return success;
+
+	  n = nn;
+	  success = 1;
+	}
+      if (n > s + 2)
+	{
+	  struct hgcd_matrix M1;
+	  mp_size_t scratch;
+
+	  p = 2*s - n + 1;
+	  scratch = MPN_HGCD_MATRIX_INIT_ITCH (n-p);
+
+	  mpn_hgcd_matrix_init(&M1, n - p, tp);
+	  if (mpn_hgcd_appr (ap + p, bp + p, n - p, &M1, tp + scratch))
+	    {
+	      /* We always have max(M) > 2^{-(GMP_NUMB_BITS + 1)} max(M1) */
+	      ASSERT (M->n + 2 >= M1.n);
+
+	      /* Furthermore, assume M ends with a quotient (1, q; 0, 1),
+		 then either q or q + 1 is a correct quotient, and M1 will
+		 start with either (1, 0; 1, 1) or (2, 1; 1, 1). This
+		 rules out the case that the size of M * M1 is much
+		 smaller than the expected M->n + M1->n. */
+
+	      ASSERT (M->n + M1.n < M->alloc);
+
+	      /* We need a bound for of M->n + M1.n. Let n be the original
+		 input size. Then
+
+		 ceil(n/2) - 1 >= size of product >= M.n + M1.n - 2
+
+		 and it follows that
+
+		 M.n + M1.n <= ceil(n/2) + 1
+
+		 Then 3*(M.n + M1.n) + 5 <= 3 * ceil(n/2) + 8 is the
+		 amount of needed scratch space. */
+	      mpn_hgcd_matrix_mul (M, &M1, tp + scratch);
+	      return 1;
+	    }
+	}
+
+      for(;;)
+	{
+	  mp_size_t nn;
+
+	  ASSERT (n > s);
+	  ASSERT (n <= 2*s);
+
+	  nn = mpn_hgcd_step (n, ap, bp, s, M, tp);
+
+	  if (!nn)
+	    return success;
+
+	  n = nn;
+	  success = 1;
+	}
+    }
+}

diff --git a/third_party/gmp/mpn/generic/hgcd_jacobi.c b/third_party/gmp/mpn/generic/hgcd_jacobi.c
new file mode 100644
index 0000000..24014ce
--- /dev/null
+++ b/third_party/gmp/mpn/generic/hgcd_jacobi.c

@@ -0,0 +1,243 @@
+/* hgcd_jacobi.c.
+
+   THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
+   SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+Copyright 2003-2005, 2008, 2011, 2012 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+/* This file is almost a copy of hgcd.c, with some added calls to
+   mpn_jacobi_update */
+
+struct hgcd_jacobi_ctx
+{
+  struct hgcd_matrix *M;
+  unsigned *bitsp;
+};
+
+static void
+hgcd_jacobi_hook (void *p, mp_srcptr gp, mp_size_t gn,
+		  mp_srcptr qp, mp_size_t qn, int d)
+{
+  ASSERT (!gp);
+  ASSERT (d >= 0);
+
+  MPN_NORMALIZE (qp, qn);
+  if (qn > 0)
+    {
+      struct hgcd_jacobi_ctx *ctx = (struct hgcd_jacobi_ctx *) p;
+      /* NOTES: This is a bit ugly. A tp area is passed to
+	 gcd_subdiv_step, which stores q at the start of that area. We
+	 now use the rest. */
+      mp_ptr tp = (mp_ptr) qp + qn;
+
+      mpn_hgcd_matrix_update_q (ctx->M, qp, qn, d, tp);
+      *ctx->bitsp = mpn_jacobi_update (*ctx->bitsp, d, qp[0] & 3);
+    }
+}
+
+/* Perform a few steps, using some of mpn_hgcd2, subtraction and
+   division. Reduces the size by almost one limb or more, but never
+   below the given size s. Return new size for a and b, or 0 if no
+   more steps are possible.
+
+   If hgcd2 succeeds, needs temporary space for hgcd_matrix_mul_1, M->n
+   limbs, and hgcd_mul_matrix1_inverse_vector, n limbs. If hgcd2
+   fails, needs space for the quotient, qn <= n - s + 1 limbs, for and
+   hgcd_matrix_update_q, qn + (size of the appropriate column of M) <=
+   resulting size of M.
+
+   If N is the input size to the calling hgcd, then s = floor(N/2) +
+   1, M->n < N, qn + matrix size <= n - s + 1 + n - s = 2 (n - s) + 1
+   < N, so N is sufficient.
+*/
+
+static mp_size_t
+hgcd_jacobi_step (mp_size_t n, mp_ptr ap, mp_ptr bp, mp_size_t s,
+		  struct hgcd_matrix *M, unsigned *bitsp, mp_ptr tp)
+{
+  struct hgcd_matrix1 M1;
+  mp_limb_t mask;
+  mp_limb_t ah, al, bh, bl;
+
+  ASSERT (n > s);
+
+  mask = ap[n-1] | bp[n-1];
+  ASSERT (mask > 0);
+
+  if (n == s + 1)
+    {
+      if (mask < 4)
+	goto subtract;
+
+      ah = ap[n-1]; al = ap[n-2];
+      bh = bp[n-1]; bl = bp[n-2];
+    }
+  else if (mask & GMP_NUMB_HIGHBIT)
+    {
+      ah = ap[n-1]; al = ap[n-2];
+      bh = bp[n-1]; bl = bp[n-2];
+    }
+  else
+    {
+      int shift;
+
+      count_leading_zeros (shift, mask);
+      ah = MPN_EXTRACT_NUMB (shift, ap[n-1], ap[n-2]);
+      al = MPN_EXTRACT_NUMB (shift, ap[n-2], ap[n-3]);
+      bh = MPN_EXTRACT_NUMB (shift, bp[n-1], bp[n-2]);
+      bl = MPN_EXTRACT_NUMB (shift, bp[n-2], bp[n-3]);
+    }
+
+  /* Try an mpn_hgcd2 step */
+  if (mpn_hgcd2_jacobi (ah, al, bh, bl, &M1, bitsp))
+    {
+      /* Multiply M <- M * M1 */
+      mpn_hgcd_matrix_mul_1 (M, &M1, tp);
+
+      /* Can't swap inputs, so we need to copy. */
+      MPN_COPY (tp, ap, n);
+      /* Multiply M1^{-1} (a;b) */
+      return mpn_matrix22_mul1_inverse_vector (&M1, ap, tp, bp, n);
+    }
+
+ subtract:
+  {
+    struct hgcd_jacobi_ctx ctx;
+    ctx.M = M;
+    ctx.bitsp = bitsp;
+
+    return mpn_gcd_subdiv_step (ap, bp, n, s, hgcd_jacobi_hook, &ctx, tp);
+  }
+}
+
+/* Reduces a,b until |a-b| fits in n/2 + 1 limbs. Constructs matrix M
+   with elements of size at most (n+1)/2 - 1. Returns new size of a,
+   b, or zero if no reduction is possible. */
+
+/* Same scratch requirements as for mpn_hgcd. */
+mp_size_t
+mpn_hgcd_jacobi (mp_ptr ap, mp_ptr bp, mp_size_t n,
+		 struct hgcd_matrix *M, unsigned *bitsp, mp_ptr tp)
+{
+  mp_size_t s = n/2 + 1;
+
+  mp_size_t nn;
+  int success = 0;
+
+  if (n <= s)
+    /* Happens when n <= 2, a fairly uninteresting case but exercised
+       by the random inputs of the testsuite. */
+    return 0;
+
+  ASSERT ((ap[n-1] | bp[n-1]) > 0);
+
+  ASSERT ((n+1)/2 - 1 < M->alloc);
+
+  if (ABOVE_THRESHOLD (n, HGCD_THRESHOLD))
+    {
+      mp_size_t n2 = (3*n)/4 + 1;
+      mp_size_t p = n/2;
+
+      nn = mpn_hgcd_jacobi (ap + p, bp + p, n - p, M, bitsp, tp);
+      if (nn > 0)
+	{
+	  /* Needs 2*(p + M->n) <= 2*(floor(n/2) + ceil(n/2) - 1)
+	     = 2 (n - 1) */
+	  n = mpn_hgcd_matrix_adjust (M, p + nn, ap, bp, p, tp);
+	  success = 1;
+	}
+      while (n > n2)
+	{
+	  /* Needs n + 1 storage */
+	  nn = hgcd_jacobi_step (n, ap, bp, s, M, bitsp, tp);
+	  if (!nn)
+	    return success ? n : 0;
+	  n = nn;
+	  success = 1;
+	}
+
+      if (n > s + 2)
+	{
+	  struct hgcd_matrix M1;
+	  mp_size_t scratch;
+
+	  p = 2*s - n + 1;
+	  scratch = MPN_HGCD_MATRIX_INIT_ITCH (n-p);
+
+	  mpn_hgcd_matrix_init(&M1, n - p, tp);
+	  nn = mpn_hgcd_jacobi (ap + p, bp + p, n - p, &M1, bitsp, tp + scratch);
+	  if (nn > 0)
+	    {
+	      /* We always have max(M) > 2^{-(GMP_NUMB_BITS + 1)} max(M1) */
+	      ASSERT (M->n + 2 >= M1.n);
+
+	      /* Furthermore, assume M ends with a quotient (1, q; 0, 1),
+		 then either q or q + 1 is a correct quotient, and M1 will
+		 start with either (1, 0; 1, 1) or (2, 1; 1, 1). This
+		 rules out the case that the size of M * M1 is much
+		 smaller than the expected M->n + M1->n. */
+
+	      ASSERT (M->n + M1.n < M->alloc);
+
+	      /* Needs 2 (p + M->n) <= 2 (2*s - n2 + 1 + n2 - s - 1)
+		 = 2*s <= 2*(floor(n/2) + 1) <= n + 2. */
+	      n = mpn_hgcd_matrix_adjust (&M1, p + nn, ap, bp, p, tp + scratch);
+
+	      /* We need a bound for of M->n + M1.n. Let n be the original
+		 input size. Then
+
+		 ceil(n/2) - 1 >= size of product >= M.n + M1.n - 2
+
+		 and it follows that
+
+		 M.n + M1.n <= ceil(n/2) + 1
+
+		 Then 3*(M.n + M1.n) + 5 <= 3 * ceil(n/2) + 8 is the
+		 amount of needed scratch space. */
+	      mpn_hgcd_matrix_mul (M, &M1, tp + scratch);
+	      success = 1;
+	    }
+	}
+    }
+
+  for (;;)
+    {
+      /* Needs s+3 < n */
+      nn = hgcd_jacobi_step (n, ap, bp, s, M, bitsp, tp);
+      if (!nn)
+	return success ? n : 0;
+
+      n = nn;
+      success = 1;
+    }
+}

diff --git a/third_party/gmp/mpn/generic/hgcd_matrix.c b/third_party/gmp/mpn/generic/hgcd_matrix.c
new file mode 100644
index 0000000..54c795d
--- /dev/null
+++ b/third_party/gmp/mpn/generic/hgcd_matrix.c

@@ -0,0 +1,265 @@
+/* hgcd_matrix.c.
+
+   THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
+   SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+Copyright 2003-2005, 2008, 2012 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+/* For input of size n, matrix elements are of size at most ceil(n/2)
+   - 1, but we need two limbs extra. */
+void
+mpn_hgcd_matrix_init (struct hgcd_matrix *M, mp_size_t n, mp_ptr p)
+{
+  mp_size_t s = (n+1)/2 + 1;
+  M->alloc = s;
+  M->n = 1;
+  MPN_ZERO (p, 4 * s);
+  M->p[0][0] = p;
+  M->p[0][1] = p + s;
+  M->p[1][0] = p + 2 * s;
+  M->p[1][1] = p + 3 * s;
+
+  M->p[0][0][0] = M->p[1][1][0] = 1;
+}
+
+/* Update column COL, adding in Q * column (1-COL). Temporary storage:
+ * qn + n <= M->alloc, where n is the size of the largest element in
+ * column 1 - COL. */
+void
+mpn_hgcd_matrix_update_q (struct hgcd_matrix *M, mp_srcptr qp, mp_size_t qn,
+			  unsigned col, mp_ptr tp)
+{
+  ASSERT (col < 2);
+
+  if (qn == 1)
+    {
+      mp_limb_t q = qp[0];
+      mp_limb_t c0, c1;
+
+      c0 = mpn_addmul_1 (M->p[0][col], M->p[0][1-col], M->n, q);
+      c1 = mpn_addmul_1 (M->p[1][col], M->p[1][1-col], M->n, q);
+
+      M->p[0][col][M->n] = c0;
+      M->p[1][col][M->n] = c1;
+
+      M->n += (c0 | c1) != 0;
+    }
+  else
+    {
+      unsigned row;
+
+      /* Carries for the unlikely case that we get both high words
+	 from the multiplication and carries from the addition. */
+      mp_limb_t c[2];
+      mp_size_t n;
+
+      /* The matrix will not necessarily grow in size by qn, so we
+	 need normalization in order not to overflow M. */
+
+      for (n = M->n; n + qn > M->n; n--)
+	{
+	  ASSERT (n > 0);
+	  if (M->p[0][1-col][n-1] > 0 || M->p[1][1-col][n-1] > 0)
+	    break;
+	}
+
+      ASSERT (qn + n <= M->alloc);
+
+      for (row = 0; row < 2; row++)
+	{
+	  if (qn <= n)
+	    mpn_mul (tp, M->p[row][1-col], n, qp, qn);
+	  else
+	    mpn_mul (tp, qp, qn, M->p[row][1-col], n);
+
+	  ASSERT (n + qn >= M->n);
+	  c[row] = mpn_add (M->p[row][col], tp, n + qn, M->p[row][col], M->n);
+	}
+
+      n += qn;
+
+      if (c[0] | c[1])
+	{
+	  M->p[0][col][n] = c[0];
+	  M->p[1][col][n] = c[1];
+	  n++;
+	}
+      else
+	{
+	  n -= (M->p[0][col][n-1] | M->p[1][col][n-1]) == 0;
+	  ASSERT (n >= M->n);
+	}
+      M->n = n;
+    }
+
+  ASSERT (M->n < M->alloc);
+}
+
+/* Multiply M by M1 from the right. Since the M1 elements fit in
+   GMP_NUMB_BITS - 1 bits, M grows by at most one limb. Needs
+   temporary space M->n */
+void
+mpn_hgcd_matrix_mul_1 (struct hgcd_matrix *M, const struct hgcd_matrix1 *M1,
+		       mp_ptr tp)
+{
+  mp_size_t n0, n1;
+
+  /* Could avoid copy by some swapping of pointers. */
+  MPN_COPY (tp, M->p[0][0], M->n);
+  n0 = mpn_hgcd_mul_matrix1_vector (M1, M->p[0][0], tp, M->p[0][1], M->n);
+  MPN_COPY (tp, M->p[1][0], M->n);
+  n1 = mpn_hgcd_mul_matrix1_vector (M1, M->p[1][0], tp, M->p[1][1], M->n);
+
+  /* Depends on zero initialization */
+  M->n = MAX(n0, n1);
+  ASSERT (M->n < M->alloc);
+}
+
+/* Multiply M by M1 from the right. Needs 3*(M->n + M1->n) + 5 limbs
+   of temporary storage (see mpn_matrix22_mul_itch). */
+void
+mpn_hgcd_matrix_mul (struct hgcd_matrix *M, const struct hgcd_matrix *M1,
+		     mp_ptr tp)
+{
+  mp_size_t n;
+
+  /* About the new size of M:s elements. Since M1's diagonal elements
+     are > 0, no element can decrease. The new elements are of size
+     M->n + M1->n, one limb more or less. The computation of the
+     matrix product produces elements of size M->n + M1->n + 1. But
+     the true size, after normalization, may be three limbs smaller.
+
+     The reason that the product has normalized size >= M->n + M1->n -
+     2 is subtle. It depends on the fact that M and M1 can be factored
+     as products of (1,1; 0,1) and (1,0; 1,1), and that we can't have
+     M ending with a large power and M1 starting with a large power of
+     the same matrix. */
+
+  /* FIXME: Strassen multiplication gives only a small speedup. In FFT
+     multiplication range, this function could be sped up quite a lot
+     using invariance. */
+  ASSERT (M->n + M1->n < M->alloc);
+
+  ASSERT ((M->p[0][0][M->n-1] | M->p[0][1][M->n-1]
+	   | M->p[1][0][M->n-1] | M->p[1][1][M->n-1]) > 0);
+
+  ASSERT ((M1->p[0][0][M1->n-1] | M1->p[0][1][M1->n-1]
+	   | M1->p[1][0][M1->n-1] | M1->p[1][1][M1->n-1]) > 0);
+
+  mpn_matrix22_mul (M->p[0][0], M->p[0][1],
+		    M->p[1][0], M->p[1][1], M->n,
+		    M1->p[0][0], M1->p[0][1],
+		    M1->p[1][0], M1->p[1][1], M1->n, tp);
+
+  /* Index of last potentially non-zero limb, size is one greater. */
+  n = M->n + M1->n;
+
+  n -= ((M->p[0][0][n] | M->p[0][1][n] | M->p[1][0][n] | M->p[1][1][n]) == 0);
+  n -= ((M->p[0][0][n] | M->p[0][1][n] | M->p[1][0][n] | M->p[1][1][n]) == 0);
+  n -= ((M->p[0][0][n] | M->p[0][1][n] | M->p[1][0][n] | M->p[1][1][n]) == 0);
+
+  ASSERT ((M->p[0][0][n] | M->p[0][1][n] | M->p[1][0][n] | M->p[1][1][n]) > 0);
+
+  M->n = n + 1;
+}
+
+/* Multiplies the least significant p limbs of (a;b) by M^-1.
+   Temporary space needed: 2 * (p + M->n)*/
+mp_size_t
+mpn_hgcd_matrix_adjust (const struct hgcd_matrix *M,
+			mp_size_t n, mp_ptr ap, mp_ptr bp,
+			mp_size_t p, mp_ptr tp)
+{
+  /* M^-1 (a;b) = (r11, -r01; -r10, r00) (a ; b)
+     = (r11 a - r01 b; - r10 a + r00 b */
+
+  mp_ptr t0 = tp;
+  mp_ptr t1 = tp + p + M->n;
+  mp_limb_t ah, bh;
+  mp_limb_t cy;
+
+  ASSERT (p + M->n  < n);
+
+  /* First compute the two values depending on a, before overwriting a */
+
+  if (M->n >= p)
+    {
+      mpn_mul (t0, M->p[1][1], M->n, ap, p);
+      mpn_mul (t1, M->p[1][0], M->n, ap, p);
+    }
+  else
+    {
+      mpn_mul (t0, ap, p, M->p[1][1], M->n);
+      mpn_mul (t1, ap, p, M->p[1][0], M->n);
+    }
+
+  /* Update a */
+  MPN_COPY (ap, t0, p);
+  ah = mpn_add (ap + p, ap + p, n - p, t0 + p, M->n);
+
+  if (M->n >= p)
+    mpn_mul (t0, M->p[0][1], M->n, bp, p);
+  else
+    mpn_mul (t0, bp, p, M->p[0][1], M->n);
+
+  cy = mpn_sub (ap, ap, n, t0, p + M->n);
+  ASSERT (cy <= ah);
+  ah -= cy;
+
+  /* Update b */
+  if (M->n >= p)
+    mpn_mul (t0, M->p[0][0], M->n, bp, p);
+  else
+    mpn_mul (t0, bp, p, M->p[0][0], M->n);
+
+  MPN_COPY (bp, t0, p);
+  bh = mpn_add (bp + p, bp + p, n - p, t0 + p, M->n);
+  cy = mpn_sub (bp, bp, n, t1, p + M->n);
+  ASSERT (cy <= bh);
+  bh -= cy;
+
+  if (ah > 0 || bh > 0)
+    {
+      ap[n] = ah;
+      bp[n] = bh;
+      n++;
+    }
+  else
+    {
+      /* The subtraction can reduce the size by at most one limb. */
+      if (ap[n-1] == 0 && bp[n-1] == 0)
+	n--;
+    }
+  ASSERT (ap[n-1] > 0 || bp[n-1] > 0);
+  return n;
+}

diff --git a/third_party/gmp/mpn/generic/hgcd_reduce.c b/third_party/gmp/mpn/generic/hgcd_reduce.c
new file mode 100644
index 0000000..3aee77d
--- /dev/null
+++ b/third_party/gmp/mpn/generic/hgcd_reduce.c

@@ -0,0 +1,242 @@
+/* hgcd_reduce.c.
+
+   THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
+   SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+Copyright 2011, 2012 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+/* Computes R -= A * B. Result must be non-negative. Normalized down
+   to size an, and resulting size is returned. */
+static mp_size_t
+submul (mp_ptr rp, mp_size_t rn,
+	mp_srcptr ap, mp_size_t an, mp_srcptr bp, mp_size_t bn)
+{
+  mp_ptr tp;
+  TMP_DECL;
+
+  ASSERT (bn > 0);
+  ASSERT (an >= bn);
+  ASSERT (rn >= an);
+  ASSERT (an + bn <= rn + 1);
+
+  TMP_MARK;
+  tp = TMP_ALLOC_LIMBS (an + bn);
+
+  mpn_mul (tp, ap, an, bp, bn);
+  ASSERT ((an + bn <= rn) || (tp[rn] == 0));
+  ASSERT_NOCARRY (mpn_sub (rp, rp, rn, tp, an + bn - (an + bn > rn)));
+  TMP_FREE;
+
+  while (rn > an && (rp[rn-1] == 0))
+    rn--;
+
+  return rn;
+}
+
+/* Computes (a, b)  <--  M^{-1} (a; b) */
+/* FIXME:
+    x Take scratch parameter, and figure out scratch need.
+
+    x Use some fallback for small M->n?
+*/
+static mp_size_t
+hgcd_matrix_apply (const struct hgcd_matrix *M,
+		   mp_ptr ap, mp_ptr bp,
+		   mp_size_t n)
+{
+  mp_size_t an, bn, un, vn, nn;
+  mp_size_t mn[2][2];
+  mp_size_t modn;
+  mp_ptr tp, sp, scratch;
+  mp_limb_t cy;
+  unsigned i, j;
+
+  TMP_DECL;
+
+  ASSERT ( (ap[n-1] | bp[n-1]) > 0);
+
+  an = n;
+  MPN_NORMALIZE (ap, an);
+  bn = n;
+  MPN_NORMALIZE (bp, bn);
+
+  for (i = 0; i < 2; i++)
+    for (j = 0; j < 2; j++)
+      {
+	mp_size_t k;
+	k = M->n;
+	MPN_NORMALIZE (M->p[i][j], k);
+	mn[i][j] = k;
+      }
+
+  ASSERT (mn[0][0] > 0);
+  ASSERT (mn[1][1] > 0);
+  ASSERT ( (mn[0][1] | mn[1][0]) > 0);
+
+  TMP_MARK;
+
+  if (mn[0][1] == 0)
+    {
+      /* A unchanged, M = (1, 0; q, 1) */
+      ASSERT (mn[0][0] == 1);
+      ASSERT (M->p[0][0][0] == 1);
+      ASSERT (mn[1][1] == 1);
+      ASSERT (M->p[1][1][0] == 1);
+
+      /* Put B <-- B - q A */
+      nn = submul (bp, bn, ap, an, M->p[1][0], mn[1][0]);
+    }
+  else if (mn[1][0] == 0)
+    {
+      /* B unchanged, M = (1, q; 0, 1) */
+      ASSERT (mn[0][0] == 1);
+      ASSERT (M->p[0][0][0] == 1);
+      ASSERT (mn[1][1] == 1);
+      ASSERT (M->p[1][1][0] == 1);
+
+      /* Put A  <-- A - q * B */
+      nn = submul (ap, an, bp, bn, M->p[0][1], mn[0][1]);
+    }
+  else
+    {
+      /* A = m00 a + m01 b  ==> a <= A / m00, b <= A / m01.
+	 B = m10 a + m11 b  ==> a <= B / m10, b <= B / m11. */
+      un = MIN (an - mn[0][0], bn - mn[1][0]) + 1;
+      vn = MIN (an - mn[0][1], bn - mn[1][1]) + 1;
+
+      nn = MAX (un, vn);
+      /* In the range of interest, mulmod_bnm1 should always beat mullo. */
+      modn = mpn_mulmod_bnm1_next_size (nn + 1);
+
+      TMP_ALLOC_LIMBS_3 (tp, modn,
+			 sp, modn,
+			 scratch, mpn_mulmod_bnm1_itch (modn, modn, M->n));
+
+      ASSERT (n <= 2*modn);
+
+      if (n > modn)
+	{
+	  cy = mpn_add (ap, ap, modn, ap + modn, n - modn);
+	  MPN_INCR_U (ap, modn, cy);
+
+	  cy = mpn_add (bp, bp, modn, bp + modn, n - modn);
+	  MPN_INCR_U (bp, modn, cy);
+
+	  n = modn;
+	}
+
+      mpn_mulmod_bnm1 (tp, modn, ap, n, M->p[1][1], mn[1][1], scratch);
+      mpn_mulmod_bnm1 (sp, modn, bp, n, M->p[0][1], mn[0][1], scratch);
+
+      /* FIXME: Handle the small n case in some better way. */
+      if (n + mn[1][1] < modn)
+	MPN_ZERO (tp + n + mn[1][1], modn - n - mn[1][1]);
+      if (n + mn[0][1] < modn)
+	MPN_ZERO (sp + n + mn[0][1], modn - n - mn[0][1]);
+
+      cy = mpn_sub_n (tp, tp, sp, modn);
+      MPN_DECR_U (tp, modn, cy);
+
+      ASSERT (mpn_zero_p (tp + nn, modn - nn));
+
+      mpn_mulmod_bnm1 (sp, modn, ap, n, M->p[1][0], mn[1][0], scratch);
+      MPN_COPY (ap, tp, nn);
+      mpn_mulmod_bnm1 (tp, modn, bp, n, M->p[0][0], mn[0][0], scratch);
+
+      if (n + mn[1][0] < modn)
+	MPN_ZERO (sp + n + mn[1][0], modn - n - mn[1][0]);
+      if (n + mn[0][0] < modn)
+	MPN_ZERO (tp + n + mn[0][0], modn - n - mn[0][0]);
+
+      cy = mpn_sub_n (tp, tp, sp, modn);
+      MPN_DECR_U (tp, modn, cy);
+
+      ASSERT (mpn_zero_p (tp + nn, modn - nn));
+      MPN_COPY (bp, tp, nn);
+
+      while ( (ap[nn-1] | bp[nn-1]) == 0)
+	{
+	  nn--;
+	  ASSERT (nn > 0);
+	}
+    }
+  TMP_FREE;
+
+  return nn;
+}
+
+mp_size_t
+mpn_hgcd_reduce_itch (mp_size_t n, mp_size_t p)
+{
+  mp_size_t itch;
+  if (BELOW_THRESHOLD (n, HGCD_REDUCE_THRESHOLD))
+    {
+      itch = mpn_hgcd_itch (n-p);
+
+      /* For arbitrary p, the storage for _adjust is 2*(p + M->n) = 2 *
+	 (p + ceil((n-p)/2) - 1 <= n + p - 1 */
+      if (itch < n + p - 1)
+	itch = n + p - 1;
+    }
+  else
+    {
+      itch = 2*(n-p) + mpn_hgcd_itch (n-p);
+      /* Currently, hgcd_matrix_apply allocates its own storage. */
+    }
+  return itch;
+}
+
+/* FIXME: Document storage need. */
+mp_size_t
+mpn_hgcd_reduce (struct hgcd_matrix *M,
+		 mp_ptr ap, mp_ptr bp, mp_size_t n, mp_size_t p,
+		 mp_ptr tp)
+{
+  mp_size_t nn;
+  if (BELOW_THRESHOLD (n, HGCD_REDUCE_THRESHOLD))
+    {
+      nn = mpn_hgcd (ap + p, bp + p, n - p, M, tp);
+      if (nn > 0)
+	/* Needs 2*(p + M->n) <= 2*(floor(n/2) + ceil(n/2) - 1)
+	   = 2 (n - 1) */
+	return mpn_hgcd_matrix_adjust (M, p + nn, ap, bp, p, tp);
+    }
+  else
+    {
+      MPN_COPY (tp, ap + p, n - p);
+      MPN_COPY (tp + n - p, bp + p, n - p);
+      if (mpn_hgcd_appr (tp, tp + n - p, n - p, M, tp + 2*(n-p)))
+	return hgcd_matrix_apply (M, ap, bp, n);
+    }
+  return 0;
+}

diff --git a/third_party/gmp/mpn/generic/hgcd_step.c b/third_party/gmp/mpn/generic/hgcd_step.c
new file mode 100644
index 0000000..a978a88
--- /dev/null
+++ b/third_party/gmp/mpn/generic/hgcd_step.c

@@ -0,0 +1,127 @@
+/* hgcd_step.c.
+
+   THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
+   SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+Copyright 2003-2005, 2008, 2011, 2012 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+
+static void
+hgcd_hook (void *p, mp_srcptr gp, mp_size_t gn,
+	   mp_srcptr qp, mp_size_t qn, int d)
+{
+  ASSERT (!gp);
+  ASSERT (d >= 0);
+  ASSERT (d <= 1);
+
+  MPN_NORMALIZE (qp, qn);
+  if (qn > 0)
+    {
+      struct hgcd_matrix *M = (struct hgcd_matrix *) p;
+      /* NOTES: This is a bit ugly. A tp area is passed to
+	 gcd_subdiv_step, which stores q at the start of that area. We
+	 now use the rest. */
+      mp_ptr tp = (mp_ptr) qp + qn;
+      mpn_hgcd_matrix_update_q (M, qp, qn, d, tp);
+    }
+}
+
+/* Perform a few steps, using some of mpn_hgcd2, subtraction and
+   division. Reduces the size by almost one limb or more, but never
+   below the given size s. Return new size for a and b, or 0 if no
+   more steps are possible.
+
+   If hgcd2 succeeds, needs temporary space for hgcd_matrix_mul_1, M->n
+   limbs, and hgcd_mul_matrix1_inverse_vector, n limbs. If hgcd2
+   fails, needs space for the quotient, qn <= n - s limbs, for and
+   hgcd_matrix_update_q, qn + (size of the appropriate column of M) <=
+   (resulting size of M) + 1.
+
+   If N is the input size to the calling hgcd, then s = floor(N/2) +
+   1, M->n < N, qn + product size <= n - s + n - s + 1 = 2 (n - s) + 1
+   <= N.
+*/
+
+mp_size_t
+mpn_hgcd_step (mp_size_t n, mp_ptr ap, mp_ptr bp, mp_size_t s,
+	       struct hgcd_matrix *M, mp_ptr tp)
+{
+  struct hgcd_matrix1 M1;
+  mp_limb_t mask;
+  mp_limb_t ah, al, bh, bl;
+
+  ASSERT (n > s);
+
+  mask = ap[n-1] | bp[n-1];
+  ASSERT (mask > 0);
+
+  if (n == s + 1)
+    {
+      if (mask < 4)
+	goto subtract;
+
+      ah = ap[n-1]; al = ap[n-2];
+      bh = bp[n-1]; bl = bp[n-2];
+    }
+  else if (mask & GMP_NUMB_HIGHBIT)
+    {
+      ah = ap[n-1]; al = ap[n-2];
+      bh = bp[n-1]; bl = bp[n-2];
+    }
+  else
+    {
+      int shift;
+
+      count_leading_zeros (shift, mask);
+      ah = MPN_EXTRACT_NUMB (shift, ap[n-1], ap[n-2]);
+      al = MPN_EXTRACT_NUMB (shift, ap[n-2], ap[n-3]);
+      bh = MPN_EXTRACT_NUMB (shift, bp[n-1], bp[n-2]);
+      bl = MPN_EXTRACT_NUMB (shift, bp[n-2], bp[n-3]);
+    }
+
+  /* Try an mpn_hgcd2 step */
+  if (mpn_hgcd2 (ah, al, bh, bl, &M1))
+    {
+      /* Multiply M <- M * M1 */
+      mpn_hgcd_matrix_mul_1 (M, &M1, tp);
+
+      /* Can't swap inputs, so we need to copy. */
+      MPN_COPY (tp, ap, n);
+      /* Multiply M1^{-1} (a;b) */
+      return mpn_matrix22_mul1_inverse_vector (&M1, ap, tp, bp, n);
+    }
+
+ subtract:
+
+  return mpn_gcd_subdiv_step (ap, bp, n, s, hgcd_hook, M, tp);
+}

diff --git a/third_party/gmp/mpn/generic/invert.c b/third_party/gmp/mpn/generic/invert.c
new file mode 100644
index 0000000..157ff2b
--- /dev/null
+++ b/third_party/gmp/mpn/generic/invert.c

@@ -0,0 +1,86 @@
+/* invert.c -- Compute floor((B^{2n}-1)/U) - B^n.
+
+   Contributed to the GNU project by Marco Bodrato.
+
+   THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
+   SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
+
+Copyright (C) 2007, 2009, 2010, 2012, 2014-2016 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+void
+mpn_invert (mp_ptr ip, mp_srcptr dp, mp_size_t n, mp_ptr scratch)
+{
+  ASSERT (n > 0);
+  ASSERT (dp[n-1] & GMP_NUMB_HIGHBIT);
+  ASSERT (! MPN_OVERLAP_P (ip, n, dp, n));
+  ASSERT (! MPN_OVERLAP_P (ip, n, scratch, mpn_invertappr_itch(n)));
+  ASSERT (! MPN_OVERLAP_P (dp, n, scratch, mpn_invertappr_itch(n)));
+
+  if (n == 1)
+    invert_limb (*ip, *dp);
+  else if (BELOW_THRESHOLD (n, INV_APPR_THRESHOLD))
+    {
+	/* Maximum scratch needed by this branch: 2*n */
+	mp_ptr xp;
+
+	xp = scratch;				/* 2 * n limbs */
+	/* n > 1 here */
+	MPN_FILL (xp, n, GMP_NUMB_MAX);
+	mpn_com (xp + n, dp, n);
+	if (n == 2) {
+	  mpn_divrem_2 (ip, 0, xp, 4, dp);
+	} else {
+	  gmp_pi1_t inv;
+	  invert_pi1 (inv, dp[n-1], dp[n-2]);
+	  /* FIXME: should we use dcpi1_div_q, for big sizes? */
+	  mpn_sbpi1_div_q (ip, xp, 2 * n, dp, n, inv.inv32);
+	}
+    }
+  else { /* Use approximated inverse; correct the result if needed. */
+      mp_limb_t e; /* The possible error in the approximate inverse */
+
+      ASSERT ( mpn_invert_itch (n) >= mpn_invertappr_itch (n) );
+      e = mpn_ni_invertappr (ip, dp, n, scratch);
+
+      if (UNLIKELY (e)) { /* Assume the error can only be "0" (no error) or "1". */
+	/* Code to detect and correct the "off by one" approximation. */
+	mpn_mul_n (scratch, ip, dp, n);
+	e = mpn_add_n (scratch, scratch, dp, n); /* FIXME: we only need e.*/
+	if (LIKELY(e)) /* The high part can not give a carry by itself. */
+	  e = mpn_add_nc (scratch + n, scratch + n, dp, n, e); /* FIXME:e */
+	/* If the value was wrong (no carry), correct it (increment). */
+	e ^= CNST_LIMB (1);
+	MPN_INCR_U (ip, n, e);
+      }
+  }
+}

diff --git a/third_party/gmp/mpn/generic/invertappr.c b/third_party/gmp/mpn/generic/invertappr.c
new file mode 100644
index 0000000..3be5596
--- /dev/null
+++ b/third_party/gmp/mpn/generic/invertappr.c

@@ -0,0 +1,300 @@
+/* mpn_invertappr and helper functions.  Compute I such that
+   floor((B^{2n}-1)/U - 1 <= I + B^n <= floor((B^{2n}-1)/U.
+
+   Contributed to the GNU project by Marco Bodrato.
+
+   The algorithm used here was inspired by ApproximateReciprocal from "Modern
+   Computer Arithmetic", by Richard P. Brent and Paul Zimmermann.  Special
+   thanks to Paul Zimmermann for his very valuable suggestions on all the
+   theoretical aspects during the work on this code.
+
+   THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
+   SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
+
+Copyright (C) 2007, 2009, 2010, 2012, 2015, 2016 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+/* FIXME: The iterative version splits the operand in two slightly unbalanced
+   parts, the use of log_2 (or counting the bits) underestimate the maximum
+   number of iterations.  */
+
+#if TUNE_PROGRAM_BUILD
+#define NPOWS \
+ ((sizeof(mp_size_t) > 6 ? 48 : 8*sizeof(mp_size_t)))
+#define MAYBE_dcpi1_divappr   1
+#else
+#define NPOWS \
+ ((sizeof(mp_size_t) > 6 ? 48 : 8*sizeof(mp_size_t)) - LOG2C (INV_NEWTON_THRESHOLD))
+#define MAYBE_dcpi1_divappr \
+  (INV_NEWTON_THRESHOLD < DC_DIVAPPR_Q_THRESHOLD)
+#if (INV_NEWTON_THRESHOLD > INV_MULMOD_BNM1_THRESHOLD) && \
+    (INV_APPR_THRESHOLD > INV_MULMOD_BNM1_THRESHOLD)
+#undef  INV_MULMOD_BNM1_THRESHOLD
+#define INV_MULMOD_BNM1_THRESHOLD 0 /* always when Newton */
+#endif
+#endif
+
+/* All the three functions mpn{,_bc,_ni}_invertappr (ip, dp, n, scratch), take
+   the strictly normalised value {dp,n} (i.e., most significant bit must be set)
+   as an input, and compute {ip,n}: the approximate reciprocal of {dp,n}.
+
+   Let e = mpn*_invertappr (ip, dp, n, scratch) be the returned value; the
+   following conditions are satisfied by the output:
+     0 <= e <= 1;
+     {dp,n}*(B^n+{ip,n}) < B^{2n} <= {dp,n}*(B^n+{ip,n}+1+e) .
+   I.e. e=0 means that the result {ip,n} equals the one given by mpn_invert.
+	e=1 means that the result _may_ be one less than expected.
+
+   The _bc version returns e=1 most of the time.
+   The _ni version should return e=0 most of the time; only about 1% of
+   possible random input should give e=1.
+
+   When the strict result is needed, i.e., e=0 in the relation above:
+     {dp,n}*(B^n+{ip,n}) < B^{2n} <= {dp,n}*(B^n+{ip,n}+1) ;
+   the function mpn_invert (ip, dp, n, scratch) should be used instead.  */
+
+/* Maximum scratch needed by this branch (at xp): 2*n */
+static mp_limb_t
+mpn_bc_invertappr (mp_ptr ip, mp_srcptr dp, mp_size_t n, mp_ptr xp)
+{
+  ASSERT (n > 0);
+  ASSERT (dp[n-1] & GMP_NUMB_HIGHBIT);
+  ASSERT (! MPN_OVERLAP_P (ip, n, dp, n));
+  ASSERT (! MPN_OVERLAP_P (ip, n, xp, mpn_invertappr_itch(n)));
+  ASSERT (! MPN_OVERLAP_P (dp, n, xp, mpn_invertappr_itch(n)));
+
+  /* Compute a base value of r limbs. */
+  if (n == 1)
+    invert_limb (*ip, *dp);
+  else {
+    /* n > 1 here */
+    MPN_FILL (xp, n, GMP_NUMB_MAX);
+    mpn_com (xp + n, dp, n);
+
+    /* Now xp contains B^2n - {dp,n}*B^n - 1 */
+
+    /* FIXME: if mpn_*pi1_divappr_q handles n==2, use it! */
+    if (n == 2) {
+      mpn_divrem_2 (ip, 0, xp, 4, dp);
+    } else {
+      gmp_pi1_t inv;
+      invert_pi1 (inv, dp[n-1], dp[n-2]);
+      if (! MAYBE_dcpi1_divappr
+	  || BELOW_THRESHOLD (n, DC_DIVAPPR_Q_THRESHOLD))
+	mpn_sbpi1_divappr_q (ip, xp, 2 * n, dp, n, inv.inv32);
+      else
+	mpn_dcpi1_divappr_q (ip, xp, 2 * n, dp, n, &inv);
+      MPN_DECR_U(ip, n, CNST_LIMB (1));
+      return 1;
+    }
+  }
+  return 0;
+}
+
+/* mpn_ni_invertappr: computes the approximate reciprocal using Newton's
+   iterations (at least one).
+
+   Inspired by Algorithm "ApproximateReciprocal", published in "Modern Computer
+   Arithmetic" by Richard P. Brent and Paul Zimmermann, algorithm 3.5, page 121
+   in version 0.4 of the book.
+
+   Some adaptations were introduced, to allow product mod B^m-1 and return the
+   value e.
+
+   We introduced a correction in such a way that "the value of
+   B^{n+h}-T computed at step 8 cannot exceed B^n-1" (the book reads
+   "2B^n-1").
+
+   Maximum scratch needed by this branch <= 2*n, but have to fit 3*rn
+   in the scratch, i.e. 3*rn <= 2*n: we require n>4.
+
+   We use a wrapped product modulo B^m-1.  NOTE: is there any normalisation
+   problem for the [0] class?  It shouldn't: we compute 2*|A*X_h - B^{n+h}| <
+   B^m-1.  We may get [0] if and only if we get AX_h = B^{n+h}.  This can
+   happen only if A=B^{n}/2, but this implies X_h = B^{h}*2-1 i.e., AX_h =
+   B^{n+h} - A, then we get into the "negative" branch, where X_h is not
+   incremented (because A < B^n).
+
+   FIXME: the scratch for mulmod_bnm1 does not currently fit in the scratch, it
+   is allocated apart.
+ */
+
+mp_limb_t
+mpn_ni_invertappr (mp_ptr ip, mp_srcptr dp, mp_size_t n, mp_ptr scratch)
+{
+  mp_limb_t cy;
+  mp_size_t rn, mn;
+  mp_size_t sizes[NPOWS], *sizp;
+  mp_ptr tp;
+  TMP_DECL;
+#define xp scratch
+
+  ASSERT (n > 4);
+  ASSERT (dp[n-1] & GMP_NUMB_HIGHBIT);
+  ASSERT (! MPN_OVERLAP_P (ip, n, dp, n));
+  ASSERT (! MPN_OVERLAP_P (ip, n, scratch, mpn_invertappr_itch(n)));
+  ASSERT (! MPN_OVERLAP_P (dp, n, scratch, mpn_invertappr_itch(n)));
+
+  /* Compute the computation precisions from highest to lowest, leaving the
+     base case size in 'rn'.  */
+  sizp = sizes;
+  rn = n;
+  do {
+    *sizp = rn;
+    rn = (rn >> 1) + 1;
+    ++sizp;
+  } while (ABOVE_THRESHOLD (rn, INV_NEWTON_THRESHOLD));
+
+  /* We search the inverse of 0.{dp,n}, we compute it as 1.{ip,n} */
+  dp += n;
+  ip += n;
+
+  /* Compute a base value of rn limbs. */
+  mpn_bc_invertappr (ip - rn, dp - rn, rn, scratch);
+
+  TMP_MARK;
+
+  if (ABOVE_THRESHOLD (n, INV_MULMOD_BNM1_THRESHOLD))
+    {
+      mn = mpn_mulmod_bnm1_next_size (n + 1);
+      tp = TMP_ALLOC_LIMBS (mpn_mulmod_bnm1_itch (mn, n, (n >> 1) + 1));
+    }
+  /* Use Newton's iterations to get the desired precision.*/
+
+  while (1) {
+    n = *--sizp;
+    /*
+      v    n  v
+      +----+--+
+      ^ rn ^
+    */
+
+    /* Compute i_jd . */
+    if (BELOW_THRESHOLD (n, INV_MULMOD_BNM1_THRESHOLD)
+	|| ((mn = mpn_mulmod_bnm1_next_size (n + 1)) > (n + rn))) {
+      /* FIXME: We do only need {xp,n+1}*/
+      mpn_mul (xp, dp - n, n, ip - rn, rn);
+      mpn_add_n (xp + rn, xp + rn, dp - n, n - rn + 1);
+      cy = CNST_LIMB(1); /* Remember we truncated, Mod B^(n+1) */
+      /* We computed (truncated) {xp,n+1} <- 1.{ip,rn} * 0.{dp,n} */
+    } else { /* Use B^mn-1 wraparound */
+      mpn_mulmod_bnm1 (xp, mn, dp - n, n, ip - rn, rn, tp);
+      /* We computed {xp,mn} <- {ip,rn} * {dp,n} mod (B^mn-1) */
+      /* We know that 2*|ip*dp + dp*B^rn - B^{rn+n}| < B^mn-1 */
+      /* Add dp*B^rn mod (B^mn-1) */
+      ASSERT (n >= mn - rn);
+      cy = mpn_add_n (xp + rn, xp + rn, dp - n, mn - rn);
+      cy = mpn_add_nc (xp, xp, dp - (n - (mn - rn)), n - (mn - rn), cy);
+      /* Subtract B^{rn+n}, maybe only compensate the carry*/
+      xp[mn] = CNST_LIMB (1); /* set a limit for DECR_U */
+      MPN_DECR_U (xp + rn + n - mn, 2 * mn + 1 - rn - n, CNST_LIMB (1) - cy);
+      MPN_DECR_U (xp, mn, CNST_LIMB (1) - xp[mn]); /* if DECR_U eroded xp[mn] */
+      cy = CNST_LIMB(0); /* Remember we are working Mod B^mn-1 */
+    }
+
+    if (xp[n] < CNST_LIMB (2)) { /* "positive" residue class */
+      cy = xp[n]; /* 0 <= cy <= 1 here. */
+#if HAVE_NATIVE_mpn_sublsh1_n
+      if (cy++) {
+	if (mpn_cmp (xp, dp - n, n) > 0) {
+	  mp_limb_t chk;
+	  chk = mpn_sublsh1_n (xp, xp, dp - n, n);
+	  ASSERT (chk == xp[n]);
+	  ++ cy;
+	} else
+	  ASSERT_CARRY (mpn_sub_n (xp, xp, dp - n, n));
+      }
+#else /* no mpn_sublsh1_n*/
+      if (cy++ && !mpn_sub_n (xp, xp, dp - n, n)) {
+	ASSERT_CARRY (mpn_sub_n (xp, xp, dp - n, n));
+	++cy;
+      }
+#endif
+      /* 1 <= cy <= 3 here. */
+#if HAVE_NATIVE_mpn_rsblsh1_n
+      if (mpn_cmp (xp, dp - n, n) > 0) {
+	ASSERT_NOCARRY (mpn_rsblsh1_n (xp + n, xp, dp - n, n));
+	++cy;
+      } else
+	ASSERT_NOCARRY (mpn_sub_nc (xp + 2 * n - rn, dp - rn, xp + n - rn, rn, mpn_cmp (xp, dp - n, n - rn) > 0));
+#else /* no mpn_rsblsh1_n*/
+      if (mpn_cmp (xp, dp - n, n) > 0) {
+	ASSERT_NOCARRY (mpn_sub_n (xp, xp, dp - n, n));
+	++cy;
+      }
+      ASSERT_NOCARRY (mpn_sub_nc (xp + 2 * n - rn, dp - rn, xp + n - rn, rn, mpn_cmp (xp, dp - n, n - rn) > 0));
+#endif
+      MPN_DECR_U(ip - rn, rn, cy); /* 1 <= cy <= 4 here. */
+    } else { /* "negative" residue class */
+      ASSERT (xp[n] >= GMP_NUMB_MAX - CNST_LIMB(1));
+      MPN_DECR_U(xp, n + 1, cy);
+      if (xp[n] != GMP_NUMB_MAX) {
+	MPN_INCR_U(ip - rn, rn, CNST_LIMB (1));
+	ASSERT_CARRY (mpn_add_n (xp, xp, dp - n, n));
+      }
+      mpn_com (xp + 2 * n - rn, xp + n - rn, rn);
+    }
+
+    /* Compute x_ju_j. FIXME:We need {xp+rn,rn}, mulhi? */
+    mpn_mul_n (xp, xp + 2 * n - rn, ip - rn, rn);
+    cy = mpn_add_n (xp + rn, xp + rn, xp + 2 * n - rn, 2 * rn - n);
+    cy = mpn_add_nc (ip - n, xp + 3 * rn - n, xp + n + rn, n - rn, cy);
+    MPN_INCR_U (ip - rn, rn, cy);
+    if (sizp == sizes) { /* Get out of the cycle */
+      /* Check for possible carry propagation from below. */
+      cy = xp[3 * rn - n - 1] > GMP_NUMB_MAX - CNST_LIMB (7); /* Be conservative. */
+      /*    cy = mpn_add_1 (xp + rn, xp + rn, 2*rn - n, 4); */
+      break;
+    }
+    rn = n;
+  }
+  TMP_FREE;
+
+  return cy;
+#undef xp
+}
+
+mp_limb_t
+mpn_invertappr (mp_ptr ip, mp_srcptr dp, mp_size_t n, mp_ptr scratch)
+{
+  ASSERT (n > 0);
+  ASSERT (dp[n-1] & GMP_NUMB_HIGHBIT);
+  ASSERT (! MPN_OVERLAP_P (ip, n, dp, n));
+  ASSERT (! MPN_OVERLAP_P (ip, n, scratch, mpn_invertappr_itch(n)));
+  ASSERT (! MPN_OVERLAP_P (dp, n, scratch, mpn_invertappr_itch(n)));
+
+  if (BELOW_THRESHOLD (n, INV_NEWTON_THRESHOLD))
+    return mpn_bc_invertappr (ip, dp, n, scratch);
+  else
+    return mpn_ni_invertappr (ip, dp, n, scratch);
+}

diff --git a/third_party/gmp/mpn/generic/jacbase.c b/third_party/gmp/mpn/generic/jacbase.c
new file mode 100644
index 0000000..735ad7a
--- /dev/null
+++ b/third_party/gmp/mpn/generic/jacbase.c

@@ -0,0 +1,242 @@
+/* mpn_jacobi_base -- limb/limb Jacobi symbol with restricted arguments.
+
+   THIS INTERFACE IS PRELIMINARY AND MIGHT DISAPPEAR OR BE SUBJECT TO
+   INCOMPATIBLE CHANGES IN A FUTURE RELEASE OF GMP.
+
+Copyright 1999-2002, 2010 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+
+/* Use the simple loop by default.  The generic count_trailing_zeros is not
+   very fast, and the extra trickery of method 3 has proven to be less use
+   than might have been though.  */
+#ifndef JACOBI_BASE_METHOD
+#define JACOBI_BASE_METHOD  2
+#endif
+
+
+/* Use count_trailing_zeros.  */
+#if JACOBI_BASE_METHOD == 1
+#define PROCESS_TWOS_ANY                                \
+  {                                                     \
+    mp_limb_t  twos;                                    \
+    count_trailing_zeros (twos, a);                     \
+    result_bit1 ^= JACOBI_TWOS_U_BIT1 (twos, b);        \
+    a >>= twos;                                         \
+  }
+#define PROCESS_TWOS_EVEN  PROCESS_TWOS_ANY
+#endif
+
+/* Use a simple loop.  A disadvantage of this is that there's a branch on a
+   50/50 chance of a 0 or 1 low bit.  */
+#if JACOBI_BASE_METHOD == 2
+#define PROCESS_TWOS_EVEN               \
+  {                                     \
+    int  two;                           \
+    two = JACOBI_TWO_U_BIT1 (b);        \
+    do                                  \
+      {                                 \
+	a >>= 1;                        \
+	result_bit1 ^= two;             \
+	ASSERT (a != 0);                \
+      }                                 \
+    while ((a & 1) == 0);               \
+  }
+#define PROCESS_TWOS_ANY        \
+  if ((a & 1) == 0)             \
+    PROCESS_TWOS_EVEN;
+#endif
+
+/* Process one bit arithmetically, then a simple loop.  This cuts the loop
+   condition down to a 25/75 chance, which should branch predict better.
+   The CPU will need a reasonable variable left shift.  */
+#if JACOBI_BASE_METHOD == 3
+#define PROCESS_TWOS_EVEN               \
+  {                                     \
+    int  two, mask, shift;              \
+					\
+    two = JACOBI_TWO_U_BIT1 (b);        \
+    mask = (~a & 2);                    \
+    a >>= 1;                            \
+					\
+    shift = (~a & 1);                   \
+    a >>= shift;                        \
+    result_bit1 ^= two ^ (two & mask);  \
+					\
+    while ((a & 1) == 0)                \
+      {                                 \
+	a >>= 1;                        \
+	result_bit1 ^= two;             \
+	ASSERT (a != 0);                \
+      }                                 \
+  }
+#define PROCESS_TWOS_ANY                \
+  {                                     \
+    int  two, mask, shift;              \
+					\
+    two = JACOBI_TWO_U_BIT1 (b);        \
+    shift = (~a & 1);                   \
+    a >>= shift;                        \
+					\
+    mask = shift << 1;                  \
+    result_bit1 ^= (two & mask);        \
+					\
+    while ((a & 1) == 0)                \
+      {                                 \
+	a >>= 1;                        \
+	result_bit1 ^= two;             \
+	ASSERT (a != 0);                \
+      }                                 \
+  }
+#endif
+
+#if JACOBI_BASE_METHOD < 4
+/* Calculate the value of the Jacobi symbol (a/b) of two mp_limb_t's, but
+   with a restricted range of inputs accepted, namely b>1, b odd.
+
+   The initial result_bit1 is taken as a parameter for the convenience of
+   mpz_kronecker_ui() et al.  The sign changes both here and in those
+   routines accumulate nicely in bit 1, see the JACOBI macros.
+
+   The return value here is the normal +1, 0, or -1.  Note that +1 and -1
+   have bit 1 in the "BIT1" sense, which could be useful if the caller is
+   accumulating it into some extended calculation.
+
+   Duplicating the loop body to avoid the MP_LIMB_T_SWAP(a,b) would be
+   possible, but a couple of tests suggest it's not a significant speedup,
+   and may even be a slowdown, so what's here is good enough for now. */
+
+int
+mpn_jacobi_base (mp_limb_t a, mp_limb_t b, int result_bit1)
+{
+  ASSERT (b & 1);  /* b odd */
+  ASSERT (b != 1);
+
+  if (a == 0)
+    return 0;
+
+  PROCESS_TWOS_ANY;
+  if (a == 1)
+    goto done;
+
+  if (a >= b)
+    goto a_gt_b;
+
+  for (;;)
+    {
+      result_bit1 ^= JACOBI_RECIP_UU_BIT1 (a, b);
+      MP_LIMB_T_SWAP (a, b);
+
+    a_gt_b:
+      do
+	{
+	  /* working on (a/b), a,b odd, a>=b */
+	  ASSERT (a & 1);
+	  ASSERT (b & 1);
+	  ASSERT (a >= b);
+
+	  if ((a -= b) == 0)
+	    return 0;
+
+	  PROCESS_TWOS_EVEN;
+	  if (a == 1)
+	    goto done;
+	}
+      while (a >= b);
+    }
+
+ done:
+  return JACOBI_BIT1_TO_PN (result_bit1);
+}
+#endif
+
+#if JACOBI_BASE_METHOD == 4
+/* Computes (a/b) for odd b > 1 and any a. The initial bit is taken as a
+ * parameter. We have no need for the convention that the sign is in
+ * bit 1, internally we use bit 0. */
+
+/* FIXME: Could try table-based count_trailing_zeros. */
+int
+mpn_jacobi_base (mp_limb_t a, mp_limb_t b, int bit)
+{
+  int c;
+
+  ASSERT (b & 1);
+  ASSERT (b > 1);
+
+  if (a == 0)
+    /* This is the only line which depends on b > 1 */
+    return 0;
+
+  bit >>= 1;
+
+  /* Below, we represent a and b shifted right so that the least
+     significant one bit is implicit. */
+
+  b >>= 1;
+
+  count_trailing_zeros (c, a);
+  bit ^= c & (b ^ (b >> 1));
+
+  /* We may have c==GMP_LIMB_BITS-1, so we can't use a>>c+1. */
+  a >>= c;
+  a >>= 1;
+
+  do
+    {
+      mp_limb_t t = a - b;
+      mp_limb_t bgta = LIMB_HIGHBIT_TO_MASK (t);
+
+      if (t == 0)
+	return 0;
+
+      /* If b > a, invoke reciprocity */
+      bit ^= (bgta & a & b);
+
+      /* b <-- min (a, b) */
+      b += (bgta & t);
+
+      /* a <-- |a - b| */
+      a = (t ^ bgta) - bgta;
+
+      /* Number of trailing zeros is the same no matter if we look at
+       * t or a, but using t gives more parallelism. */
+      count_trailing_zeros (c, t);
+      c ++;
+      /* (2/b) = -1 if b = 3 or 5 mod 8 */
+      bit ^= c & (b ^ (b >> 1));
+      a >>= c;
+    }
+  while (b > 0);
+
+  return 1-2*(bit & 1);
+}
+#endif /* JACOBI_BASE_METHOD == 4 */

diff --git a/third_party/gmp/mpn/generic/jacobi.c b/third_party/gmp/mpn/generic/jacobi.c
new file mode 100644
index 0000000..d98b126
--- /dev/null
+++ b/third_party/gmp/mpn/generic/jacobi.c

@@ -0,0 +1,294 @@
+/* jacobi.c
+
+   THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
+   SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+Copyright 1996, 1998, 2000-2004, 2008, 2010, 2011 Free Software Foundation,
+Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+#ifndef JACOBI_DC_THRESHOLD
+#define JACOBI_DC_THRESHOLD GCD_DC_THRESHOLD
+#endif
+
+/* Schönhage's rules:
+ *
+ * Assume r0 = r1 q1 + r2, with r0 odd, and r1 = q2 r2 + r3
+ *
+ * If r1 is odd, then
+ *
+ *   (r1 | r0) = s(r1, r0) (r0 | r1) = s(r1, r0) (r2, r1)
+ *
+ * where s(x,y) = (-1)^{(x-1)(y-1)/4} = (-1)^[x = y = 3 (mod 4)].
+ *
+ * If r1 is even, r2 must be odd. We have
+ *
+ *   (r1 | r0) = (r1 - r0 | r0) = (-1)^(r0-1)/2 (r0 - r1 | r0)
+ *             = (-1)^(r0-1)/2 s(r0, r0 - r1) (r0 | r0 - r1)
+ *             = (-1)^(r0-1)/2 s(r0, r0 - r1) (r1 | r0 - r1)
+ *
+ * Now, if r1 = 0 (mod 4), then the sign factor is +1, and repeating
+ * q1 times gives
+ *
+ *   (r1 | r0) = (r1 | r2) = (r3 | r2)
+ *
+ * On the other hand, if r1 = 2 (mod 4), the sign factor is
+ * (-1)^{(r0-1)/2}, and repeating q1 times gives the exponent
+ *
+ *   (r0-1)/2 + (r0-r1-1)/2 + ... + (r0 - (q1-1) r1)/2
+ *   = q1 (r0-1)/2 + q1 (q1-1)/2
+ *
+ * and we can summarize the even case as
+ *
+ *   (r1 | r0) = t(r1, r0, q1) (r3 | r2)
+ *
+ * where t(x,y,q) = (-1)^{[x = 2 (mod 4)] (q(y-1)/2 + y(q-1)/2)}
+ *
+ * What about termination? The remainder sequence ends with (0|1) = 1
+ * (or (0 | r) = 0 if r != 1). What are the possible cases? If r1 is
+ * odd, r2 may be zero. If r1 is even, then r2 = r0 - q1 r1 is odd and
+ * hence non-zero. We may have r3 = r1 - q2 r2 = 0.
+ *
+ * Examples: (11|15) = - (15|11) = - (4|11)
+ *            (4|11) =    (4| 3) =   (1| 3)
+ *            (1| 3) = (3|1) = (0|1) = 1
+ *
+ *             (2|7) = (2|1) = (0|1) = 1
+ *
+ * Detail:     (2|7) = (2-7|7) = (-1|7)(5|7) = -(7|5) = -(2|5)
+ *             (2|5) = (2-5|5) = (-1|5)(3|5) =  (5|3) =  (2|3)
+ *             (2|3) = (2-3|3) = (-1|3)(1|3) = -(3|1) = -(2|1)
+ *
+ */
+
+/* In principle, the state consists of four variables: e (one bit), a,
+   b (two bits each), d (one bit). Collected factors are (-1)^e. a and
+   b are the least significant bits of the current remainders. d
+   (denominator) is 0 if we're currently subtracting multiplies of a
+   from b, and 1 if we're subtracting b from a.
+
+   e is stored in the least significant bit, while a, b and d are
+   coded as only 13 distinct values in bits 1-4, according to the
+   following table. For rows not mentioning d, the value is either
+   implied, or it doesn't matter. */
+
+#if WANT_ASSERT
+static const struct
+{
+  unsigned char a;
+  unsigned char b;
+} decode_table[13] = {
+  /*  0 */ { 0, 1 },
+  /*  1 */ { 0, 3 },
+  /*  2 */ { 1, 1 },
+  /*  3 */ { 1, 3 },
+  /*  4 */ { 2, 1 },
+  /*  5 */ { 2, 3 },
+  /*  6 */ { 3, 1 },
+  /*  7 */ { 3, 3 }, /* d = 1 */
+  /*  8 */ { 1, 0 },
+  /*  9 */ { 1, 2 },
+  /* 10 */ { 3, 0 },
+  /* 11 */ { 3, 2 },
+  /* 12 */ { 3, 3 }, /* d = 0 */
+};
+#define JACOBI_A(bits) (decode_table[(bits)>>1].a)
+#define JACOBI_B(bits) (decode_table[(bits)>>1].b)
+#endif /* WANT_ASSERT */
+
+const unsigned char jacobi_table[208] = {
+#include "jacobitab.h"
+};
+
+#define BITS_FAIL 31
+
+static void
+jacobi_hook (void *p, mp_srcptr gp, mp_size_t gn,
+	     mp_srcptr qp, mp_size_t qn, int d)
+{
+  unsigned *bitsp = (unsigned *) p;
+
+  if (gp)
+    {
+      ASSERT (gn > 0);
+      if (gn != 1 || gp[0] != 1)
+	{
+	  *bitsp = BITS_FAIL;
+	  return;
+	}
+    }
+
+  if (qp)
+    {
+      ASSERT (qn > 0);
+      ASSERT (d >= 0);
+      *bitsp = mpn_jacobi_update (*bitsp, d, qp[0] & 3);
+    }
+}
+
+#define CHOOSE_P(n) (2*(n) / 3)
+
+int
+mpn_jacobi_n (mp_ptr ap, mp_ptr bp, mp_size_t n, unsigned bits)
+{
+  mp_size_t scratch;
+  mp_size_t matrix_scratch;
+  mp_ptr tp;
+
+  TMP_DECL;
+
+  ASSERT (n > 0);
+  ASSERT ( (ap[n-1] | bp[n-1]) > 0);
+  ASSERT ( (bp[0] | ap[0]) & 1);
+
+  /* FIXME: Check for small sizes first, before setting up temporary
+     storage etc. */
+  scratch = MPN_GCD_SUBDIV_STEP_ITCH(n);
+
+  if (ABOVE_THRESHOLD (n, JACOBI_DC_THRESHOLD))
+    {
+      mp_size_t hgcd_scratch;
+      mp_size_t update_scratch;
+      mp_size_t p = CHOOSE_P (n);
+      mp_size_t dc_scratch;
+
+      matrix_scratch = MPN_HGCD_MATRIX_INIT_ITCH (n - p);
+      hgcd_scratch = mpn_hgcd_itch (n - p);
+      update_scratch = p + n - 1;
+
+      dc_scratch = matrix_scratch + MAX(hgcd_scratch, update_scratch);
+      if (dc_scratch > scratch)
+	scratch = dc_scratch;
+    }
+
+  TMP_MARK;
+  tp = TMP_ALLOC_LIMBS(scratch);
+
+  while (ABOVE_THRESHOLD (n, JACOBI_DC_THRESHOLD))
+    {
+      struct hgcd_matrix M;
+      mp_size_t p = 2*n/3;
+      mp_size_t matrix_scratch = MPN_HGCD_MATRIX_INIT_ITCH (n - p);
+      mp_size_t nn;
+      mpn_hgcd_matrix_init (&M, n - p, tp);
+
+      nn = mpn_hgcd_jacobi (ap + p, bp + p, n - p, &M, &bits,
+			    tp + matrix_scratch);
+      if (nn > 0)
+	{
+	  ASSERT (M.n <= (n - p - 1)/2);
+	  ASSERT (M.n + p <= (p + n - 1) / 2);
+	  /* Temporary storage 2 (p + M->n) <= p + n - 1. */
+	  n = mpn_hgcd_matrix_adjust (&M, p + nn, ap, bp, p, tp + matrix_scratch);
+	}
+      else
+	{
+	  /* Temporary storage n */
+	  n = mpn_gcd_subdiv_step (ap, bp, n, 0, jacobi_hook, &bits, tp);
+	  if (!n)
+	    {
+	      TMP_FREE;
+	      return bits == BITS_FAIL ? 0 : mpn_jacobi_finish (bits);
+	    }
+	}
+    }
+
+  while (n > 2)
+    {
+      struct hgcd_matrix1 M;
+      mp_limb_t ah, al, bh, bl;
+      mp_limb_t mask;
+
+      mask = ap[n-1] | bp[n-1];
+      ASSERT (mask > 0);
+
+      if (mask & GMP_NUMB_HIGHBIT)
+	{
+	  ah = ap[n-1]; al = ap[n-2];
+	  bh = bp[n-1]; bl = bp[n-2];
+	}
+      else
+	{
+	  int shift;
+
+	  count_leading_zeros (shift, mask);
+	  ah = MPN_EXTRACT_NUMB (shift, ap[n-1], ap[n-2]);
+	  al = MPN_EXTRACT_NUMB (shift, ap[n-2], ap[n-3]);
+	  bh = MPN_EXTRACT_NUMB (shift, bp[n-1], bp[n-2]);
+	  bl = MPN_EXTRACT_NUMB (shift, bp[n-2], bp[n-3]);
+	}
+
+      /* Try an mpn_nhgcd2 step */
+      if (mpn_hgcd2_jacobi (ah, al, bh, bl, &M, &bits))
+	{
+	  n = mpn_matrix22_mul1_inverse_vector (&M, tp, ap, bp, n);
+	  MP_PTR_SWAP (ap, tp);
+	}
+      else
+	{
+	  /* mpn_hgcd2 has failed. Then either one of a or b is very
+	     small, or the difference is very small. Perform one
+	     subtraction followed by one division. */
+	  n = mpn_gcd_subdiv_step (ap, bp, n, 0, &jacobi_hook, &bits, tp);
+	  if (!n)
+	    {
+	      TMP_FREE;
+	      return bits == BITS_FAIL ? 0 : mpn_jacobi_finish (bits);
+	    }
+	}
+    }
+
+  if (bits >= 16)
+    MP_PTR_SWAP (ap, bp);
+
+  ASSERT (bp[0] & 1);
+
+  if (n == 1)
+    {
+      mp_limb_t al, bl;
+      al = ap[0];
+      bl = bp[0];
+
+      TMP_FREE;
+      if (bl == 1)
+	return 1 - 2*(bits & 1);
+      else
+	return mpn_jacobi_base (al, bl, bits << 1);
+    }
+
+  else
+    {
+      int res = mpn_jacobi_2 (ap, bp, bits & 1);
+      TMP_FREE;
+      return res;
+    }
+}

diff --git a/third_party/gmp/mpn/generic/jacobi_2.c b/third_party/gmp/mpn/generic/jacobi_2.c
new file mode 100644
index 0000000..028b8a4
--- /dev/null
+++ b/third_party/gmp/mpn/generic/jacobi_2.c

@@ -0,0 +1,351 @@
+/* jacobi_2.c
+
+   THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
+   SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+Copyright 1996, 1998, 2000-2004, 2008, 2010 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+#ifndef JACOBI_2_METHOD
+#define JACOBI_2_METHOD 2
+#endif
+
+/* Computes (a / b) where b is odd, and a and b are otherwise arbitrary
+   two-limb numbers. */
+#if JACOBI_2_METHOD == 1
+int
+mpn_jacobi_2 (mp_srcptr ap, mp_srcptr bp, unsigned bit)
+{
+  mp_limb_t ah, al, bh, bl;
+  int c;
+
+  al = ap[0];
+  ah = ap[1];
+  bl = bp[0];
+  bh = bp[1];
+
+  ASSERT (bl & 1);
+
+  bl = ((bh << (GMP_NUMB_BITS - 1)) & GMP_NUMB_MASK) | (bl >> 1);
+  bh >>= 1;
+
+  if ( (bh | bl) == 0)
+    return 1 - 2*(bit & 1);
+
+  if ( (ah | al) == 0)
+    return 0;
+
+  if (al == 0)
+    {
+      al = ah;
+      ah = 0;
+      bit ^= GMP_NUMB_BITS & (bl ^ (bl >> 1));
+    }
+  count_trailing_zeros (c, al);
+  bit ^= c & (bl ^ (bl >> 1));
+
+  c++;
+  if (UNLIKELY (c == GMP_NUMB_BITS))
+    {
+      al = ah;
+      ah = 0;
+    }
+  else
+    {
+      al = ((ah << (GMP_NUMB_BITS - c)) & GMP_NUMB_MASK) | (al >> c);
+      ah >>= c;
+    }
+  while ( (ah | bh) > 0)
+    {
+      mp_limb_t th, tl;
+      mp_limb_t bgta;
+
+      sub_ddmmss (th, tl, ah, al, bh, bl);
+      if ( (tl | th) == 0)
+	return 0;
+
+      bgta = LIMB_HIGHBIT_TO_MASK (th);
+
+      /* If b > a, invoke reciprocity */
+      bit ^= (bgta & al & bl);
+
+      /* b <-- min (a, b) */
+      add_ssaaaa (bh, bl, bh, bl, th & bgta, tl & bgta);
+
+      if ( (bh | bl) == 0)
+	return 1 - 2*(bit & 1);
+
+      /* a <-- |a - b| */
+      al = (bgta ^ tl) - bgta;
+      ah = (bgta ^ th);
+
+      if (UNLIKELY (al == 0))
+	{
+	  /* If b > a, al == 0 implies that we have a carry to
+	     propagate. */
+	  al = ah - bgta;
+	  ah = 0;
+	  bit ^= GMP_NUMB_BITS & (bl ^ (bl >> 1));
+	}
+      count_trailing_zeros (c, al);
+      c++;
+      bit ^= c & (bl ^ (bl >> 1));
+
+      if (UNLIKELY (c == GMP_NUMB_BITS))
+	{
+	  al = ah;
+	  ah = 0;
+	}
+      else
+	{
+	  al = ((ah << (GMP_NUMB_BITS - c)) & GMP_NUMB_MASK) | (al >> c);
+	  ah >>= c;
+	}
+    }
+
+  ASSERT (bl > 0);
+
+  while ( (al | bl) & GMP_LIMB_HIGHBIT)
+    {
+      /* Need an extra comparison to get the mask. */
+      mp_limb_t t = al - bl;
+      mp_limb_t bgta = - (bl > al);
+
+      if (t == 0)
+	return 0;
+
+      /* If b > a, invoke reciprocity */
+      bit ^= (bgta & al & bl);
+
+      /* b <-- min (a, b) */
+      bl += (bgta & t);
+
+      /* a <-- |a - b| */
+      al = (t ^ bgta) - bgta;
+
+      /* Number of trailing zeros is the same no matter if we look at
+       * t or a, but using t gives more parallelism. */
+      count_trailing_zeros (c, t);
+      c ++;
+      /* (2/b) = -1 if b = 3 or 5 mod 8 */
+      bit ^= c & (bl ^ (bl >> 1));
+
+      if (UNLIKELY (c == GMP_NUMB_BITS))
+	return 1 - 2*(bit & 1);
+
+      al >>= c;
+    }
+
+  /* Here we have a little impedance mismatch. Better to inline it? */
+  return mpn_jacobi_base (2*al+1, 2*bl+1, bit << 1);
+}
+#elif JACOBI_2_METHOD == 2
+int
+mpn_jacobi_2 (mp_srcptr ap, mp_srcptr bp, unsigned bit)
+{
+  mp_limb_t ah, al, bh, bl;
+  int c;
+
+  al = ap[0];
+  ah = ap[1];
+  bl = bp[0];
+  bh = bp[1];
+
+  ASSERT (bl & 1);
+
+  /* Use bit 1. */
+  bit <<= 1;
+
+  if (bh == 0 && bl == 1)
+    /* (a|1) = 1 */
+    return 1 - (bit & 2);
+
+  if (al == 0)
+    {
+      if (ah == 0)
+	/* (0|b) = 0, b > 1 */
+	return 0;
+
+      count_trailing_zeros (c, ah);
+      bit ^= ((GMP_NUMB_BITS + c) << 1) & (bl ^ (bl >> 1));
+
+      al = bl;
+      bl = ah >> c;
+
+      if (bl == 1)
+	/* (1|b) = 1 */
+	return 1 - (bit & 2);
+
+      ah = bh;
+
+      bit ^= al & bl;
+
+      goto b_reduced;
+    }
+  if ( (al & 1) == 0)
+    {
+      count_trailing_zeros (c, al);
+
+      al = ((ah << (GMP_NUMB_BITS - c)) & GMP_NUMB_MASK) | (al >> c);
+      ah >>= c;
+      bit ^= (c << 1) & (bl ^ (bl >> 1));
+    }
+  if (ah == 0)
+    {
+      if (bh > 0)
+	{
+	  bit ^= al & bl;
+	  MP_LIMB_T_SWAP (al, bl);
+	  ah = bh;
+	  goto b_reduced;
+	}
+      goto ab_reduced;
+    }
+
+  while (bh > 0)
+    {
+      /* Compute (a|b) */
+      while (ah > bh)
+	{
+	  sub_ddmmss (ah, al, ah, al, bh, bl);
+	  if (al == 0)
+	    {
+	      count_trailing_zeros (c, ah);
+	      bit ^= ((GMP_NUMB_BITS + c) << 1) & (bl ^ (bl >> 1));
+
+	      al = bl;
+	      bl = ah >> c;
+	      ah = bh;
+
+	      bit ^= al & bl;
+	      goto b_reduced;
+	    }
+	  count_trailing_zeros (c, al);
+	  bit ^= (c << 1) & (bl ^ (bl >> 1));
+	  al = ((ah << (GMP_NUMB_BITS - c)) & GMP_NUMB_MASK) | (al >> c);
+	  ah >>= c;
+	}
+      if (ah == bh)
+	goto cancel_hi;
+
+      if (ah == 0)
+	{
+	  bit ^= al & bl;
+	  MP_LIMB_T_SWAP (al, bl);
+	  ah = bh;
+	  break;
+	}
+
+      bit ^= al & bl;
+
+      /* Compute (b|a) */
+      while (bh > ah)
+	{
+	  sub_ddmmss (bh, bl, bh, bl, ah, al);
+	  if (bl == 0)
+	    {
+	      count_trailing_zeros (c, bh);
+	      bit ^= ((GMP_NUMB_BITS + c) << 1) & (al ^ (al >> 1));
+
+	      bl = bh >> c;
+	      bit ^= al & bl;
+	      goto b_reduced;
+	    }
+	  count_trailing_zeros (c, bl);
+	  bit ^= (c << 1) & (al ^ (al >> 1));
+	  bl = ((bh << (GMP_NUMB_BITS - c)) & GMP_NUMB_MASK) | (bl >> c);
+	  bh >>= c;
+	}
+      bit ^= al & bl;
+
+      /* Compute (a|b) */
+      if (ah == bh)
+	{
+	cancel_hi:
+	  if (al < bl)
+	    {
+	      MP_LIMB_T_SWAP (al, bl);
+	      bit ^= al & bl;
+	    }
+	  al -= bl;
+	  if (al == 0)
+	    return 0;
+
+	  count_trailing_zeros (c, al);
+	  bit ^= (c << 1) & (bl ^ (bl >> 1));
+	  al >>= c;
+
+	  if (al == 1)
+	    return 1 - (bit & 2);
+
+	  MP_LIMB_T_SWAP (al, bl);
+	  bit ^= al & bl;
+	  break;
+	}
+    }
+
+ b_reduced:
+  /* Compute (a|b), with b a single limb. */
+  ASSERT (bl & 1);
+
+  if (bl == 1)
+    /* (a|1) = 1 */
+    return 1 - (bit & 2);
+
+  while (ah > 0)
+    {
+      ah -= (al < bl);
+      al -= bl;
+      if (al == 0)
+	{
+	  if (ah == 0)
+	    return 0;
+	  count_trailing_zeros (c, ah);
+	  bit ^= ((GMP_NUMB_BITS + c) << 1) & (bl ^ (bl >> 1));
+	  al = ah >> c;
+	  goto ab_reduced;
+	}
+      count_trailing_zeros (c, al);
+
+      al = ((ah << (GMP_NUMB_BITS - c)) & GMP_NUMB_MASK) | (al >> c);
+      ah >>= c;
+      bit ^= (c << 1) & (bl ^ (bl >> 1));
+    }
+ ab_reduced:
+  ASSERT (bl & 1);
+  ASSERT (bl > 1);
+
+  return mpn_jacobi_base (al, bl, bit);
+}
+#else
+#error Unsupported value for JACOBI_2_METHOD
+#endif

diff --git a/third_party/gmp/mpn/generic/logops_n.c b/third_party/gmp/mpn/generic/logops_n.c
new file mode 100644
index 0000000..3adba2c
--- /dev/null
+++ b/third_party/gmp/mpn/generic/logops_n.c

@@ -0,0 +1,77 @@
+/* mpn_and_n, mpn_ior_n, etc -- mpn logical operations.
+
+Copyright 2009 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+
+#ifdef OPERATION_and_n
+#define func __MPN(and_n)
+#define call mpn_and_n
+#endif
+
+#ifdef OPERATION_andn_n
+#define func __MPN(andn_n)
+#define call mpn_andn_n
+#endif
+
+#ifdef OPERATION_nand_n
+#define func __MPN(nand_n)
+#define call mpn_nand_n
+#endif
+
+#ifdef OPERATION_ior_n
+#define func __MPN(ior_n)
+#define call mpn_ior_n
+#endif
+
+#ifdef OPERATION_iorn_n
+#define func __MPN(iorn_n)
+#define call mpn_iorn_n
+#endif
+
+#ifdef OPERATION_nior_n
+#define func __MPN(nior_n)
+#define call mpn_nior_n
+#endif
+
+#ifdef OPERATION_xor_n
+#define func __MPN(xor_n)
+#define call mpn_xor_n
+#endif
+
+#ifdef OPERATION_xnor_n
+#define func __MPN(xnor_n)
+#define call mpn_xnor_n
+#endif
+
+void
+func (mp_ptr rp, mp_srcptr up, mp_srcptr vp, mp_size_t n)
+{
+  call (rp, up, vp, n);
+}

diff --git a/third_party/gmp/mpn/generic/lshift.c b/third_party/gmp/mpn/generic/lshift.c
new file mode 100644
index 0000000..7e1fdef
--- /dev/null
+++ b/third_party/gmp/mpn/generic/lshift.c

@@ -0,0 +1,72 @@
+/* mpn_lshift -- Shift left low level.
+
+Copyright 1991, 1993, 1994, 1996, 2000-2002 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+
+/* Shift U (pointed to by up and n limbs long) cnt bits to the left
+   and store the n least significant limbs of the result at rp.
+   Return the bits shifted out from the most significant limb.
+
+   Argument constraints:
+   1. 0 < cnt < GMP_NUMB_BITS.
+   2. If the result is to be written over the input, rp must be >= up.
+*/
+
+mp_limb_t
+mpn_lshift (mp_ptr rp, mp_srcptr up, mp_size_t n, unsigned int cnt)
+{
+  mp_limb_t high_limb, low_limb;
+  unsigned int tnc;
+  mp_size_t i;
+  mp_limb_t retval;
+
+  ASSERT (n >= 1);
+  ASSERT (cnt >= 1);
+  ASSERT (cnt < GMP_NUMB_BITS);
+  ASSERT (MPN_SAME_OR_DECR_P (rp, up, n));
+
+  up += n;
+  rp += n;
+
+  tnc = GMP_NUMB_BITS - cnt;
+  low_limb = *--up;
+  retval = low_limb >> tnc;
+  high_limb = (low_limb << cnt) & GMP_NUMB_MASK;
+
+  for (i = n - 1; i != 0; i--)
+    {
+      low_limb = *--up;
+      *--rp = high_limb | (low_limb >> tnc);
+      high_limb = (low_limb << cnt) & GMP_NUMB_MASK;
+    }
+  *--rp = high_limb;
+
+  return retval;
+}

diff --git a/third_party/gmp/mpn/generic/lshiftc.c b/third_party/gmp/mpn/generic/lshiftc.c
new file mode 100644
index 0000000..a583602
--- /dev/null
+++ b/third_party/gmp/mpn/generic/lshiftc.c

@@ -0,0 +1,73 @@
+/* mpn_lshiftc -- Shift left low level with complement.
+
+Copyright 1991, 1993, 1994, 1996, 2000-2002, 2009 Free Software Foundation,
+Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+
+/* Shift U (pointed to by up and n limbs long) cnt bits to the left
+   and store the n least significant limbs of the result at rp.
+   Return the bits shifted out from the most significant limb.
+
+   Argument constraints:
+   1. 0 < cnt < GMP_NUMB_BITS.
+   2. If the result is to be written over the input, rp must be >= up.
+*/
+
+mp_limb_t
+mpn_lshiftc (mp_ptr rp, mp_srcptr up, mp_size_t n, unsigned int cnt)
+{
+  mp_limb_t high_limb, low_limb;
+  unsigned int tnc;
+  mp_size_t i;
+  mp_limb_t retval;
+
+  ASSERT (n >= 1);
+  ASSERT (cnt >= 1);
+  ASSERT (cnt < GMP_NUMB_BITS);
+  ASSERT (MPN_SAME_OR_DECR_P (rp, up, n));
+
+  up += n;
+  rp += n;
+
+  tnc = GMP_NUMB_BITS - cnt;
+  low_limb = *--up;
+  retval = low_limb >> tnc;
+  high_limb = (low_limb << cnt);
+
+  for (i = n - 1; i != 0; i--)
+    {
+      low_limb = *--up;
+      *--rp = (~(high_limb | (low_limb >> tnc))) & GMP_NUMB_MASK;
+      high_limb = low_limb << cnt;
+    }
+  *--rp = (~high_limb) & GMP_NUMB_MASK;
+
+  return retval;
+}

diff --git a/third_party/gmp/mpn/generic/matrix22_mul.c b/third_party/gmp/mpn/generic/matrix22_mul.c
new file mode 100644
index 0000000..6a1299a
--- /dev/null
+++ b/third_party/gmp/mpn/generic/matrix22_mul.c

@@ -0,0 +1,321 @@
+/* matrix22_mul.c.
+
+   Contributed by Niels Möller and Marco Bodrato.
+
+   THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
+   SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+Copyright 2003-2005, 2008, 2009 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+#define MUL(rp, ap, an, bp, bn) do {		\
+  if (an >= bn)					\
+    mpn_mul (rp, ap, an, bp, bn);		\
+  else						\
+    mpn_mul (rp, bp, bn, ap, an);		\
+} while (0)
+
+/* Inputs are unsigned. */
+static int
+abs_sub_n (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t n)
+{
+  int c;
+  MPN_CMP (c, ap, bp, n);
+  if (c >= 0)
+    {
+      mpn_sub_n (rp, ap, bp, n);
+      return 0;
+    }
+  else
+    {
+      mpn_sub_n (rp, bp, ap, n);
+      return 1;
+    }
+}
+
+static int
+add_signed_n (mp_ptr rp,
+	      mp_srcptr ap, int as, mp_srcptr bp, int bs, mp_size_t n)
+{
+  if (as != bs)
+    return as ^ abs_sub_n (rp, ap, bp, n);
+  else
+    {
+      ASSERT_NOCARRY (mpn_add_n (rp, ap, bp, n));
+      return as;
+    }
+}
+
+mp_size_t
+mpn_matrix22_mul_itch (mp_size_t rn, mp_size_t mn)
+{
+  if (BELOW_THRESHOLD (rn, MATRIX22_STRASSEN_THRESHOLD)
+      || BELOW_THRESHOLD (mn, MATRIX22_STRASSEN_THRESHOLD))
+    return 3*rn + 2*mn;
+  else
+    return 3*(rn + mn) + 5;
+}
+
+/* Algorithm:
+
+    / s0 \   /  1  0  0  0 \ / r0 \
+    | s1 |   |  0  1  0  1 | | r1 |
+    | s2 |   |  0  0 -1  1 | | r2 |
+    | s3 | = |  0  1 -1  1 | \ r3 /
+    | s4 |   | -1  1 -1  1 |
+    | s5 |   |  0  1  0  0 |
+    \ s6 /   \  0  0  1  0 /
+
+    / t0 \   /  1  0  0  0 \ / m0 \
+    | t1 |   |  0  1  0  1 | | m1 |
+    | t2 |   |  0  0 -1  1 | | m2 |
+    | t3 | = |  0  1 -1  1 | \ m3 /
+    | t4 |   | -1  1 -1  1 |
+    | t5 |   |  0  1  0  0 |
+    \ t6 /   \  0  0  1  0 /
+
+  Note: the two matrices above are the same, but s_i and t_i are used
+  in the same product, only for i<4, see "A Strassen-like Matrix
+  Multiplication suited for squaring and higher power computation" by
+  M. Bodrato, in Proceedings of ISSAC 2010.
+
+    / r0 \   / 1 0  0  0  0  1  0 \ / s0*t0 \
+    | r1 | = | 0 0 -1  1 -1  1  0 | | s1*t1 |
+    | r2 |   | 0 1  0 -1  0 -1 -1 | | s2*t2 |
+    \ r3 /   \ 0 1  1 -1  0 -1  0 / | s3*t3 |
+				    | s4*t5 |
+				    | s5*t6 |
+				    \ s6*t4 /
+
+  The scheduling uses two temporaries U0 and U1 to store products, and
+  two, S0 and T0, to store combinations of entries of the two
+  operands.
+*/
+
+/* Computes R = R * M. Elements are numbers R = (r0, r1; r2, r3).
+ *
+ * Resulting elements are of size up to rn + mn + 1.
+ *
+ * Temporary storage: 3 rn + 3 mn + 5. */
+static void
+mpn_matrix22_mul_strassen (mp_ptr r0, mp_ptr r1, mp_ptr r2, mp_ptr r3, mp_size_t rn,
+			   mp_srcptr m0, mp_srcptr m1, mp_srcptr m2, mp_srcptr m3, mp_size_t mn,
+			   mp_ptr tp)
+{
+  mp_ptr s0, t0, u0, u1;
+  int r1s, r3s, s0s, t0s, u1s;
+  s0 = tp; tp += rn + 1;
+  t0 = tp; tp += mn + 1;
+  u0 = tp; tp += rn + mn + 1;
+  u1 = tp; /* rn + mn + 2 */
+
+  MUL (u0, r1, rn, m2, mn);		/* u5 = s5 * t6 */
+  r3s = abs_sub_n (r3, r3, r2, rn);	/* r3 - r2 */
+  if (r3s)
+    {
+      r1s = abs_sub_n (r1, r1, r3, rn);
+      r1[rn] = 0;
+    }
+  else
+    {
+      r1[rn] = mpn_add_n (r1, r1, r3, rn);
+      r1s = 0;				/* r1 - r2 + r3  */
+    }
+  if (r1s)
+    {
+      s0[rn] = mpn_add_n (s0, r1, r0, rn);
+      s0s = 0;
+    }
+  else if (r1[rn] != 0)
+    {
+      s0[rn] = r1[rn] - mpn_sub_n (s0, r1, r0, rn);
+      s0s = 1;				/* s4 = -r0 + r1 - r2 + r3 */
+					/* Reverse sign! */
+    }
+  else
+    {
+      s0s = abs_sub_n (s0, r0, r1, rn);
+      s0[rn] = 0;
+    }
+  MUL (u1, r0, rn, m0, mn);		/* u0 = s0 * t0 */
+  r0[rn+mn] = mpn_add_n (r0, u0, u1, rn + mn);
+  ASSERT (r0[rn+mn] < 2);		/* u0 + u5 */
+
+  t0s = abs_sub_n (t0, m3, m2, mn);
+  u1s = r3s^t0s^1;			/* Reverse sign! */
+  MUL (u1, r3, rn, t0, mn);		/* u2 = s2 * t2 */
+  u1[rn+mn] = 0;
+  if (t0s)
+    {
+      t0s = abs_sub_n (t0, m1, t0, mn);
+      t0[mn] = 0;
+    }
+  else
+    {
+      t0[mn] = mpn_add_n (t0, t0, m1, mn);
+    }
+
+  /* FIXME: Could be simplified if we had space for rn + mn + 2 limbs
+     at r3. I'd expect that for matrices of random size, the high
+     words t0[mn] and r1[rn] are non-zero with a pretty small
+     probability. If that can be confirmed this should be done as an
+     unconditional rn x (mn+1) followed by an if (UNLIKELY (r1[rn]))
+     add_n. */
+  if (t0[mn] != 0)
+    {
+      MUL (r3, r1, rn, t0, mn + 1);	/* u3 = s3 * t3 */
+      ASSERT (r1[rn] < 2);
+      if (r1[rn] != 0)
+	mpn_add_n (r3 + rn, r3 + rn, t0, mn + 1);
+    }
+  else
+    {
+      MUL (r3, r1, rn + 1, t0, mn);
+    }
+
+  ASSERT (r3[rn+mn] < 4);
+
+  u0[rn+mn] = 0;
+  if (r1s^t0s)
+    {
+      r3s = abs_sub_n (r3, u0, r3, rn + mn + 1);
+    }
+  else
+    {
+      ASSERT_NOCARRY (mpn_add_n (r3, r3, u0, rn + mn + 1));
+      r3s = 0;				/* u3 + u5 */
+    }
+
+  if (t0s)
+    {
+      t0[mn] = mpn_add_n (t0, t0, m0, mn);
+    }
+  else if (t0[mn] != 0)
+    {
+      t0[mn] -= mpn_sub_n (t0, t0, m0, mn);
+    }
+  else
+    {
+      t0s = abs_sub_n (t0, t0, m0, mn);
+    }
+  MUL (u0, r2, rn, t0, mn + 1);		/* u6 = s6 * t4 */
+  ASSERT (u0[rn+mn] < 2);
+  if (r1s)
+    {
+      ASSERT_NOCARRY (mpn_sub_n (r1, r2, r1, rn));
+    }
+  else
+    {
+      r1[rn] += mpn_add_n (r1, r1, r2, rn);
+    }
+  rn++;
+  t0s = add_signed_n (r2, r3, r3s, u0, t0s, rn + mn);
+					/* u3 + u5 + u6 */
+  ASSERT (r2[rn+mn-1] < 4);
+  r3s = add_signed_n (r3, r3, r3s, u1, u1s, rn + mn);
+					/* -u2 + u3 + u5  */
+  ASSERT (r3[rn+mn-1] < 3);
+  MUL (u0, s0, rn, m1, mn);		/* u4 = s4 * t5 */
+  ASSERT (u0[rn+mn-1] < 2);
+  t0[mn] = mpn_add_n (t0, m3, m1, mn);
+  MUL (u1, r1, rn, t0, mn + 1);		/* u1 = s1 * t1 */
+  mn += rn;
+  ASSERT (u1[mn-1] < 4);
+  ASSERT (u1[mn] == 0);
+  ASSERT_NOCARRY (add_signed_n (r1, r3, r3s, u0, s0s, mn));
+					/* -u2 + u3 - u4 + u5  */
+  ASSERT (r1[mn-1] < 2);
+  if (r3s)
+    {
+      ASSERT_NOCARRY (mpn_add_n (r3, u1, r3, mn));
+    }
+  else
+    {
+      ASSERT_NOCARRY (mpn_sub_n (r3, u1, r3, mn));
+					/* u1 + u2 - u3 - u5  */
+    }
+  ASSERT (r3[mn-1] < 2);
+  if (t0s)
+    {
+      ASSERT_NOCARRY (mpn_add_n (r2, u1, r2, mn));
+    }
+  else
+    {
+      ASSERT_NOCARRY (mpn_sub_n (r2, u1, r2, mn));
+					/* u1 - u3 - u5 - u6  */
+    }
+  ASSERT (r2[mn-1] < 2);
+}
+
+void
+mpn_matrix22_mul (mp_ptr r0, mp_ptr r1, mp_ptr r2, mp_ptr r3, mp_size_t rn,
+		  mp_srcptr m0, mp_srcptr m1, mp_srcptr m2, mp_srcptr m3, mp_size_t mn,
+		  mp_ptr tp)
+{
+  if (BELOW_THRESHOLD (rn, MATRIX22_STRASSEN_THRESHOLD)
+      || BELOW_THRESHOLD (mn, MATRIX22_STRASSEN_THRESHOLD))
+    {
+      mp_ptr p0, p1;
+      unsigned i;
+
+      /* Temporary storage: 3 rn + 2 mn */
+      p0 = tp + rn;
+      p1 = p0 + rn + mn;
+
+      for (i = 0; i < 2; i++)
+	{
+	  MPN_COPY (tp, r0, rn);
+
+	  if (rn >= mn)
+	    {
+	      mpn_mul (p0, r0, rn, m0, mn);
+	      mpn_mul (p1, r1, rn, m3, mn);
+	      mpn_mul (r0, r1, rn, m2, mn);
+	      mpn_mul (r1, tp, rn, m1, mn);
+	    }
+	  else
+	    {
+	      mpn_mul (p0, m0, mn, r0, rn);
+	      mpn_mul (p1, m3, mn, r1, rn);
+	      mpn_mul (r0, m2, mn, r1, rn);
+	      mpn_mul (r1, m1, mn, tp, rn);
+	    }
+	  r0[rn+mn] = mpn_add_n (r0, r0, p0, rn + mn);
+	  r1[rn+mn] = mpn_add_n (r1, r1, p1, rn + mn);
+
+	  r0 = r2; r1 = r3;
+	}
+    }
+  else
+    mpn_matrix22_mul_strassen (r0, r1, r2, r3, rn,
+			       m0, m1, m2, m3, mn, tp);
+}

diff --git a/third_party/gmp/mpn/generic/matrix22_mul1_inverse_vector.c b/third_party/gmp/mpn/generic/matrix22_mul1_inverse_vector.c
new file mode 100644
index 0000000..68d50b7
--- /dev/null
+++ b/third_party/gmp/mpn/generic/matrix22_mul1_inverse_vector.c

@@ -0,0 +1,64 @@
+/* matrix22_mul1_inverse_vector.c
+
+   THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
+   SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+Copyright 2008, 2010 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+/* Sets (r;b) = M^{-1}(a;b), with M^{-1} = (u11, -u01; -u10, u00) from
+   the left. Uses three buffers, to avoid a copy. */
+mp_size_t
+mpn_matrix22_mul1_inverse_vector (const struct hgcd_matrix1 *M,
+				  mp_ptr rp, mp_srcptr ap, mp_ptr bp, mp_size_t n)
+{
+  mp_limb_t h0, h1;
+
+  /* Compute (r;b) <-- (u11 a - u01 b; -u10 a + u00 b) as
+
+     r  = u11 * a
+     r -= u01 * b
+     b *= u00
+     b -= u10 * a
+  */
+
+  h0 =    mpn_mul_1 (rp, ap, n, M->u[1][1]);
+  h1 = mpn_submul_1 (rp, bp, n, M->u[0][1]);
+  ASSERT (h0 == h1);
+
+  h0 =    mpn_mul_1 (bp, bp, n, M->u[0][0]);
+  h1 = mpn_submul_1 (bp, ap, n, M->u[1][0]);
+  ASSERT (h0 == h1);
+
+  n -= (rp[n-1] | bp[n-1]) == 0;
+  return n;
+}

diff --git a/third_party/gmp/mpn/generic/mod_1.c b/third_party/gmp/mpn/generic/mod_1.c
new file mode 100644
index 0000000..8e415df
--- /dev/null
+++ b/third_party/gmp/mpn/generic/mod_1.c

@@ -0,0 +1,280 @@
+/* mpn_mod_1(dividend_ptr, dividend_size, divisor_limb) --
+   Divide (DIVIDEND_PTR,,DIVIDEND_SIZE) by DIVISOR_LIMB.
+   Return the single-limb remainder.
+   There are no constraints on the value of the divisor.
+
+Copyright 1991, 1993, 1994, 1999, 2000, 2002, 2007-2009, 2012 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+
+/* The size where udiv_qrnnd_preinv should be used rather than udiv_qrnnd,
+   meaning the quotient size where that should happen, the quotient size
+   being how many udiv divisions will be done.
+
+   The default is to use preinv always, CPUs where this doesn't suit have
+   tuned thresholds.  Note in particular that preinv should certainly be
+   used if that's the only division available (USE_PREINV_ALWAYS).  */
+
+#ifndef MOD_1_NORM_THRESHOLD
+#define MOD_1_NORM_THRESHOLD  0
+#endif
+
+#ifndef MOD_1_UNNORM_THRESHOLD
+#define MOD_1_UNNORM_THRESHOLD  0
+#endif
+
+#ifndef MOD_1U_TO_MOD_1_1_THRESHOLD
+#define MOD_1U_TO_MOD_1_1_THRESHOLD  MP_SIZE_T_MAX /* default is not to use mpn_mod_1s */
+#endif
+
+#ifndef MOD_1N_TO_MOD_1_1_THRESHOLD
+#define MOD_1N_TO_MOD_1_1_THRESHOLD  MP_SIZE_T_MAX /* default is not to use mpn_mod_1s */
+#endif
+
+#ifndef MOD_1_1_TO_MOD_1_2_THRESHOLD
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD  10
+#endif
+
+#ifndef MOD_1_2_TO_MOD_1_4_THRESHOLD
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD  20
+#endif
+
+#if TUNE_PROGRAM_BUILD && !HAVE_NATIVE_mpn_mod_1_1p
+/* Duplicates declarations in tune/speed.h */
+mp_limb_t mpn_mod_1_1p_1 (mp_srcptr, mp_size_t, mp_limb_t, mp_limb_t [4]);
+mp_limb_t mpn_mod_1_1p_2 (mp_srcptr, mp_size_t, mp_limb_t, mp_limb_t [4]);
+
+void mpn_mod_1_1p_cps_1 (mp_limb_t [4], mp_limb_t);
+void mpn_mod_1_1p_cps_2 (mp_limb_t [4], mp_limb_t);
+
+#undef mpn_mod_1_1p
+#define mpn_mod_1_1p(ap, n, b, pre)			     \
+  (mod_1_1p_method == 1 ? mpn_mod_1_1p_1 (ap, n, b, pre)     \
+   : (mod_1_1p_method == 2 ? mpn_mod_1_1p_2 (ap, n, b, pre)  \
+      : __gmpn_mod_1_1p (ap, n, b, pre)))
+
+#undef mpn_mod_1_1p_cps
+#define mpn_mod_1_1p_cps(pre, b)				\
+  (mod_1_1p_method == 1 ? mpn_mod_1_1p_cps_1 (pre, b)		\
+   : (mod_1_1p_method == 2 ? mpn_mod_1_1p_cps_2 (pre, b)	\
+      : __gmpn_mod_1_1p_cps (pre, b)))
+#endif /* TUNE_PROGRAM_BUILD && !HAVE_NATIVE_mpn_mod_1_1p */
+
+
+/* The comments in mpn/generic/divrem_1.c apply here too.
+
+   As noted in the algorithms section of the manual, the shifts in the loop
+   for the unnorm case can be avoided by calculating r = a%(d*2^n), followed
+   by a final (r*2^n)%(d*2^n).  In fact if it happens that a%(d*2^n) can
+   skip a division where (a*2^n)%(d*2^n) can't then there's the same number
+   of divide steps, though how often that happens depends on the assumed
+   distributions of dividend and divisor.  In any case this idea is left to
+   CPU specific implementations to consider.  */
+
+static mp_limb_t
+mpn_mod_1_unnorm (mp_srcptr up, mp_size_t un, mp_limb_t d)
+{
+  mp_size_t  i;
+  mp_limb_t  n1, n0, r;
+  mp_limb_t  dummy;
+  int cnt;
+
+  ASSERT (un > 0);
+  ASSERT (d != 0);
+
+  d <<= GMP_NAIL_BITS;
+
+  /* Skip a division if high < divisor.  Having the test here before
+     normalizing will still skip as often as possible.  */
+  r = up[un - 1] << GMP_NAIL_BITS;
+  if (r < d)
+    {
+      r >>= GMP_NAIL_BITS;
+      un--;
+      if (un == 0)
+	return r;
+    }
+  else
+    r = 0;
+
+  /* If udiv_qrnnd doesn't need a normalized divisor, can use the simple
+     code above. */
+  if (! UDIV_NEEDS_NORMALIZATION
+      && BELOW_THRESHOLD (un, MOD_1_UNNORM_THRESHOLD))
+    {
+      for (i = un - 1; i >= 0; i--)
+	{
+	  n0 = up[i] << GMP_NAIL_BITS;
+	  udiv_qrnnd (dummy, r, r, n0, d);
+	  r >>= GMP_NAIL_BITS;
+	}
+      return r;
+    }
+
+  count_leading_zeros (cnt, d);
+  d <<= cnt;
+
+  n1 = up[un - 1] << GMP_NAIL_BITS;
+  r = (r << cnt) | (n1 >> (GMP_LIMB_BITS - cnt));
+
+  if (UDIV_NEEDS_NORMALIZATION
+      && BELOW_THRESHOLD (un, MOD_1_UNNORM_THRESHOLD))
+    {
+      mp_limb_t nshift;
+      for (i = un - 2; i >= 0; i--)
+	{
+	  n0 = up[i] << GMP_NAIL_BITS;
+	  nshift = (n1 << cnt) | (n0 >> (GMP_NUMB_BITS - cnt));
+	  udiv_qrnnd (dummy, r, r, nshift, d);
+	  r >>= GMP_NAIL_BITS;
+	  n1 = n0;
+	}
+      udiv_qrnnd (dummy, r, r, n1 << cnt, d);
+      r >>= GMP_NAIL_BITS;
+      return r >> cnt;
+    }
+  else
+    {
+      mp_limb_t inv, nshift;
+      invert_limb (inv, d);
+
+      for (i = un - 2; i >= 0; i--)
+	{
+	  n0 = up[i] << GMP_NAIL_BITS;
+	  nshift = (n1 << cnt) | (n0 >> (GMP_NUMB_BITS - cnt));
+	  udiv_rnnd_preinv (r, r, nshift, d, inv);
+	  r >>= GMP_NAIL_BITS;
+	  n1 = n0;
+	}
+      udiv_rnnd_preinv (r, r, n1 << cnt, d, inv);
+      r >>= GMP_NAIL_BITS;
+      return r >> cnt;
+    }
+}
+
+static mp_limb_t
+mpn_mod_1_norm (mp_srcptr up, mp_size_t un, mp_limb_t d)
+{
+  mp_size_t  i;
+  mp_limb_t  n0, r;
+  mp_limb_t  dummy;
+
+  ASSERT (un > 0);
+
+  d <<= GMP_NAIL_BITS;
+
+  ASSERT (d & GMP_LIMB_HIGHBIT);
+
+  /* High limb is initial remainder, possibly with one subtract of
+     d to get r<d.  */
+  r = up[un - 1] << GMP_NAIL_BITS;
+  if (r >= d)
+    r -= d;
+  r >>= GMP_NAIL_BITS;
+  un--;
+  if (un == 0)
+    return r;
+
+  if (BELOW_THRESHOLD (un, MOD_1_NORM_THRESHOLD))
+    {
+      for (i = un - 1; i >= 0; i--)
+	{
+	  n0 = up[i] << GMP_NAIL_BITS;
+	  udiv_qrnnd (dummy, r, r, n0, d);
+	  r >>= GMP_NAIL_BITS;
+	}
+      return r;
+    }
+  else
+    {
+      mp_limb_t  inv;
+      invert_limb (inv, d);
+      for (i = un - 1; i >= 0; i--)
+	{
+	  n0 = up[i] << GMP_NAIL_BITS;
+	  udiv_rnnd_preinv (r, r, n0, d, inv);
+	  r >>= GMP_NAIL_BITS;
+	}
+      return r;
+    }
+}
+
+mp_limb_t
+mpn_mod_1 (mp_srcptr ap, mp_size_t n, mp_limb_t b)
+{
+  ASSERT (n >= 0);
+  ASSERT (b != 0);
+
+  /* Should this be handled at all?  Rely on callers?  Note un==0 is currently
+     required by mpz/fdiv_r_ui.c and possibly other places.  */
+  if (n == 0)
+    return 0;
+
+  if (UNLIKELY ((b & GMP_NUMB_HIGHBIT) != 0))
+    {
+      if (BELOW_THRESHOLD (n, MOD_1N_TO_MOD_1_1_THRESHOLD))
+	{
+	  return mpn_mod_1_norm (ap, n, b);
+	}
+      else
+	{
+	  mp_limb_t pre[4];
+	  mpn_mod_1_1p_cps (pre, b);
+	  return mpn_mod_1_1p (ap, n, b, pre);
+	}
+    }
+  else
+    {
+      if (BELOW_THRESHOLD (n, MOD_1U_TO_MOD_1_1_THRESHOLD))
+	{
+	  return mpn_mod_1_unnorm (ap, n, b);
+	}
+      else if (BELOW_THRESHOLD (n, MOD_1_1_TO_MOD_1_2_THRESHOLD))
+	{
+	  mp_limb_t pre[4];
+	  mpn_mod_1_1p_cps (pre, b);
+	  return mpn_mod_1_1p (ap, n, b << pre[1], pre);
+	}
+      else if (BELOW_THRESHOLD (n, MOD_1_2_TO_MOD_1_4_THRESHOLD) || UNLIKELY (b > GMP_NUMB_MASK / 4))
+	{
+	  mp_limb_t pre[5];
+	  mpn_mod_1s_2p_cps (pre, b);
+	  return mpn_mod_1s_2p (ap, n, b << pre[1], pre);
+	}
+      else
+	{
+	  mp_limb_t pre[7];
+	  mpn_mod_1s_4p_cps (pre, b);
+	  return mpn_mod_1s_4p (ap, n, b << pre[1], pre);
+	}
+    }
+}

diff --git a/third_party/gmp/mpn/generic/mod_1_1.c b/third_party/gmp/mpn/generic/mod_1_1.c
new file mode 100644
index 0000000..f6342d6
--- /dev/null
+++ b/third_party/gmp/mpn/generic/mod_1_1.c

@@ -0,0 +1,332 @@
+/* mpn_mod_1_1p (ap, n, b, cps)
+   Divide (ap,,n) by b.  Return the single-limb remainder.
+
+   Contributed to the GNU project by Torbjorn Granlund and Niels Möller.
+   Based on a suggestion by Peter L. Montgomery.
+
+   THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
+   SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+Copyright 2008-2011, 2013 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+#ifndef MOD_1_1P_METHOD
+# define MOD_1_1P_METHOD 1    /* need to make sure this is 2 for asm testing */
+#endif
+
+/* Define some longlong.h-style macros, but for wider operations.
+ * add_mssaaaa is like longlong.h's add_ssaaaa, but also generates
+ * carry out, in the form of a mask. */
+
+#if defined (__GNUC__) && ! defined (NO_ASM)
+
+#if HAVE_HOST_CPU_FAMILY_x86 && W_TYPE_SIZE == 32
+#define add_mssaaaa(m, s1, s0, a1, a0, b1, b0)				\
+  __asm__ (  "add	%6, %k2\n\t"					\
+	     "adc	%4, %k1\n\t"					\
+	     "sbb	%k0, %k0"					\
+	   : "=r" (m), "=r" (s1), "=&r" (s0)				\
+	   : "1"  ((USItype)(a1)), "g" ((USItype)(b1)),			\
+	     "%2" ((USItype)(a0)), "g" ((USItype)(b0)))
+#endif
+
+#if HAVE_HOST_CPU_FAMILY_x86_64 && W_TYPE_SIZE == 64
+#define add_mssaaaa(m, s1, s0, a1, a0, b1, b0)				\
+  __asm__ (  "add	%6, %q2\n\t"					\
+	     "adc	%4, %q1\n\t"					\
+	     "sbb	%q0, %q0"					\
+	   : "=r" (m), "=r" (s1), "=&r" (s0)				\
+	   : "1"  ((UDItype)(a1)), "rme" ((UDItype)(b1)),		\
+	     "%2" ((UDItype)(a0)), "rme" ((UDItype)(b0)))
+#endif
+
+#if defined (__sparc__) && W_TYPE_SIZE == 32
+#define add_mssaaaa(m, sh, sl, ah, al, bh, bl)				\
+  __asm__ (  "addcc	%r5, %6, %2\n\t"				\
+	     "addxcc	%r3, %4, %1\n\t"				\
+	     "subx	%%g0, %%g0, %0"					\
+	   : "=r" (m), "=r" (sh), "=&r" (sl)				\
+	   : "rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl)		\
+	 __CLOBBER_CC)
+#endif
+
+#if defined (__sparc__) && W_TYPE_SIZE == 64
+#define add_mssaaaa(m, sh, sl, ah, al, bh, bl)				\
+  __asm__ (  "addcc	%r5, %6, %2\n\t"				\
+	     "addccc	%r7, %8, %%g0\n\t"				\
+	     "addccc	%r3, %4, %1\n\t"				\
+	     "clr	%0\n\t"						\
+	     "movcs	%%xcc, -1, %0"					\
+	   : "=r" (m), "=r" (sh), "=&r" (sl)				\
+	   : "rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl),		\
+	     "rJ" ((al) >> 32), "rI" ((bl) >> 32)			\
+	 __CLOBBER_CC)
+#if __VIS__ >= 0x300
+#undef add_mssaaaa
+#define add_mssaaaa(m, sh, sl, ah, al, bh, bl)				\
+  __asm__ (  "addcc	%r5, %6, %2\n\t"				\
+	     "addxccc	%r3, %4, %1\n\t"				\
+	     "clr	%0\n\t"						\
+	     "movcs	%%xcc, -1, %0"					\
+	   : "=r" (m), "=r" (sh), "=&r" (sl)				\
+	   : "rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl)		\
+	 __CLOBBER_CC)
+#endif
+#endif
+
+#if HAVE_HOST_CPU_FAMILY_powerpc && !defined (_LONG_LONG_LIMB)
+/* This works fine for 32-bit and 64-bit limbs, except for 64-bit limbs with a
+   processor running in 32-bit mode, since the carry flag then gets the 32-bit
+   carry.  */
+#define add_mssaaaa(m, s1, s0, a1, a0, b1, b0)				\
+  __asm__ (  "add%I6c	%2, %5, %6\n\t"					\
+	     "adde	%1, %3, %4\n\t"					\
+	     "subfe	%0, %0, %0\n\t"					\
+	     "nor	%0, %0, %0"					\
+	   : "=r" (m), "=r" (s1), "=&r" (s0)				\
+	   : "r"  (a1), "r" (b1), "%r" (a0), "rI" (b0)			\
+	     __CLOBBER_CC)
+#endif
+
+#if defined (__s390x__) && W_TYPE_SIZE == 64
+#define add_mssaaaa(m, s1, s0, a1, a0, b1, b0)				\
+  __asm__ (  "algr	%2, %6\n\t"					\
+	     "alcgr	%1, %4\n\t"					\
+	     "lghi	%0, 0\n\t"					\
+	     "alcgr	%0, %0\n\t"					\
+	     "lcgr	%0, %0"						\
+	   : "=r" (m), "=r" (s1), "=&r" (s0)				\
+	   : "1"  ((UDItype)(a1)), "r" ((UDItype)(b1)),			\
+	     "%2" ((UDItype)(a0)), "r" ((UDItype)(b0)) __CLOBBER_CC)
+#endif
+
+#if defined (__arm__) && !defined (__thumb__) && W_TYPE_SIZE == 32
+#define add_mssaaaa(m, sh, sl, ah, al, bh, bl)				\
+  __asm__ (  "adds	%2, %5, %6\n\t"					\
+	     "adcs	%1, %3, %4\n\t"					\
+	     "movcc	%0, #0\n\t"					\
+	     "movcs	%0, #-1"					\
+	   : "=r" (m), "=r" (sh), "=&r" (sl)				\
+	   : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl) __CLOBBER_CC)
+#endif
+#endif /* defined (__GNUC__) */
+
+#ifndef add_mssaaaa
+#define add_mssaaaa(m, s1, s0, a1, a0, b1, b0)				\
+  do {									\
+    UWtype __s0, __s1, __c0, __c1;					\
+    __s0 = (a0) + (b0);							\
+    __s1 = (a1) + (b1);							\
+    __c0 = __s0 < (a0);							\
+    __c1 = __s1 < (a1);							\
+    (s0) = __s0;							\
+    __s1 = __s1 + __c0;							\
+    (s1) = __s1;							\
+    (m) = - (__c1 + (__s1 < __c0));					\
+  } while (0)
+#endif
+
+#if MOD_1_1P_METHOD == 1
+void
+mpn_mod_1_1p_cps (mp_limb_t cps[4], mp_limb_t b)
+{
+  mp_limb_t bi;
+  mp_limb_t B1modb, B2modb;
+  int cnt;
+
+  count_leading_zeros (cnt, b);
+
+  b <<= cnt;
+  invert_limb (bi, b);
+
+  cps[0] = bi;
+  cps[1] = cnt;
+
+  B1modb = -b;
+  if (LIKELY (cnt != 0))
+    B1modb *= ((bi >> (GMP_LIMB_BITS-cnt)) | (CNST_LIMB(1) << cnt));
+  ASSERT (B1modb <= b);		/* NB: not fully reduced mod b */
+  cps[2] = B1modb >> cnt;
+
+  /* In the normalized case, this can be simplified to
+   *
+   *   B2modb = - b * bi;
+   *   ASSERT (B2modb <= b);    // NB: equality iff b = B/2
+   */
+  udiv_rnnd_preinv (B2modb, B1modb, CNST_LIMB(0), b, bi);
+  cps[3] = B2modb >> cnt;
+}
+
+mp_limb_t
+mpn_mod_1_1p (mp_srcptr ap, mp_size_t n, mp_limb_t b, const mp_limb_t bmodb[4])
+{
+  mp_limb_t rh, rl, bi, ph, pl, r;
+  mp_limb_t B1modb, B2modb;
+  mp_size_t i;
+  int cnt;
+  mp_limb_t mask;
+
+  ASSERT (n >= 2);		/* fix tuneup.c if this is changed */
+
+  B1modb = bmodb[2];
+  B2modb = bmodb[3];
+
+  rl = ap[n - 1];
+  umul_ppmm (ph, pl, rl, B1modb);
+  add_ssaaaa (rh, rl, ph, pl, CNST_LIMB(0), ap[n - 2]);
+
+  for (i = n - 3; i >= 0; i -= 1)
+    {
+      /* rr = ap[i]				< B
+	    + LO(rr)  * (B mod b)		<= (B-1)(b-1)
+	    + HI(rr)  * (B^2 mod b)		<= (B-1)(b-1)
+      */
+      umul_ppmm (ph, pl, rl, B1modb);
+      add_ssaaaa (ph, pl, ph, pl, CNST_LIMB(0), ap[i]);
+
+      umul_ppmm (rh, rl, rh, B2modb);
+      add_ssaaaa (rh, rl, rh, rl, ph, pl);
+    }
+
+  cnt = bmodb[1];
+  bi = bmodb[0];
+
+  if (LIKELY (cnt != 0))
+    rh = (rh << cnt) | (rl >> (GMP_LIMB_BITS - cnt));
+
+  mask = -(mp_limb_t) (rh >= b);
+  rh -= mask & b;
+
+  udiv_rnnd_preinv (r, rh, rl << cnt, b, bi);
+
+  return r >> cnt;
+}
+#endif /* MOD_1_1P_METHOD == 1 */
+
+#if MOD_1_1P_METHOD == 2
+void
+mpn_mod_1_1p_cps (mp_limb_t cps[4], mp_limb_t b)
+{
+  mp_limb_t bi;
+  mp_limb_t B2modb;
+  int cnt;
+
+  count_leading_zeros (cnt, b);
+
+  b <<= cnt;
+  invert_limb (bi, b);
+
+  cps[0] = bi;
+  cps[1] = cnt;
+
+  if (LIKELY (cnt != 0))
+    {
+      mp_limb_t B1modb = -b;
+      B1modb *= ((bi >> (GMP_LIMB_BITS-cnt)) | (CNST_LIMB(1) << cnt));
+      ASSERT (B1modb <= b);		/* NB: not fully reduced mod b */
+      cps[2] = B1modb >> cnt;
+    }
+  B2modb = - b * bi;
+  ASSERT (B2modb <= b);    // NB: equality iff b = B/2
+  cps[3] = B2modb;
+}
+
+mp_limb_t
+mpn_mod_1_1p (mp_srcptr ap, mp_size_t n, mp_limb_t b, const mp_limb_t bmodb[4])
+{
+  int cnt;
+  mp_limb_t bi, B1modb;
+  mp_limb_t r0, r1;
+  mp_limb_t r;
+
+  ASSERT (n >= 2);		/* fix tuneup.c if this is changed */
+
+  r0 = ap[n-2];
+  r1 = ap[n-1];
+
+  if (n > 2)
+    {
+      mp_limb_t B2modb, B2mb;
+      mp_limb_t p0, p1;
+      mp_limb_t r2;
+      mp_size_t j;
+
+      B2modb = bmodb[3];
+      B2mb = B2modb - b;
+
+      umul_ppmm (p1, p0, r1, B2modb);
+      add_mssaaaa (r2, r1, r0, r0, ap[n-3], p1, p0);
+
+      for (j = n-4; j >= 0; j--)
+	{
+	  mp_limb_t cy;
+	  /* mp_limb_t t = r0 + B2mb; */
+	  umul_ppmm (p1, p0, r1, B2modb);
+
+	  ADDC_LIMB (cy, r0, r0, r2 & B2modb);
+	  /* Alternative, for cmov: if (cy) r0 = t; */
+	  r0 -= (-cy) & b;
+	  add_mssaaaa (r2, r1, r0, r0, ap[j], p1, p0);
+	}
+
+      r1 -= (r2 & b);
+    }
+
+  cnt = bmodb[1];
+
+  if (LIKELY (cnt != 0))
+    {
+      mp_limb_t t;
+      mp_limb_t B1modb = bmodb[2];
+
+      umul_ppmm (r1, t, r1, B1modb);
+      r0 += t;
+      r1 += (r0 < t);
+
+      /* Normalize */
+      r1 = (r1 << cnt) | (r0 >> (GMP_LIMB_BITS - cnt));
+      r0 <<= cnt;
+
+      /* NOTE: Might get r1 == b here, but udiv_rnnd_preinv allows that. */
+    }
+  else
+    {
+      mp_limb_t mask = -(mp_limb_t) (r1 >= b);
+      r1 -= mask & b;
+    }
+
+  bi = bmodb[0];
+
+  udiv_rnnd_preinv (r, r1, r0, b, bi);
+  return r >> cnt;
+}
+#endif /* MOD_1_1P_METHOD == 2 */

diff --git a/third_party/gmp/mpn/generic/mod_1_2.c b/third_party/gmp/mpn/generic/mod_1_2.c
new file mode 100644
index 0000000..b00d19e
--- /dev/null
+++ b/third_party/gmp/mpn/generic/mod_1_2.c

@@ -0,0 +1,148 @@
+/* mpn_mod_1s_2p (ap, n, b, cps)
+   Divide (ap,,n) by b.  Return the single-limb remainder.
+   Requires that b < B / 2.
+
+   Contributed to the GNU project by Torbjorn Granlund.
+   Based on a suggestion by Peter L. Montgomery.
+
+   THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
+   SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+Copyright 2008-2010 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+void
+mpn_mod_1s_2p_cps (mp_limb_t cps[5], mp_limb_t b)
+{
+  mp_limb_t bi;
+  mp_limb_t B1modb, B2modb, B3modb;
+  int cnt;
+
+  ASSERT (b <= (~(mp_limb_t) 0) / 2);
+
+  count_leading_zeros (cnt, b);
+
+  b <<= cnt;
+  invert_limb (bi, b);
+
+  cps[0] = bi;
+  cps[1] = cnt;
+
+  B1modb = -b * ((bi >> (GMP_LIMB_BITS-cnt)) | (CNST_LIMB(1) << cnt));
+  ASSERT (B1modb <= b);		/* NB: not fully reduced mod b */
+  cps[2] = B1modb >> cnt;
+
+  udiv_rnnd_preinv (B2modb, B1modb, CNST_LIMB(0), b, bi);
+  cps[3] = B2modb >> cnt;
+
+  udiv_rnnd_preinv (B3modb, B2modb, CNST_LIMB(0), b, bi);
+  cps[4] = B3modb >> cnt;
+
+#if WANT_ASSERT
+  {
+    int i;
+    b = cps[2];
+    for (i = 3; i <= 4; i++)
+      {
+	b += cps[i];
+	ASSERT (b >= cps[i]);
+      }
+  }
+#endif
+}
+
+mp_limb_t
+mpn_mod_1s_2p (mp_srcptr ap, mp_size_t n, mp_limb_t b, const mp_limb_t cps[5])
+{
+  mp_limb_t rh, rl, bi, ph, pl, ch, cl, r;
+  mp_limb_t B1modb, B2modb, B3modb;
+  mp_size_t i;
+  int cnt;
+
+  ASSERT (n >= 1);
+
+  B1modb = cps[2];
+  B2modb = cps[3];
+  B3modb = cps[4];
+
+  if ((n & 1) != 0)
+    {
+      if (n == 1)
+	{
+	  rl = ap[n - 1];
+	  bi = cps[0];
+	  cnt = cps[1];
+	  udiv_rnnd_preinv (r, rl >> (GMP_LIMB_BITS - cnt),
+			     rl << cnt, b, bi);
+	  return r >> cnt;
+	}
+
+      umul_ppmm (ph, pl, ap[n - 2], B1modb);
+      add_ssaaaa (ph, pl, ph, pl, CNST_LIMB(0), ap[n - 3]);
+      umul_ppmm (rh, rl, ap[n - 1], B2modb);
+      add_ssaaaa (rh, rl, rh, rl, ph, pl);
+      n--;
+    }
+  else
+    {
+      rh = ap[n - 1];
+      rl = ap[n - 2];
+    }
+
+  for (i = n - 4; i >= 0; i -= 2)
+    {
+      /* rr = ap[i]				< B
+	    + ap[i+1] * (B mod b)		<= (B-1)(b-1)
+	    + LO(rr)  * (B^2 mod b)		<= (B-1)(b-1)
+	    + HI(rr)  * (B^3 mod b)		<= (B-1)(b-1)
+      */
+      umul_ppmm (ph, pl, ap[i + 1], B1modb);
+      add_ssaaaa (ph, pl, ph, pl, CNST_LIMB(0), ap[i + 0]);
+
+      umul_ppmm (ch, cl, rl, B2modb);
+      add_ssaaaa (ph, pl, ph, pl, ch, cl);
+
+      umul_ppmm (rh, rl, rh, B3modb);
+      add_ssaaaa (rh, rl, rh, rl, ph, pl);
+    }
+
+  umul_ppmm (rh, cl, rh, B1modb);
+  add_ssaaaa (rh, rl, rh, rl, CNST_LIMB(0), cl);
+
+  cnt = cps[1];
+  bi = cps[0];
+
+  r = (rh << cnt) | (rl >> (GMP_LIMB_BITS - cnt));
+  udiv_rnnd_preinv (r, r, rl << cnt, b, bi);
+
+  return r >> cnt;
+}

diff --git a/third_party/gmp/mpn/generic/mod_1_3.c b/third_party/gmp/mpn/generic/mod_1_3.c
new file mode 100644
index 0000000..4d4be5d
--- /dev/null
+++ b/third_party/gmp/mpn/generic/mod_1_3.c

@@ -0,0 +1,156 @@
+/* mpn_mod_1s_3p (ap, n, b, cps)
+   Divide (ap,,n) by b.  Return the single-limb remainder.
+   Requires that b < B / 3.
+
+   Contributed to the GNU project by Torbjorn Granlund.
+   Based on a suggestion by Peter L. Montgomery.
+
+   THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
+   SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+Copyright 2008-2010, 2013 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+void
+mpn_mod_1s_3p_cps (mp_limb_t cps[6], mp_limb_t b)
+{
+  mp_limb_t bi;
+  mp_limb_t B1modb, B2modb, B3modb, B4modb;
+  int cnt;
+
+  ASSERT (b <= (~(mp_limb_t) 0) / 3);
+
+  count_leading_zeros (cnt, b);
+
+  b <<= cnt;
+  invert_limb (bi, b);
+
+  cps[0] = bi;
+  cps[1] = cnt;
+
+  B1modb = -b * ((bi >> (GMP_LIMB_BITS-cnt)) | (CNST_LIMB(1) << cnt));
+  ASSERT (B1modb <= b);		/* NB: not fully reduced mod b */
+  cps[2] = B1modb >> cnt;
+
+  udiv_rnnd_preinv (B2modb, B1modb, CNST_LIMB(0), b, bi);
+  cps[3] = B2modb >> cnt;
+
+  udiv_rnnd_preinv (B3modb, B2modb, CNST_LIMB(0), b, bi);
+  cps[4] = B3modb >> cnt;
+
+  udiv_rnnd_preinv (B4modb, B3modb, CNST_LIMB(0), b, bi);
+  cps[5] = B4modb >> cnt;
+
+#if WANT_ASSERT
+  {
+    int i;
+    b = cps[2];
+    for (i = 3; i <= 5; i++)
+      {
+	b += cps[i];
+	ASSERT (b >= cps[i]);
+      }
+  }
+#endif
+}
+
+mp_limb_t
+mpn_mod_1s_3p (mp_srcptr ap, mp_size_t n, mp_limb_t b, const mp_limb_t cps[6])
+{
+  mp_limb_t rh, rl, bi, ph, pl, ch, cl, r;
+  mp_limb_t B1modb, B2modb, B3modb, B4modb;
+  mp_size_t i;
+  int cnt;
+
+  ASSERT (n >= 1);
+
+  B1modb = cps[2];
+  B2modb = cps[3];
+  B3modb = cps[4];
+  B4modb = cps[5];
+
+  /* We compute n mod 3 in a tricky way, which works except for when n is so
+     close to the maximum size that we don't need to support it.  The final
+     cast to int is a workaround for HP cc.  */
+  switch ((int) ((mp_limb_t) n * MODLIMB_INVERSE_3 >> (GMP_NUMB_BITS - 2)))
+    {
+    case 0:
+      umul_ppmm (ph, pl, ap[n - 2], B1modb);
+      add_ssaaaa (ph, pl, ph, pl, CNST_LIMB(0), ap[n - 3]);
+      umul_ppmm (rh, rl, ap[n - 1], B2modb);
+      add_ssaaaa (rh, rl, rh, rl, ph, pl);
+      n -= 3;
+      break;
+    case 2:	/* n mod 3 = 1 */
+      rh = 0;
+      rl = ap[n - 1];
+      n -= 1;
+      break;
+    case 1:	/* n mod 3 = 2 */
+      rh = ap[n - 1];
+      rl = ap[n - 2];
+      n -= 2;
+      break;
+    }
+
+  for (i = n - 3; i >= 0; i -= 3)
+    {
+      /* rr = ap[i]				< B
+	    + ap[i+1] * (B mod b)		<= (B-1)(b-1)
+	    + ap[i+2] * (B^2 mod b)		<= (B-1)(b-1)
+	    + LO(rr)  * (B^3 mod b)		<= (B-1)(b-1)
+	    + HI(rr)  * (B^4 mod b)		<= (B-1)(b-1)
+      */
+      umul_ppmm (ph, pl, ap[i + 1], B1modb);
+      add_ssaaaa (ph, pl, ph, pl, CNST_LIMB(0), ap[i + 0]);
+
+      umul_ppmm (ch, cl, ap[i + 2], B2modb);
+      add_ssaaaa (ph, pl, ph, pl, ch, cl);
+
+      umul_ppmm (ch, cl, rl, B3modb);
+      add_ssaaaa (ph, pl, ph, pl, ch, cl);
+
+      umul_ppmm (rh, rl, rh, B4modb);
+      add_ssaaaa (rh, rl, rh, rl, ph, pl);
+    }
+
+  umul_ppmm (rh, cl, rh, B1modb);
+  add_ssaaaa (rh, rl, rh, rl, CNST_LIMB(0), cl);
+
+  cnt = cps[1];
+  bi = cps[0];
+
+  r = (rh << cnt) | (rl >> (GMP_LIMB_BITS - cnt));
+  udiv_rnnd_preinv (r, r, rl << cnt, b, bi);
+
+  return r >> cnt;
+}

diff --git a/third_party/gmp/mpn/generic/mod_1_4.c b/third_party/gmp/mpn/generic/mod_1_4.c
new file mode 100644
index 0000000..80b42ba
--- /dev/null
+++ b/third_party/gmp/mpn/generic/mod_1_4.c

@@ -0,0 +1,170 @@
+/* mpn_mod_1s_4p (ap, n, b, cps)
+   Divide (ap,,n) by b.  Return the single-limb remainder.
+   Requires that b < B / 4.
+
+   Contributed to the GNU project by Torbjorn Granlund.
+   Based on a suggestion by Peter L. Montgomery.
+
+   THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
+   SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+Copyright 2008-2010 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+void
+mpn_mod_1s_4p_cps (mp_limb_t cps[7], mp_limb_t b)
+{
+  mp_limb_t bi;
+  mp_limb_t B1modb, B2modb, B3modb, B4modb, B5modb;
+  int cnt;
+
+  ASSERT (b <= (~(mp_limb_t) 0) / 4);
+
+  count_leading_zeros (cnt, b);
+
+  b <<= cnt;
+  invert_limb (bi, b);
+
+  cps[0] = bi;
+  cps[1] = cnt;
+
+  B1modb = -b * ((bi >> (GMP_LIMB_BITS-cnt)) | (CNST_LIMB(1) << cnt));
+  ASSERT (B1modb <= b);		/* NB: not fully reduced mod b */
+  cps[2] = B1modb >> cnt;
+
+  udiv_rnnd_preinv (B2modb, B1modb, CNST_LIMB(0), b, bi);
+  cps[3] = B2modb >> cnt;
+
+  udiv_rnnd_preinv (B3modb, B2modb, CNST_LIMB(0), b, bi);
+  cps[4] = B3modb >> cnt;
+
+  udiv_rnnd_preinv (B4modb, B3modb, CNST_LIMB(0), b, bi);
+  cps[5] = B4modb >> cnt;
+
+  udiv_rnnd_preinv (B5modb, B4modb, CNST_LIMB(0), b, bi);
+  cps[6] = B5modb >> cnt;
+
+#if WANT_ASSERT
+  {
+    int i;
+    b = cps[2];
+    for (i = 3; i <= 6; i++)
+      {
+	b += cps[i];
+	ASSERT (b >= cps[i]);
+      }
+  }
+#endif
+}
+
+mp_limb_t
+mpn_mod_1s_4p (mp_srcptr ap, mp_size_t n, mp_limb_t b, const mp_limb_t cps[7])
+{
+  mp_limb_t rh, rl, bi, ph, pl, ch, cl, r;
+  mp_limb_t B1modb, B2modb, B3modb, B4modb, B5modb;
+  mp_size_t i;
+  int cnt;
+
+  ASSERT (n >= 1);
+
+  B1modb = cps[2];
+  B2modb = cps[3];
+  B3modb = cps[4];
+  B4modb = cps[5];
+  B5modb = cps[6];
+
+  switch (n & 3)
+    {
+    case 0:
+      umul_ppmm (ph, pl, ap[n - 3], B1modb);
+      add_ssaaaa (ph, pl, ph, pl, CNST_LIMB(0), ap[n - 4]);
+      umul_ppmm (ch, cl, ap[n - 2], B2modb);
+      add_ssaaaa (ph, pl, ph, pl, ch, cl);
+      umul_ppmm (rh, rl, ap[n - 1], B3modb);
+      add_ssaaaa (rh, rl, rh, rl, ph, pl);
+      n -= 4;
+      break;
+    case 1:
+      rh = 0;
+      rl = ap[n - 1];
+      n -= 1;
+      break;
+    case 2:
+      rh = ap[n - 1];
+      rl = ap[n - 2];
+      n -= 2;
+      break;
+    case 3:
+      umul_ppmm (ph, pl, ap[n - 2], B1modb);
+      add_ssaaaa (ph, pl, ph, pl, CNST_LIMB(0), ap[n - 3]);
+      umul_ppmm (rh, rl, ap[n - 1], B2modb);
+      add_ssaaaa (rh, rl, rh, rl, ph, pl);
+      n -= 3;
+      break;
+    }
+
+  for (i = n - 4; i >= 0; i -= 4)
+    {
+      /* rr = ap[i]				< B
+	    + ap[i+1] * (B mod b)		<= (B-1)(b-1)
+	    + ap[i+2] * (B^2 mod b)		<= (B-1)(b-1)
+	    + ap[i+3] * (B^3 mod b)		<= (B-1)(b-1)
+	    + LO(rr)  * (B^4 mod b)		<= (B-1)(b-1)
+	    + HI(rr)  * (B^5 mod b)		<= (B-1)(b-1)
+      */
+      umul_ppmm (ph, pl, ap[i + 1], B1modb);
+      add_ssaaaa (ph, pl, ph, pl, CNST_LIMB(0), ap[i + 0]);
+
+      umul_ppmm (ch, cl, ap[i + 2], B2modb);
+      add_ssaaaa (ph, pl, ph, pl, ch, cl);
+
+      umul_ppmm (ch, cl, ap[i + 3], B3modb);
+      add_ssaaaa (ph, pl, ph, pl, ch, cl);
+
+      umul_ppmm (ch, cl, rl, B4modb);
+      add_ssaaaa (ph, pl, ph, pl, ch, cl);
+
+      umul_ppmm (rh, rl, rh, B5modb);
+      add_ssaaaa (rh, rl, rh, rl, ph, pl);
+    }
+
+  umul_ppmm (rh, cl, rh, B1modb);
+  add_ssaaaa (rh, rl, rh, rl, CNST_LIMB(0), cl);
+
+  cnt = cps[1];
+  bi = cps[0];
+
+  r = (rh << cnt) | (rl >> (GMP_LIMB_BITS - cnt));
+  udiv_rnnd_preinv (r, r, rl << cnt, b, bi);
+
+  return r >> cnt;
+}

diff --git a/third_party/gmp/mpn/generic/mod_34lsub1.c b/third_party/gmp/mpn/generic/mod_34lsub1.c
new file mode 100644
index 0000000..af9c6c6
--- /dev/null
+++ b/third_party/gmp/mpn/generic/mod_34lsub1.c

@@ -0,0 +1,128 @@
+/* mpn_mod_34lsub1 -- remainder modulo 2^(GMP_NUMB_BITS*3/4)-1.
+
+   THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY.  THEY'RE ALMOST
+   CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
+   FUTURE GNU MP RELEASES.
+
+Copyright 2000-2002 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+#include "gmp-impl.h"
+
+
+/* Calculate a remainder from {p,n} divided by 2^(GMP_NUMB_BITS*3/4)-1.
+   The remainder is not fully reduced, it's any limb value congruent to
+   {p,n} modulo that divisor.
+
+   This implementation is only correct when GMP_NUMB_BITS is a multiple of
+   4.
+
+   FIXME: If GMP_NAIL_BITS is some silly big value during development then
+   it's possible the carry accumulators c0,c1,c2 could overflow.
+
+   General notes:
+
+   The basic idea is to use a set of N accumulators (N=3 in this case) to
+   effectively get a remainder mod 2^(GMP_NUMB_BITS*N)-1 followed at the end
+   by a reduction to GMP_NUMB_BITS*N/M bits (M=4 in this case) for a
+   remainder mod 2^(GMP_NUMB_BITS*N/M)-1.  N and M are chosen to give a good
+   set of small prime factors in 2^(GMP_NUMB_BITS*N/M)-1.
+
+   N=3 M=4 suits GMP_NUMB_BITS==32 and GMP_NUMB_BITS==64 quite well, giving
+   a few more primes than a single accumulator N=1 does, and for no extra
+   cost (assuming the processor has a decent number of registers).
+
+   For strange nailified values of GMP_NUMB_BITS the idea would be to look
+   for what N and M give good primes.  With GMP_NUMB_BITS not a power of 2
+   the choices for M may be opened up a bit.  But such things are probably
+   best done in separate code, not grafted on here.  */
+
+#if GMP_NUMB_BITS % 4 == 0
+
+#define B1  (GMP_NUMB_BITS / 4)
+#define B2  (B1 * 2)
+#define B3  (B1 * 3)
+
+#define M1  ((CNST_LIMB(1) << B1) - 1)
+#define M2  ((CNST_LIMB(1) << B2) - 1)
+#define M3  ((CNST_LIMB(1) << B3) - 1)
+
+#define LOW0(n)      ((n) & M3)
+#define HIGH0(n)     ((n) >> B3)
+
+#define LOW1(n)      (((n) & M2) << B1)
+#define HIGH1(n)     ((n) >> B2)
+
+#define LOW2(n)      (((n) & M1) << B2)
+#define HIGH2(n)     ((n) >> B1)
+
+#define PARTS0(n)    (LOW0(n) + HIGH0(n))
+#define PARTS1(n)    (LOW1(n) + HIGH1(n))
+#define PARTS2(n)    (LOW2(n) + HIGH2(n))
+
+#define ADD(c,a,val)                    \
+  do {                                  \
+    mp_limb_t  new_c;                   \
+    ADDC_LIMB (new_c, a, a, val);       \
+    (c) += new_c;                       \
+  } while (0)
+
+mp_limb_t
+mpn_mod_34lsub1 (mp_srcptr p, mp_size_t n)
+{
+  mp_limb_t  c0, c1, c2;
+  mp_limb_t  a0, a1, a2;
+
+  ASSERT (n >= 1);
+  ASSERT (n/3 < GMP_NUMB_MAX);
+
+  a0 = a1 = a2 = 0;
+  c0 = c1 = c2 = 0;
+
+  while ((n -= 3) >= 0)
+    {
+      ADD (c0, a0, p[0]);
+      ADD (c1, a1, p[1]);
+      ADD (c2, a2, p[2]);
+      p += 3;
+    }
+
+  if (n != -3)
+    {
+      ADD (c0, a0, p[0]);
+      if (n != -2)
+	ADD (c1, a1, p[1]);
+    }
+
+  return
+    PARTS0 (a0) + PARTS1 (a1) + PARTS2 (a2)
+    + PARTS1 (c0) + PARTS2 (c1) + PARTS0 (c2);
+}
+
+#endif

diff --git a/third_party/gmp/mpn/generic/mode1o.c b/third_party/gmp/mpn/generic/mode1o.c
new file mode 100644
index 0000000..9ba0ae1
--- /dev/null
+++ b/third_party/gmp/mpn/generic/mode1o.c

@@ -0,0 +1,235 @@
+/* mpn_modexact_1c_odd -- mpn by limb exact division style remainder.
+
+   THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY.  THEY'RE ALMOST
+   CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
+   FUTURE GNU MP RELEASES.
+
+Copyright 2000-2004 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+
+/* Calculate an r satisfying
+
+           r*B^k + a - c == q*d
+
+   where B=2^GMP_LIMB_BITS, a is {src,size}, k is either size or size-1
+   (the caller won't know which), and q is the quotient (discarded).  d must
+   be odd, c can be any limb value.
+
+   If c<d then r will be in the range 0<=r<d, or if c>=d then 0<=r<=d.
+
+   This slightly strange function suits the initial Nx1 reduction for GCDs
+   or Jacobi symbols since the factors of 2 in B^k can be ignored, leaving
+   -r == a mod d (by passing c=0).  For a GCD the factor of -1 on r can be
+   ignored, or for the Jacobi symbol it can be accounted for.  The function
+   also suits divisibility and congruence testing since if r=0 (or r=d) is
+   obtained then a==c mod d.
+
+
+   r is a bit like the remainder returned by mpn_divexact_by3c, and is the
+   sort of remainder mpn_divexact_1 might return.  Like mpn_divexact_by3c, r
+   represents a borrow, since effectively quotient limbs are chosen so that
+   subtracting that multiple of d from src at each step will produce a zero
+   limb.
+
+   A long calculation can be done piece by piece from low to high by passing
+   the return value from one part as the carry parameter to the next part.
+   The effective final k becomes anything between size and size-n, if n
+   pieces are used.
+
+
+   A similar sort of routine could be constructed based on adding multiples
+   of d at each limb, much like redc in mpz_powm does.  Subtracting however
+   has a small advantage that when subtracting to cancel out l there's never
+   a borrow into h, whereas using an addition would put a carry into h
+   depending whether l==0 or l!=0.
+
+
+   In terms of efficiency, this function is similar to a mul-by-inverse
+   mpn_mod_1.  Both are essentially two multiplies and are best suited to
+   CPUs with low latency multipliers (in comparison to a divide instruction
+   at least.)  But modexact has a few less supplementary operations, only
+   needs low part and high part multiplies, and has fewer working quantities
+   (helping CPUs with few registers).
+
+
+   In the main loop it will be noted that the new carry (call it r) is the
+   sum of the high product h and any borrow from l=s-c.  If c<d then we will
+   have r<d too, for the following reasons.  Let q=l*inverse be the quotient
+   limb, so that q*d = B*h + l, where B=2^GMP_NUMB_BITS.  Now if h=d-1 then
+
+       l = q*d - B*(d-1) <= (B-1)*d - B*(d-1) = B-d
+
+   But if l=s-c produces a borrow when c<d, then l>=B-d+1 and hence will
+   never have h=d-1 and so r=h+borrow <= d-1.
+
+   When c>=d, on the other hand, h=d-1 can certainly occur together with a
+   borrow, thereby giving only r<=d, as per the function definition above.
+
+   As a design decision it's left to the caller to check for r=d if it might
+   be passing c>=d.  Several applications have c<d initially so the extra
+   test is often unnecessary, for example the GCDs or a plain divisibility
+   d|a test will pass c=0.
+
+
+   The special case for size==1 is so that it can be assumed c<=d in the
+   high<=divisor test at the end.  c<=d is only guaranteed after at least
+   one iteration of the main loop.  There's also a decent chance one % is
+   faster than a binvert_limb, though that will depend on the processor.
+
+   A CPU specific implementation might want to omit the size==1 code or the
+   high<divisor test.  mpn/x86/k6/mode1o.asm for instance finds neither
+   useful.  */
+
+
+mp_limb_t
+mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, mp_limb_t d,
+                     mp_limb_t orig_c)
+{
+  mp_limb_t  s, h, l, inverse, dummy, dmul, ret;
+  mp_limb_t  c = orig_c;
+  mp_size_t  i;
+
+  ASSERT (size >= 1);
+  ASSERT (d & 1);
+  ASSERT_MPN (src, size);
+  ASSERT_LIMB (d);
+  ASSERT_LIMB (c);
+
+  if (size == 1)
+    {
+      s = src[0];
+      if (s > c)
+	{
+	  l = s-c;
+	  h = l % d;
+	  if (h != 0)
+	    h = d - h;
+	}
+      else
+	{
+	  l = c-s;
+	  h = l % d;
+	}
+      return h;
+    }
+
+
+  binvert_limb (inverse, d);
+  dmul = d << GMP_NAIL_BITS;
+
+  i = 0;
+  do
+    {
+      s = src[i];
+      SUBC_LIMB (c, l, s, c);
+      l = (l * inverse) & GMP_NUMB_MASK;
+      umul_ppmm (h, dummy, l, dmul);
+      c += h;
+    }
+  while (++i < size-1);
+
+
+  s = src[i];
+  if (s <= d)
+    {
+      /* With high<=d the final step can be a subtract and addback.  If c==0
+	 then the addback will restore to l>=0.  If c==d then will get l==d
+	 if s==0, but that's ok per the function definition.  */
+
+      l = c - s;
+      if (c < s)
+	l += d;
+
+      ret = l;
+    }
+  else
+    {
+      /* Can't skip a divide, just do the loop code once more. */
+
+      SUBC_LIMB (c, l, s, c);
+      l = (l * inverse) & GMP_NUMB_MASK;
+      umul_ppmm (h, dummy, l, dmul);
+      c += h;
+      ret = c;
+    }
+
+  ASSERT (orig_c < d ? ret < d : ret <= d);
+  return ret;
+}
+
+
+
+#if 0
+
+/* The following is an alternate form that might shave one cycle on a
+   superscalar processor since it takes c+=h off the dependent chain,
+   leaving just a low product, high product, and a subtract.
+
+   This is for CPU specific implementations to consider.  A special case for
+   high<divisor and/or size==1 can be added if desired.
+
+   Notice that c is only ever 0 or 1, since if s-c produces a borrow then
+   x=0xFF..FF and x-h cannot produce a borrow.  The c=(x>s) could become
+   c=(x==0xFF..FF) too, if that helped.  */
+
+mp_limb_t
+mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, mp_limb_t d, mp_limb_t h)
+{
+  mp_limb_t  s, x, y, inverse, dummy, dmul, c1, c2;
+  mp_limb_t  c = 0;
+  mp_size_t  i;
+
+  ASSERT (size >= 1);
+  ASSERT (d & 1);
+
+  binvert_limb (inverse, d);
+  dmul = d << GMP_NAIL_BITS;
+
+  for (i = 0; i < size; i++)
+    {
+      ASSERT (c==0 || c==1);
+
+      s = src[i];
+      SUBC_LIMB (c1, x, s, c);
+
+      SUBC_LIMB (c2, y, x, h);
+      c = c1 + c2;
+
+      y = (y * inverse) & GMP_NUMB_MASK;
+      umul_ppmm (h, dummy, y, dmul);
+    }
+
+  h += c;
+  return h;
+}
+
+#endif

diff --git a/third_party/gmp/mpn/generic/mu_bdiv_q.c b/third_party/gmp/mpn/generic/mu_bdiv_q.c
new file mode 100644
index 0000000..0ef3bd8
--- /dev/null
+++ b/third_party/gmp/mpn/generic/mu_bdiv_q.c

@@ -0,0 +1,281 @@
+/* mpn_mu_bdiv_q(qp,np,nn,dp,dn,tp) -- Compute {np,nn} / {dp,dn} mod B^nn.
+   storing the result in {qp,nn}.  Overlap allowed between Q and N; all other
+   overlap disallowed.
+
+   Contributed to the GNU project by Torbjorn Granlund.
+
+   THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
+   SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
+
+Copyright 2005-2007, 2009, 2010, 2017 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+/*
+   The idea of the algorithm used herein is to compute a smaller inverted value
+   than used in the standard Barrett algorithm, and thus save time in the
+   Newton iterations, and pay just a small price when using the inverted value
+   for developing quotient bits.  This algorithm was presented at ICMS 2006.
+*/
+
+#include "gmp-impl.h"
+
+
+/* N = {np,nn}
+   D = {dp,dn}
+
+   Requirements: N >= D
+		 D >= 1
+		 D odd
+		 dn >= 2
+		 nn >= 2
+		 scratch space as determined by mpn_mu_bdiv_q_itch(nn,dn).
+
+   Write quotient to Q = {qp,nn}.
+
+   FIXME: When iterating, perhaps do the small step before loop, not after.
+   FIXME: Try to avoid the scalar divisions when computing inverse size.
+   FIXME: Trim allocation for (qn > dn) case, 3*dn might be possible.  In
+	  particular, when dn==in, tp and rp could use the same space.
+   FIXME: Trim final quotient calculation to qn limbs of precision.
+*/
+static void
+mpn_mu_bdiv_q_old (mp_ptr qp,
+	       mp_srcptr np, mp_size_t nn,
+	       mp_srcptr dp, mp_size_t dn,
+	       mp_ptr scratch)
+{
+  mp_size_t qn;
+  mp_size_t in;
+  int cy, c0;
+  mp_size_t tn, wn;
+
+  qn = nn;
+
+  ASSERT (dn >= 2);
+  ASSERT (qn >= 2);
+
+  if (qn > dn)
+    {
+      mp_size_t b;
+
+      /* |_______________________|   dividend
+			|________|   divisor  */
+
+#define ip           scratch			/* in */
+#define rp           (scratch + in)		/* dn or rest >= binvert_itch(in) */
+#define tp           (scratch + in + dn)	/* dn+in or next_size(dn) */
+#define scratch_out  (scratch + in + dn + tn)	/* mulmod_bnm1_itch(next_size(dn)) */
+
+      /* Compute an inverse size that is a nice partition of the quotient.  */
+      b = (qn - 1) / dn + 1;	/* ceil(qn/dn), number of blocks */
+      in = (qn - 1) / b + 1;	/* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */
+
+      /* Some notes on allocation:
+
+	 When in = dn, R dies when mpn_mullo returns, if in < dn the low in
+	 limbs of R dies at that point.  We could save memory by letting T live
+	 just under R, and let the upper part of T expand into R. These changes
+	 should reduce itch to perhaps 3dn.
+       */
+
+      mpn_binvert (ip, dp, in, rp);
+
+      cy = 0;
+
+      MPN_COPY (rp, np, dn);
+      np += dn;
+      mpn_mullo_n (qp, rp, ip, in);
+      qn -= in;
+
+      while (qn > in)
+	{
+	  if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
+	    mpn_mul (tp, dp, dn, qp, in);	/* mulhi, need tp[dn+in-1...in] */
+	  else
+	    {
+	      tn = mpn_mulmod_bnm1_next_size (dn);
+	      mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out);
+	      wn = dn + in - tn;		/* number of wrapped limbs */
+	      if (wn > 0)
+		{
+		  c0 = mpn_sub_n (tp + tn, tp, rp, wn);
+		  mpn_decr_u (tp + wn, c0);
+		}
+	    }
+
+	  qp += in;
+	  if (dn != in)
+	    {
+	      /* Subtract tp[dn-1...in] from partial remainder.  */
+	      cy += mpn_sub_n (rp, rp + in, tp + in, dn - in);
+	      if (cy == 2)
+		{
+		  mpn_incr_u (tp + dn, 1);
+		  cy = 1;
+		}
+	    }
+	  /* Subtract tp[dn+in-1...dn] from dividend.  */
+	  cy = mpn_sub_nc (rp + dn - in, np, tp + dn, in, cy);
+	  np += in;
+	  mpn_mullo_n (qp, rp, ip, in);
+	  qn -= in;
+	}
+
+      /* Generate last qn limbs.
+	 FIXME: It should be possible to limit precision here, since qn is
+	 typically somewhat smaller than dn.  No big gains expected.  */
+
+      if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
+	mpn_mul (tp, dp, dn, qp, in);		/* mulhi, need tp[qn+in-1...in] */
+      else
+	{
+	  tn = mpn_mulmod_bnm1_next_size (dn);
+	  mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out);
+	  wn = dn + in - tn;			/* number of wrapped limbs */
+	  if (wn > 0)
+	    {
+	      c0 = mpn_sub_n (tp + tn, tp, rp, wn);
+	      mpn_decr_u (tp + wn, c0);
+	    }
+	}
+
+      qp += in;
+      if (dn != in)
+	{
+	  cy += mpn_sub_n (rp, rp + in, tp + in, dn - in);
+	  if (cy == 2)
+	    {
+	      mpn_incr_u (tp + dn, 1);
+	      cy = 1;
+	    }
+	}
+
+      mpn_sub_nc (rp + dn - in, np, tp + dn, qn - (dn - in), cy);
+      mpn_mullo_n (qp, rp, ip, qn);
+
+#undef ip
+#undef rp
+#undef tp
+#undef scratch_out
+   }
+  else
+    {
+      /* |_______________________|   dividend
+		|________________|   divisor  */
+
+#define ip           scratch		/* in */
+#define tp           (scratch + in)	/* qn+in or next_size(qn) or rest >= binvert_itch(in) */
+#define scratch_out  (scratch + in + tn)/* mulmod_bnm1_itch(next_size(qn)) */
+
+      /* Compute half-sized inverse.  */
+      in = qn - (qn >> 1);
+
+      mpn_binvert (ip, dp, in, tp);
+
+      mpn_mullo_n (qp, np, ip, in);		/* low `in' quotient limbs */
+
+      if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
+	mpn_mul (tp, dp, qn, qp, in);		/* mulhigh */
+      else
+	{
+	  tn = mpn_mulmod_bnm1_next_size (qn);
+	  mpn_mulmod_bnm1 (tp, tn, dp, qn, qp, in, scratch_out);
+	  wn = qn + in - tn;			/* number of wrapped limbs */
+	  if (wn > 0)
+	    {
+	      c0 = mpn_cmp (tp, np, wn) < 0;
+	      mpn_decr_u (tp + wn, c0);
+	    }
+	}
+
+      mpn_sub_n (tp, np + in, tp + in, qn - in);
+      mpn_mullo_n (qp + in, tp, ip, qn - in);	/* high qn-in quotient limbs */
+
+#undef ip
+#undef tp
+#undef scratch_out
+    }
+}
+
+void
+mpn_mu_bdiv_q (mp_ptr qp,
+	       mp_srcptr np, mp_size_t nn,
+	       mp_srcptr dp, mp_size_t dn,
+	       mp_ptr scratch)
+{
+  mpn_mu_bdiv_q_old (qp, np, nn, dp, dn, scratch);
+  mpn_neg (qp, qp, nn);
+}
+
+mp_size_t
+mpn_mu_bdiv_q_itch (mp_size_t nn, mp_size_t dn)
+{
+  mp_size_t qn, in, tn, itch_binvert, itch_out, itches;
+  mp_size_t b;
+
+  ASSERT_ALWAYS (DC_BDIV_Q_THRESHOLD < MU_BDIV_Q_THRESHOLD);
+
+  qn = nn;
+
+  if (qn > dn)
+    {
+      b = (qn - 1) / dn + 1;	/* ceil(qn/dn), number of blocks */
+      in = (qn - 1) / b + 1;	/* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */
+      if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
+	{
+	  tn = dn + in;
+	  itch_out = 0;
+	}
+      else
+	{
+	  tn = mpn_mulmod_bnm1_next_size (dn);
+	  itch_out = mpn_mulmod_bnm1_itch (tn, dn, in);
+	}
+      itches = dn + tn + itch_out;
+    }
+  else
+    {
+      in = qn - (qn >> 1);
+      if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
+	{
+	  tn = qn + in;
+	  itch_out = 0;
+	}
+      else
+	{
+	  tn = mpn_mulmod_bnm1_next_size (qn);
+	  itch_out = mpn_mulmod_bnm1_itch (tn, qn, in);
+	}
+      itches = tn + itch_out;
+    }
+
+  itch_binvert = mpn_binvert_itch (in);
+  return in + MAX (itches, itch_binvert);
+}

diff --git a/third_party/gmp/mpn/generic/mu_bdiv_qr.c b/third_party/gmp/mpn/generic/mu_bdiv_qr.c
new file mode 100644
index 0000000..540ad73
--- /dev/null
+++ b/third_party/gmp/mpn/generic/mu_bdiv_qr.c

@@ -0,0 +1,312 @@
+/* mpn_mu_bdiv_qr(qp,rp,np,nn,dp,dn,tp) -- Compute {np,nn} / {dp,dn} mod B^qn,
+   where qn = nn-dn, storing the result in {qp,qn}.  Overlap allowed between Q
+   and N; all other overlap disallowed.
+
+   Contributed to the GNU project by Torbjorn Granlund.
+
+   THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
+   SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
+
+Copyright 2005-2007, 2009, 2010, 2012, 2017 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+/*
+   The idea of the algorithm used herein is to compute a smaller inverted value
+   than used in the standard Barrett algorithm, and thus save time in the
+   Newton iterations, and pay just a small price when using the inverted value
+   for developing quotient bits.  This algorithm was presented at ICMS 2006.
+*/
+
+#include "gmp-impl.h"
+
+
+/* N = {np,nn}
+   D = {dp,dn}
+
+   Requirements: N >= D
+		 D >= 1
+		 D odd
+		 dn >= 2
+		 nn >= 2
+		 scratch space as determined by mpn_mu_bdiv_qr_itch(nn,dn).
+
+   Write quotient to Q = {qp,nn-dn}.
+
+   FIXME: When iterating, perhaps do the small step before loop, not after.
+   FIXME: Try to avoid the scalar divisions when computing inverse size.
+   FIXME: Trim allocation for (qn > dn) case, 3*dn might be possible.  In
+	  particular, when dn==in, tp and rp could use the same space.
+*/
+static mp_limb_t
+mpn_mu_bdiv_qr_old (mp_ptr qp,
+		    mp_ptr rp,
+		    mp_srcptr np, mp_size_t nn,
+		    mp_srcptr dp, mp_size_t dn,
+		    mp_ptr scratch)
+{
+  mp_size_t qn;
+  mp_size_t in;
+  mp_limb_t cy, c0;
+  mp_size_t tn, wn;
+
+  qn = nn - dn;
+
+  ASSERT (dn >= 2);
+  ASSERT (qn >= 2);
+
+  if (qn > dn)
+    {
+      mp_size_t b;
+
+      /* |_______________________|   dividend
+			|________|   divisor  */
+
+#define ip           scratch		/* in */
+#define tp           (scratch + in)	/* dn+in or next_size(dn) or rest >= binvert_itch(in) */
+#define scratch_out  (scratch + in + tn)/* mulmod_bnm1_itch(next_size(dn)) */
+
+      /* Compute an inverse size that is a nice partition of the quotient.  */
+      b = (qn - 1) / dn + 1;	/* ceil(qn/dn), number of blocks */
+      in = (qn - 1) / b + 1;	/* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */
+
+      /* Some notes on allocation:
+
+	 When in = dn, R dies when mpn_mullo returns, if in < dn the low in
+	 limbs of R dies at that point.  We could save memory by letting T live
+	 just under R, and let the upper part of T expand into R. These changes
+	 should reduce itch to perhaps 3dn.
+       */
+
+      mpn_binvert (ip, dp, in, tp);
+
+      MPN_COPY (rp, np, dn);
+      np += dn;
+      cy = 0;
+
+      while (qn > in)
+	{
+	  mpn_mullo_n (qp, rp, ip, in);
+
+	  if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
+	    mpn_mul (tp, dp, dn, qp, in);	/* mulhi, need tp[dn+in-1...in] */
+	  else
+	    {
+	      tn = mpn_mulmod_bnm1_next_size (dn);
+	      mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out);
+	      wn = dn + in - tn;		/* number of wrapped limbs */
+	      if (wn > 0)
+		{
+		  c0 = mpn_sub_n (tp + tn, tp, rp, wn);
+		  mpn_decr_u (tp + wn, c0);
+		}
+	    }
+
+	  qp += in;
+	  qn -= in;
+
+	  if (dn != in)
+	    {
+	      /* Subtract tp[dn-1...in] from partial remainder.  */
+	      cy += mpn_sub_n (rp, rp + in, tp + in, dn - in);
+	      if (cy == 2)
+		{
+		  mpn_incr_u (tp + dn, 1);
+		  cy = 1;
+		}
+	    }
+	  /* Subtract tp[dn+in-1...dn] from dividend.  */
+	  cy = mpn_sub_nc (rp + dn - in, np, tp + dn, in, cy);
+	  np += in;
+	}
+
+      /* Generate last qn limbs.  */
+      mpn_mullo_n (qp, rp, ip, qn);
+
+      if (BELOW_THRESHOLD (qn, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
+	mpn_mul (tp, dp, dn, qp, qn);		/* mulhi, need tp[qn+in-1...in] */
+      else
+	{
+	  tn = mpn_mulmod_bnm1_next_size (dn);
+	  mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, qn, scratch_out);
+	  wn = dn + qn - tn;			/* number of wrapped limbs */
+	  if (wn > 0)
+	    {
+	      c0 = mpn_sub_n (tp + tn, tp, rp, wn);
+	      mpn_decr_u (tp + wn, c0);
+	    }
+	}
+
+      if (dn != qn)
+	{
+	  cy += mpn_sub_n (rp, rp + qn, tp + qn, dn - qn);
+	  if (cy == 2)
+	    {
+	      mpn_incr_u (tp + dn, 1);
+	      cy = 1;
+	    }
+	}
+      return mpn_sub_nc (rp + dn - qn, np, tp + dn, qn, cy);
+
+#undef ip
+#undef tp
+#undef scratch_out
+    }
+  else
+    {
+      /* |_______________________|   dividend
+		|________________|   divisor  */
+
+#define ip           scratch		/* in */
+#define tp           (scratch + in)	/* dn+in or next_size(dn) or rest >= binvert_itch(in) */
+#define scratch_out  (scratch + in + tn)/* mulmod_bnm1_itch(next_size(dn)) */
+
+      /* Compute half-sized inverse.  */
+      in = qn - (qn >> 1);
+
+      mpn_binvert (ip, dp, in, tp);
+
+      mpn_mullo_n (qp, np, ip, in);		/* low `in' quotient limbs */
+
+      if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
+	mpn_mul (tp, dp, dn, qp, in);		/* mulhigh */
+      else
+	{
+	  tn = mpn_mulmod_bnm1_next_size (dn);
+	  mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out);
+	  wn = dn + in - tn;			/* number of wrapped limbs */
+	  if (wn > 0)
+	    {
+	      c0 = mpn_sub_n (tp + tn, tp, np, wn);
+	      mpn_decr_u (tp + wn, c0);
+	    }
+	}
+
+      qp += in;
+      qn -= in;
+
+      cy = mpn_sub_n (rp, np + in, tp + in, dn);
+      mpn_mullo_n (qp, rp, ip, qn);		/* high qn quotient limbs */
+
+      if (BELOW_THRESHOLD (qn, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
+	mpn_mul (tp, dp, dn, qp, qn);		/* mulhigh */
+      else
+	{
+	  tn = mpn_mulmod_bnm1_next_size (dn);
+	  mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, qn, scratch_out);
+	  wn = dn + qn - tn;			/* number of wrapped limbs */
+	  if (wn > 0)
+	    {
+	      c0 = mpn_sub_n (tp + tn, tp, rp, wn);
+	      mpn_decr_u (tp + wn, c0);
+	    }
+	}
+
+      cy += mpn_sub_n (rp, rp + qn, tp + qn, dn - qn);
+      if (cy == 2)
+	{
+	  mpn_incr_u (tp + dn, 1);
+	  cy = 1;
+	}
+      return mpn_sub_nc (rp + dn - qn, np + dn + in, tp + dn, qn, cy);
+
+#undef ip
+#undef tp
+#undef scratch_out
+    }
+}
+
+mp_limb_t
+mpn_mu_bdiv_qr (mp_ptr qp,
+		mp_ptr rp,
+		mp_srcptr np, mp_size_t nn,
+		mp_srcptr dp, mp_size_t dn,
+		mp_ptr scratch)
+{
+  mp_limb_t cy = mpn_mu_bdiv_qr_old (qp, rp, np, nn, dp, dn, scratch);
+
+  /* R' B^{qn} = U - Q' D
+   *
+   * Q = B^{qn} - Q' (assuming Q' != 0)
+   *
+   * R B^{qn} = U + Q D = U + B^{qn} D - Q' D
+   *          = B^{qn} D + R'
+   */
+
+  if (UNLIKELY (!mpn_neg (qp, qp, nn - dn)))
+    {
+      /* Zero quotient. */
+      ASSERT (cy == 0);
+      return 0;
+    }
+  else
+    {
+      mp_limb_t cy2 = mpn_add_n (rp, rp, dp, dn);
+      ASSERT (cy2 >= cy);
+
+      return cy2 - cy;
+    }
+}
+
+
+mp_size_t
+mpn_mu_bdiv_qr_itch (mp_size_t nn, mp_size_t dn)
+{
+  mp_size_t qn, in, tn, itch_binvert, itch_out, itches;
+  mp_size_t b;
+
+  ASSERT_ALWAYS (DC_BDIV_Q_THRESHOLD < MU_BDIV_Q_THRESHOLD);
+
+  qn = nn - dn;
+
+  if (qn > dn)
+    {
+      b = (qn - 1) / dn + 1;	/* ceil(qn/dn), number of blocks */
+      in = (qn - 1) / b + 1;	/* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */
+    }
+  else
+    {
+      in = qn - (qn >> 1);
+    }
+
+  if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
+    {
+      tn = dn + in;
+      itch_out = 0;
+    }
+  else
+    {
+      tn = mpn_mulmod_bnm1_next_size (dn);
+      itch_out = mpn_mulmod_bnm1_itch (tn, dn, in);
+    }
+
+  itch_binvert = mpn_binvert_itch (in);
+  itches = tn + itch_out;
+  return in + MAX (itches, itch_binvert);
+}

diff --git a/third_party/gmp/mpn/generic/mu_div_q.c b/third_party/gmp/mpn/generic/mu_div_q.c
new file mode 100644
index 0000000..44cfb40
--- /dev/null
+++ b/third_party/gmp/mpn/generic/mu_div_q.c

@@ -0,0 +1,184 @@
+/* mpn_mu_div_q.
+
+   Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
+
+   THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
+   SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
+
+Copyright 2005-2007, 2009, 2010, 2013 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+/*
+   The idea of the algorithm used herein is to compute a smaller inverted value
+   than used in the standard Barrett algorithm, and thus save time in the
+   Newton iterations, and pay just a small price when using the inverted value
+   for developing quotient bits.  This algorithm was presented at ICMS 2006.
+*/
+
+/*
+  Things to work on:
+
+  1. This is a rudimentary implementation of mpn_mu_div_q.  The algorithm is
+     probably close to optimal, except when mpn_mu_divappr_q fails.
+
+  2. We used to fall back to mpn_mu_div_qr when we detect a possible
+     mpn_mu_divappr_q rounding problem, now we multiply and compare.
+     Unfortunately, since mpn_mu_divappr_q does not return the partial
+     remainder, this also doesn't become optimal.  A mpn_mu_divappr_qr could
+     solve that.
+
+  3. The allocations done here should be made from the scratch area, which
+     then would need to be amended.
+*/
+
+#include <stdlib.h>		/* for NULL */
+#include "gmp-impl.h"
+
+
+mp_limb_t
+mpn_mu_div_q (mp_ptr qp,
+	      mp_srcptr np, mp_size_t nn,
+	      mp_srcptr dp, mp_size_t dn,
+	      mp_ptr scratch)
+{
+  mp_ptr tp, rp;
+  mp_size_t qn;
+  mp_limb_t cy, qh;
+  TMP_DECL;
+
+  TMP_MARK;
+
+  qn = nn - dn;
+
+  tp = TMP_BALLOC_LIMBS (qn + 1);
+
+  if (qn >= dn)			/* nn >= 2*dn + 1 */
+    {
+       /* |_______________________|   dividend
+			 |________|   divisor  */
+
+      rp = TMP_BALLOC_LIMBS (nn + 1);
+      MPN_COPY (rp + 1, np, nn);
+      rp[0] = 0;
+
+      qh = mpn_cmp (rp + 1 + nn - dn, dp, dn) >= 0;
+      if (qh != 0)
+	mpn_sub_n (rp + 1 + nn - dn, rp + 1 + nn - dn, dp, dn);
+
+      cy = mpn_mu_divappr_q (tp, rp, nn + 1, dp, dn, scratch);
+
+      if (UNLIKELY (cy != 0))
+	{
+	  /* Since the partial remainder fed to mpn_preinv_mu_divappr_q was
+	     canonically reduced, replace the returned value of B^(qn-dn)+eps
+	     by the largest possible value.  */
+	  mp_size_t i;
+	  for (i = 0; i < qn + 1; i++)
+	    tp[i] = GMP_NUMB_MAX;
+	}
+
+      /* The max error of mpn_mu_divappr_q is +4.  If the low quotient limb is
+	 smaller than the max error, we cannot trust the quotient.  */
+      if (tp[0] > 4)
+	{
+	  MPN_COPY (qp, tp + 1, qn);
+	}
+      else
+	{
+	  mp_limb_t cy;
+	  mp_ptr pp;
+
+	  pp = rp;
+	  mpn_mul (pp, tp + 1, qn, dp, dn);
+
+	  cy = (qh != 0) ? mpn_add_n (pp + qn, pp + qn, dp, dn) : 0;
+
+	  if (cy || mpn_cmp (pp, np, nn) > 0) /* At most is wrong by one, no cycle. */
+	    qh -= mpn_sub_1 (qp, tp + 1, qn, 1);
+	  else /* Same as above */
+	    MPN_COPY (qp, tp + 1, qn);
+	}
+    }
+  else
+    {
+       /* |_______________________|   dividend
+		 |________________|   divisor  */
+
+      /* FIXME: When nn = 2dn-1, qn becomes dn-1, and the numerator size passed
+	 here becomes 2dn, i.e., more than nn.  This shouldn't hurt, since only
+	 the most significant dn-1 limbs will actually be read, but it is not
+	 pretty.  */
+
+      qh = mpn_mu_divappr_q (tp, np + nn - (2 * qn + 2), 2 * qn + 2,
+			     dp + dn - (qn + 1), qn + 1, scratch);
+
+      /* The max error of mpn_mu_divappr_q is +4, but we get an additional
+         error from the divisor truncation.  */
+      if (tp[0] > 6)
+	{
+	  MPN_COPY (qp, tp + 1, qn);
+	}
+      else
+	{
+	  mp_limb_t cy;
+
+	  /* FIXME: a shorter product should be enough; we may use already
+	     allocated space... */
+	  rp = TMP_BALLOC_LIMBS (nn);
+	  mpn_mul (rp, dp, dn, tp + 1, qn);
+
+	  cy = (qh != 0) ? mpn_add_n (rp + qn, rp + qn, dp, dn) : 0;
+
+	  if (cy || mpn_cmp (rp, np, nn) > 0) /* At most is wrong by one, no cycle. */
+	    qh -= mpn_sub_1 (qp, tp + 1, qn, 1);
+	  else /* Same as above */
+	    MPN_COPY (qp, tp + 1, qn);
+	}
+    }
+
+  TMP_FREE;
+  return qh;
+}
+
+mp_size_t
+mpn_mu_div_q_itch (mp_size_t nn, mp_size_t dn, int mua_k)
+{
+  mp_size_t qn;
+
+  qn = nn - dn;
+  if (qn >= dn)
+    {
+      return mpn_mu_divappr_q_itch (nn + 1, dn, mua_k);
+    }
+  else
+    {
+      return mpn_mu_divappr_q_itch (2 * qn + 2, qn + 1, mua_k);
+    }
+}

diff --git a/third_party/gmp/mpn/generic/mu_div_qr.c b/third_party/gmp/mpn/generic/mu_div_qr.c
new file mode 100644
index 0000000..8b9c702
--- /dev/null
+++ b/third_party/gmp/mpn/generic/mu_div_qr.c

@@ -0,0 +1,417 @@
+/* mpn_mu_div_qr, mpn_preinv_mu_div_qr.
+
+   Compute Q = floor(N / D) and R = N-QD.  N is nn limbs and D is dn limbs and
+   must be normalized, and Q must be nn-dn limbs.  The requirement that Q is
+   nn-dn limbs (and not nn-dn+1 limbs) was put in place in order to allow us to
+   let N be unmodified during the operation.
+
+   Contributed to the GNU project by Torbjorn Granlund.
+
+   THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
+   SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
+
+Copyright 2005-2007, 2009, 2010 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+/*
+   The idea of the algorithm used herein is to compute a smaller inverted value
+   than used in the standard Barrett algorithm, and thus save time in the
+   Newton iterations, and pay just a small price when using the inverted value
+   for developing quotient bits.  This algorithm was presented at ICMS 2006.
+*/
+
+/* CAUTION: This code and the code in mu_divappr_q.c should be edited in sync.
+
+ Things to work on:
+
+  * This isn't optimal when the quotient isn't needed, as it might take a lot
+    of space.  The computation is always needed, though, so there is no time to
+    save with special code.
+
+  * The itch/scratch scheme isn't perhaps such a good idea as it once seemed,
+    demonstrated by the fact that the mpn_invertappr function's scratch needs
+    mean that we need to keep a large allocation long after it is needed.
+    Things are worse as mpn_mul_fft does not accept any scratch parameter,
+    which means we'll have a large memory hole while in mpn_mul_fft.  In
+    general, a peak scratch need in the beginning of a function isn't
+    well-handled by the itch/scratch scheme.
+*/
+
+#ifdef STAT
+#undef STAT
+#define STAT(x) x
+#else
+#define STAT(x)
+#endif
+
+#include <stdlib.h>		/* for NULL */
+#include "gmp-impl.h"
+
+
+/* FIXME: The MU_DIV_QR_SKEW_THRESHOLD was not analysed properly.  It gives a
+   speedup according to old measurements, but does the decision mechanism
+   really make sense?  It seem like the quotient between dn and qn might be
+   what we really should be checking.  */
+#ifndef MU_DIV_QR_SKEW_THRESHOLD
+#define MU_DIV_QR_SKEW_THRESHOLD 100
+#endif
+
+#ifdef CHECK				/* FIXME: Enable in minithres */
+#undef  MU_DIV_QR_SKEW_THRESHOLD
+#define MU_DIV_QR_SKEW_THRESHOLD 1
+#endif
+
+
+static mp_limb_t mpn_mu_div_qr2 (mp_ptr, mp_ptr, mp_srcptr, mp_size_t, mp_srcptr, mp_size_t, mp_ptr);
+static mp_size_t mpn_mu_div_qr_choose_in (mp_size_t, mp_size_t, int);
+
+
+mp_limb_t
+mpn_mu_div_qr (mp_ptr qp,
+	       mp_ptr rp,
+	       mp_srcptr np,
+	       mp_size_t nn,
+	       mp_srcptr dp,
+	       mp_size_t dn,
+	       mp_ptr scratch)
+{
+  mp_size_t qn;
+  mp_limb_t cy, qh;
+
+  qn = nn - dn;
+  if (qn + MU_DIV_QR_SKEW_THRESHOLD < dn)
+    {
+      /* |______________|_ign_first__|   dividend			  nn
+		|_______|_ign_first__|   divisor			  dn
+
+		|______|	     quotient (prel)			  qn
+
+		 |___________________|   quotient * ignored-divisor-part  dn-1
+      */
+
+      /* Compute a preliminary quotient and a partial remainder by dividing the
+	 most significant limbs of each operand.  */
+      qh = mpn_mu_div_qr2 (qp, rp + nn - (2 * qn + 1),
+			   np + nn - (2 * qn + 1), 2 * qn + 1,
+			   dp + dn - (qn + 1), qn + 1,
+			   scratch);
+
+      /* Multiply the quotient by the divisor limbs ignored above.  */
+      if (dn - (qn + 1) > qn)
+	mpn_mul (scratch, dp, dn - (qn + 1), qp, qn);  /* prod is dn-1 limbs */
+      else
+	mpn_mul (scratch, qp, qn, dp, dn - (qn + 1));  /* prod is dn-1 limbs */
+
+      if (qh)
+	cy = mpn_add_n (scratch + qn, scratch + qn, dp, dn - (qn + 1));
+      else
+	cy = 0;
+      scratch[dn - 1] = cy;
+
+      cy = mpn_sub_n (rp, np, scratch, nn - (2 * qn + 1));
+      cy = mpn_sub_nc (rp + nn - (2 * qn + 1),
+		       rp + nn - (2 * qn + 1),
+		       scratch + nn - (2 * qn + 1),
+		       qn + 1, cy);
+      if (cy)
+	{
+	  qh -= mpn_sub_1 (qp, qp, qn, 1);
+	  mpn_add_n (rp, rp, dp, dn);
+	}
+    }
+  else
+    {
+      qh = mpn_mu_div_qr2 (qp, rp, np, nn, dp, dn, scratch);
+    }
+
+  return qh;
+}
+
+static mp_limb_t
+mpn_mu_div_qr2 (mp_ptr qp,
+		mp_ptr rp,
+		mp_srcptr np,
+		mp_size_t nn,
+		mp_srcptr dp,
+		mp_size_t dn,
+		mp_ptr scratch)
+{
+  mp_size_t qn, in;
+  mp_limb_t cy, qh;
+  mp_ptr ip, tp;
+
+  ASSERT (dn > 1);
+
+  qn = nn - dn;
+
+  /* Compute the inverse size.  */
+  in = mpn_mu_div_qr_choose_in (qn, dn, 0);
+  ASSERT (in <= dn);
+
+#if 1
+  /* This alternative inverse computation method gets slightly more accurate
+     results.  FIXMEs: (1) Temp allocation needs not analysed (2) itch function
+     not adapted (3) mpn_invertappr scratch needs not met.  */
+  ip = scratch;
+  tp = scratch + in + 1;
+
+  /* compute an approximate inverse on (in+1) limbs */
+  if (dn == in)
+    {
+      MPN_COPY (tp + 1, dp, in);
+      tp[0] = 1;
+      mpn_invertappr (ip, tp, in + 1, tp + in + 1);
+      MPN_COPY_INCR (ip, ip + 1, in);
+    }
+  else
+    {
+      cy = mpn_add_1 (tp, dp + dn - (in + 1), in + 1, 1);
+      if (UNLIKELY (cy != 0))
+	MPN_ZERO (ip, in);
+      else
+	{
+	  mpn_invertappr (ip, tp, in + 1, tp + in + 1);
+	  MPN_COPY_INCR (ip, ip + 1, in);
+	}
+    }
+#else
+  /* This older inverse computation method gets slightly worse results than the
+     one above.  */
+  ip = scratch;
+  tp = scratch + in;
+
+  /* Compute inverse of D to in+1 limbs, then round to 'in' limbs.  Ideally the
+     inversion function should do this automatically.  */
+  if (dn == in)
+    {
+      tp[in + 1] = 0;
+      MPN_COPY (tp + in + 2, dp, in);
+      mpn_invertappr (tp, tp + in + 1, in + 1, NULL);
+    }
+  else
+    {
+      mpn_invertappr (tp, dp + dn - (in + 1), in + 1, NULL);
+    }
+  cy = mpn_sub_1 (tp, tp, in + 1, GMP_NUMB_HIGHBIT);
+  if (UNLIKELY (cy != 0))
+    MPN_ZERO (tp + 1, in);
+  MPN_COPY (ip, tp + 1, in);
+#endif
+
+  qh = mpn_preinv_mu_div_qr (qp, rp, np, nn, dp, dn, ip, in, scratch + in);
+
+  return qh;
+}
+
+mp_limb_t
+mpn_preinv_mu_div_qr (mp_ptr qp,
+		      mp_ptr rp,
+		      mp_srcptr np,
+		      mp_size_t nn,
+		      mp_srcptr dp,
+		      mp_size_t dn,
+		      mp_srcptr ip,
+		      mp_size_t in,
+		      mp_ptr scratch)
+{
+  mp_size_t qn;
+  mp_limb_t cy, cx, qh;
+  mp_limb_t r;
+  mp_size_t tn, wn;
+
+#define tp           scratch
+#define scratch_out  (scratch + tn)
+
+  qn = nn - dn;
+
+  np += qn;
+  qp += qn;
+
+  qh = mpn_cmp (np, dp, dn) >= 0;
+  if (qh != 0)
+    mpn_sub_n (rp, np, dp, dn);
+  else
+    MPN_COPY_INCR (rp, np, dn);
+
+  /* if (qn == 0) */			/* The while below handles this case */
+  /*   return qh; */			/* Degenerate use.  Should we allow this? */
+
+  while (qn > 0)
+    {
+      if (qn < in)
+	{
+	  ip += in - qn;
+	  in = qn;
+	}
+      np -= in;
+      qp -= in;
+
+      /* Compute the next block of quotient limbs by multiplying the inverse I
+	 by the upper part of the partial remainder R.  */
+      mpn_mul_n (tp, rp + dn - in, ip, in);		/* mulhi  */
+      cy = mpn_add_n (qp, tp + in, rp + dn - in, in);	/* I's msb implicit */
+      ASSERT_ALWAYS (cy == 0);
+
+      qn -= in;
+
+      /* Compute the product of the quotient block and the divisor D, to be
+	 subtracted from the partial remainder combined with new limbs from the
+	 dividend N.  We only really need the low dn+1 limbs.  */
+
+      if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
+	mpn_mul (tp, dp, dn, qp, in);		/* dn+in limbs, high 'in' cancels */
+      else
+	{
+	  tn = mpn_mulmod_bnm1_next_size (dn + 1);
+	  mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out);
+	  wn = dn + in - tn;			/* number of wrapped limbs */
+	  if (wn > 0)
+	    {
+	      cy = mpn_sub_n (tp, tp, rp + dn - wn, wn);
+	      cy = mpn_sub_1 (tp + wn, tp + wn, tn - wn, cy);
+	      cx = mpn_cmp (rp + dn - in, tp + dn, tn - dn) < 0;
+	      ASSERT_ALWAYS (cx >= cy);
+	      mpn_incr_u (tp, cx - cy);
+	    }
+	}
+
+      r = rp[dn - in] - tp[dn];
+
+      /* Subtract the product from the partial remainder combined with new
+	 limbs from the dividend N, generating a new partial remainder R.  */
+      if (dn != in)
+	{
+	  cy = mpn_sub_n (tp, np, tp, in);	/* get next 'in' limbs from N */
+	  cy = mpn_sub_nc (tp + in, rp, tp + in, dn - in, cy);
+	  MPN_COPY (rp, tp, dn);		/* FIXME: try to avoid this */
+	}
+      else
+	{
+	  cy = mpn_sub_n (rp, np, tp, in);	/* get next 'in' limbs from N */
+	}
+
+      STAT (int i; int err = 0;
+	    static int errarr[5]; static int err_rec; static int tot);
+
+      /* Check the remainder R and adjust the quotient as needed.  */
+      r -= cy;
+      while (r != 0)
+	{
+	  /* We loop 0 times with about 69% probability, 1 time with about 31%
+	     probability, 2 times with about 0.6% probability, if inverse is
+	     computed as recommended.  */
+	  mpn_incr_u (qp, 1);
+	  cy = mpn_sub_n (rp, rp, dp, dn);
+	  r -= cy;
+	  STAT (err++);
+	}
+      if (mpn_cmp (rp, dp, dn) >= 0)
+	{
+	  /* This is executed with about 76% probability.  */
+	  mpn_incr_u (qp, 1);
+	  cy = mpn_sub_n (rp, rp, dp, dn);
+	  STAT (err++);
+	}
+
+      STAT (
+	    tot++;
+	    errarr[err]++;
+	    if (err > err_rec)
+	      err_rec = err;
+	    if (tot % 0x10000 == 0)
+	      {
+		for (i = 0; i <= err_rec; i++)
+		  printf ("  %d(%.1f%%)", errarr[i], 100.0*errarr[i]/tot);
+		printf ("\n");
+	      }
+	    );
+    }
+
+  return qh;
+}
+
+/* In case k=0 (automatic choice), we distinguish 3 cases:
+   (a) dn < qn:         in = ceil(qn / ceil(qn/dn))
+   (b) dn/3 < qn <= dn: in = ceil(qn / 2)
+   (c) qn < dn/3:       in = qn
+   In all cases we have in <= dn.
+ */
+static mp_size_t
+mpn_mu_div_qr_choose_in (mp_size_t qn, mp_size_t dn, int k)
+{
+  mp_size_t in;
+
+  if (k == 0)
+    {
+      mp_size_t b;
+      if (qn > dn)
+	{
+	  /* Compute an inverse size that is a nice partition of the quotient.  */
+	  b = (qn - 1) / dn + 1;	/* ceil(qn/dn), number of blocks */
+	  in = (qn - 1) / b + 1;	/* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */
+	}
+      else if (3 * qn > dn)
+	{
+	  in = (qn - 1) / 2 + 1;	/* b = 2 */
+	}
+      else
+	{
+	  in = (qn - 1) / 1 + 1;	/* b = 1 */
+	}
+    }
+  else
+    {
+      mp_size_t xn;
+      xn = MIN (dn, qn);
+      in = (xn - 1) / k + 1;
+    }
+
+  return in;
+}
+
+mp_size_t
+mpn_mu_div_qr_itch (mp_size_t nn, mp_size_t dn, int mua_k)
+{
+  mp_size_t in = mpn_mu_div_qr_choose_in (nn - dn, dn, mua_k);
+  mp_size_t itch_preinv = mpn_preinv_mu_div_qr_itch (nn, dn, in);
+  mp_size_t itch_invapp = mpn_invertappr_itch (in + 1) + in + 2; /* 3in + 4 */
+
+  ASSERT (itch_preinv >= itch_invapp);
+  return in + MAX (itch_invapp, itch_preinv);
+}
+
+mp_size_t
+mpn_preinv_mu_div_qr_itch (mp_size_t nn, mp_size_t dn, mp_size_t in)
+{
+  mp_size_t itch_local = mpn_mulmod_bnm1_next_size (dn + 1);
+  mp_size_t itch_out = mpn_mulmod_bnm1_itch (itch_local, dn, in);
+
+  return itch_local + itch_out;
+}

diff --git a/third_party/gmp/mpn/generic/mu_divappr_q.c b/third_party/gmp/mpn/generic/mu_divappr_q.c
new file mode 100644
index 0000000..c022b4f
--- /dev/null
+++ b/third_party/gmp/mpn/generic/mu_divappr_q.c

@@ -0,0 +1,368 @@
+/* mpn_mu_divappr_q, mpn_preinv_mu_divappr_q.
+
+   Compute Q = floor(N / D) + e.  N is nn limbs, D is dn limbs and must be
+   normalized, and Q must be nn-dn limbs, 0 <= e <= 4.  The requirement that Q
+   is nn-dn limbs (and not nn-dn+1 limbs) was put in place in order to allow us
+   to let N be unmodified during the operation.
+
+   Contributed to the GNU project by Torbjorn Granlund.
+
+   THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
+   SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
+
+Copyright 2005-2007, 2009, 2010 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+/*
+   The idea of the algorithm used herein is to compute a smaller inverted value
+   than used in the standard Barrett algorithm, and thus save time in the
+   Newton iterations, and pay just a small price when using the inverted value
+   for developing quotient bits.  This algorithm was presented at ICMS 2006.
+*/
+
+/* CAUTION: This code and the code in mu_div_qr.c should be edited in sync.
+
+ Things to work on:
+
+  * The itch/scratch scheme isn't perhaps such a good idea as it once seemed,
+    demonstrated by the fact that the mpn_invertappr function's scratch needs
+    mean that we need to keep a large allocation long after it is needed.
+    Things are worse as mpn_mul_fft does not accept any scratch parameter,
+    which means we'll have a large memory hole while in mpn_mul_fft.  In
+    general, a peak scratch need in the beginning of a function isn't
+    well-handled by the itch/scratch scheme.
+*/
+
+#ifdef STAT
+#undef STAT
+#define STAT(x) x
+#else
+#define STAT(x)
+#endif
+
+#include <stdlib.h>		/* for NULL */
+#include "gmp-impl.h"
+
+static mp_limb_t mpn_preinv_mu_divappr_q (mp_ptr, mp_srcptr, mp_size_t,
+			 mp_srcptr, mp_size_t, mp_srcptr, mp_size_t, mp_ptr);
+static mp_size_t mpn_mu_divappr_q_choose_in (mp_size_t, mp_size_t, int);
+
+mp_limb_t
+mpn_mu_divappr_q (mp_ptr qp,
+		  mp_srcptr np,
+		  mp_size_t nn,
+		  mp_srcptr dp,
+		  mp_size_t dn,
+		  mp_ptr scratch)
+{
+  mp_size_t qn, in;
+  mp_limb_t cy, qh;
+  mp_ptr ip, tp;
+
+  ASSERT (dn > 1);
+
+  qn = nn - dn;
+
+  /* If Q is smaller than D, truncate operands. */
+  if (qn + 1 < dn)
+    {
+      np += dn - (qn + 1);
+      nn -= dn - (qn + 1);
+      dp += dn - (qn + 1);
+      dn = qn + 1;
+    }
+
+  /* Compute the inverse size.  */
+  in = mpn_mu_divappr_q_choose_in (qn, dn, 0);
+  ASSERT (in <= dn);
+
+#if 1
+  /* This alternative inverse computation method gets slightly more accurate
+     results.  FIXMEs: (1) Temp allocation needs not analysed (2) itch function
+     not adapted (3) mpn_invertappr scratch needs not met.  */
+  ip = scratch;
+  tp = scratch + in + 1;
+
+  /* compute an approximate inverse on (in+1) limbs */
+  if (dn == in)
+    {
+      MPN_COPY (tp + 1, dp, in);
+      tp[0] = 1;
+      mpn_invertappr (ip, tp, in + 1, tp + in + 1);
+      MPN_COPY_INCR (ip, ip + 1, in);
+    }
+  else
+    {
+      cy = mpn_add_1 (tp, dp + dn - (in + 1), in + 1, 1);
+      if (UNLIKELY (cy != 0))
+	MPN_ZERO (ip, in);
+      else
+	{
+	  mpn_invertappr (ip, tp, in + 1, tp + in + 1);
+	  MPN_COPY_INCR (ip, ip + 1, in);
+	}
+    }
+#else
+  /* This older inverse computation method gets slightly worse results than the
+     one above.  */
+  ip = scratch;
+  tp = scratch + in;
+
+  /* Compute inverse of D to in+1 limbs, then round to 'in' limbs.  Ideally the
+     inversion function should do this automatically.  */
+  if (dn == in)
+    {
+      tp[in + 1] = 0;
+      MPN_COPY (tp + in + 2, dp, in);
+      mpn_invertappr (tp, tp + in + 1, in + 1, NULL);
+    }
+  else
+    {
+      mpn_invertappr (tp, dp + dn - (in + 1), in + 1, NULL);
+    }
+  cy = mpn_sub_1 (tp, tp, in + 1, GMP_NUMB_HIGHBIT);
+  if (UNLIKELY (cy != 0))
+    MPN_ZERO (tp + 1, in);
+  MPN_COPY (ip, tp + 1, in);
+#endif
+
+  qh = mpn_preinv_mu_divappr_q (qp, np, nn, dp, dn, ip, in, scratch + in);
+
+  return qh;
+}
+
+static mp_limb_t
+mpn_preinv_mu_divappr_q (mp_ptr qp,
+			 mp_srcptr np,
+			 mp_size_t nn,
+			 mp_srcptr dp,
+			 mp_size_t dn,
+			 mp_srcptr ip,
+			 mp_size_t in,
+			 mp_ptr scratch)
+{
+  mp_size_t qn;
+  mp_limb_t cy, cx, qh;
+  mp_limb_t r;
+  mp_size_t tn, wn;
+
+#define rp           scratch
+#define tp           (scratch + dn)
+#define scratch_out  (scratch + dn + tn)
+
+  qn = nn - dn;
+
+  np += qn;
+  qp += qn;
+
+  qh = mpn_cmp (np, dp, dn) >= 0;
+  if (qh != 0)
+    mpn_sub_n (rp, np, dp, dn);
+  else
+    MPN_COPY (rp, np, dn);
+
+  if (qn == 0)
+    return qh;			/* Degenerate use.  Should we allow this? */
+
+  while (qn > 0)
+    {
+      if (qn < in)
+	{
+	  ip += in - qn;
+	  in = qn;
+	}
+      np -= in;
+      qp -= in;
+
+      /* Compute the next block of quotient limbs by multiplying the inverse I
+	 by the upper part of the partial remainder R.  */
+      mpn_mul_n (tp, rp + dn - in, ip, in);		/* mulhi  */
+      cy = mpn_add_n (qp, tp + in, rp + dn - in, in);	/* I's msb implicit */
+      ASSERT_ALWAYS (cy == 0);
+
+      qn -= in;
+      if (qn == 0)
+	break;
+
+      /* Compute the product of the quotient block and the divisor D, to be
+	 subtracted from the partial remainder combined with new limbs from the
+	 dividend N.  We only really need the low dn limbs.  */
+
+      if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
+	mpn_mul (tp, dp, dn, qp, in);		/* dn+in limbs, high 'in' cancels */
+      else
+	{
+	  tn = mpn_mulmod_bnm1_next_size (dn + 1);
+	  mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out);
+	  wn = dn + in - tn;			/* number of wrapped limbs */
+	  if (wn > 0)
+	    {
+	      cy = mpn_sub_n (tp, tp, rp + dn - wn, wn);
+	      cy = mpn_sub_1 (tp + wn, tp + wn, tn - wn, cy);
+	      cx = mpn_cmp (rp + dn - in, tp + dn, tn - dn) < 0;
+	      ASSERT_ALWAYS (cx >= cy);
+	      mpn_incr_u (tp, cx - cy);
+	    }
+	}
+
+      r = rp[dn - in] - tp[dn];
+
+      /* Subtract the product from the partial remainder combined with new
+	 limbs from the dividend N, generating a new partial remainder R.  */
+      if (dn != in)
+	{
+	  cy = mpn_sub_n (tp, np, tp, in);	/* get next 'in' limbs from N */
+	  cy = mpn_sub_nc (tp + in, rp, tp + in, dn - in, cy);
+	  MPN_COPY (rp, tp, dn);		/* FIXME: try to avoid this */
+	}
+      else
+	{
+	  cy = mpn_sub_n (rp, np, tp, in);	/* get next 'in' limbs from N */
+	}
+
+      STAT (int i; int err = 0;
+	    static int errarr[5]; static int err_rec; static int tot);
+
+      /* Check the remainder R and adjust the quotient as needed.  */
+      r -= cy;
+      while (r != 0)
+	{
+	  /* We loop 0 times with about 69% probability, 1 time with about 31%
+	     probability, 2 times with about 0.6% probability, if inverse is
+	     computed as recommended.  */
+	  mpn_incr_u (qp, 1);
+	  cy = mpn_sub_n (rp, rp, dp, dn);
+	  r -= cy;
+	  STAT (err++);
+	}
+      if (mpn_cmp (rp, dp, dn) >= 0)
+	{
+	  /* This is executed with about 76% probability.  */
+	  mpn_incr_u (qp, 1);
+	  cy = mpn_sub_n (rp, rp, dp, dn);
+	  STAT (err++);
+	}
+
+      STAT (
+	    tot++;
+	    errarr[err]++;
+	    if (err > err_rec)
+	      err_rec = err;
+	    if (tot % 0x10000 == 0)
+	      {
+		for (i = 0; i <= err_rec; i++)
+		  printf ("  %d(%.1f%%)", errarr[i], 100.0*errarr[i]/tot);
+		printf ("\n");
+	      }
+	    );
+    }
+
+  /* FIXME: We should perhaps be somewhat more elegant in our rounding of the
+     quotient.  For now, just make sure the returned quotient is >= the real
+     quotient; add 3 with saturating arithmetic.  */
+  qn = nn - dn;
+  cy += mpn_add_1 (qp, qp, qn, 3);
+  if (cy != 0)
+    {
+      if (qh != 0)
+	{
+	  /* Return a quotient of just 1-bits, with qh set.  */
+	  mp_size_t i;
+	  for (i = 0; i < qn; i++)
+	    qp[i] = GMP_NUMB_MAX;
+	}
+      else
+	{
+	  /* Propagate carry into qh.  */
+	  qh = 1;
+	}
+    }
+
+  return qh;
+}
+
+/* In case k=0 (automatic choice), we distinguish 3 cases:
+   (a) dn < qn:         in = ceil(qn / ceil(qn/dn))
+   (b) dn/3 < qn <= dn: in = ceil(qn / 2)
+   (c) qn < dn/3:       in = qn
+   In all cases we have in <= dn.
+ */
+static mp_size_t
+mpn_mu_divappr_q_choose_in (mp_size_t qn, mp_size_t dn, int k)
+{
+  mp_size_t in;
+
+  if (k == 0)
+    {
+      mp_size_t b;
+      if (qn > dn)
+	{
+	  /* Compute an inverse size that is a nice partition of the quotient.  */
+	  b = (qn - 1) / dn + 1;	/* ceil(qn/dn), number of blocks */
+	  in = (qn - 1) / b + 1;	/* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */
+	}
+      else if (3 * qn > dn)
+	{
+	  in = (qn - 1) / 2 + 1;	/* b = 2 */
+	}
+      else
+	{
+	  in = (qn - 1) / 1 + 1;	/* b = 1 */
+	}
+    }
+  else
+    {
+      mp_size_t xn;
+      xn = MIN (dn, qn);
+      in = (xn - 1) / k + 1;
+    }
+
+  return in;
+}
+
+mp_size_t
+mpn_mu_divappr_q_itch (mp_size_t nn, mp_size_t dn, int mua_k)
+{
+  mp_size_t qn, in, itch_local, itch_out, itch_invapp;
+
+  qn = nn - dn;
+  if (qn + 1 < dn)
+    {
+      dn = qn + 1;
+    }
+  in = mpn_mu_divappr_q_choose_in (qn, dn, mua_k);
+
+  itch_local = mpn_mulmod_bnm1_next_size (dn + 1);
+  itch_out = mpn_mulmod_bnm1_itch (itch_local, dn, in);
+  itch_invapp = mpn_invertappr_itch (in + 1) + in + 2; /* 3in + 4 */
+
+  ASSERT (dn + itch_local + itch_out >= itch_invapp);
+  return in + MAX (dn + itch_local + itch_out, itch_invapp);
+}

diff --git a/third_party/gmp/mpn/generic/mul.c b/third_party/gmp/mpn/generic/mul.c
new file mode 100644
index 0000000..37444e9
--- /dev/null
+++ b/third_party/gmp/mpn/generic/mul.c

@@ -0,0 +1,441 @@
+/* mpn_mul -- Multiply two natural numbers.
+
+   Contributed to the GNU project by Torbjorn Granlund.
+
+Copyright 1991, 1993, 1994, 1996, 1997, 1999-2003, 2005-2007, 2009, 2010, 2012,
+2014, 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+
+
+#ifndef MUL_BASECASE_MAX_UN
+#define MUL_BASECASE_MAX_UN 500
+#endif
+
+/* Areas where the different toom algorithms can be called (extracted
+   from the t-toom*.c files, and ignoring small constant offsets):
+
+   1/6  1/5 1/4 4/13 1/3 3/8 2/5 5/11 1/2 3/5 2/3 3/4 4/5   1 vn/un
+                                        4/7              6/7
+				       6/11
+                                       |--------------------| toom22 (small)
+                                                           || toom22 (large)
+                                                       |xxxx| toom22 called
+                      |-------------------------------------| toom32
+                                         |xxxxxxxxxxxxxxxx| | toom32 called
+                                               |------------| toom33
+                                                          |x| toom33 called
+             |---------------------------------|            | toom42
+	              |xxxxxxxxxxxxxxxxxxxxxxxx|            | toom42 called
+                                       |--------------------| toom43
+                                               |xxxxxxxxxx|   toom43 called
+         |-----------------------------|                      toom52 (unused)
+                                                   |--------| toom44
+						   |xxxxxxxx| toom44 called
+                              |--------------------|        | toom53
+                                        |xxxxxx|              toom53 called
+    |-------------------------|                               toom62 (unused)
+                                           |----------------| toom54 (unused)
+                      |--------------------|                  toom63
+	                      |xxxxxxxxx|                   | toom63 called
+                          |---------------------------------| toom6h
+						   |xxxxxxxx| toom6h called
+                                  |-------------------------| toom8h (32 bit)
+                 |------------------------------------------| toom8h (64 bit)
+						   |xxxxxxxx| toom8h called
+*/
+
+#define TOOM33_OK(an,bn) (6 + 2 * an < 3 * bn)
+#define TOOM44_OK(an,bn) (12 + 3 * an < 4 * bn)
+
+/* Multiply the natural numbers u (pointed to by UP, with UN limbs) and v
+   (pointed to by VP, with VN limbs), and store the result at PRODP.  The
+   result is UN + VN limbs.  Return the most significant limb of the result.
+
+   NOTE: The space pointed to by PRODP is overwritten before finished with U
+   and V, so overlap is an error.
+
+   Argument constraints:
+   1. UN >= VN.
+   2. PRODP != UP and PRODP != VP, i.e. the destination must be distinct from
+      the multiplier and the multiplicand.  */
+
+/*
+  * The cutoff lines in the toomX2 and toomX3 code are now exactly between the
+    ideal lines of the surrounding algorithms.  Is that optimal?
+
+  * The toomX3 code now uses a structure similar to the one of toomX2, except
+    that it loops longer in the unbalanced case.  The result is that the
+    remaining area might have un < vn.  Should we fix the toomX2 code in a
+    similar way?
+
+  * The toomX3 code is used for the largest non-FFT unbalanced operands.  It
+    therefore calls mpn_mul recursively for certain cases.
+
+  * Allocate static temp space using THRESHOLD variables (except for toom44
+    when !WANT_FFT).  That way, we can typically have no TMP_ALLOC at all.
+
+  * We sort ToomX2 algorithms together, assuming the toom22, toom32, toom42
+    have the same vn threshold.  This is not true, we should actually use
+    mul_basecase for slightly larger operands for toom32 than for toom22, and
+    even larger for toom42.
+
+  * That problem is even more prevalent for toomX3.  We therefore use special
+    THRESHOLD variables there.
+*/
+
+mp_limb_t
+mpn_mul (mp_ptr prodp,
+	 mp_srcptr up, mp_size_t un,
+	 mp_srcptr vp, mp_size_t vn)
+{
+  ASSERT (un >= vn);
+  ASSERT (vn >= 1);
+  ASSERT (! MPN_OVERLAP_P (prodp, un+vn, up, un));
+  ASSERT (! MPN_OVERLAP_P (prodp, un+vn, vp, vn));
+
+  if (BELOW_THRESHOLD (un, MUL_TOOM22_THRESHOLD))
+    {
+      /* When un (and thus vn) is below the toom22 range, do mul_basecase.
+	 Test un and not vn here not to thwart the un >> vn code below.
+	 This special case is not necessary, but cuts the overhead for the
+	 smallest operands. */
+      mpn_mul_basecase (prodp, up, un, vp, vn);
+    }
+  else if (un == vn)
+    {
+      mpn_mul_n (prodp, up, vp, un);
+    }
+  else if (vn < MUL_TOOM22_THRESHOLD)
+    { /* plain schoolbook multiplication */
+
+      /* Unless un is very large, or else if have an applicable mpn_mul_N,
+	 perform basecase multiply directly.  */
+      if (un <= MUL_BASECASE_MAX_UN
+#if HAVE_NATIVE_mpn_mul_2
+	  || vn <= 2
+#else
+	  || vn == 1
+#endif
+	  )
+	mpn_mul_basecase (prodp, up, un, vp, vn);
+      else
+	{
+	  /* We have un >> MUL_BASECASE_MAX_UN > vn.  For better memory
+	     locality, split up[] into MUL_BASECASE_MAX_UN pieces and multiply
+	     these pieces with the vp[] operand.  After each such partial
+	     multiplication (but the last) we copy the most significant vn
+	     limbs into a temporary buffer since that part would otherwise be
+	     overwritten by the next multiplication.  After the next
+	     multiplication, we add it back.  This illustrates the situation:
+
+                                                    -->vn<--
+                                                      |  |<------- un ------->|
+                                                         _____________________|
+                                                        X                    /|
+                                                      /XX__________________/  |
+                                    _____________________                     |
+                                   X                    /                     |
+                                 /XX__________________/                       |
+               _____________________                                          |
+              /                    /                                          |
+            /____________________/                                            |
+	    ==================================================================
+
+	    The parts marked with X are the parts whose sums are copied into
+	    the temporary buffer.  */
+
+	  mp_limb_t tp[MUL_TOOM22_THRESHOLD_LIMIT];
+	  mp_limb_t cy;
+	  ASSERT (MUL_TOOM22_THRESHOLD <= MUL_TOOM22_THRESHOLD_LIMIT);
+
+	  mpn_mul_basecase (prodp, up, MUL_BASECASE_MAX_UN, vp, vn);
+	  prodp += MUL_BASECASE_MAX_UN;
+	  MPN_COPY (tp, prodp, vn);		/* preserve high triangle */
+	  up += MUL_BASECASE_MAX_UN;
+	  un -= MUL_BASECASE_MAX_UN;
+	  while (un > MUL_BASECASE_MAX_UN)
+	    {
+	      mpn_mul_basecase (prodp, up, MUL_BASECASE_MAX_UN, vp, vn);
+	      cy = mpn_add_n (prodp, prodp, tp, vn); /* add back preserved triangle */
+	      mpn_incr_u (prodp + vn, cy);
+	      prodp += MUL_BASECASE_MAX_UN;
+	      MPN_COPY (tp, prodp, vn);		/* preserve high triangle */
+	      up += MUL_BASECASE_MAX_UN;
+	      un -= MUL_BASECASE_MAX_UN;
+	    }
+	  if (un > vn)
+	    {
+	      mpn_mul_basecase (prodp, up, un, vp, vn);
+	    }
+	  else
+	    {
+	      ASSERT (un > 0);
+	      mpn_mul_basecase (prodp, vp, vn, up, un);
+	    }
+	  cy = mpn_add_n (prodp, prodp, tp, vn); /* add back preserved triangle */
+	  mpn_incr_u (prodp + vn, cy);
+	}
+    }
+  else if (BELOW_THRESHOLD (vn, MUL_TOOM33_THRESHOLD))
+    {
+      /* Use ToomX2 variants */
+      mp_ptr scratch;
+      TMP_SDECL; TMP_SMARK;
+
+#define ITCH_TOOMX2 (9 * vn / 2 + GMP_NUMB_BITS * 2)
+      scratch = TMP_SALLOC_LIMBS (ITCH_TOOMX2);
+      ASSERT (mpn_toom22_mul_itch ((5*vn-1)/4, vn) <= ITCH_TOOMX2); /* 5vn/2+ */
+      ASSERT (mpn_toom32_mul_itch ((7*vn-1)/4, vn) <= ITCH_TOOMX2); /* 7vn/6+ */
+      ASSERT (mpn_toom42_mul_itch (3 * vn - 1, vn) <= ITCH_TOOMX2); /* 9vn/2+ */
+#undef ITCH_TOOMX2
+
+      /* FIXME: This condition (repeated in the loop below) leaves from a vn*vn
+	 square to a (3vn-1)*vn rectangle.  Leaving such a rectangle is hardly
+	 wise; we would get better balance by slightly moving the bound.  We
+	 will sometimes end up with un < vn, like in the X3 arm below.  */
+      if (un >= 3 * vn)
+	{
+	  mp_limb_t cy;
+	  mp_ptr ws;
+
+	  /* The maximum ws usage is for the mpn_mul result.  */
+	  ws = TMP_SALLOC_LIMBS (4 * vn);
+
+	  mpn_toom42_mul (prodp, up, 2 * vn, vp, vn, scratch);
+	  un -= 2 * vn;
+	  up += 2 * vn;
+	  prodp += 2 * vn;
+
+	  while (un >= 3 * vn)
+	    {
+	      mpn_toom42_mul (ws, up, 2 * vn, vp, vn, scratch);
+	      un -= 2 * vn;
+	      up += 2 * vn;
+	      cy = mpn_add_n (prodp, prodp, ws, vn);
+	      MPN_COPY (prodp + vn, ws + vn, 2 * vn);
+	      mpn_incr_u (prodp + vn, cy);
+	      prodp += 2 * vn;
+	    }
+
+	  /* vn <= un < 3vn */
+
+	  if (4 * un < 5 * vn)
+	    mpn_toom22_mul (ws, up, un, vp, vn, scratch);
+	  else if (4 * un < 7 * vn)
+	    mpn_toom32_mul (ws, up, un, vp, vn, scratch);
+	  else
+	    mpn_toom42_mul (ws, up, un, vp, vn, scratch);
+
+	  cy = mpn_add_n (prodp, prodp, ws, vn);
+	  MPN_COPY (prodp + vn, ws + vn, un);
+	  mpn_incr_u (prodp + vn, cy);
+	}
+      else
+	{
+	  if (4 * un < 5 * vn)
+	    mpn_toom22_mul (prodp, up, un, vp, vn, scratch);
+	  else if (4 * un < 7 * vn)
+	    mpn_toom32_mul (prodp, up, un, vp, vn, scratch);
+	  else
+	    mpn_toom42_mul (prodp, up, un, vp, vn, scratch);
+	}
+      TMP_SFREE;
+    }
+  else if (BELOW_THRESHOLD ((un + vn) >> 1, MUL_FFT_THRESHOLD) ||
+	   BELOW_THRESHOLD (3 * vn, MUL_FFT_THRESHOLD))
+    {
+      /* Handle the largest operands that are not in the FFT range.  The 2nd
+	 condition makes very unbalanced operands avoid the FFT code (except
+	 perhaps as coefficient products of the Toom code.  */
+
+      if (BELOW_THRESHOLD (vn, MUL_TOOM44_THRESHOLD) || !TOOM44_OK (un, vn))
+	{
+	  /* Use ToomX3 variants */
+	  mp_ptr scratch;
+	  TMP_DECL; TMP_MARK;
+
+#define ITCH_TOOMX3 (4 * vn + GMP_NUMB_BITS)
+	  scratch = TMP_ALLOC_LIMBS (ITCH_TOOMX3);
+	  ASSERT (mpn_toom33_mul_itch ((7*vn-1)/6, vn) <= ITCH_TOOMX3); /* 7vn/2+ */
+	  ASSERT (mpn_toom43_mul_itch ((3*vn-1)/2, vn) <= ITCH_TOOMX3); /* 9vn/4+ */
+	  ASSERT (mpn_toom32_mul_itch ((7*vn-1)/4, vn) <= ITCH_TOOMX3); /* 7vn/6+ */
+	  ASSERT (mpn_toom53_mul_itch ((11*vn-1)/6, vn) <= ITCH_TOOMX3); /* 11vn/3+ */
+	  ASSERT (mpn_toom42_mul_itch ((5*vn-1)/2, vn) <= ITCH_TOOMX3); /* 15vn/4+ */
+	  ASSERT (mpn_toom63_mul_itch ((5*vn-1)/2, vn) <= ITCH_TOOMX3); /* 15vn/4+ */
+#undef ITCH_TOOMX3
+
+	  if (2 * un >= 5 * vn)
+	    {
+	      mp_limb_t cy;
+	      mp_ptr ws;
+
+	      /* The maximum ws usage is for the mpn_mul result.  */
+	      ws = TMP_ALLOC_LIMBS (7 * vn >> 1);
+
+	      if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM63_THRESHOLD))
+		mpn_toom42_mul (prodp, up, 2 * vn, vp, vn, scratch);
+	      else
+		mpn_toom63_mul (prodp, up, 2 * vn, vp, vn, scratch);
+	      un -= 2 * vn;
+	      up += 2 * vn;
+	      prodp += 2 * vn;
+
+	      while (2 * un >= 5 * vn)	/* un >= 2.5vn */
+		{
+		  if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM63_THRESHOLD))
+		    mpn_toom42_mul (ws, up, 2 * vn, vp, vn, scratch);
+		  else
+		    mpn_toom63_mul (ws, up, 2 * vn, vp, vn, scratch);
+		  un -= 2 * vn;
+		  up += 2 * vn;
+		  cy = mpn_add_n (prodp, prodp, ws, vn);
+		  MPN_COPY (prodp + vn, ws + vn, 2 * vn);
+		  mpn_incr_u (prodp + vn, cy);
+		  prodp += 2 * vn;
+		}
+
+	      /* vn / 2 <= un < 2.5vn */
+
+	      if (un < vn)
+		mpn_mul (ws, vp, vn, up, un);
+	      else
+		mpn_mul (ws, up, un, vp, vn);
+
+	      cy = mpn_add_n (prodp, prodp, ws, vn);
+	      MPN_COPY (prodp + vn, ws + vn, un);
+	      mpn_incr_u (prodp + vn, cy);
+	    }
+	  else
+	    {
+	      if (6 * un < 7 * vn)
+		mpn_toom33_mul (prodp, up, un, vp, vn, scratch);
+	      else if (2 * un < 3 * vn)
+		{
+		  if (BELOW_THRESHOLD (vn, MUL_TOOM32_TO_TOOM43_THRESHOLD))
+		    mpn_toom32_mul (prodp, up, un, vp, vn, scratch);
+		  else
+		    mpn_toom43_mul (prodp, up, un, vp, vn, scratch);
+		}
+	      else if (6 * un < 11 * vn)
+		{
+		  if (4 * un < 7 * vn)
+		    {
+		      if (BELOW_THRESHOLD (vn, MUL_TOOM32_TO_TOOM53_THRESHOLD))
+			mpn_toom32_mul (prodp, up, un, vp, vn, scratch);
+		      else
+			mpn_toom53_mul (prodp, up, un, vp, vn, scratch);
+		    }
+		  else
+		    {
+		      if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM53_THRESHOLD))
+			mpn_toom42_mul (prodp, up, un, vp, vn, scratch);
+		      else
+			mpn_toom53_mul (prodp, up, un, vp, vn, scratch);
+		    }
+		}
+	      else
+		{
+		  if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM63_THRESHOLD))
+		    mpn_toom42_mul (prodp, up, un, vp, vn, scratch);
+		  else
+		    mpn_toom63_mul (prodp, up, un, vp, vn, scratch);
+		}
+	    }
+	  TMP_FREE;
+	}
+      else
+	{
+	  mp_ptr scratch;
+	  TMP_DECL; TMP_MARK;
+
+	  if (BELOW_THRESHOLD (vn, MUL_TOOM6H_THRESHOLD))
+	    {
+	      scratch = TMP_SALLOC_LIMBS (mpn_toom44_mul_itch (un, vn));
+	      mpn_toom44_mul (prodp, up, un, vp, vn, scratch);
+	    }
+	  else if (BELOW_THRESHOLD (vn, MUL_TOOM8H_THRESHOLD))
+	    {
+	      scratch = TMP_SALLOC_LIMBS (mpn_toom6h_mul_itch (un, vn));
+	      mpn_toom6h_mul (prodp, up, un, vp, vn, scratch);
+	    }
+	  else
+	    {
+	      scratch = TMP_ALLOC_LIMBS (mpn_toom8h_mul_itch (un, vn));
+	      mpn_toom8h_mul (prodp, up, un, vp, vn, scratch);
+	    }
+	  TMP_FREE;
+	}
+    }
+  else
+    {
+      if (un >= 8 * vn)
+	{
+	  mp_limb_t cy;
+	  mp_ptr ws;
+	  TMP_DECL; TMP_MARK;
+
+	  /* The maximum ws usage is for the mpn_mul result.  */
+	  ws = TMP_BALLOC_LIMBS (9 * vn >> 1);
+
+	  mpn_fft_mul (prodp, up, 3 * vn, vp, vn);
+	  un -= 3 * vn;
+	  up += 3 * vn;
+	  prodp += 3 * vn;
+
+	  while (2 * un >= 7 * vn)	/* un >= 3.5vn  */
+	    {
+	      mpn_fft_mul (ws, up, 3 * vn, vp, vn);
+	      un -= 3 * vn;
+	      up += 3 * vn;
+	      cy = mpn_add_n (prodp, prodp, ws, vn);
+	      MPN_COPY (prodp + vn, ws + vn, 3 * vn);
+	      mpn_incr_u (prodp + vn, cy);
+	      prodp += 3 * vn;
+	    }
+
+	  /* vn / 2 <= un < 3.5vn */
+
+	  if (un < vn)
+	    mpn_mul (ws, vp, vn, up, un);
+	  else
+	    mpn_mul (ws, up, un, vp, vn);
+
+	  cy = mpn_add_n (prodp, prodp, ws, vn);
+	  MPN_COPY (prodp + vn, ws + vn, un);
+	  mpn_incr_u (prodp + vn, cy);
+
+	  TMP_FREE;
+	}
+      else
+	mpn_fft_mul (prodp, up, un, vp, vn);
+    }
+
+  return prodp[un + vn - 1];	/* historic */
+}

diff --git a/third_party/gmp/mpn/generic/mul_1.c b/third_party/gmp/mpn/generic/mul_1.c
new file mode 100644
index 0000000..52d46da
--- /dev/null
+++ b/third_party/gmp/mpn/generic/mul_1.c

@@ -0,0 +1,96 @@
+/* mpn_mul_1 -- Multiply a limb vector with a single limb and store the
+   product in a second limb vector.
+
+Copyright 1991-1994, 1996, 2000-2002 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+
+#if GMP_NAIL_BITS == 0
+
+mp_limb_t
+mpn_mul_1 (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_limb_t vl)
+{
+  mp_limb_t ul, cl, hpl, lpl;
+
+  ASSERT (n >= 1);
+  ASSERT (MPN_SAME_OR_INCR_P (rp, up, n));
+
+  cl = 0;
+  do
+    {
+      ul = *up++;
+      umul_ppmm (hpl, lpl, ul, vl);
+
+      lpl += cl;
+      cl = (lpl < cl) + hpl;
+
+      *rp++ = lpl;
+    }
+  while (--n != 0);
+
+  return cl;
+}
+
+#endif
+
+#if GMP_NAIL_BITS >= 1
+
+mp_limb_t
+mpn_mul_1 (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_limb_t vl)
+{
+  mp_limb_t shifted_vl, ul, lpl, hpl, prev_hpl, xw, cl, xl;
+
+  ASSERT (n >= 1);
+  ASSERT (MPN_SAME_OR_INCR_P (rp, up, n));
+  ASSERT_MPN (up, n);
+  ASSERT_LIMB (vl);
+
+  shifted_vl = vl << GMP_NAIL_BITS;
+  cl = 0;
+  prev_hpl = 0;
+  do
+    {
+      ul = *up++;
+
+      umul_ppmm (hpl, lpl, ul, shifted_vl);
+      lpl >>= GMP_NAIL_BITS;
+      xw = prev_hpl + lpl + cl;
+      cl = xw >> GMP_NUMB_BITS;
+      xl = xw & GMP_NUMB_MASK;
+      *rp++ = xl;
+      prev_hpl = hpl;
+    }
+  while (--n != 0);
+
+  return prev_hpl + cl;
+}
+
+#endif

diff --git a/third_party/gmp/mpn/generic/mul_basecase.c b/third_party/gmp/mpn/generic/mul_basecase.c
new file mode 100644
index 0000000..2487fba
--- /dev/null
+++ b/third_party/gmp/mpn/generic/mul_basecase.c

@@ -0,0 +1,165 @@
+/* mpn_mul_basecase -- Internal routine to multiply two natural numbers
+   of length m and n.
+
+   THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE.  IT IS ONLY
+   SAFE TO REACH THIS FUNCTION THROUGH DOCUMENTED INTERFACES.
+
+Copyright 1991-1994, 1996, 1997, 2000-2002 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+
+
+/* Multiply {up,usize} by {vp,vsize} and write the result to
+   {prodp,usize+vsize}.  Must have usize>=vsize.
+
+   Note that prodp gets usize+vsize limbs stored, even if the actual result
+   only needs usize+vsize-1.
+
+   There's no good reason to call here with vsize>=MUL_TOOM22_THRESHOLD.
+   Currently this is allowed, but it might not be in the future.
+
+   This is the most critical code for multiplication.  All multiplies rely
+   on this, both small and huge.  Small ones arrive here immediately, huge
+   ones arrive here as this is the base case for Karatsuba's recursive
+   algorithm.  */
+
+void
+mpn_mul_basecase (mp_ptr rp,
+		  mp_srcptr up, mp_size_t un,
+		  mp_srcptr vp, mp_size_t vn)
+{
+  ASSERT (un >= vn);
+  ASSERT (vn >= 1);
+  ASSERT (! MPN_OVERLAP_P (rp, un+vn, up, un));
+  ASSERT (! MPN_OVERLAP_P (rp, un+vn, vp, vn));
+
+  /* We first multiply by the low order limb (or depending on optional function
+     availability, limbs).  This result can be stored, not added, to rp.  We
+     also avoid a loop for zeroing this way.  */
+
+#if HAVE_NATIVE_mpn_mul_2
+  if (vn >= 2)
+    {
+      rp[un + 1] = mpn_mul_2 (rp, up, un, vp);
+      rp += 2, vp += 2, vn -= 2;
+    }
+  else
+    {
+      rp[un] = mpn_mul_1 (rp, up, un, vp[0]);
+      return;
+    }
+#else
+  rp[un] = mpn_mul_1 (rp, up, un, vp[0]);
+  rp += 1, vp += 1, vn -= 1;
+#endif
+
+  /* Now accumulate the product of up[] and the next higher limb (or depending
+     on optional function availability, limbs) from vp[].  */
+
+#define MAX_LEFT MP_SIZE_T_MAX	/* Used to simplify loops into if statements */
+
+
+#if HAVE_NATIVE_mpn_addmul_6
+  while (vn >= 6)
+    {
+      rp[un + 6 - 1] = mpn_addmul_6 (rp, up, un, vp);
+      if (MAX_LEFT == 6)
+	return;
+      rp += 6, vp += 6, vn -= 6;
+      if (MAX_LEFT < 2 * 6)
+	break;
+    }
+#undef MAX_LEFT
+#define MAX_LEFT (6 - 1)
+#endif
+
+#if HAVE_NATIVE_mpn_addmul_5
+  while (vn >= 5)
+    {
+      rp[un + 5 - 1] = mpn_addmul_5 (rp, up, un, vp);
+      if (MAX_LEFT == 5)
+	return;
+      rp += 5, vp += 5, vn -= 5;
+      if (MAX_LEFT < 2 * 5)
+	break;
+    }
+#undef MAX_LEFT
+#define MAX_LEFT (5 - 1)
+#endif
+
+#if HAVE_NATIVE_mpn_addmul_4
+  while (vn >= 4)
+    {
+      rp[un + 4 - 1] = mpn_addmul_4 (rp, up, un, vp);
+      if (MAX_LEFT == 4)
+	return;
+      rp += 4, vp += 4, vn -= 4;
+      if (MAX_LEFT < 2 * 4)
+	break;
+    }
+#undef MAX_LEFT
+#define MAX_LEFT (4 - 1)
+#endif
+
+#if HAVE_NATIVE_mpn_addmul_3
+  while (vn >= 3)
+    {
+      rp[un + 3 - 1] = mpn_addmul_3 (rp, up, un, vp);
+      if (MAX_LEFT == 3)
+	return;
+      rp += 3, vp += 3, vn -= 3;
+      if (MAX_LEFT < 2 * 3)
+	break;
+    }
+#undef MAX_LEFT
+#define MAX_LEFT (3 - 1)
+#endif
+
+#if HAVE_NATIVE_mpn_addmul_2
+  while (vn >= 2)
+    {
+      rp[un + 2 - 1] = mpn_addmul_2 (rp, up, un, vp);
+      if (MAX_LEFT == 2)
+	return;
+      rp += 2, vp += 2, vn -= 2;
+      if (MAX_LEFT < 2 * 2)
+	break;
+    }
+#undef MAX_LEFT
+#define MAX_LEFT (2 - 1)
+#endif
+
+  while (vn >= 1)
+    {
+      rp[un] = mpn_addmul_1 (rp, up, un, vp[0]);
+      if (MAX_LEFT == 1)
+	return;
+      rp += 1, vp += 1, vn -= 1;
+    }
+}

diff --git a/third_party/gmp/mpn/generic/mul_fft.c b/third_party/gmp/mpn/generic/mul_fft.c
new file mode 100644
index 0000000..df8ee63
--- /dev/null
+++ b/third_party/gmp/mpn/generic/mul_fft.c

@@ -0,0 +1,1041 @@
+/* Schoenhage's fast multiplication modulo 2^N+1.
+
+   Contributed by Paul Zimmermann.
+
+   THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
+   SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+Copyright 1998-2010, 2012, 2013, 2018 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+/* References:
+
+   Schnelle Multiplikation grosser Zahlen, by Arnold Schoenhage and Volker
+   Strassen, Computing 7, p. 281-292, 1971.
+
+   Asymptotically fast algorithms for the numerical multiplication and division
+   of polynomials with complex coefficients, by Arnold Schoenhage, Computer
+   Algebra, EUROCAM'82, LNCS 144, p. 3-15, 1982.
+
+   Tapes versus Pointers, a study in implementing fast algorithms, by Arnold
+   Schoenhage, Bulletin of the EATCS, 30, p. 23-32, 1986.
+
+   TODO:
+
+   Implement some of the tricks published at ISSAC'2007 by Gaudry, Kruppa, and
+   Zimmermann.
+
+   It might be possible to avoid a small number of MPN_COPYs by using a
+   rotating temporary or two.
+
+   Cleanup and simplify the code!
+*/
+
+#ifdef TRACE
+#undef TRACE
+#define TRACE(x) x
+#include <stdio.h>
+#else
+#define TRACE(x)
+#endif
+
+#include "gmp-impl.h"
+
+#ifdef WANT_ADDSUB
+#include "generic/add_n_sub_n.c"
+#define HAVE_NATIVE_mpn_add_n_sub_n 1
+#endif
+
+static mp_limb_t mpn_mul_fft_internal (mp_ptr, mp_size_t, int, mp_ptr *,
+				       mp_ptr *, mp_ptr, mp_ptr, mp_size_t,
+				       mp_size_t, mp_size_t, int **, mp_ptr, int);
+static void mpn_mul_fft_decompose (mp_ptr, mp_ptr *, mp_size_t, mp_size_t, mp_srcptr,
+				   mp_size_t, mp_size_t, mp_size_t, mp_ptr);
+
+
+/* Find the best k to use for a mod 2^(m*GMP_NUMB_BITS)+1 FFT for m >= n.
+   We have sqr=0 if for a multiply, sqr=1 for a square.
+   There are three generations of this code; we keep the old ones as long as
+   some gmp-mparam.h is not updated.  */
+
+
+/*****************************************************************************/
+
+#if TUNE_PROGRAM_BUILD || (defined (MUL_FFT_TABLE3) && defined (SQR_FFT_TABLE3))
+
+#ifndef FFT_TABLE3_SIZE		/* When tuning this is defined in gmp-impl.h */
+#if defined (MUL_FFT_TABLE3_SIZE) && defined (SQR_FFT_TABLE3_SIZE)
+#if MUL_FFT_TABLE3_SIZE > SQR_FFT_TABLE3_SIZE
+#define FFT_TABLE3_SIZE MUL_FFT_TABLE3_SIZE
+#else
+#define FFT_TABLE3_SIZE SQR_FFT_TABLE3_SIZE
+#endif
+#endif
+#endif
+
+#ifndef FFT_TABLE3_SIZE
+#define FFT_TABLE3_SIZE 200
+#endif
+
+FFT_TABLE_ATTRS struct fft_table_nk mpn_fft_table3[2][FFT_TABLE3_SIZE] =
+{
+  MUL_FFT_TABLE3,
+  SQR_FFT_TABLE3
+};
+
+int
+mpn_fft_best_k (mp_size_t n, int sqr)
+{
+  const struct fft_table_nk *fft_tab, *tab;
+  mp_size_t tab_n, thres;
+  int last_k;
+
+  fft_tab = mpn_fft_table3[sqr];
+  last_k = fft_tab->k;
+  for (tab = fft_tab + 1; ; tab++)
+    {
+      tab_n = tab->n;
+      thres = tab_n << last_k;
+      if (n <= thres)
+	break;
+      last_k = tab->k;
+    }
+  return last_k;
+}
+
+#define MPN_FFT_BEST_READY 1
+#endif
+
+/*****************************************************************************/
+
+#if ! defined (MPN_FFT_BEST_READY)
+FFT_TABLE_ATTRS mp_size_t mpn_fft_table[2][MPN_FFT_TABLE_SIZE] =
+{
+  MUL_FFT_TABLE,
+  SQR_FFT_TABLE
+};
+
+int
+mpn_fft_best_k (mp_size_t n, int sqr)
+{
+  int i;
+
+  for (i = 0; mpn_fft_table[sqr][i] != 0; i++)
+    if (n < mpn_fft_table[sqr][i])
+      return i + FFT_FIRST_K;
+
+  /* treat 4*last as one further entry */
+  if (i == 0 || n < 4 * mpn_fft_table[sqr][i - 1])
+    return i + FFT_FIRST_K;
+  else
+    return i + FFT_FIRST_K + 1;
+}
+#endif
+
+/*****************************************************************************/
+
+
+/* Returns smallest possible number of limbs >= pl for a fft of size 2^k,
+   i.e. smallest multiple of 2^k >= pl.
+
+   Don't declare static: needed by tuneup.
+*/
+
+mp_size_t
+mpn_fft_next_size (mp_size_t pl, int k)
+{
+  pl = 1 + ((pl - 1) >> k); /* ceil (pl/2^k) */
+  return pl << k;
+}
+
+
+/* Initialize l[i][j] with bitrev(j) */
+static void
+mpn_fft_initl (int **l, int k)
+{
+  int i, j, K;
+  int *li;
+
+  l[0][0] = 0;
+  for (i = 1, K = 1; i <= k; i++, K *= 2)
+    {
+      li = l[i];
+      for (j = 0; j < K; j++)
+	{
+	  li[j] = 2 * l[i - 1][j];
+	  li[K + j] = 1 + li[j];
+	}
+    }
+}
+
+
+/* r <- a*2^d mod 2^(n*GMP_NUMB_BITS)+1 with a = {a, n+1}
+   Assumes a is semi-normalized, i.e. a[n] <= 1.
+   r and a must have n+1 limbs, and not overlap.
+*/
+static void
+mpn_fft_mul_2exp_modF (mp_ptr r, mp_srcptr a, mp_bitcnt_t d, mp_size_t n)
+{
+  unsigned int sh;
+  mp_size_t m;
+  mp_limb_t cc, rd;
+
+  sh = d % GMP_NUMB_BITS;
+  m = d / GMP_NUMB_BITS;
+
+  if (m >= n)			/* negate */
+    {
+      /* r[0..m-1]  <-- lshift(a[n-m]..a[n-1], sh)
+	 r[m..n-1]  <-- -lshift(a[0]..a[n-m-1],  sh) */
+
+      m -= n;
+      if (sh != 0)
+	{
+	  /* no out shift below since a[n] <= 1 */
+	  mpn_lshift (r, a + n - m, m + 1, sh);
+	  rd = r[m];
+	  cc = mpn_lshiftc (r + m, a, n - m, sh);
+	}
+      else
+	{
+	  MPN_COPY (r, a + n - m, m);
+	  rd = a[n];
+	  mpn_com (r + m, a, n - m);
+	  cc = 0;
+	}
+
+      /* add cc to r[0], and add rd to r[m] */
+
+      /* now add 1 in r[m], subtract 1 in r[n], i.e. add 1 in r[0] */
+
+      r[n] = 0;
+      /* cc < 2^sh <= 2^(GMP_NUMB_BITS-1) thus no overflow here */
+      cc++;
+      mpn_incr_u (r, cc);
+
+      rd++;
+      /* rd might overflow when sh=GMP_NUMB_BITS-1 */
+      cc = (rd == 0) ? 1 : rd;
+      r = r + m + (rd == 0);
+      mpn_incr_u (r, cc);
+    }
+  else
+    {
+      /* r[0..m-1]  <-- -lshift(a[n-m]..a[n-1], sh)
+	 r[m..n-1]  <-- lshift(a[0]..a[n-m-1],  sh)  */
+      if (sh != 0)
+	{
+	  /* no out bits below since a[n] <= 1 */
+	  mpn_lshiftc (r, a + n - m, m + 1, sh);
+	  rd = ~r[m];
+	  /* {r, m+1} = {a+n-m, m+1} << sh */
+	  cc = mpn_lshift (r + m, a, n - m, sh); /* {r+m, n-m} = {a, n-m}<<sh */
+	}
+      else
+	{
+	  /* r[m] is not used below, but we save a test for m=0 */
+	  mpn_com (r, a + n - m, m + 1);
+	  rd = a[n];
+	  MPN_COPY (r + m, a, n - m);
+	  cc = 0;
+	}
+
+      /* now complement {r, m}, subtract cc from r[0], subtract rd from r[m] */
+
+      /* if m=0 we just have r[0]=a[n] << sh */
+      if (m != 0)
+	{
+	  /* now add 1 in r[0], subtract 1 in r[m] */
+	  if (cc-- == 0) /* then add 1 to r[0] */
+	    cc = mpn_add_1 (r, r, n, CNST_LIMB(1));
+	  cc = mpn_sub_1 (r, r, m, cc) + 1;
+	  /* add 1 to cc instead of rd since rd might overflow */
+	}
+
+      /* now subtract cc and rd from r[m..n] */
+
+      r[n] = -mpn_sub_1 (r + m, r + m, n - m, cc);
+      r[n] -= mpn_sub_1 (r + m, r + m, n - m, rd);
+      if (r[n] & GMP_LIMB_HIGHBIT)
+	r[n] = mpn_add_1 (r, r, n, CNST_LIMB(1));
+    }
+}
+
+#if HAVE_NATIVE_mpn_add_n_sub_n
+static inline void
+mpn_fft_add_sub_modF (mp_ptr A0, mp_ptr Ai, mp_srcptr tp, mp_size_t n)
+{
+  mp_limb_t cyas, c, x;
+
+  cyas = mpn_add_n_sub_n (A0, Ai, A0, tp, n);
+
+  c = A0[n] - tp[n] - (cyas & 1);
+  x = (-c) & -((c & GMP_LIMB_HIGHBIT) != 0);
+  Ai[n] = x + c;
+  MPN_INCR_U (Ai, n + 1, x);
+
+  c = A0[n] + tp[n] + (cyas >> 1);
+  x = (c - 1) & -(c != 0);
+  A0[n] = c - x;
+  MPN_DECR_U (A0, n + 1, x);
+}
+
+#else /* ! HAVE_NATIVE_mpn_add_n_sub_n  */
+
+/* r <- a+b mod 2^(n*GMP_NUMB_BITS)+1.
+   Assumes a and b are semi-normalized.
+*/
+static inline void
+mpn_fft_add_modF (mp_ptr r, mp_srcptr a, mp_srcptr b, mp_size_t n)
+{
+  mp_limb_t c, x;
+
+  c = a[n] + b[n] + mpn_add_n (r, a, b, n);
+  /* 0 <= c <= 3 */
+
+#if 1
+  /* GCC 4.1 outsmarts most expressions here, and generates a 50% branch.  The
+     result is slower code, of course.  But the following outsmarts GCC.  */
+  x = (c - 1) & -(c != 0);
+  r[n] = c - x;
+  MPN_DECR_U (r, n + 1, x);
+#endif
+#if 0
+  if (c > 1)
+    {
+      r[n] = 1;                       /* r[n] - c = 1 */
+      MPN_DECR_U (r, n + 1, c - 1);
+    }
+  else
+    {
+      r[n] = c;
+    }
+#endif
+}
+
+/* r <- a-b mod 2^(n*GMP_NUMB_BITS)+1.
+   Assumes a and b are semi-normalized.
+*/
+static inline void
+mpn_fft_sub_modF (mp_ptr r, mp_srcptr a, mp_srcptr b, mp_size_t n)
+{
+  mp_limb_t c, x;
+
+  c = a[n] - b[n] - mpn_sub_n (r, a, b, n);
+  /* -2 <= c <= 1 */
+
+#if 1
+  /* GCC 4.1 outsmarts most expressions here, and generates a 50% branch.  The
+     result is slower code, of course.  But the following outsmarts GCC.  */
+  x = (-c) & -((c & GMP_LIMB_HIGHBIT) != 0);
+  r[n] = x + c;
+  MPN_INCR_U (r, n + 1, x);
+#endif
+#if 0
+  if ((c & GMP_LIMB_HIGHBIT) != 0)
+    {
+      r[n] = 0;
+      MPN_INCR_U (r, n + 1, -c);
+    }
+  else
+    {
+      r[n] = c;
+    }
+#endif
+}
+#endif /* HAVE_NATIVE_mpn_add_n_sub_n */
+
+/* input: A[0] ... A[inc*(K-1)] are residues mod 2^N+1 where
+	  N=n*GMP_NUMB_BITS, and 2^omega is a primitive root mod 2^N+1
+   output: A[inc*l[k][i]] <- \sum (2^omega)^(ij) A[inc*j] mod 2^N+1 */
+
+static void
+mpn_fft_fft (mp_ptr *Ap, mp_size_t K, int **ll,
+	     mp_size_t omega, mp_size_t n, mp_size_t inc, mp_ptr tp)
+{
+  if (K == 2)
+    {
+      mp_limb_t cy;
+#if HAVE_NATIVE_mpn_add_n_sub_n
+      cy = mpn_add_n_sub_n (Ap[0], Ap[inc], Ap[0], Ap[inc], n + 1) & 1;
+#else
+      MPN_COPY (tp, Ap[0], n + 1);
+      mpn_add_n (Ap[0], Ap[0], Ap[inc], n + 1);
+      cy = mpn_sub_n (Ap[inc], tp, Ap[inc], n + 1);
+#endif
+      if (Ap[0][n] > 1) /* can be 2 or 3 */
+	Ap[0][n] = 1 - mpn_sub_1 (Ap[0], Ap[0], n, Ap[0][n] - 1);
+      if (cy) /* Ap[inc][n] can be -1 or -2 */
+	Ap[inc][n] = mpn_add_1 (Ap[inc], Ap[inc], n, ~Ap[inc][n] + 1);
+    }
+  else
+    {
+      mp_size_t j, K2 = K >> 1;
+      int *lk = *ll;
+
+      mpn_fft_fft (Ap,     K2, ll-1, 2 * omega, n, inc * 2, tp);
+      mpn_fft_fft (Ap+inc, K2, ll-1, 2 * omega, n, inc * 2, tp);
+      /* A[2*j*inc]   <- A[2*j*inc] + omega^l[k][2*j*inc] A[(2j+1)inc]
+	 A[(2j+1)inc] <- A[2*j*inc] + omega^l[k][(2j+1)inc] A[(2j+1)inc] */
+      for (j = 0; j < K2; j++, lk += 2, Ap += 2 * inc)
+	{
+	  /* Ap[inc] <- Ap[0] + Ap[inc] * 2^(lk[1] * omega)
+	     Ap[0]   <- Ap[0] + Ap[inc] * 2^(lk[0] * omega) */
+	  mpn_fft_mul_2exp_modF (tp, Ap[inc], lk[0] * omega, n);
+#if HAVE_NATIVE_mpn_add_n_sub_n
+	  mpn_fft_add_sub_modF (Ap[0], Ap[inc], tp, n);
+#else
+	  mpn_fft_sub_modF (Ap[inc], Ap[0], tp, n);
+	  mpn_fft_add_modF (Ap[0],   Ap[0], tp, n);
+#endif
+	}
+    }
+}
+
+/* input: A[0] ... A[inc*(K-1)] are residues mod 2^N+1 where
+	  N=n*GMP_NUMB_BITS, and 2^omega is a primitive root mod 2^N+1
+   output: A[inc*l[k][i]] <- \sum (2^omega)^(ij) A[inc*j] mod 2^N+1
+   tp must have space for 2*(n+1) limbs.
+*/
+
+
+/* Given ap[0..n] with ap[n]<=1, reduce it modulo 2^(n*GMP_NUMB_BITS)+1,
+   by subtracting that modulus if necessary.
+
+   If ap[0..n] is exactly 2^(n*GMP_NUMB_BITS) then mpn_sub_1 produces a
+   borrow and the limbs must be zeroed out again.  This will occur very
+   infrequently.  */
+
+static inline void
+mpn_fft_normalize (mp_ptr ap, mp_size_t n)
+{
+  if (ap[n] != 0)
+    {
+      MPN_DECR_U (ap, n + 1, CNST_LIMB(1));
+      if (ap[n] == 0)
+	{
+	  /* This happens with very low probability; we have yet to trigger it,
+	     and thereby make sure this code is correct.  */
+	  MPN_ZERO (ap, n);
+	  ap[n] = 1;
+	}
+      else
+	ap[n] = 0;
+    }
+}
+
+/* a[i] <- a[i]*b[i] mod 2^(n*GMP_NUMB_BITS)+1 for 0 <= i < K */
+static void
+mpn_fft_mul_modF_K (mp_ptr *ap, mp_ptr *bp, mp_size_t n, mp_size_t K)
+{
+  int i;
+  int sqr = (ap == bp);
+  TMP_DECL;
+
+  TMP_MARK;
+
+  if (n >= (sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD))
+    {
+      mp_size_t K2, nprime2, Nprime2, M2, maxLK, l, Mp2;
+      int k;
+      int **fft_l, *tmp;
+      mp_ptr *Ap, *Bp, A, B, T;
+
+      k = mpn_fft_best_k (n, sqr);
+      K2 = (mp_size_t) 1 << k;
+      ASSERT_ALWAYS((n & (K2 - 1)) == 0);
+      maxLK = (K2 > GMP_NUMB_BITS) ? K2 : GMP_NUMB_BITS;
+      M2 = n * GMP_NUMB_BITS >> k;
+      l = n >> k;
+      Nprime2 = ((2 * M2 + k + 2 + maxLK) / maxLK) * maxLK;
+      /* Nprime2 = ceil((2*M2+k+3)/maxLK)*maxLK*/
+      nprime2 = Nprime2 / GMP_NUMB_BITS;
+
+      /* we should ensure that nprime2 is a multiple of the next K */
+      if (nprime2 >= (sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD))
+	{
+	  mp_size_t K3;
+	  for (;;)
+	    {
+	      K3 = (mp_size_t) 1 << mpn_fft_best_k (nprime2, sqr);
+	      if ((nprime2 & (K3 - 1)) == 0)
+		break;
+	      nprime2 = (nprime2 + K3 - 1) & -K3;
+	      Nprime2 = nprime2 * GMP_LIMB_BITS;
+	      /* warning: since nprime2 changed, K3 may change too! */
+	    }
+	}
+      ASSERT_ALWAYS(nprime2 < n); /* otherwise we'll loop */
+
+      Mp2 = Nprime2 >> k;
+
+      Ap = TMP_BALLOC_MP_PTRS (K2);
+      Bp = TMP_BALLOC_MP_PTRS (K2);
+      A = TMP_BALLOC_LIMBS (2 * (nprime2 + 1) << k);
+      T = TMP_BALLOC_LIMBS (2 * (nprime2 + 1));
+      B = A + ((nprime2 + 1) << k);
+      fft_l = TMP_BALLOC_TYPE (k + 1, int *);
+      tmp = TMP_BALLOC_TYPE ((size_t) 2 << k, int);
+      for (i = 0; i <= k; i++)
+	{
+	  fft_l[i] = tmp;
+	  tmp += (mp_size_t) 1 << i;
+	}
+
+      mpn_fft_initl (fft_l, k);
+
+      TRACE (printf ("recurse: %ldx%ld limbs -> %ld times %ldx%ld (%1.2f)\n", n,
+		    n, K2, nprime2, nprime2, 2.0*(double)n/nprime2/K2));
+      for (i = 0; i < K; i++, ap++, bp++)
+	{
+	  mp_limb_t cy;
+	  mpn_fft_normalize (*ap, n);
+	  if (!sqr)
+	    mpn_fft_normalize (*bp, n);
+
+	  mpn_mul_fft_decompose (A, Ap, K2, nprime2, *ap, (l << k) + 1, l, Mp2, T);
+	  if (!sqr)
+	    mpn_mul_fft_decompose (B, Bp, K2, nprime2, *bp, (l << k) + 1, l, Mp2, T);
+
+	  cy = mpn_mul_fft_internal (*ap, n, k, Ap, Bp, A, B, nprime2,
+				     l, Mp2, fft_l, T, sqr);
+	  (*ap)[n] = cy;
+	}
+    }
+  else
+    {
+      mp_ptr a, b, tp, tpn;
+      mp_limb_t cc;
+      mp_size_t n2 = 2 * n;
+      tp = TMP_BALLOC_LIMBS (n2);
+      tpn = tp + n;
+      TRACE (printf ("  mpn_mul_n %ld of %ld limbs\n", K, n));
+      for (i = 0; i < K; i++)
+	{
+	  a = *ap++;
+	  b = *bp++;
+	  if (sqr)
+	    mpn_sqr (tp, a, n);
+	  else
+	    mpn_mul_n (tp, b, a, n);
+	  if (a[n] != 0)
+	    cc = mpn_add_n (tpn, tpn, b, n);
+	  else
+	    cc = 0;
+	  if (b[n] != 0)
+	    cc += mpn_add_n (tpn, tpn, a, n) + a[n];
+	  if (cc != 0)
+	    {
+	      /* FIXME: use MPN_INCR_U here, since carry is not expected.  */
+	      cc = mpn_add_1 (tp, tp, n2, cc);
+	      ASSERT (cc == 0);
+	    }
+	  a[n] = mpn_sub_n (a, tp, tpn, n) && mpn_add_1 (a, a, n, CNST_LIMB(1));
+	}
+    }
+  TMP_FREE;
+}
+
+
+/* input: A^[l[k][0]] A^[l[k][1]] ... A^[l[k][K-1]]
+   output: K*A[0] K*A[K-1] ... K*A[1].
+   Assumes the Ap[] are pseudo-normalized, i.e. 0 <= Ap[][n] <= 1.
+   This condition is also fulfilled at exit.
+*/
+static void
+mpn_fft_fftinv (mp_ptr *Ap, mp_size_t K, mp_size_t omega, mp_size_t n, mp_ptr tp)
+{
+  if (K == 2)
+    {
+      mp_limb_t cy;
+#if HAVE_NATIVE_mpn_add_n_sub_n
+      cy = mpn_add_n_sub_n (Ap[0], Ap[1], Ap[0], Ap[1], n + 1) & 1;
+#else
+      MPN_COPY (tp, Ap[0], n + 1);
+      mpn_add_n (Ap[0], Ap[0], Ap[1], n + 1);
+      cy = mpn_sub_n (Ap[1], tp, Ap[1], n + 1);
+#endif
+      if (Ap[0][n] > 1) /* can be 2 or 3 */
+	Ap[0][n] = 1 - mpn_sub_1 (Ap[0], Ap[0], n, Ap[0][n] - 1);
+      if (cy) /* Ap[1][n] can be -1 or -2 */
+	Ap[1][n] = mpn_add_1 (Ap[1], Ap[1], n, ~Ap[1][n] + 1);
+    }
+  else
+    {
+      mp_size_t j, K2 = K >> 1;
+
+      mpn_fft_fftinv (Ap,      K2, 2 * omega, n, tp);
+      mpn_fft_fftinv (Ap + K2, K2, 2 * omega, n, tp);
+      /* A[j]     <- A[j] + omega^j A[j+K/2]
+	 A[j+K/2] <- A[j] + omega^(j+K/2) A[j+K/2] */
+      for (j = 0; j < K2; j++, Ap++)
+	{
+	  /* Ap[K2] <- Ap[0] + Ap[K2] * 2^((j + K2) * omega)
+	     Ap[0]  <- Ap[0] + Ap[K2] * 2^(j * omega) */
+	  mpn_fft_mul_2exp_modF (tp, Ap[K2], j * omega, n);
+#if HAVE_NATIVE_mpn_add_n_sub_n
+	  mpn_fft_add_sub_modF (Ap[0], Ap[K2], tp, n);
+#else
+	  mpn_fft_sub_modF (Ap[K2], Ap[0], tp, n);
+	  mpn_fft_add_modF (Ap[0],  Ap[0], tp, n);
+#endif
+	}
+    }
+}
+
+
+/* R <- A/2^k mod 2^(n*GMP_NUMB_BITS)+1 */
+static void
+mpn_fft_div_2exp_modF (mp_ptr r, mp_srcptr a, mp_bitcnt_t k, mp_size_t n)
+{
+  mp_bitcnt_t i;
+
+  ASSERT (r != a);
+  i = (mp_bitcnt_t) 2 * n * GMP_NUMB_BITS - k;
+  mpn_fft_mul_2exp_modF (r, a, i, n);
+  /* 1/2^k = 2^(2nL-k) mod 2^(n*GMP_NUMB_BITS)+1 */
+  /* normalize so that R < 2^(n*GMP_NUMB_BITS)+1 */
+  mpn_fft_normalize (r, n);
+}
+
+
+/* {rp,n} <- {ap,an} mod 2^(n*GMP_NUMB_BITS)+1, n <= an <= 3*n.
+   Returns carry out, i.e. 1 iff {ap,an} = -1 mod 2^(n*GMP_NUMB_BITS)+1,
+   then {rp,n}=0.
+*/
+static mp_size_t
+mpn_fft_norm_modF (mp_ptr rp, mp_size_t n, mp_ptr ap, mp_size_t an)
+{
+  mp_size_t l, m, rpn;
+  mp_limb_t cc;
+
+  ASSERT ((n <= an) && (an <= 3 * n));
+  m = an - 2 * n;
+  if (m > 0)
+    {
+      l = n;
+      /* add {ap, m} and {ap+2n, m} in {rp, m} */
+      cc = mpn_add_n (rp, ap, ap + 2 * n, m);
+      /* copy {ap+m, n-m} to {rp+m, n-m} */
+      rpn = mpn_add_1 (rp + m, ap + m, n - m, cc);
+    }
+  else
+    {
+      l = an - n; /* l <= n */
+      MPN_COPY (rp, ap, n);
+      rpn = 0;
+    }
+
+  /* remains to subtract {ap+n, l} from {rp, n+1} */
+  cc = mpn_sub_n (rp, rp, ap + n, l);
+  rpn -= mpn_sub_1 (rp + l, rp + l, n - l, cc);
+  if (rpn < 0) /* necessarily rpn = -1 */
+    rpn = mpn_add_1 (rp, rp, n, CNST_LIMB(1));
+  return rpn;
+}
+
+/* store in A[0..nprime] the first M bits from {n, nl},
+   in A[nprime+1..] the following M bits, ...
+   Assumes M is a multiple of GMP_NUMB_BITS (M = l * GMP_NUMB_BITS).
+   T must have space for at least (nprime + 1) limbs.
+   We must have nl <= 2*K*l.
+*/
+static void
+mpn_mul_fft_decompose (mp_ptr A, mp_ptr *Ap, mp_size_t K, mp_size_t nprime,
+		       mp_srcptr n, mp_size_t nl, mp_size_t l, mp_size_t Mp,
+		       mp_ptr T)
+{
+  mp_size_t i, j;
+  mp_ptr tmp;
+  mp_size_t Kl = K * l;
+  TMP_DECL;
+  TMP_MARK;
+
+  if (nl > Kl) /* normalize {n, nl} mod 2^(Kl*GMP_NUMB_BITS)+1 */
+    {
+      mp_size_t dif = nl - Kl;
+      mp_limb_signed_t cy;
+
+      tmp = TMP_BALLOC_LIMBS(Kl + 1);
+
+      if (dif > Kl)
+	{
+	  int subp = 0;
+
+	  cy = mpn_sub_n (tmp, n, n + Kl, Kl);
+	  n += 2 * Kl;
+	  dif -= Kl;
+
+	  /* now dif > 0 */
+	  while (dif > Kl)
+	    {
+	      if (subp)
+		cy += mpn_sub_n (tmp, tmp, n, Kl);
+	      else
+		cy -= mpn_add_n (tmp, tmp, n, Kl);
+	      subp ^= 1;
+	      n += Kl;
+	      dif -= Kl;
+	    }
+	  /* now dif <= Kl */
+	  if (subp)
+	    cy += mpn_sub (tmp, tmp, Kl, n, dif);
+	  else
+	    cy -= mpn_add (tmp, tmp, Kl, n, dif);
+	  if (cy >= 0)
+	    cy = mpn_add_1 (tmp, tmp, Kl, cy);
+	  else
+	    cy = mpn_sub_1 (tmp, tmp, Kl, -cy);
+	}
+      else /* dif <= Kl, i.e. nl <= 2 * Kl */
+	{
+	  cy = mpn_sub (tmp, n, Kl, n + Kl, dif);
+	  cy = mpn_add_1 (tmp, tmp, Kl, cy);
+	}
+      tmp[Kl] = cy;
+      nl = Kl + 1;
+      n = tmp;
+    }
+  for (i = 0; i < K; i++)
+    {
+      Ap[i] = A;
+      /* store the next M bits of n into A[0..nprime] */
+      if (nl > 0) /* nl is the number of remaining limbs */
+	{
+	  j = (l <= nl && i < K - 1) ? l : nl; /* store j next limbs */
+	  nl -= j;
+	  MPN_COPY (T, n, j);
+	  MPN_ZERO (T + j, nprime + 1 - j);
+	  n += l;
+	  mpn_fft_mul_2exp_modF (A, T, i * Mp, nprime);
+	}
+      else
+	MPN_ZERO (A, nprime + 1);
+      A += nprime + 1;
+    }
+  ASSERT_ALWAYS (nl == 0);
+  TMP_FREE;
+}
+
+/* op <- n*m mod 2^N+1 with fft of size 2^k where N=pl*GMP_NUMB_BITS
+   op is pl limbs, its high bit is returned.
+   One must have pl = mpn_fft_next_size (pl, k).
+   T must have space for 2 * (nprime + 1) limbs.
+*/
+
+static mp_limb_t
+mpn_mul_fft_internal (mp_ptr op, mp_size_t pl, int k,
+		      mp_ptr *Ap, mp_ptr *Bp, mp_ptr A, mp_ptr B,
+		      mp_size_t nprime, mp_size_t l, mp_size_t Mp,
+		      int **fft_l, mp_ptr T, int sqr)
+{
+  mp_size_t K, i, pla, lo, sh, j;
+  mp_ptr p;
+  mp_limb_t cc;
+
+  K = (mp_size_t) 1 << k;
+
+  /* direct fft's */
+  mpn_fft_fft (Ap, K, fft_l + k, 2 * Mp, nprime, 1, T);
+  if (!sqr)
+    mpn_fft_fft (Bp, K, fft_l + k, 2 * Mp, nprime, 1, T);
+
+  /* term to term multiplications */
+  mpn_fft_mul_modF_K (Ap, sqr ? Ap : Bp, nprime, K);
+
+  /* inverse fft's */
+  mpn_fft_fftinv (Ap, K, 2 * Mp, nprime, T);
+
+  /* division of terms after inverse fft */
+  Bp[0] = T + nprime + 1;
+  mpn_fft_div_2exp_modF (Bp[0], Ap[0], k, nprime);
+  for (i = 1; i < K; i++)
+    {
+      Bp[i] = Ap[i - 1];
+      mpn_fft_div_2exp_modF (Bp[i], Ap[i], k + (K - i) * Mp, nprime);
+    }
+
+  /* addition of terms in result p */
+  MPN_ZERO (T, nprime + 1);
+  pla = l * (K - 1) + nprime + 1; /* number of required limbs for p */
+  p = B; /* B has K*(n' + 1) limbs, which is >= pla, i.e. enough */
+  MPN_ZERO (p, pla);
+  cc = 0; /* will accumulate the (signed) carry at p[pla] */
+  for (i = K - 1, lo = l * i + nprime,sh = l * i; i >= 0; i--,lo -= l,sh -= l)
+    {
+      mp_ptr n = p + sh;
+
+      j = (K - i) & (K - 1);
+
+      if (mpn_add_n (n, n, Bp[j], nprime + 1))
+	cc += mpn_add_1 (n + nprime + 1, n + nprime + 1,
+			  pla - sh - nprime - 1, CNST_LIMB(1));
+      T[2 * l] = i + 1; /* T = (i + 1)*2^(2*M) */
+      if (mpn_cmp (Bp[j], T, nprime + 1) > 0)
+	{ /* subtract 2^N'+1 */
+	  cc -= mpn_sub_1 (n, n, pla - sh, CNST_LIMB(1));
+	  cc -= mpn_sub_1 (p + lo, p + lo, pla - lo, CNST_LIMB(1));
+	}
+    }
+  if (cc == -CNST_LIMB(1))
+    {
+      if ((cc = mpn_add_1 (p + pla - pl, p + pla - pl, pl, CNST_LIMB(1))))
+	{
+	  /* p[pla-pl]...p[pla-1] are all zero */
+	  mpn_sub_1 (p + pla - pl - 1, p + pla - pl - 1, pl + 1, CNST_LIMB(1));
+	  mpn_sub_1 (p + pla - 1, p + pla - 1, 1, CNST_LIMB(1));
+	}
+    }
+  else if (cc == 1)
+    {
+      if (pla >= 2 * pl)
+	{
+	  while ((cc = mpn_add_1 (p + pla - 2 * pl, p + pla - 2 * pl, 2 * pl, cc)))
+	    ;
+	}
+      else
+	{
+	  cc = mpn_sub_1 (p + pla - pl, p + pla - pl, pl, cc);
+	  ASSERT (cc == 0);
+	}
+    }
+  else
+    ASSERT (cc == 0);
+
+  /* here p < 2^(2M) [K 2^(M(K-1)) + (K-1) 2^(M(K-2)) + ... ]
+     < K 2^(2M) [2^(M(K-1)) + 2^(M(K-2)) + ... ]
+     < K 2^(2M) 2^(M(K-1))*2 = 2^(M*K+M+k+1) */
+  return mpn_fft_norm_modF (op, pl, p, pla);
+}
+
+/* return the lcm of a and 2^k */
+static mp_bitcnt_t
+mpn_mul_fft_lcm (mp_bitcnt_t a, int k)
+{
+  mp_bitcnt_t l = k;
+
+  while (a % 2 == 0 && k > 0)
+    {
+      a >>= 1;
+      k --;
+    }
+  return a << l;
+}
+
+
+mp_limb_t
+mpn_mul_fft (mp_ptr op, mp_size_t pl,
+	     mp_srcptr n, mp_size_t nl,
+	     mp_srcptr m, mp_size_t ml,
+	     int k)
+{
+  int i;
+  mp_size_t K, maxLK;
+  mp_size_t N, Nprime, nprime, M, Mp, l;
+  mp_ptr *Ap, *Bp, A, T, B;
+  int **fft_l, *tmp;
+  int sqr = (n == m && nl == ml);
+  mp_limb_t h;
+  TMP_DECL;
+
+  TRACE (printf ("\nmpn_mul_fft pl=%ld nl=%ld ml=%ld k=%d\n", pl, nl, ml, k));
+  ASSERT_ALWAYS (mpn_fft_next_size (pl, k) == pl);
+
+  TMP_MARK;
+  N = pl * GMP_NUMB_BITS;
+  fft_l = TMP_BALLOC_TYPE (k + 1, int *);
+  tmp = TMP_BALLOC_TYPE ((size_t) 2 << k, int);
+  for (i = 0; i <= k; i++)
+    {
+      fft_l[i] = tmp;
+      tmp += (mp_size_t) 1 << i;
+    }
+
+  mpn_fft_initl (fft_l, k);
+  K = (mp_size_t) 1 << k;
+  M = N >> k;	/* N = 2^k M */
+  l = 1 + (M - 1) / GMP_NUMB_BITS;
+  maxLK = mpn_mul_fft_lcm (GMP_NUMB_BITS, k); /* lcm (GMP_NUMB_BITS, 2^k) */
+
+  Nprime = (1 + (2 * M + k + 2) / maxLK) * maxLK;
+  /* Nprime = ceil((2*M+k+3)/maxLK)*maxLK; */
+  nprime = Nprime / GMP_NUMB_BITS;
+  TRACE (printf ("N=%ld K=%ld, M=%ld, l=%ld, maxLK=%ld, Np=%ld, np=%ld\n",
+		 N, K, M, l, maxLK, Nprime, nprime));
+  /* we should ensure that recursively, nprime is a multiple of the next K */
+  if (nprime >= (sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD))
+    {
+      mp_size_t K2;
+      for (;;)
+	{
+	  K2 = (mp_size_t) 1 << mpn_fft_best_k (nprime, sqr);
+	  if ((nprime & (K2 - 1)) == 0)
+	    break;
+	  nprime = (nprime + K2 - 1) & -K2;
+	  Nprime = nprime * GMP_LIMB_BITS;
+	  /* warning: since nprime changed, K2 may change too! */
+	}
+      TRACE (printf ("new maxLK=%ld, Np=%ld, np=%ld\n", maxLK, Nprime, nprime));
+    }
+  ASSERT_ALWAYS (nprime < pl); /* otherwise we'll loop */
+
+  T = TMP_BALLOC_LIMBS (2 * (nprime + 1));
+  Mp = Nprime >> k;
+
+  TRACE (printf ("%ldx%ld limbs -> %ld times %ldx%ld limbs (%1.2f)\n",
+		pl, pl, K, nprime, nprime, 2.0 * (double) N / Nprime / K);
+	 printf ("   temp space %ld\n", 2 * K * (nprime + 1)));
+
+  A = TMP_BALLOC_LIMBS (K * (nprime + 1));
+  Ap = TMP_BALLOC_MP_PTRS (K);
+  mpn_mul_fft_decompose (A, Ap, K, nprime, n, nl, l, Mp, T);
+  if (sqr)
+    {
+      mp_size_t pla;
+      pla = l * (K - 1) + nprime + 1; /* number of required limbs for p */
+      B = TMP_BALLOC_LIMBS (pla);
+      Bp = TMP_BALLOC_MP_PTRS (K);
+    }
+  else
+    {
+      B = TMP_BALLOC_LIMBS (K * (nprime + 1));
+      Bp = TMP_BALLOC_MP_PTRS (K);
+      mpn_mul_fft_decompose (B, Bp, K, nprime, m, ml, l, Mp, T);
+    }
+  h = mpn_mul_fft_internal (op, pl, k, Ap, Bp, A, B, nprime, l, Mp, fft_l, T, sqr);
+
+  TMP_FREE;
+  return h;
+}
+
+#if WANT_OLD_FFT_FULL
+/* multiply {n, nl} by {m, ml}, and put the result in {op, nl+ml} */
+void
+mpn_mul_fft_full (mp_ptr op,
+		  mp_srcptr n, mp_size_t nl,
+		  mp_srcptr m, mp_size_t ml)
+{
+  mp_ptr pad_op;
+  mp_size_t pl, pl2, pl3, l;
+  mp_size_t cc, c2, oldcc;
+  int k2, k3;
+  int sqr = (n == m && nl == ml);
+
+  pl = nl + ml; /* total number of limbs of the result */
+
+  /* perform a fft mod 2^(2N)+1 and one mod 2^(3N)+1.
+     We must have pl3 = 3/2 * pl2, with pl2 a multiple of 2^k2, and
+     pl3 a multiple of 2^k3. Since k3 >= k2, both are multiples of 2^k2,
+     and pl2 must be an even multiple of 2^k2. Thus (pl2,pl3) =
+     (2*j*2^k2,3*j*2^k2), which works for 3*j <= pl/2^k2 <= 5*j.
+     We need that consecutive intervals overlap, i.e. 5*j >= 3*(j+1),
+     which requires j>=2. Thus this scheme requires pl >= 6 * 2^FFT_FIRST_K. */
+
+  /*  ASSERT_ALWAYS(pl >= 6 * (1 << FFT_FIRST_K)); */
+
+  pl2 = (2 * pl - 1) / 5; /* ceil (2pl/5) - 1 */
+  do
+    {
+      pl2++;
+      k2 = mpn_fft_best_k (pl2, sqr); /* best fft size for pl2 limbs */
+      pl2 = mpn_fft_next_size (pl2, k2);
+      pl3 = 3 * pl2 / 2; /* since k>=FFT_FIRST_K=4, pl2 is a multiple of 2^4,
+			    thus pl2 / 2 is exact */
+      k3 = mpn_fft_best_k (pl3, sqr);
+    }
+  while (mpn_fft_next_size (pl3, k3) != pl3);
+
+  TRACE (printf ("mpn_mul_fft_full nl=%ld ml=%ld -> pl2=%ld pl3=%ld k=%d\n",
+		 nl, ml, pl2, pl3, k2));
+
+  ASSERT_ALWAYS(pl3 <= pl);
+  cc = mpn_mul_fft (op, pl3, n, nl, m, ml, k3);     /* mu */
+  ASSERT(cc == 0);
+  pad_op = __GMP_ALLOCATE_FUNC_LIMBS (pl2);
+  cc = mpn_mul_fft (pad_op, pl2, n, nl, m, ml, k2); /* lambda */
+  cc = -cc + mpn_sub_n (pad_op, pad_op, op, pl2);    /* lambda - low(mu) */
+  /* 0 <= cc <= 1 */
+  ASSERT(0 <= cc && cc <= 1);
+  l = pl3 - pl2; /* l = pl2 / 2 since pl3 = 3/2 * pl2 */
+  c2 = mpn_add_n (pad_op, pad_op, op + pl2, l);
+  cc = mpn_add_1 (pad_op + l, pad_op + l, l, (mp_limb_t) c2) - cc;
+  ASSERT(-1 <= cc && cc <= 1);
+  if (cc < 0)
+    cc = mpn_add_1 (pad_op, pad_op, pl2, (mp_limb_t) -cc);
+  ASSERT(0 <= cc && cc <= 1);
+  /* now lambda-mu = {pad_op, pl2} - cc mod 2^(pl2*GMP_NUMB_BITS)+1 */
+  oldcc = cc;
+#if HAVE_NATIVE_mpn_add_n_sub_n
+  c2 = mpn_add_n_sub_n (pad_op + l, pad_op, pad_op, pad_op + l, l);
+  cc += c2 >> 1; /* carry out from high <- low + high */
+  c2 = c2 & 1; /* borrow out from low <- low - high */
+#else
+  {
+    mp_ptr tmp;
+    TMP_DECL;
+
+    TMP_MARK;
+    tmp = TMP_BALLOC_LIMBS (l);
+    MPN_COPY (tmp, pad_op, l);
+    c2 = mpn_sub_n (pad_op,      pad_op, pad_op + l, l);
+    cc += mpn_add_n (pad_op + l, tmp,    pad_op + l, l);
+    TMP_FREE;
+  }
+#endif
+  c2 += oldcc;
+  /* first normalize {pad_op, pl2} before dividing by 2: c2 is the borrow
+     at pad_op + l, cc is the carry at pad_op + pl2 */
+  /* 0 <= cc <= 2 */
+  cc -= mpn_sub_1 (pad_op + l, pad_op + l, l, (mp_limb_t) c2);
+  /* -1 <= cc <= 2 */
+  if (cc > 0)
+    cc = -mpn_sub_1 (pad_op, pad_op, pl2, (mp_limb_t) cc);
+  /* now -1 <= cc <= 0 */
+  if (cc < 0)
+    cc = mpn_add_1 (pad_op, pad_op, pl2, (mp_limb_t) -cc);
+  /* now {pad_op, pl2} is normalized, with 0 <= cc <= 1 */
+  if (pad_op[0] & 1) /* if odd, add 2^(pl2*GMP_NUMB_BITS)+1 */
+    cc += 1 + mpn_add_1 (pad_op, pad_op, pl2, CNST_LIMB(1));
+  /* now 0 <= cc <= 2, but cc=2 cannot occur since it would give a carry
+     out below */
+  mpn_rshift (pad_op, pad_op, pl2, 1); /* divide by two */
+  if (cc) /* then cc=1 */
+    pad_op [pl2 - 1] |= (mp_limb_t) 1 << (GMP_NUMB_BITS - 1);
+  /* now {pad_op,pl2}-cc = (lambda-mu)/(1-2^(l*GMP_NUMB_BITS))
+     mod 2^(pl2*GMP_NUMB_BITS) + 1 */
+  c2 = mpn_add_n (op, op, pad_op, pl2); /* no need to add cc (is 0) */
+  /* since pl2+pl3 >= pl, necessary the extra limbs (including cc) are zero */
+  MPN_COPY (op + pl3, pad_op, pl - pl3);
+  ASSERT_MPN_ZERO_P (pad_op + pl - pl3, pl2 + pl3 - pl);
+  __GMP_FREE_FUNC_LIMBS (pad_op, pl2);
+  /* since the final result has at most pl limbs, no carry out below */
+  mpn_add_1 (op + pl2, op + pl2, pl - pl2, (mp_limb_t) c2);
+}
+#endif

diff --git a/third_party/gmp/mpn/generic/mul_n.c b/third_party/gmp/mpn/generic/mul_n.c
new file mode 100644
index 0000000..36bd923
--- /dev/null
+++ b/third_party/gmp/mpn/generic/mul_n.c

@@ -0,0 +1,96 @@
+/* mpn_mul_n -- multiply natural numbers.
+
+Copyright 1991, 1993, 1994, 1996-2003, 2005, 2008, 2009 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+void
+mpn_mul_n (mp_ptr p, mp_srcptr a, mp_srcptr b, mp_size_t n)
+{
+  ASSERT (n >= 1);
+  ASSERT (! MPN_OVERLAP_P (p, 2 * n, a, n));
+  ASSERT (! MPN_OVERLAP_P (p, 2 * n, b, n));
+
+  if (BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD))
+    {
+      mpn_mul_basecase (p, a, n, b, n);
+    }
+  else if (BELOW_THRESHOLD (n, MUL_TOOM33_THRESHOLD))
+    {
+      /* Allocate workspace of fixed size on stack: fast! */
+      mp_limb_t ws[mpn_toom22_mul_itch (MUL_TOOM33_THRESHOLD_LIMIT-1,
+					MUL_TOOM33_THRESHOLD_LIMIT-1)];
+      ASSERT (MUL_TOOM33_THRESHOLD <= MUL_TOOM33_THRESHOLD_LIMIT);
+      mpn_toom22_mul (p, a, n, b, n, ws);
+    }
+  else if (BELOW_THRESHOLD (n, MUL_TOOM44_THRESHOLD))
+    {
+      mp_ptr ws;
+      TMP_SDECL;
+      TMP_SMARK;
+      ws = TMP_SALLOC_LIMBS (mpn_toom33_mul_itch (n, n));
+      mpn_toom33_mul (p, a, n, b, n, ws);
+      TMP_SFREE;
+    }
+  else if (BELOW_THRESHOLD (n, MUL_TOOM6H_THRESHOLD))
+    {
+      mp_ptr ws;
+      TMP_SDECL;
+      TMP_SMARK;
+      ws = TMP_SALLOC_LIMBS (mpn_toom44_mul_itch (n, n));
+      mpn_toom44_mul (p, a, n, b, n, ws);
+      TMP_SFREE;
+    }
+  else if (BELOW_THRESHOLD (n, MUL_TOOM8H_THRESHOLD))
+    {
+      mp_ptr ws;
+      TMP_SDECL;
+      TMP_SMARK;
+      ws = TMP_SALLOC_LIMBS (mpn_toom6_mul_n_itch (n));
+      mpn_toom6h_mul (p, a, n, b, n, ws);
+      TMP_SFREE;
+    }
+  else if (BELOW_THRESHOLD (n, MUL_FFT_THRESHOLD))
+    {
+      mp_ptr ws;
+      TMP_DECL;
+      TMP_MARK;
+      ws = TMP_ALLOC_LIMBS (mpn_toom8_mul_n_itch (n));
+      mpn_toom8h_mul (p, a, n, b, n, ws);
+      TMP_FREE;
+    }
+  else
+    {
+      /* The current FFT code allocates its own space.  That should probably
+	 change.  */
+      mpn_fft_mul (p, a, n, b, n);
+    }
+}

diff --git a/third_party/gmp/mpn/generic/mullo_basecase.c b/third_party/gmp/mpn/generic/mullo_basecase.c
new file mode 100644
index 0000000..9a4cd3d
--- /dev/null
+++ b/third_party/gmp/mpn/generic/mullo_basecase.c

@@ -0,0 +1,90 @@
+/* mpn_mullo_basecase -- Internal routine to multiply two natural
+   numbers of length n and return the low part.
+
+   THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE.  IT IS ONLY
+   SAFE TO REACH THIS FUNCTION THROUGH DOCUMENTED INTERFACES.
+
+
+Copyright (C) 2000, 2002, 2004, 2015 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+
+/* FIXME: Should optionally use mpn_mul_2/mpn_addmul_2.  */
+
+#ifndef MULLO_VARIANT
+#define MULLO_VARIANT 2
+#endif
+
+
+#if MULLO_VARIANT == 1
+void
+mpn_mullo_basecase (mp_ptr rp, mp_srcptr up, mp_srcptr vp, mp_size_t n)
+{
+  mp_size_t i;
+
+  mpn_mul_1 (rp, up, n, vp[0]);
+
+  for (i = n - 1; i > 0; i--)
+    {
+      vp++;
+      rp++;
+      mpn_addmul_1 (rp, up, i, vp[0]);
+    }
+}
+#endif
+
+
+#if MULLO_VARIANT == 2
+void
+mpn_mullo_basecase (mp_ptr rp, mp_srcptr up, mp_srcptr vp, mp_size_t n)
+{
+  mp_limb_t h;
+
+  h = up[0] * vp[n - 1];
+
+  if (n != 1)
+    {
+      mp_size_t i;
+      mp_limb_t v0;
+
+      v0 = *vp++;
+      h += up[n - 1] * v0 + mpn_mul_1 (rp, up, n - 1, v0);
+      rp++;
+
+      for (i = n - 2; i > 0; i--)
+	{
+	  v0 = *vp++;
+	  h += up[i] * v0 + mpn_addmul_1 (rp, up, i, v0);
+	  rp++;
+	}
+    }
+
+  rp[0] = h;
+}
+#endif

diff --git a/third_party/gmp/mpn/generic/mullo_n.c b/third_party/gmp/mpn/generic/mullo_n.c
new file mode 100644
index 0000000..6f4e7ae
--- /dev/null
+++ b/third_party/gmp/mpn/generic/mullo_n.c

@@ -0,0 +1,243 @@
+/* mpn_mullo_n -- multiply two n-limb numbers and return the low n limbs
+   of their products.
+
+   Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
+
+   THIS IS (FOR NOW) AN INTERNAL FUNCTION.  IT IS ONLY SAFE TO REACH THIS
+   FUNCTION THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST GUARANTEED
+   THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+Copyright 2004, 2005, 2009, 2010, 2012 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+
+
+#if TUNE_PROGRAM_BUILD || WANT_FAT_BINARY
+#define MAYBE_range_basecase 1
+#define MAYBE_range_toom22   1
+#else
+#define MAYBE_range_basecase                                           \
+  ((MULLO_DC_THRESHOLD == 0 ? MULLO_BASECASE_THRESHOLD : MULLO_DC_THRESHOLD) < MUL_TOOM22_THRESHOLD*36/(36-11))
+#define MAYBE_range_toom22                                             \
+  ((MULLO_DC_THRESHOLD == 0 ? MULLO_BASECASE_THRESHOLD : MULLO_DC_THRESHOLD) < MUL_TOOM33_THRESHOLD*36/(36-11) )
+#endif
+
+/*  THINK: The DC strategy uses different constants in different Toom's
+	 ranges. Something smoother?
+*/
+
+/*
+  Compute the least significant half of the product {xy,n}*{yp,n}, or
+  formally {rp,n} = {xy,n}*{yp,n} Mod (B^n).
+
+  Above the given threshold, the Divide and Conquer strategy is used.
+  The operands are split in two, and a full product plus two mullo
+  are used to obtain the final result. The more natural strategy is to
+  split in two halves, but this is far from optimal when a
+  sub-quadratic multiplication is used.
+
+  Mulders suggests an unbalanced split in favour of the full product,
+  split n = n1 + n2, where an = n1 <= n2 = (1-a)n; i.e. 0 < a <= 1/2.
+
+  To compute the value of a, we assume that the cost of mullo for a
+  given size ML(n) is a fraction of the cost of a full product with
+  same size M(n), and the cost M(n)=n^e for some exponent 1 < e <= 2;
+  then we can write:
+
+  ML(n) = 2*ML(an) + M((1-a)n) => k*M(n) = 2*k*M(n)*a^e + M(n)*(1-a)^e
+
+  Given a value for e, want to minimise the value of k, i.e. the
+  function k=(1-a)^e/(1-2*a^e).
+
+  With e=2, the exponent for schoolbook multiplication, the minimum is
+  given by the values a=1-a=1/2.
+
+  With e=log(3)/log(2), the exponent for Karatsuba (aka toom22),
+  Mulders compute (1-a) = 0.694... and we approximate a with 11/36.
+
+  Other possible approximations follow:
+  e=log(5)/log(3) [Toom-3] -> a ~= 9/40
+  e=log(7)/log(4) [Toom-4] -> a ~= 7/39
+  e=log(11)/log(6) [Toom-6] -> a ~= 1/8
+  e=log(15)/log(8) [Toom-8] -> a ~= 1/10
+
+  The values above where obtained with the following trivial commands
+  in the gp-pari shell:
+
+fun(e,a)=(1-a)^e/(1-2*a^e)
+mul(a,b,c)={local(m,x,p);if(b-c<1/10000,(b+c)/2,m=1;x=b;forstep(p=c,b,(b-c)/8,if(fun(a,p)<m,m=fun(a,p);x=p));mul(a,(b+x)/2,(c+x)/2))}
+contfracpnqn(contfrac(mul(log(2*2-1)/log(2),1/2,0),5))
+contfracpnqn(contfrac(mul(log(3*2-1)/log(3),1/2,0),5))
+contfracpnqn(contfrac(mul(log(4*2-1)/log(4),1/2,0),5))
+contfracpnqn(contfrac(mul(log(6*2-1)/log(6),1/2,0),3))
+contfracpnqn(contfrac(mul(log(8*2-1)/log(8),1/2,0),3))
+
+  ,
+  |\
+  | \
+  +----,
+  |    |
+  |    |
+  |    |\
+  |    | \
+  +----+--`
+  ^ n2 ^n1^
+
+  For an actual implementation, the assumption that M(n)=n^e is
+  incorrect, as a consequence also the assumption that ML(n)=k*M(n)
+  with a constant k is wrong.
+
+  But theory suggest us two things:
+  - the best the multiplication product is (lower e), the more k
+    approaches 1, and a approaches 0.
+
+  - A value for a smaller than optimal is probably less bad than a
+    bigger one: e.g. let e=log(3)/log(2), a=0.3058_ the optimal
+    value, and k(a)=0.808_ the mul/mullo speed ratio. We get
+    k(a+1/6)=0.929_ but k(a-1/6)=0.865_.
+*/
+
+static mp_size_t
+mpn_mullo_n_itch (mp_size_t n)
+{
+  return 2*n;
+}
+
+/*
+    mpn_dc_mullo_n requires a scratch space of 2*n limbs at tp.
+    It accepts tp == rp.
+*/
+static void
+mpn_dc_mullo_n (mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n, mp_ptr tp)
+{
+  mp_size_t n2, n1;
+  ASSERT (n >= 2);
+  ASSERT (! MPN_OVERLAP_P (rp, n, xp, n));
+  ASSERT (! MPN_OVERLAP_P (rp, n, yp, n));
+  ASSERT (MPN_SAME_OR_SEPARATE2_P(rp, n, tp, 2*n));
+
+  /* Divide-and-conquer */
+
+  /* We need fractional approximation of the value 0 < a <= 1/2
+     giving the minimum in the function k=(1-a)^e/(1-2*a^e).
+  */
+  if (MAYBE_range_basecase && BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD*36/(36-11)))
+    n1 = n >> 1;
+  else if (MAYBE_range_toom22 && BELOW_THRESHOLD (n, MUL_TOOM33_THRESHOLD*36/(36-11)))
+    n1 = n * 11 / (size_t) 36;	/* n1 ~= n*(1-.694...) */
+  else if (BELOW_THRESHOLD (n, MUL_TOOM44_THRESHOLD*40/(40-9)))
+    n1 = n * 9 / (size_t) 40;	/* n1 ~= n*(1-.775...) */
+  else if (BELOW_THRESHOLD (n, MUL_TOOM8H_THRESHOLD*10/9))
+    n1 = n * 7 / (size_t) 39;	/* n1 ~= n*(1-.821...) */
+  /* n1 = n * 4 / (size_t) 31;	// n1 ~= n*(1-.871...) [TOOM66] */
+  else
+    n1 = n / (size_t) 10;		/* n1 ~= n*(1-.899...) [TOOM88] */
+
+  n2 = n - n1;
+
+  /* Split as x = x1 2^(n2 GMP_NUMB_BITS) + x0,
+	      y = y1 2^(n2 GMP_NUMB_BITS) + y0 */
+
+  /* x0 * y0 */
+  mpn_mul_n (tp, xp, yp, n2);
+  MPN_COPY (rp, tp, n2);
+
+  /* x1 * y0 * 2^(n2 GMP_NUMB_BITS) */
+  if (BELOW_THRESHOLD (n1, MULLO_BASECASE_THRESHOLD))
+    mpn_mul_basecase (tp + n, xp + n2, n1, yp, n1);
+  else if (BELOW_THRESHOLD (n1, MULLO_DC_THRESHOLD))
+    mpn_mullo_basecase (tp + n, xp + n2, yp, n1);
+  else
+    mpn_dc_mullo_n (tp + n, xp + n2, yp, n1, tp + n);
+  mpn_add_n (rp + n2, tp + n2, tp + n, n1);
+
+  /* x0 * y1 * 2^(n2 GMP_NUMB_BITS) */
+  if (BELOW_THRESHOLD (n1, MULLO_BASECASE_THRESHOLD))
+    mpn_mul_basecase (tp + n, xp, n1, yp + n2, n1);
+  else if (BELOW_THRESHOLD (n1, MULLO_DC_THRESHOLD))
+    mpn_mullo_basecase (tp + n, xp, yp + n2, n1);
+  else
+    mpn_dc_mullo_n (tp + n, xp, yp + n2, n1, tp + n);
+  mpn_add_n (rp + n2, rp + n2, tp + n, n1);
+}
+
+/* Avoid zero allocations when MULLO_BASECASE_THRESHOLD is 0.  */
+#define MUL_BASECASE_ALLOC \
+ (MULLO_BASECASE_THRESHOLD_LIMIT == 0 ? 1 : 2*MULLO_BASECASE_THRESHOLD_LIMIT)
+
+/* FIXME: This function should accept a temporary area; dc_mullow_n
+   accepts a pointer tp, and handle the case tp == rp, do the same here.
+   Maybe recombine the two functions.
+   THINK: If mpn_mul_basecase is always faster than mpn_mullo_basecase
+	  (typically thanks to mpn_addmul_2) should we unconditionally use
+	  mpn_mul_n?
+*/
+
+void
+mpn_mullo_n (mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n)
+{
+  ASSERT (n >= 1);
+  ASSERT (! MPN_OVERLAP_P (rp, n, xp, n));
+  ASSERT (! MPN_OVERLAP_P (rp, n, yp, n));
+
+  if (BELOW_THRESHOLD (n, MULLO_BASECASE_THRESHOLD))
+    {
+      /* Allocate workspace of fixed size on stack: fast! */
+      mp_limb_t tp[MUL_BASECASE_ALLOC];
+      mpn_mul_basecase (tp, xp, n, yp, n);
+      MPN_COPY (rp, tp, n);
+    }
+  else if (BELOW_THRESHOLD (n, MULLO_DC_THRESHOLD))
+    {
+      mpn_mullo_basecase (rp, xp, yp, n);
+    }
+  else
+    {
+      mp_ptr tp;
+      TMP_DECL;
+      TMP_MARK;
+      tp = TMP_ALLOC_LIMBS (mpn_mullo_n_itch (n));
+      if (BELOW_THRESHOLD (n, MULLO_MUL_N_THRESHOLD))
+	{
+	  mpn_dc_mullo_n (rp, xp, yp, n, tp);
+	}
+      else
+	{
+	  /* For really large operands, use plain mpn_mul_n but throw away upper n
+	     limbs of result.  */
+#if !TUNE_PROGRAM_BUILD && (MULLO_MUL_N_THRESHOLD > MUL_FFT_THRESHOLD)
+	  mpn_fft_mul (tp, xp, n, yp, n);
+#else
+	  mpn_mul_n (tp, xp, yp, n);
+#endif
+	  MPN_COPY (rp, tp, n);
+	}
+      TMP_FREE;
+    }
+}

diff --git a/third_party/gmp/mpn/generic/mulmid.c b/third_party/gmp/mpn/generic/mulmid.c
new file mode 100644
index 0000000..f35c5fb
--- /dev/null
+++ b/third_party/gmp/mpn/generic/mulmid.c

@@ -0,0 +1,255 @@
+/* mpn_mulmid -- middle product
+
+   Contributed by David Harvey.
+
+   THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
+   SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+Copyright 2011 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+#include "gmp-impl.h"
+
+
+#define CHUNK (200 + MULMID_TOOM42_THRESHOLD)
+
+
+void
+mpn_mulmid (mp_ptr rp,
+            mp_srcptr ap, mp_size_t an,
+            mp_srcptr bp, mp_size_t bn)
+{
+  mp_size_t rn, k;
+  mp_ptr scratch, temp;
+
+  ASSERT (an >= bn);
+  ASSERT (bn >= 1);
+  ASSERT (! MPN_OVERLAP_P (rp, an - bn + 3, ap, an));
+  ASSERT (! MPN_OVERLAP_P (rp, an - bn + 3, bp, bn));
+
+  if (bn < MULMID_TOOM42_THRESHOLD)
+    {
+      /* region not tall enough to make toom42 worthwhile for any portion */
+
+      if (an < CHUNK)
+	{
+	  /* region not too wide either, just call basecase directly */
+	  mpn_mulmid_basecase (rp, ap, an, bp, bn);
+	  return;
+	}
+
+      /* Region quite wide. For better locality, use basecase on chunks:
+
+	 AAABBBCC..
+	 .AAABBBCC.
+	 ..AAABBBCC
+      */
+
+      k = CHUNK - bn + 1;    /* number of diagonals per chunk */
+
+      /* first chunk (marked A in the above diagram) */
+      mpn_mulmid_basecase (rp, ap, CHUNK, bp, bn);
+
+      /* remaining chunks (B, C, etc) */
+      an -= k;
+
+      while (an >= CHUNK)
+	{
+	  mp_limb_t t0, t1, cy;
+	  ap += k, rp += k;
+	  t0 = rp[0], t1 = rp[1];
+	  mpn_mulmid_basecase (rp, ap, CHUNK, bp, bn);
+	  ADDC_LIMB (cy, rp[0], rp[0], t0);    /* add back saved limbs */
+	  MPN_INCR_U (rp + 1, k + 1, t1 + cy);
+	  an -= k;
+	}
+
+      if (an >= bn)
+	{
+	  /* last remaining chunk */
+	  mp_limb_t t0, t1, cy;
+	  ap += k, rp += k;
+	  t0 = rp[0], t1 = rp[1];
+	  mpn_mulmid_basecase (rp, ap, an, bp, bn);
+	  ADDC_LIMB (cy, rp[0], rp[0], t0);
+	  MPN_INCR_U (rp + 1, an - bn + 2, t1 + cy);
+	}
+
+      return;
+    }
+
+  /* region is tall enough for toom42 */
+
+  rn = an - bn + 1;
+
+  if (rn < MULMID_TOOM42_THRESHOLD)
+    {
+      /* region not wide enough to make toom42 worthwhile for any portion */
+
+      TMP_DECL;
+
+      if (bn < CHUNK)
+	{
+	  /* region not too tall either, just call basecase directly */
+	  mpn_mulmid_basecase (rp, ap, an, bp, bn);
+	  return;
+	}
+
+      /* Region quite tall. For better locality, use basecase on chunks:
+
+	 AAAAA....
+	 .AAAAA...
+	 ..BBBBB..
+	 ...BBBBB.
+	 ....CCCCC
+      */
+
+      TMP_MARK;
+
+      temp = TMP_ALLOC_LIMBS (rn + 2);
+
+      /* first chunk (marked A in the above diagram) */
+      bp += bn - CHUNK, an -= bn - CHUNK;
+      mpn_mulmid_basecase (rp, ap, an, bp, CHUNK);
+
+      /* remaining chunks (B, C, etc) */
+      bn -= CHUNK;
+
+      while (bn >= CHUNK)
+	{
+	  ap += CHUNK, bp -= CHUNK;
+	  mpn_mulmid_basecase (temp, ap, an, bp, CHUNK);
+	  mpn_add_n (rp, rp, temp, rn + 2);
+	  bn -= CHUNK;
+	}
+
+      if (bn)
+	{
+	  /* last remaining chunk */
+	  ap += CHUNK, bp -= bn;
+	  mpn_mulmid_basecase (temp, ap, rn + bn - 1, bp, bn);
+	  mpn_add_n (rp, rp, temp, rn + 2);
+	}
+
+      TMP_FREE;
+      return;
+    }
+
+  /* we're definitely going to use toom42 somewhere */
+
+  if (bn > rn)
+    {
+      /* slice region into chunks, use toom42 on all chunks except possibly
+	 the last:
+
+         AA....
+         .AA...
+         ..BB..
+         ...BB.
+         ....CC
+      */
+
+      TMP_DECL;
+      TMP_MARK;
+
+      temp = TMP_ALLOC_LIMBS (rn + 2 + mpn_toom42_mulmid_itch (rn));
+      scratch = temp + rn + 2;
+
+      /* first chunk (marked A in the above diagram) */
+      bp += bn - rn;
+      mpn_toom42_mulmid (rp, ap, bp, rn, scratch);
+
+      /* remaining chunks (B, C, etc) */
+      bn -= rn;
+
+      while (bn >= rn)
+        {
+          ap += rn, bp -= rn;
+	  mpn_toom42_mulmid (temp, ap, bp, rn, scratch);
+          mpn_add_n (rp, rp, temp, rn + 2);
+          bn -= rn;
+        }
+
+      if (bn)
+        {
+          /* last remaining chunk */
+          ap += rn, bp -= bn;
+	  mpn_mulmid (temp, ap, rn + bn - 1, bp, bn);
+          mpn_add_n (rp, rp, temp, rn + 2);
+        }
+
+      TMP_FREE;
+    }
+  else
+    {
+      /* slice region into chunks, use toom42 on all chunks except possibly
+	 the last:
+
+         AAABBBCC..
+         .AAABBBCC.
+         ..AAABBBCC
+      */
+
+      TMP_DECL;
+      TMP_MARK;
+
+      scratch = TMP_ALLOC_LIMBS (mpn_toom42_mulmid_itch (bn));
+
+      /* first chunk (marked A in the above diagram) */
+      mpn_toom42_mulmid (rp, ap, bp, bn, scratch);
+
+      /* remaining chunks (B, C, etc) */
+      rn -= bn;
+
+      while (rn >= bn)
+        {
+	  mp_limb_t t0, t1, cy;
+          ap += bn, rp += bn;
+          t0 = rp[0], t1 = rp[1];
+          mpn_toom42_mulmid (rp, ap, bp, bn, scratch);
+	  ADDC_LIMB (cy, rp[0], rp[0], t0);     /* add back saved limbs */
+	  MPN_INCR_U (rp + 1, bn + 1, t1 + cy);
+	  rn -= bn;
+        }
+
+      TMP_FREE;
+
+      if (rn)
+        {
+          /* last remaining chunk */
+	  mp_limb_t t0, t1, cy;
+          ap += bn, rp += bn;
+          t0 = rp[0], t1 = rp[1];
+          mpn_mulmid (rp, ap, rn + bn - 1, bp, bn);
+	  ADDC_LIMB (cy, rp[0], rp[0], t0);
+	  MPN_INCR_U (rp + 1, rn + 1, t1 + cy);
+        }
+    }
+}

diff --git a/third_party/gmp/mpn/generic/mulmid_basecase.c b/third_party/gmp/mpn/generic/mulmid_basecase.c
new file mode 100644
index 0000000..d5434ea
--- /dev/null
+++ b/third_party/gmp/mpn/generic/mulmid_basecase.c

@@ -0,0 +1,82 @@
+/* mpn_mulmid_basecase -- classical middle product algorithm
+
+   Contributed by David Harvey.
+
+   THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
+   SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+Copyright 2011 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+/* Middle product of {up,un} and {vp,vn}, write result to {rp,un-vn+3}.
+   Must have un >= vn >= 1.
+
+   Neither input buffer may overlap with the output buffer. */
+
+void
+mpn_mulmid_basecase (mp_ptr rp,
+                     mp_srcptr up, mp_size_t un,
+                     mp_srcptr vp, mp_size_t vn)
+{
+  mp_limb_t lo, hi;  /* last two limbs of output */
+  mp_limb_t cy;
+
+  ASSERT (un >= vn);
+  ASSERT (vn >= 1);
+  ASSERT (! MPN_OVERLAP_P (rp, un - vn + 3, up, un));
+  ASSERT (! MPN_OVERLAP_P (rp, un - vn + 3, vp, vn));
+
+  up += vn - 1;
+  un -= vn - 1;
+
+  /* multiply by first limb, store result */
+  lo = mpn_mul_1 (rp, up, un, vp[0]);
+  hi = 0;
+
+  /* accumulate remaining rows */
+  for (vn--; vn; vn--)
+    {
+      up--, vp++;
+      cy = mpn_addmul_1 (rp, up, un, vp[0]);
+      add_ssaaaa (hi, lo, hi, lo, CNST_LIMB(0), cy);
+    }
+
+  /* store final limbs */
+#if GMP_NAIL_BITS != 0
+  hi = (hi << GMP_NAIL_BITS) + (lo >> GMP_NUMB_BITS);
+  lo &= GMP_NUMB_MASK;
+#endif
+
+  rp[un] = lo;
+  rp[un + 1] = hi;
+}

diff --git a/third_party/gmp/mpn/generic/mulmid_n.c b/third_party/gmp/mpn/generic/mulmid_n.c
new file mode 100644
index 0000000..ac7e8f1
--- /dev/null
+++ b/third_party/gmp/mpn/generic/mulmid_n.c

@@ -0,0 +1,61 @@
+/* mpn_mulmid_n -- balanced middle product
+
+   Contributed by David Harvey.
+
+   THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
+   SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+Copyright 2011 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+#include "gmp-impl.h"
+
+
+void
+mpn_mulmid_n (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t n)
+{
+  ASSERT (n >= 1);
+  ASSERT (! MPN_OVERLAP_P (rp, n + 2, ap, 2*n - 1));
+  ASSERT (! MPN_OVERLAP_P (rp, n + 2, bp, n));
+
+  if (n < MULMID_TOOM42_THRESHOLD)
+    {
+      mpn_mulmid_basecase (rp, ap, 2*n - 1, bp, n);
+    }
+  else
+    {
+      mp_ptr scratch;
+      TMP_DECL;
+      TMP_MARK;
+      scratch = TMP_ALLOC_LIMBS (mpn_toom42_mulmid_itch (n));
+      mpn_toom42_mulmid (rp, ap, bp, n, scratch);
+      TMP_FREE;
+    }
+}

diff --git a/third_party/gmp/mpn/generic/mulmod_bnm1.c b/third_party/gmp/mpn/generic/mulmod_bnm1.c
new file mode 100644
index 0000000..769bdbc
--- /dev/null
+++ b/third_party/gmp/mpn/generic/mulmod_bnm1.c

@@ -0,0 +1,354 @@
+/* mulmod_bnm1.c -- multiplication mod B^n-1.
+
+   Contributed to the GNU project by Niels Möller, Torbjorn Granlund and
+   Marco Bodrato.
+
+   THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
+   SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+Copyright 2009, 2010, 2012, 2013 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+/* Inputs are {ap,rn} and {bp,rn}; output is {rp,rn}, computation is
+   mod B^rn - 1, and values are semi-normalised; zero is represented
+   as either 0 or B^n - 1.  Needs a scratch of 2rn limbs at tp.
+   tp==rp is allowed. */
+void
+mpn_bc_mulmod_bnm1 (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t rn,
+		    mp_ptr tp)
+{
+  mp_limb_t cy;
+
+  ASSERT (0 < rn);
+
+  mpn_mul_n (tp, ap, bp, rn);
+  cy = mpn_add_n (rp, tp, tp + rn, rn);
+  /* If cy == 1, then the value of rp is at most B^rn - 2, so there can
+   * be no overflow when adding in the carry. */
+  MPN_INCR_U (rp, rn, cy);
+}
+
+
+/* Inputs are {ap,rn+1} and {bp,rn+1}; output is {rp,rn+1}, in
+   semi-normalised representation, computation is mod B^rn + 1. Needs
+   a scratch area of 2rn + 2 limbs at tp; tp == rp is allowed.
+   Output is normalised. */
+static void
+mpn_bc_mulmod_bnp1 (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t rn,
+		    mp_ptr tp)
+{
+  mp_limb_t cy;
+
+  ASSERT (0 < rn);
+
+  mpn_mul_n (tp, ap, bp, rn + 1);
+  ASSERT (tp[2*rn+1] == 0);
+  ASSERT (tp[2*rn] < GMP_NUMB_MAX);
+  cy = tp[2*rn] + mpn_sub_n (rp, tp, tp+rn, rn);
+  rp[rn] = 0;
+  MPN_INCR_U (rp, rn+1, cy);
+}
+
+
+/* Computes {rp,MIN(rn,an+bn)} <- {ap,an}*{bp,bn} Mod(B^rn-1)
+ *
+ * The result is expected to be ZERO if and only if one of the operand
+ * already is. Otherwise the class [0] Mod(B^rn-1) is represented by
+ * B^rn-1. This should not be a problem if mulmod_bnm1 is used to
+ * combine results and obtain a natural number when one knows in
+ * advance that the final value is less than (B^rn-1).
+ * Moreover it should not be a problem if mulmod_bnm1 is used to
+ * compute the full product with an+bn <= rn, because this condition
+ * implies (B^an-1)(B^bn-1) < (B^rn-1) .
+ *
+ * Requires 0 < bn <= an <= rn and an + bn > rn/2
+ * Scratch need: rn + (need for recursive call OR rn + 4). This gives
+ *
+ * S(n) <= rn + MAX (rn + 4, S(n/2)) <= 2rn + 4
+ */
+void
+mpn_mulmod_bnm1 (mp_ptr rp, mp_size_t rn, mp_srcptr ap, mp_size_t an, mp_srcptr bp, mp_size_t bn, mp_ptr tp)
+{
+  ASSERT (0 < bn);
+  ASSERT (bn <= an);
+  ASSERT (an <= rn);
+
+  if ((rn & 1) != 0 || BELOW_THRESHOLD (rn, MULMOD_BNM1_THRESHOLD))
+    {
+      if (UNLIKELY (bn < rn))
+	{
+	  if (UNLIKELY (an + bn <= rn))
+	    {
+	      mpn_mul (rp, ap, an, bp, bn);
+	    }
+	  else
+	    {
+	      mp_limb_t cy;
+	      mpn_mul (tp, ap, an, bp, bn);
+	      cy = mpn_add (rp, tp, rn, tp + rn, an + bn - rn);
+	      MPN_INCR_U (rp, rn, cy);
+	    }
+	}
+      else
+	mpn_bc_mulmod_bnm1 (rp, ap, bp, rn, tp);
+    }
+  else
+    {
+      mp_size_t n;
+      mp_limb_t cy;
+      mp_limb_t hi;
+
+      n = rn >> 1;
+
+      /* We need at least an + bn >= n, to be able to fit one of the
+	 recursive products at rp. Requiring strict inequality makes
+	 the code slightly simpler. If desired, we could avoid this
+	 restriction by initially halving rn as long as rn is even and
+	 an + bn <= rn/2. */
+
+      ASSERT (an + bn > n);
+
+      /* Compute xm = a*b mod (B^n - 1), xp = a*b mod (B^n + 1)
+	 and crt together as
+
+	 x = -xp * B^n + (B^n + 1) * [ (xp + xm)/2 mod (B^n-1)]
+      */
+
+#define a0 ap
+#define a1 (ap + n)
+#define b0 bp
+#define b1 (bp + n)
+
+#define xp  tp	/* 2n + 2 */
+      /* am1  maybe in {xp, n} */
+      /* bm1  maybe in {xp + n, n} */
+#define sp1 (tp + 2*n + 2)
+      /* ap1  maybe in {sp1, n + 1} */
+      /* bp1  maybe in {sp1 + n + 1, n + 1} */
+
+      {
+	mp_srcptr am1, bm1;
+	mp_size_t anm, bnm;
+	mp_ptr so;
+
+	bm1 = b0;
+	bnm = bn;
+	if (LIKELY (an > n))
+	  {
+	    am1 = xp;
+	    cy = mpn_add (xp, a0, n, a1, an - n);
+	    MPN_INCR_U (xp, n, cy);
+	    anm = n;
+	    so = xp + n;
+	    if (LIKELY (bn > n))
+	      {
+		bm1 = so;
+		cy = mpn_add (so, b0, n, b1, bn - n);
+		MPN_INCR_U (so, n, cy);
+		bnm = n;
+		so += n;
+	      }
+	  }
+	else
+	  {
+	    so = xp;
+	    am1 = a0;
+	    anm = an;
+	  }
+
+	mpn_mulmod_bnm1 (rp, n, am1, anm, bm1, bnm, so);
+      }
+
+      {
+	int       k;
+	mp_srcptr ap1, bp1;
+	mp_size_t anp, bnp;
+
+	bp1 = b0;
+	bnp = bn;
+	if (LIKELY (an > n)) {
+	  ap1 = sp1;
+	  cy = mpn_sub (sp1, a0, n, a1, an - n);
+	  sp1[n] = 0;
+	  MPN_INCR_U (sp1, n + 1, cy);
+	  anp = n + ap1[n];
+	  if (LIKELY (bn > n)) {
+	    bp1 = sp1 + n + 1;
+	    cy = mpn_sub (sp1 + n + 1, b0, n, b1, bn - n);
+	    sp1[2*n+1] = 0;
+	    MPN_INCR_U (sp1 + n + 1, n + 1, cy);
+	    bnp = n + bp1[n];
+	  }
+	} else {
+	  ap1 = a0;
+	  anp = an;
+	}
+
+	if (BELOW_THRESHOLD (n, MUL_FFT_MODF_THRESHOLD))
+	  k=0;
+	else
+	  {
+	    int mask;
+	    k = mpn_fft_best_k (n, 0);
+	    mask = (1<<k) - 1;
+	    while (n & mask) {k--; mask >>=1;};
+	  }
+	if (k >= FFT_FIRST_K)
+	  xp[n] = mpn_mul_fft (xp, n, ap1, anp, bp1, bnp, k);
+	else if (UNLIKELY (bp1 == b0))
+	  {
+	    ASSERT (anp + bnp <= 2*n+1);
+	    ASSERT (anp + bnp > n);
+	    ASSERT (anp >= bnp);
+	    mpn_mul (xp, ap1, anp, bp1, bnp);
+	    anp = anp + bnp - n;
+	    ASSERT (anp <= n || xp[2*n]==0);
+	    anp-= anp > n;
+	    cy = mpn_sub (xp, xp, n, xp + n, anp);
+	    xp[n] = 0;
+	    MPN_INCR_U (xp, n+1, cy);
+	  }
+	else
+	  mpn_bc_mulmod_bnp1 (xp, ap1, bp1, n, xp);
+      }
+
+      /* Here the CRT recomposition begins.
+
+	 xm <- (xp + xm)/2 = (xp + xm)B^n/2 mod (B^n-1)
+	 Division by 2 is a bitwise rotation.
+
+	 Assumes xp normalised mod (B^n+1).
+
+	 The residue class [0] is represented by [B^n-1]; except when
+	 both input are ZERO.
+      */
+
+#if HAVE_NATIVE_mpn_rsh1add_n || HAVE_NATIVE_mpn_rsh1add_nc
+#if HAVE_NATIVE_mpn_rsh1add_nc
+      cy = mpn_rsh1add_nc(rp, rp, xp, n, xp[n]); /* B^n = 1 */
+      hi = cy << (GMP_NUMB_BITS - 1);
+      cy = 0;
+      /* next update of rp[n-1] will set cy = 1 only if rp[n-1]+=hi
+	 overflows, i.e. a further increment will not overflow again. */
+#else /* ! _nc */
+      cy = xp[n] + mpn_rsh1add_n(rp, rp, xp, n); /* B^n = 1 */
+      hi = (cy<<(GMP_NUMB_BITS-1))&GMP_NUMB_MASK; /* (cy&1) << ... */
+      cy >>= 1;
+      /* cy = 1 only if xp[n] = 1 i.e. {xp,n} = ZERO, this implies that
+	 the rsh1add was a simple rshift: the top bit is 0. cy=1 => hi=0. */
+#endif
+#if GMP_NAIL_BITS == 0
+      add_ssaaaa(cy, rp[n-1], cy, rp[n-1], 0, hi);
+#else
+      cy += (hi & rp[n-1]) >> (GMP_NUMB_BITS-1);
+      rp[n-1] ^= hi;
+#endif
+#else /* ! HAVE_NATIVE_mpn_rsh1add_n */
+#if HAVE_NATIVE_mpn_add_nc
+      cy = mpn_add_nc(rp, rp, xp, n, xp[n]);
+#else /* ! _nc */
+      cy = xp[n] + mpn_add_n(rp, rp, xp, n); /* xp[n] == 1 implies {xp,n} == ZERO */
+#endif
+      cy += (rp[0]&1);
+      mpn_rshift(rp, rp, n, 1);
+      ASSERT (cy <= 2);
+      hi = (cy<<(GMP_NUMB_BITS-1))&GMP_NUMB_MASK; /* (cy&1) << ... */
+      cy >>= 1;
+      /* We can have cy != 0 only if hi = 0... */
+      ASSERT ((rp[n-1] & GMP_NUMB_HIGHBIT) == 0);
+      rp[n-1] |= hi;
+      /* ... rp[n-1] + cy can not overflow, the following INCR is correct. */
+#endif
+      ASSERT (cy <= 1);
+      /* Next increment can not overflow, read the previous comments about cy. */
+      ASSERT ((cy == 0) || ((rp[n-1] & GMP_NUMB_HIGHBIT) == 0));
+      MPN_INCR_U(rp, n, cy);
+
+      /* Compute the highest half:
+	 ([(xp + xm)/2 mod (B^n-1)] - xp ) * B^n
+       */
+      if (UNLIKELY (an + bn < rn))
+	{
+	  /* Note that in this case, the only way the result can equal
+	     zero mod B^{rn} - 1 is if one of the inputs is zero, and
+	     then the output of both the recursive calls and this CRT
+	     reconstruction is zero, not B^{rn} - 1. Which is good,
+	     since the latter representation doesn't fit in the output
+	     area.*/
+	  cy = mpn_sub_n (rp + n, rp, xp, an + bn - n);
+
+	  /* FIXME: This subtraction of the high parts is not really
+	     necessary, we do it to get the carry out, and for sanity
+	     checking. */
+	  cy = xp[n] + mpn_sub_nc (xp + an + bn - n, rp + an + bn - n,
+				   xp + an + bn - n, rn - (an + bn), cy);
+	  ASSERT (an + bn == rn - 1 ||
+		  mpn_zero_p (xp + an + bn - n + 1, rn - 1 - (an + bn)));
+	  cy = mpn_sub_1 (rp, rp, an + bn, cy);
+	  ASSERT (cy == (xp + an + bn - n)[0]);
+	}
+      else
+	{
+	  cy = xp[n] + mpn_sub_n (rp + n, rp, xp, n);
+	  /* cy = 1 only if {xp,n+1} is not ZERO, i.e. {rp,n} is not ZERO.
+	     DECR will affect _at most_ the lowest n limbs. */
+	  MPN_DECR_U (rp, 2*n, cy);
+	}
+#undef a0
+#undef a1
+#undef b0
+#undef b1
+#undef xp
+#undef sp1
+    }
+}
+
+mp_size_t
+mpn_mulmod_bnm1_next_size (mp_size_t n)
+{
+  mp_size_t nh;
+
+  if (BELOW_THRESHOLD (n,     MULMOD_BNM1_THRESHOLD))
+    return n;
+  if (BELOW_THRESHOLD (n, 4 * (MULMOD_BNM1_THRESHOLD - 1) + 1))
+    return (n + (2-1)) & (-2);
+  if (BELOW_THRESHOLD (n, 8 * (MULMOD_BNM1_THRESHOLD - 1) + 1))
+    return (n + (4-1)) & (-4);
+
+  nh = (n + 1) >> 1;
+
+  if (BELOW_THRESHOLD (nh, MUL_FFT_MODF_THRESHOLD))
+    return (n + (8-1)) & (-8);
+
+  return 2 * mpn_fft_next_size (nh, mpn_fft_best_k (nh, 0));
+}

diff --git a/third_party/gmp/mpn/generic/neg.c b/third_party/gmp/mpn/generic/neg.c
new file mode 100644
index 0000000..bec2a32
--- /dev/null
+++ b/third_party/gmp/mpn/generic/neg.c

@@ -0,0 +1,33 @@
+/* mpn_neg - negate an mpn.
+
+Copyright 2001, 2009 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define __GMP_FORCE_mpn_neg 1
+
+#include "gmp-impl.h"

diff --git a/third_party/gmp/mpn/generic/nussbaumer_mul.c b/third_party/gmp/mpn/generic/nussbaumer_mul.c
new file mode 100644
index 0000000..3e0cf27
--- /dev/null
+++ b/third_party/gmp/mpn/generic/nussbaumer_mul.c

@@ -0,0 +1,70 @@
+/* mpn_nussbaumer_mul -- Multiply {ap,an} and {bp,bn} using
+   Nussbaumer's negacyclic convolution.
+
+   Contributed to the GNU project by Marco Bodrato.
+
+   THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
+   SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+Copyright 2009 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+#include "gmp-impl.h"
+
+/* Multiply {ap,an} by {bp,bn}, and put the result in {pp, an+bn} */
+void
+mpn_nussbaumer_mul (mp_ptr pp,
+		    mp_srcptr ap, mp_size_t an,
+		    mp_srcptr bp, mp_size_t bn)
+{
+  mp_size_t rn;
+  mp_ptr tp;
+  TMP_DECL;
+
+  ASSERT (an >= bn);
+  ASSERT (bn > 0);
+
+  TMP_MARK;
+
+  if ((ap == bp) && (an == bn))
+    {
+      rn = mpn_sqrmod_bnm1_next_size (2*an);
+      tp = TMP_ALLOC_LIMBS (mpn_sqrmod_bnm1_itch (rn, an));
+      mpn_sqrmod_bnm1 (pp, rn, ap, an, tp);
+    }
+  else
+    {
+      rn = mpn_mulmod_bnm1_next_size (an + bn);
+      tp = TMP_ALLOC_LIMBS (mpn_mulmod_bnm1_itch (rn, an, bn));
+      mpn_mulmod_bnm1 (pp, rn, ap, an, bp, bn, tp);
+    }
+
+  TMP_FREE;
+}

diff --git a/third_party/gmp/mpn/generic/perfpow.c b/third_party/gmp/mpn/generic/perfpow.c
new file mode 100644
index 0000000..9d46477
--- /dev/null
+++ b/third_party/gmp/mpn/generic/perfpow.c

@@ -0,0 +1,342 @@
+/* mpn_perfect_power_p -- mpn perfect power detection.
+
+   Contributed to the GNU project by Martin Boij.
+
+Copyright 2009, 2010, 2012, 2014 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+#define SMALL 20
+#define MEDIUM 100
+
+/* Return non-zero if {np,nn} == {xp,xn} ^ k.
+   Algorithm:
+       For s = 1, 2, 4, ..., s_max, compute the s least significant limbs of
+       {xp,xn}^k. Stop if they don't match the s least significant limbs of
+       {np,nn}.
+
+   FIXME: Low xn limbs can be expected to always match, if computed as a mod
+   B^{xn} root. So instead of using mpn_powlo, compute an approximation of the
+   most significant (normalized) limb of {xp,xn} ^ k (and an error bound), and
+   compare to {np, nn}. Or use an even cruder approximation based on fix-point
+   base 2 logarithm.  */
+static int
+pow_equals (mp_srcptr np, mp_size_t n,
+	    mp_srcptr xp,mp_size_t xn,
+	    mp_limb_t k, mp_bitcnt_t f,
+	    mp_ptr tp)
+{
+  mp_bitcnt_t y, z;
+  mp_size_t bn;
+  mp_limb_t h, l;
+
+  ASSERT (n > 1 || (n == 1 && np[0] > 1));
+  ASSERT (np[n - 1] > 0);
+  ASSERT (xn > 0);
+
+  if (xn == 1 && xp[0] == 1)
+    return 0;
+
+  z = 1 + (n >> 1);
+  for (bn = 1; bn < z; bn <<= 1)
+    {
+      mpn_powlo (tp, xp, &k, 1, bn, tp + bn);
+      if (mpn_cmp (tp, np, bn) != 0)
+	return 0;
+    }
+
+  /* Final check. Estimate the size of {xp,xn}^k before computing the power
+     with full precision.  Optimization: It might pay off to make a more
+     accurate estimation of the logarithm of {xp,xn}, rather than using the
+     index of the MSB.  */
+
+  MPN_SIZEINBASE_2EXP(y, xp, xn, 1);
+  y -= 1;  /* msb_index (xp, xn) */
+
+  umul_ppmm (h, l, k, y);
+  h -= l == 0;  --l;	/* two-limb decrement */
+
+  z = f - 1; /* msb_index (np, n) */
+  if (h == 0 && l <= z)
+    {
+      mp_limb_t *tp2;
+      mp_size_t i;
+      int ans;
+      mp_limb_t size;
+      TMP_DECL;
+
+      size = l + k;
+      ASSERT_ALWAYS (size >= k);
+
+      TMP_MARK;
+      y = 2 + size / GMP_LIMB_BITS;
+      tp2 = TMP_ALLOC_LIMBS (y);
+
+      i = mpn_pow_1 (tp, xp, xn, k, tp2);
+      if (i == n && mpn_cmp (tp, np, n) == 0)
+	ans = 1;
+      else
+	ans = 0;
+      TMP_FREE;
+      return ans;
+    }
+
+  return 0;
+}
+
+
+/* Return non-zero if N = {np,n} is a kth power.
+   I = {ip,n} = N^(-1) mod B^n.  */
+static int
+is_kth_power (mp_ptr rp, mp_srcptr np,
+	      mp_limb_t k, mp_srcptr ip,
+	      mp_size_t n, mp_bitcnt_t f,
+	      mp_ptr tp)
+{
+  mp_bitcnt_t b;
+  mp_size_t rn, xn;
+
+  ASSERT (n > 0);
+  ASSERT ((k & 1) != 0 || k == 2);
+  ASSERT ((np[0] & 1) != 0);
+
+  if (k == 2)
+    {
+      b = (f + 1) >> 1;
+      rn = 1 + b / GMP_LIMB_BITS;
+      if (mpn_bsqrtinv (rp, ip, b, tp) != 0)
+	{
+	  rp[rn - 1] &= (CNST_LIMB(1) << (b % GMP_LIMB_BITS)) - 1;
+	  xn = rn;
+	  MPN_NORMALIZE (rp, xn);
+	  if (pow_equals (np, n, rp, xn, k, f, tp) != 0)
+	    return 1;
+
+	  /* Check if (2^b - r)^2 == n */
+	  mpn_neg (rp, rp, rn);
+	  rp[rn - 1] &= (CNST_LIMB(1) << (b % GMP_LIMB_BITS)) - 1;
+	  MPN_NORMALIZE (rp, rn);
+	  if (pow_equals (np, n, rp, rn, k, f, tp) != 0)
+	    return 1;
+	}
+    }
+  else
+    {
+      b = 1 + (f - 1) / k;
+      rn = 1 + (b - 1) / GMP_LIMB_BITS;
+      mpn_brootinv (rp, ip, rn, k, tp);
+      if ((b % GMP_LIMB_BITS) != 0)
+	rp[rn - 1] &= (CNST_LIMB(1) << (b % GMP_LIMB_BITS)) - 1;
+      MPN_NORMALIZE (rp, rn);
+      if (pow_equals (np, n, rp, rn, k, f, tp) != 0)
+	return 1;
+    }
+  MPN_ZERO (rp, rn); /* Untrash rp */
+  return 0;
+}
+
+static int
+perfpow (mp_srcptr np, mp_size_t n,
+	 mp_limb_t ub, mp_limb_t g,
+	 mp_bitcnt_t f, int neg)
+{
+  mp_ptr ip, tp, rp;
+  mp_limb_t k;
+  int ans;
+  mp_bitcnt_t b;
+  gmp_primesieve_t ps;
+  TMP_DECL;
+
+  ASSERT (n > 0);
+  ASSERT ((np[0] & 1) != 0);
+  ASSERT (ub > 0);
+
+  TMP_MARK;
+  gmp_init_primesieve (&ps);
+  b = (f + 3) >> 1;
+
+  TMP_ALLOC_LIMBS_3 (ip, n, rp, n, tp, 5 * n);
+
+  MPN_ZERO (rp, n);
+
+  /* FIXME: It seems the inverse in ninv is needed only to get non-inverted
+     roots. I.e., is_kth_power computes n^{1/2} as (n^{-1})^{-1/2} and
+     similarly for nth roots. It should be more efficient to compute n^{1/2} as
+     n * n^{-1/2}, with a mullo instead of a binvert. And we can do something
+     similar for kth roots if we switch to an iteration converging to n^{1/k -
+     1}, and we can then eliminate this binvert call. */
+  mpn_binvert (ip, np, 1 + (b - 1) / GMP_LIMB_BITS, tp);
+  if (b % GMP_LIMB_BITS)
+    ip[(b - 1) / GMP_LIMB_BITS] &= (CNST_LIMB(1) << (b % GMP_LIMB_BITS)) - 1;
+
+  if (neg)
+    gmp_nextprime (&ps);
+
+  ans = 0;
+  if (g > 0)
+    {
+      ub = MIN (ub, g + 1);
+      while ((k = gmp_nextprime (&ps)) < ub)
+	{
+	  if ((g % k) == 0)
+	    {
+	      if (is_kth_power (rp, np, k, ip, n, f, tp) != 0)
+		{
+		  ans = 1;
+		  goto ret;
+		}
+	    }
+	}
+    }
+  else
+    {
+      while ((k = gmp_nextprime (&ps)) < ub)
+	{
+	  if (is_kth_power (rp, np, k, ip, n, f, tp) != 0)
+	    {
+	      ans = 1;
+	      goto ret;
+	    }
+	}
+    }
+ ret:
+  TMP_FREE;
+  return ans;
+}
+
+static const unsigned short nrtrial[] = { 100, 500, 1000 };
+
+/* Table of (log_{p_i} 2) values, where p_i is the (nrtrial[i] + 1)'th prime
+   number.  */
+static const double logs[] =
+  { 0.1099457228193620, 0.0847016403115322, 0.0772048195144415 };
+
+int
+mpn_perfect_power_p (mp_srcptr np, mp_size_t n)
+{
+  mp_limb_t *nc, factor, g;
+  mp_limb_t exp, d;
+  mp_bitcnt_t twos, count;
+  int ans, where, neg, trial;
+  TMP_DECL;
+
+  neg = n < 0;
+  if (neg)
+    {
+      n = -n;
+    }
+
+  if (n == 0 || (n == 1 && np[0] == 1)) /* Valgrind doesn't like
+					   (n <= (np[0] == 1)) */
+    return 1;
+
+  TMP_MARK;
+
+  count = 0;
+
+  twos = mpn_scan1 (np, 0);
+  if (twos != 0)
+    {
+      mp_size_t s;
+      if (twos == 1)
+	{
+	  return 0;
+	}
+      s = twos / GMP_LIMB_BITS;
+      if (s + 1 == n && POW2_P (np[s]))
+	{
+	  return ! (neg && POW2_P (twos));
+	}
+      count = twos % GMP_LIMB_BITS;
+      n -= s;
+      np += s;
+      if (count > 0)
+	{
+	  nc = TMP_ALLOC_LIMBS (n);
+	  mpn_rshift (nc, np, n, count);
+	  n -= (nc[n - 1] == 0);
+	  np = nc;
+	}
+    }
+  g = twos;
+
+  trial = (n > SMALL) + (n > MEDIUM);
+
+  where = 0;
+  factor = mpn_trialdiv (np, n, nrtrial[trial], &where);
+
+  if (factor != 0)
+    {
+      if (count == 0) /* We did not allocate nc yet. */
+	{
+	  nc = TMP_ALLOC_LIMBS (n);
+	}
+
+      /* Remove factors found by trialdiv.  Optimization: If remove
+	 define _itch, we can allocate its scratch just once */
+
+      do
+	{
+	  binvert_limb (d, factor);
+
+	  /* After the first round we always have nc == np */
+	  exp = mpn_remove (nc, &n, np, n, &d, 1, ~(mp_bitcnt_t)0);
+
+	  if (g == 0)
+	    g = exp;
+	  else
+	    g = mpn_gcd_1 (&g, 1, exp);
+
+	  if (g == 1)
+	    {
+	      ans = 0;
+	      goto ret;
+	    }
+
+	  if ((n == 1) & (nc[0] == 1))
+	    {
+	      ans = ! (neg && POW2_P (g));
+	      goto ret;
+	    }
+
+	  np = nc;
+	  factor = mpn_trialdiv (np, n, nrtrial[trial], &where);
+	}
+      while (factor != 0);
+    }
+
+  MPN_SIZEINBASE_2EXP(count, np, n, 1);   /* log (np) + 1 */
+  d = (mp_limb_t) (count * logs[trial] + 1e-9) + 1;
+  ans = perfpow (np, n, d, g, count, neg);
+
+ ret:
+  TMP_FREE;
+  return ans;
+}

diff --git a/third_party/gmp/mpn/generic/perfsqr.c b/third_party/gmp/mpn/generic/perfsqr.c
new file mode 100644
index 0000000..38a1a91
--- /dev/null
+++ b/third_party/gmp/mpn/generic/perfsqr.c

@@ -0,0 +1,239 @@
+/* mpn_perfect_square_p(u,usize) -- Return non-zero if U is a perfect square,
+   zero otherwise.
+
+Copyright 1991, 1993, 1994, 1996, 1997, 2000-2002, 2005, 2012 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include <stdio.h> /* for NULL */
+#include "gmp-impl.h"
+#include "longlong.h"
+
+#include "perfsqr.h"
+
+
+/* change this to "#define TRACE(x) x" for diagnostics */
+#define TRACE(x)
+
+
+
+/* PERFSQR_MOD_* detects non-squares using residue tests.
+
+   A macro PERFSQR_MOD_TEST is setup by gen-psqr.c in perfsqr.h.  It takes
+   {up,usize} modulo a selected modulus to get a remainder r.  For 32-bit or
+   64-bit limbs this modulus will be 2^24-1 or 2^48-1 using PERFSQR_MOD_34,
+   or for other limb or nail sizes a PERFSQR_PP is chosen and PERFSQR_MOD_PP
+   used.  PERFSQR_PP_NORM and PERFSQR_PP_INVERTED are pre-calculated in this
+   case too.
+
+   PERFSQR_MOD_TEST then makes various calls to PERFSQR_MOD_1 or
+   PERFSQR_MOD_2 with divisors d which are factors of the modulus, and table
+   data indicating residues and non-residues modulo those divisors.  The
+   table data is in 1 or 2 limbs worth of bits respectively, per the size of
+   each d.
+
+   A "modexact" style remainder is taken to reduce r modulo d.
+   PERFSQR_MOD_IDX implements this, producing an index "idx" for use with
+   the table data.  Notice there's just one multiplication by a constant
+   "inv", for each d.
+
+   The modexact doesn't produce a true r%d remainder, instead idx satisfies
+   "-(idx<<PERFSQR_MOD_BITS) == r mod d".  Because d is odd, this factor
+   -2^PERFSQR_MOD_BITS is a one-to-one mapping between r and idx, and is
+   accounted for by having the table data suitably permuted.
+
+   The remainder r fits within PERFSQR_MOD_BITS which is less than a limb.
+   In fact the GMP_LIMB_BITS - PERFSQR_MOD_BITS spare bits are enough to fit
+   each divisor d meaning the modexact multiply can take place entirely
+   within one limb, giving the compiler the chance to optimize it, in a way
+   that say umul_ppmm would not give.
+
+   There's no need for the divisors d to be prime, in fact gen-psqr.c makes
+   a deliberate effort to combine factors so as to reduce the number of
+   separate tests done on r.  But such combining is limited to d <=
+   2*GMP_LIMB_BITS so that the table data fits in at most 2 limbs.
+
+   Alternatives:
+
+   It'd be possible to use bigger divisors d, and more than 2 limbs of table
+   data, but this doesn't look like it would be of much help to the prime
+   factors in the usual moduli 2^24-1 or 2^48-1.
+
+   The moduli 2^24-1 or 2^48-1 are nothing particularly special, they're
+   just easy to calculate (see mpn_mod_34lsub1) and have a nice set of prime
+   factors.  2^32-1 and 2^64-1 would be equally easy to calculate, but have
+   fewer prime factors.
+
+   The nails case usually ends up using mpn_mod_1, which is a lot slower
+   than mpn_mod_34lsub1.  Perhaps other such special moduli could be found
+   for the nails case.  Two-term things like 2^30-2^15-1 might be
+   candidates.  Or at worst some on-the-fly de-nailing would allow the plain
+   2^24-1 to be used.  Currently nails are too preliminary to be worried
+   about.
+
+*/
+
+#define PERFSQR_MOD_MASK       ((CNST_LIMB(1) << PERFSQR_MOD_BITS) - 1)
+
+#define MOD34_BITS  (GMP_NUMB_BITS / 4 * 3)
+#define MOD34_MASK  ((CNST_LIMB(1) << MOD34_BITS) - 1)
+
+#define PERFSQR_MOD_34(r, up, usize)				\
+  do {								\
+    (r) = mpn_mod_34lsub1 (up, usize);				\
+    (r) = ((r) & MOD34_MASK) + ((r) >> MOD34_BITS);		\
+  } while (0)
+
+/* FIXME: The %= here isn't good, and might destroy any savings from keeping
+   the PERFSQR_MOD_IDX stuff within a limb (rather than needing umul_ppmm).
+   Maybe a new sort of mpn_preinv_mod_1 could accept an unnormalized divisor
+   and a shift count, like mpn_preinv_divrem_1.  But mod_34lsub1 is our
+   normal case, so lets not worry too much about mod_1.  */
+#define PERFSQR_MOD_PP(r, up, usize)					\
+  do {									\
+    if (BELOW_THRESHOLD (usize, PREINV_MOD_1_TO_MOD_1_THRESHOLD))	\
+      {									\
+	(r) = mpn_preinv_mod_1 (up, usize, PERFSQR_PP_NORM,		\
+				PERFSQR_PP_INVERTED);			\
+	(r) %= PERFSQR_PP;						\
+      }									\
+    else								\
+      {									\
+	(r) = mpn_mod_1 (up, usize, PERFSQR_PP);			\
+      }									\
+  } while (0)
+
+#define PERFSQR_MOD_IDX(idx, r, d, inv)				\
+  do {								\
+    mp_limb_t  q;						\
+    ASSERT ((r) <= PERFSQR_MOD_MASK);				\
+    ASSERT ((((inv) * (d)) & PERFSQR_MOD_MASK) == 1);		\
+    ASSERT (MP_LIMB_T_MAX / (d) >= PERFSQR_MOD_MASK);		\
+								\
+    q = ((r) * (inv)) & PERFSQR_MOD_MASK;			\
+    ASSERT (r == ((q * (d)) & PERFSQR_MOD_MASK));		\
+    (idx) = (q * (d)) >> PERFSQR_MOD_BITS;			\
+  } while (0)
+
+#define PERFSQR_MOD_1(r, d, inv, mask)				\
+  do {								\
+    unsigned   idx;						\
+    ASSERT ((d) <= GMP_LIMB_BITS);				\
+    PERFSQR_MOD_IDX(idx, r, d, inv);				\
+    TRACE (printf ("  PERFSQR_MOD_1 d=%u r=%lu idx=%u\n",	\
+		   d, r%d, idx));				\
+    if ((((mask) >> idx) & 1) == 0)				\
+      {								\
+	TRACE (printf ("  non-square\n"));			\
+	return 0;						\
+      }								\
+  } while (0)
+
+/* The expression "(int) idx - GMP_LIMB_BITS < 0" lets the compiler use the
+   sign bit from "idx-GMP_LIMB_BITS", which might help avoid a branch.  */
+#define PERFSQR_MOD_2(r, d, inv, mhi, mlo)			\
+  do {								\
+    mp_limb_t  m;						\
+    unsigned   idx;						\
+    ASSERT ((d) <= 2*GMP_LIMB_BITS);				\
+								\
+    PERFSQR_MOD_IDX (idx, r, d, inv);				\
+    TRACE (printf ("  PERFSQR_MOD_2 d=%u r=%lu idx=%u\n",	\
+		   d, r%d, idx));				\
+    m = ((int) idx - GMP_LIMB_BITS < 0 ? (mlo) : (mhi));	\
+    idx %= GMP_LIMB_BITS;					\
+    if (((m >> idx) & 1) == 0)					\
+      {								\
+	TRACE (printf ("  non-square\n"));			\
+	return 0;						\
+      }								\
+  } while (0)
+
+
+int
+mpn_perfect_square_p (mp_srcptr up, mp_size_t usize)
+{
+  ASSERT (usize >= 1);
+
+  TRACE (gmp_printf ("mpn_perfect_square_p %Nd\n", up, usize));
+
+  /* The first test excludes 212/256 (82.8%) of the perfect square candidates
+     in O(1) time.  */
+  {
+    unsigned  idx = up[0] % 0x100;
+    if (((sq_res_0x100[idx / GMP_LIMB_BITS]
+	  >> (idx % GMP_LIMB_BITS)) & 1) == 0)
+      return 0;
+  }
+
+#if 0
+  /* Check that we have even multiplicity of 2, and then check that the rest is
+     a possible perfect square.  Leave disabled until we can determine this
+     really is an improvement.  It it is, it could completely replace the
+     simple probe above, since this should throw out more non-squares, but at
+     the expense of somewhat more cycles.  */
+  {
+    mp_limb_t lo;
+    int cnt;
+    lo = up[0];
+    while (lo == 0)
+      up++, lo = up[0], usize--;
+    count_trailing_zeros (cnt, lo);
+    if ((cnt & 1) != 0)
+      return 0;			/* return of not even multiplicity of 2 */
+    lo >>= cnt;			/* shift down to align lowest non-zero bit */
+    lo >>= 1;			/* shift away lowest non-zero bit */
+    if ((lo & 3) != 0)
+      return 0;
+  }
+#endif
+
+
+  /* The second test uses mpn_mod_34lsub1 or mpn_mod_1 to detect non-squares
+     according to their residues modulo small primes (or powers of
+     primes).  See perfsqr.h.  */
+  PERFSQR_MOD_TEST (up, usize);
+
+
+  /* For the third and last test, we finally compute the square root,
+     to make sure we've really got a perfect square.  */
+  {
+    mp_ptr root_ptr;
+    int res;
+    TMP_DECL;
+
+    TMP_MARK;
+    root_ptr = TMP_ALLOC_LIMBS ((usize + 1) / 2);
+
+    /* Iff mpn_sqrtrem returns zero, the square is perfect.  */
+    res = ! mpn_sqrtrem (root_ptr, NULL, up, usize);
+    TMP_FREE;
+
+    return res;
+  }
+}

diff --git a/third_party/gmp/mpn/generic/popham.c b/third_party/gmp/mpn/generic/popham.c
new file mode 100644
index 0000000..87974d7
--- /dev/null
+++ b/third_party/gmp/mpn/generic/popham.c

@@ -0,0 +1,125 @@
+/* mpn_popcount, mpn_hamdist -- mpn bit population count/hamming distance.
+
+Copyright 1994, 1996, 2000-2002, 2005, 2011, 2012 Free Software Foundation,
+Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+
+#if OPERATION_popcount
+#define FNAME mpn_popcount
+#define POPHAM(u,v) u
+#endif
+
+#if OPERATION_hamdist
+#define FNAME mpn_hamdist
+#define POPHAM(u,v) u ^ v
+#endif
+
+mp_bitcnt_t
+FNAME (mp_srcptr up,
+#if OPERATION_hamdist
+       mp_srcptr vp,
+#endif
+       mp_size_t n) __GMP_NOTHROW
+{
+  mp_bitcnt_t result = 0;
+  mp_limb_t p0, p1, p2, p3, x, p01, p23;
+  mp_size_t i;
+
+  ASSERT (n >= 1);		/* Actually, this code handles any n, but some
+				   assembly implementations do not.  */
+
+  for (i = n >> 2; i != 0; i--)
+    {
+      p0 = POPHAM (up[0], vp[0]);
+      p0 -= (p0 >> 1) & MP_LIMB_T_MAX/3;				/* 2 0-2 */
+      p0 = ((p0 >> 2) & MP_LIMB_T_MAX/5) + (p0 & MP_LIMB_T_MAX/5);	/* 4 0-4 */
+
+      p1 = POPHAM (up[1], vp[1]);
+      p1 -= (p1 >> 1) & MP_LIMB_T_MAX/3;				/* 2 0-2 */
+      p1 = ((p1 >> 2) & MP_LIMB_T_MAX/5) + (p1 & MP_LIMB_T_MAX/5);	/* 4 0-4 */
+
+      p01 = p0 + p1;							/* 8 0-8 */
+      p01 = ((p01 >> 4) & MP_LIMB_T_MAX/17) + (p01 & MP_LIMB_T_MAX/17);	/* 8 0-16 */
+
+      p2 = POPHAM (up[2], vp[2]);
+      p2 -= (p2 >> 1) & MP_LIMB_T_MAX/3;				/* 2 0-2 */
+      p2 = ((p2 >> 2) & MP_LIMB_T_MAX/5) + (p2 & MP_LIMB_T_MAX/5);	/* 4 0-4 */
+
+      p3 = POPHAM (up[3], vp[3]);
+      p3 -= (p3 >> 1) & MP_LIMB_T_MAX/3;				/* 2 0-2 */
+      p3 = ((p3 >> 2) & MP_LIMB_T_MAX/5) + (p3 & MP_LIMB_T_MAX/5);	/* 4 0-4 */
+
+      p23 = p2 + p3;							/* 8 0-8 */
+      p23 = ((p23 >> 4) & MP_LIMB_T_MAX/17) + (p23 & MP_LIMB_T_MAX/17);	/* 8 0-16 */
+
+      x = p01 + p23;							/* 8 0-32 */
+      x = (x >> 8) + x;							/* 8 0-64 */
+      x = (x >> 16) + x;						/* 8 0-128 */
+#if GMP_LIMB_BITS > 32
+      x = ((x >> 32) & 0xff) + (x & 0xff);				/* 8 0-256 */
+      result += x;
+#else
+      result += x & 0xff;
+#endif
+      up += 4;
+#if OPERATION_hamdist
+      vp += 4;
+#endif
+    }
+
+  n &= 3;
+  if (n != 0)
+    {
+      x = 0;
+      do
+	{
+	  p0 = POPHAM (up[0], vp[0]);
+	  p0 -= (p0 >> 1) & MP_LIMB_T_MAX/3;				/* 2 0-2 */
+	  p0 = ((p0 >> 2) & MP_LIMB_T_MAX/5) + (p0 & MP_LIMB_T_MAX/5);	/* 4 0-4 */
+	  p0 = ((p0 >> 4) + p0) & MP_LIMB_T_MAX/17;			/* 8 0-8 */
+
+	  x += p0;
+	  up += 1;
+#if OPERATION_hamdist
+	  vp += 1;
+#endif
+	}
+      while (--n);
+
+      x = (x >> 8) + x;
+      x = (x >> 16) + x;
+#if GMP_LIMB_BITS > 32
+      x = (x >> 32) + x;
+#endif
+      result += x & 0xff;
+    }
+
+  return result;
+}

diff --git a/third_party/gmp/mpn/generic/pow_1.c b/third_party/gmp/mpn/generic/pow_1.c
new file mode 100644
index 0000000..de11cd2
--- /dev/null
+++ b/third_party/gmp/mpn/generic/pow_1.c

@@ -0,0 +1,135 @@
+/* mpn_pow_1 -- Compute powers R = U^exp.
+
+   THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY.  THEY'RE ALMOST
+   CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
+   FUTURE GNU MP RELEASES.
+
+Copyright 2002, 2014 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+mp_size_t
+mpn_pow_1 (mp_ptr rp, mp_srcptr bp, mp_size_t bn, mp_limb_t exp, mp_ptr tp)
+{
+  mp_limb_t x;
+  int cnt, i;
+  mp_size_t rn;
+  int par;
+
+  ASSERT (bn >= 1);
+  /* FIXME: Add operand overlap criteria */
+
+  if (exp <= 1)
+    {
+      if (exp == 0)
+	{
+	  rp[0] = 1;
+	  return 1;
+	}
+      else
+	{
+	  MPN_COPY (rp, bp, bn);
+	  return bn;
+	}
+    }
+
+  /* Count number of bits in exp, and compute where to put initial square in
+     order to magically get results in the entry rp.  Use simple code,
+     optimized for small exp.  For large exp, the bignum operations will take
+     so much time that the slowness of this code will be negligible.  */
+  par = 0;
+  cnt = GMP_LIMB_BITS;
+  x = exp;
+  do
+    {
+      par ^= x;
+      cnt--;
+      x >>= 1;
+    } while (x != 0);
+  exp <<= cnt;
+
+  if (bn == 1)
+    {
+      mp_limb_t rl, rh, bl = bp[0];
+
+      if ((cnt & 1) != 0)
+	MP_PTR_SWAP (rp, tp);
+
+      umul_ppmm (rh, rl, bl, bl << GMP_NAIL_BITS);
+      rp[0] = rl >> GMP_NAIL_BITS;
+      rp[1] = rh;
+      rn = 1 + (rh != 0);
+
+      for (i = GMP_LIMB_BITS - cnt - 1;;)
+	{
+	  exp <<= 1;
+	  if ((exp & GMP_LIMB_HIGHBIT) != 0)
+	    {
+	      rp[rn] = rh = mpn_mul_1 (rp, rp, rn, bl);
+	      rn += rh != 0;
+	    }
+
+	  if (--i == 0)
+	    break;
+
+	  mpn_sqr (tp, rp, rn);
+	  rn = 2 * rn; rn -= tp[rn - 1] == 0;
+	  MP_PTR_SWAP (rp, tp);
+	}
+    }
+  else
+    {
+      if (((par ^ cnt) & 1) == 0)
+	MP_PTR_SWAP (rp, tp);
+
+      mpn_sqr (rp, bp, bn);
+      rn = 2 * bn; rn -= rp[rn - 1] == 0;
+
+      for (i = GMP_LIMB_BITS - cnt - 1;;)
+	{
+	  exp <<= 1;
+	  if ((exp & GMP_LIMB_HIGHBIT) != 0)
+	    {
+	      rn = rn + bn - (mpn_mul (tp, rp, rn, bp, bn) == 0);
+	      MP_PTR_SWAP (rp, tp);
+	    }
+
+	  if (--i == 0)
+	    break;
+
+	  mpn_sqr (tp, rp, rn);
+	  rn = 2 * rn; rn -= tp[rn - 1] == 0;
+	  MP_PTR_SWAP (rp, tp);
+	}
+    }
+
+  return rn;
+}

diff --git a/third_party/gmp/mpn/generic/powlo.c b/third_party/gmp/mpn/generic/powlo.c
new file mode 100644
index 0000000..3b26bca
--- /dev/null
+++ b/third_party/gmp/mpn/generic/powlo.c

@@ -0,0 +1,188 @@
+/* mpn_powlo -- Compute R = U^E mod B^n, where B is the limb base.
+
+Copyright 2007-2009, 2012, 2015, 2016, 2018 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+
+#define getbit(p,bi) \
+  ((p[(bi - 1) / GMP_LIMB_BITS] >> (bi - 1) % GMP_LIMB_BITS) & 1)
+
+static inline mp_limb_t
+getbits (const mp_limb_t *p, mp_bitcnt_t bi, unsigned nbits)
+{
+  unsigned nbits_in_r;
+  mp_limb_t r;
+  mp_size_t i;
+
+  if (bi < nbits)
+    {
+      return p[0] & (((mp_limb_t) 1 << bi) - 1);
+    }
+  else
+    {
+      bi -= nbits;			/* bit index of low bit to extract */
+      i = bi / GMP_NUMB_BITS;		/* word index of low bit to extract */
+      bi %= GMP_NUMB_BITS;		/* bit index in low word */
+      r = p[i] >> bi;			/* extract (low) bits */
+      nbits_in_r = GMP_NUMB_BITS - bi;	/* number of bits now in r */
+      if (nbits_in_r < nbits)		/* did we get enough bits? */
+	r += p[i + 1] << nbits_in_r;	/* prepend bits from higher word */
+      return r & (((mp_limb_t ) 1 << nbits) - 1);
+    }
+}
+
+static inline unsigned
+win_size (mp_bitcnt_t eb)
+{
+  unsigned k;
+  static mp_bitcnt_t x[] = {7,25,81,241,673,1793,4609,11521,28161,~(mp_bitcnt_t)0};
+  ASSERT (eb > 1);
+  for (k = 0; eb > x[k++];)
+    ;
+  return k;
+}
+
+/* rp[n-1..0] = bp[n-1..0] ^ ep[en-1..0] mod B^n, B is the limb base.
+   Requires that ep[en-1] is non-zero.
+   Uses scratch space tp[3n-1..0], i.e., 3n words.  */
+/* We only use n words in the scratch space, we should pass tp + n to
+   mullo/sqrlo as a temporary area, it is needed. */
+void
+mpn_powlo (mp_ptr rp, mp_srcptr bp,
+	   mp_srcptr ep, mp_size_t en,
+	   mp_size_t n, mp_ptr tp)
+{
+  unsigned cnt;
+  mp_bitcnt_t ebi;
+  unsigned windowsize, this_windowsize;
+  mp_limb_t expbits;
+  mp_limb_t *pp;
+  long i;
+  int flipflop;
+  TMP_DECL;
+
+  ASSERT (en > 1 || (en == 1 && ep[0] > 1));
+
+  TMP_MARK;
+
+  MPN_SIZEINBASE_2EXP(ebi, ep, en, 1);
+
+  windowsize = win_size (ebi);
+  if (windowsize > 1)
+    {
+      mp_limb_t *this_pp, *last_pp;
+      ASSERT (windowsize < ebi);
+
+      pp = TMP_ALLOC_LIMBS ((n << (windowsize - 1)));
+
+      this_pp = pp;
+
+      MPN_COPY (this_pp, bp, n);
+
+      /* Store b^2 in tp.  */
+      mpn_sqrlo (tp, bp, n);
+
+      /* Precompute odd powers of b and put them in the temporary area at pp.  */
+      i = (1 << (windowsize - 1)) - 1;
+      do
+	{
+	  last_pp = this_pp;
+	  this_pp += n;
+	  mpn_mullo_n (this_pp, last_pp, tp, n);
+	} while (--i != 0);
+
+      expbits = getbits (ep, ebi, windowsize);
+
+      /* THINK: Should we initialise the case expbits % 4 == 0 with a mullo? */
+      count_trailing_zeros (cnt, expbits);
+      ebi -= windowsize;
+      ebi += cnt;
+      expbits >>= cnt;
+
+      MPN_COPY (rp, pp + n * (expbits >> 1), n);
+    }
+  else
+    {
+      pp = tp + n;
+      MPN_COPY (pp, bp, n);
+      MPN_COPY (rp, bp, n);
+      --ebi;
+    }
+
+  flipflop = 0;
+
+  do
+    {
+      while (getbit (ep, ebi) == 0)
+	{
+	  mpn_sqrlo (tp, rp, n);
+	  MP_PTR_SWAP (rp, tp);
+	  flipflop = ! flipflop;
+	  if (--ebi == 0)
+	    goto done;
+	}
+
+      /* The next bit of the exponent is 1.  Now extract the largest block of
+	 bits <= windowsize, and such that the least significant bit is 1.  */
+
+      expbits = getbits (ep, ebi, windowsize);
+      this_windowsize = MIN (windowsize, ebi);
+      ebi -= this_windowsize;
+
+      count_trailing_zeros (cnt, expbits);
+      this_windowsize -= cnt;
+      ebi += cnt;
+      expbits >>= cnt;
+
+      while (this_windowsize > 1)
+	{
+	  mpn_sqrlo (tp, rp, n);
+	  mpn_sqrlo (rp, tp, n);
+	  this_windowsize -= 2;
+	}
+
+      if (this_windowsize != 0)
+	mpn_sqrlo (tp, rp, n);
+      else
+	{
+	  MP_PTR_SWAP (rp, tp);
+	  flipflop = ! flipflop;
+	}
+
+      mpn_mullo_n (rp, tp, pp + n * (expbits >> 1), n);
+    } while (ebi != 0);
+
+ done:
+  if (flipflop)
+    MPN_COPY (tp, rp, n);
+  TMP_FREE;
+}

diff --git a/third_party/gmp/mpn/generic/powm.c b/third_party/gmp/mpn/generic/powm.c
new file mode 100644
index 0000000..2828103
--- /dev/null
+++ b/third_party/gmp/mpn/generic/powm.c

@@ -0,0 +1,635 @@
+/* mpn_powm -- Compute R = U^E mod M.
+
+   Contributed to the GNU project by Torbjorn Granlund.
+
+   THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
+   SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+Copyright 2007-2012, 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+/*
+  BASIC ALGORITHM, Compute U^E mod M, where M < B^n is odd.
+
+  1. W <- U
+
+  2. T <- (B^n * U) mod M                Convert to REDC form
+
+  3. Compute table U^1, U^3, U^5... of E-dependent size
+
+  4. While there are more bits in E
+       W <- power left-to-right base-k
+
+
+  TODO:
+
+   * Make getbits a macro, thereby allowing it to update the index operand.
+     That will simplify the code using getbits.  (Perhaps make getbits' sibling
+     getbit then have similar form, for symmetry.)
+
+   * Write an itch function.  Or perhaps get rid of tp parameter since the huge
+     pp area is allocated locally anyway?
+
+   * Choose window size without looping.  (Superoptimize or think(tm).)
+
+   * Handle small bases with initial, reduction-free exponentiation.
+
+   * Call new division functions, not mpn_tdiv_qr.
+
+   * Consider special code for one-limb M.
+
+   * How should we handle the redc1/redc2/redc_n choice?
+     - redc1:  T(binvert_1limb)  + e * (n)   * (T(mullo-1x1) + n*T(addmul_1))
+     - redc2:  T(binvert_2limbs) + e * (n/2) * (T(mullo-2x2) + n*T(addmul_2))
+     - redc_n: T(binvert_nlimbs) + e * (T(mullo-nxn) + T(M(n)))
+     This disregards the addmul_N constant term, but we could think of
+     that as part of the respective mullo.
+
+   * When U (the base) is small, we should start the exponentiation with plain
+     operations, then convert that partial result to REDC form.
+
+   * When U is just one limb, should it be handled without the k-ary tricks?
+     We could keep a factor of B^n in W, but use U' = BU as base.  After
+     multiplying by this (pseudo two-limb) number, we need to multiply by 1/B
+     mod M.
+*/
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+#undef MPN_REDC_0
+#define MPN_REDC_0(rp, up, mp, invm)					\
+  do {									\
+    mp_limb_t p1, r0, u0, _dummy;					\
+    u0 = *(up);								\
+    umul_ppmm (p1, _dummy, *(mp), (u0 * (invm)) & GMP_NUMB_MASK);	\
+    ASSERT (((u0 + _dummy) & GMP_NUMB_MASK) == 0);			\
+    p1 += (u0 != 0);							\
+    r0 = (up)[1] + p1;							\
+    if (p1 > r0)							\
+      r0 -= *(mp);							\
+    *(rp) = r0;								\
+  } while (0)
+
+#undef MPN_REDC_1
+#if HAVE_NATIVE_mpn_sbpi1_bdiv_r
+#define MPN_REDC_1(rp, up, mp, n, invm)					\
+  do {									\
+    mp_limb_t cy;							\
+    cy = mpn_sbpi1_bdiv_r (up, 2 * n, mp, n, invm);			\
+    if (cy != 0)							\
+      mpn_sub_n (rp, up + n, mp, n);					\
+    else								\
+      MPN_COPY (rp, up + n, n);						\
+  } while (0)
+#else
+#define MPN_REDC_1(rp, up, mp, n, invm)					\
+  do {									\
+    mp_limb_t cy;							\
+    cy = mpn_redc_1 (rp, up, mp, n, invm);				\
+    if (cy != 0)							\
+      mpn_sub_n (rp, rp, mp, n);					\
+  } while (0)
+#endif
+
+#undef MPN_REDC_2
+#define MPN_REDC_2(rp, up, mp, n, mip)					\
+  do {									\
+    mp_limb_t cy;							\
+    cy = mpn_redc_2 (rp, up, mp, n, mip);				\
+    if (cy != 0)							\
+      mpn_sub_n (rp, rp, mp, n);					\
+  } while (0)
+
+#if HAVE_NATIVE_mpn_addmul_2 || HAVE_NATIVE_mpn_redc_2
+#define WANT_REDC_2 1
+#endif
+
+#define getbit(p,bi) \
+  ((p[(bi - 1) / GMP_LIMB_BITS] >> (bi - 1) % GMP_LIMB_BITS) & 1)
+
+static inline mp_limb_t
+getbits (const mp_limb_t *p, mp_bitcnt_t bi, int nbits)
+{
+  int nbits_in_r;
+  mp_limb_t r;
+  mp_size_t i;
+
+  if (bi < nbits)
+    {
+      return p[0] & (((mp_limb_t) 1 << bi) - 1);
+    }
+  else
+    {
+      bi -= nbits;			/* bit index of low bit to extract */
+      i = bi / GMP_NUMB_BITS;		/* word index of low bit to extract */
+      bi %= GMP_NUMB_BITS;		/* bit index in low word */
+      r = p[i] >> bi;			/* extract (low) bits */
+      nbits_in_r = GMP_NUMB_BITS - bi;	/* number of bits now in r */
+      if (nbits_in_r < nbits)		/* did we get enough bits? */
+	r += p[i + 1] << nbits_in_r;	/* prepend bits from higher word */
+      return r & (((mp_limb_t) 1 << nbits) - 1);
+    }
+}
+
+static inline int
+win_size (mp_bitcnt_t eb)
+{
+  int k;
+  static mp_bitcnt_t x[] = {0,7,25,81,241,673,1793,4609,11521,28161,~(mp_bitcnt_t)0};
+  for (k = 1; eb > x[k]; k++)
+    ;
+  return k;
+}
+
+/* Convert U to REDC form, U_r = B^n * U mod M */
+static void
+redcify (mp_ptr rp, mp_srcptr up, mp_size_t un, mp_srcptr mp, mp_size_t n)
+{
+  mp_ptr tp, qp;
+  TMP_DECL;
+  TMP_MARK;
+
+  TMP_ALLOC_LIMBS_2 (tp, un + n, qp, un + 1);
+
+  MPN_ZERO (tp, n);
+  MPN_COPY (tp + n, up, un);
+  mpn_tdiv_qr (qp, rp, 0L, tp, un + n, mp, n);
+  TMP_FREE;
+}
+
+/* rp[n-1..0] = bp[bn-1..0] ^ ep[en-1..0] mod mp[n-1..0]
+   Requires that mp[n-1..0] is odd.
+   Requires that ep[en-1..0] is > 1.
+   Uses scratch space at tp of MAX(mpn_binvert_itch(n),2n) limbs.  */
+void
+mpn_powm (mp_ptr rp, mp_srcptr bp, mp_size_t bn,
+	  mp_srcptr ep, mp_size_t en,
+	  mp_srcptr mp, mp_size_t n, mp_ptr tp)
+{
+  mp_limb_t ip[2], *mip;
+  int cnt;
+  mp_bitcnt_t ebi;
+  int windowsize, this_windowsize;
+  mp_limb_t expbits;
+  mp_ptr pp, this_pp;
+  long i;
+  TMP_DECL;
+
+  ASSERT (en > 1 || (en == 1 && ep[0] > 1));
+  ASSERT (n >= 1 && ((mp[0] & 1) != 0));
+
+  TMP_MARK;
+
+  MPN_SIZEINBASE_2EXP(ebi, ep, en, 1);
+
+#if 0
+  if (bn < n)
+    {
+      /* Do the first few exponent bits without mod reductions,
+	 until the result is greater than the mod argument.  */
+      for (;;)
+	{
+	  mpn_sqr (tp, this_pp, tn);
+	  tn = tn * 2 - 1,  tn += tp[tn] != 0;
+	  if (getbit (ep, ebi) != 0)
+	    mpn_mul (..., tp, tn, bp, bn);
+	  ebi--;
+	}
+    }
+#endif
+
+  windowsize = win_size (ebi);
+
+#if WANT_REDC_2
+  if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_2_THRESHOLD))
+    {
+      mip = ip;
+      binvert_limb (mip[0], mp[0]);
+      mip[0] = -mip[0];
+    }
+  else if (BELOW_THRESHOLD (n, REDC_2_TO_REDC_N_THRESHOLD))
+    {
+      mip = ip;
+      mpn_binvert (mip, mp, 2, tp);
+      mip[0] = -mip[0]; mip[1] = ~mip[1];
+    }
+#else
+  if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_N_THRESHOLD))
+    {
+      mip = ip;
+      binvert_limb (mip[0], mp[0]);
+      mip[0] = -mip[0];
+    }
+#endif
+  else
+    {
+      mip = TMP_ALLOC_LIMBS (n);
+      mpn_binvert (mip, mp, n, tp);
+    }
+
+  pp = TMP_ALLOC_LIMBS (n << (windowsize - 1));
+
+  this_pp = pp;
+  redcify (this_pp, bp, bn, mp, n);
+
+  /* Store b^2 at rp.  */
+  mpn_sqr (tp, this_pp, n);
+#if 0
+  if (n == 1) {
+    MPN_REDC_0 (rp, tp, mp, mip[0]);
+  } else
+#endif
+#if WANT_REDC_2
+  if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_2_THRESHOLD))
+    MPN_REDC_1 (rp, tp, mp, n, mip[0]);
+  else if (BELOW_THRESHOLD (n, REDC_2_TO_REDC_N_THRESHOLD))
+    MPN_REDC_2 (rp, tp, mp, n, mip);
+#else
+  if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_N_THRESHOLD))
+    MPN_REDC_1 (rp, tp, mp, n, mip[0]);
+#endif
+  else
+    mpn_redc_n (rp, tp, mp, n, mip);
+
+  /* Precompute odd powers of b and put them in the temporary area at pp.  */
+  for (i = (1 << (windowsize - 1)) - 1; i > 0; i--)
+#if 1
+    if (n == 1) {
+      umul_ppmm((tp)[1], *(tp), *(this_pp), *(rp));
+      ++this_pp ;
+      MPN_REDC_0 (this_pp, tp, mp, mip[0]);
+    } else
+#endif
+    {
+      mpn_mul_n (tp, this_pp, rp, n);
+      this_pp += n;
+#if WANT_REDC_2
+      if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_2_THRESHOLD))
+	MPN_REDC_1 (this_pp, tp, mp, n, mip[0]);
+      else if (BELOW_THRESHOLD (n, REDC_2_TO_REDC_N_THRESHOLD))
+	MPN_REDC_2 (this_pp, tp, mp, n, mip);
+#else
+      if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_N_THRESHOLD))
+	MPN_REDC_1 (this_pp, tp, mp, n, mip[0]);
+#endif
+      else
+	mpn_redc_n (this_pp, tp, mp, n, mip);
+    }
+
+  expbits = getbits (ep, ebi, windowsize);
+  if (ebi < windowsize)
+    ebi = 0;
+  else
+    ebi -= windowsize;
+
+  count_trailing_zeros (cnt, expbits);
+  ebi += cnt;
+  expbits >>= cnt;
+
+  MPN_COPY (rp, pp + n * (expbits >> 1), n);
+
+#define INNERLOOP							\
+  while (ebi != 0)							\
+    {									\
+      while (getbit (ep, ebi) == 0)					\
+	{								\
+	  MPN_SQR (tp, rp, n);						\
+	  MPN_REDUCE (rp, tp, mp, n, mip);				\
+	  if (--ebi == 0)						\
+	    goto done;							\
+	}								\
+									\
+      /* The next bit of the exponent is 1.  Now extract the largest	\
+	 block of bits <= windowsize, and such that the least		\
+	 significant bit is 1.  */					\
+									\
+      expbits = getbits (ep, ebi, windowsize);				\
+      this_windowsize = windowsize;					\
+      if (ebi < windowsize)						\
+	{								\
+	  this_windowsize -= windowsize - ebi;				\
+	  ebi = 0;							\
+	}								\
+      else								\
+        ebi -= windowsize;						\
+									\
+      count_trailing_zeros (cnt, expbits);				\
+      this_windowsize -= cnt;						\
+      ebi += cnt;							\
+      expbits >>= cnt;							\
+									\
+      do								\
+	{								\
+	  MPN_SQR (tp, rp, n);						\
+	  MPN_REDUCE (rp, tp, mp, n, mip);				\
+	}								\
+      while (--this_windowsize != 0);					\
+									\
+      MPN_MUL_N (tp, rp, pp + n * (expbits >> 1), n);			\
+      MPN_REDUCE (rp, tp, mp, n, mip);					\
+    }
+
+
+  if (n == 1)
+    {
+#undef MPN_MUL_N
+#undef MPN_SQR
+#undef MPN_REDUCE
+#define MPN_MUL_N(r,a,b,n)		umul_ppmm((r)[1], *(r), *(a), *(b))
+#define MPN_SQR(r,a,n)			umul_ppmm((r)[1], *(r), *(a), *(a))
+#define MPN_REDUCE(rp,tp,mp,n,mip)	MPN_REDC_0(rp, tp, mp, mip[0])
+      INNERLOOP;
+    }
+  else
+#if WANT_REDC_2
+  if (REDC_1_TO_REDC_2_THRESHOLD < MUL_TOOM22_THRESHOLD)
+    {
+      if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_2_THRESHOLD))
+	{
+	  if (REDC_1_TO_REDC_2_THRESHOLD < SQR_BASECASE_THRESHOLD
+	      || BELOW_THRESHOLD (n, SQR_BASECASE_THRESHOLD))
+	    {
+#undef MPN_MUL_N
+#undef MPN_SQR
+#undef MPN_REDUCE
+#define MPN_MUL_N(r,a,b,n)		mpn_mul_basecase (r,a,n,b,n)
+#define MPN_SQR(r,a,n)			mpn_mul_basecase (r,a,n,a,n)
+#define MPN_REDUCE(rp,tp,mp,n,mip)	MPN_REDC_1 (rp, tp, mp, n, mip[0])
+	      INNERLOOP;
+	    }
+	  else
+	    {
+#undef MPN_MUL_N
+#undef MPN_SQR
+#undef MPN_REDUCE
+#define MPN_MUL_N(r,a,b,n)		mpn_mul_basecase (r,a,n,b,n)
+#define MPN_SQR(r,a,n)			mpn_sqr_basecase (r,a,n)
+#define MPN_REDUCE(rp,tp,mp,n,mip)	MPN_REDC_1 (rp, tp, mp, n, mip[0])
+	      INNERLOOP;
+	    }
+	}
+      else if (BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD))
+	{
+	  if (MUL_TOOM22_THRESHOLD < SQR_BASECASE_THRESHOLD
+	      || BELOW_THRESHOLD (n, SQR_BASECASE_THRESHOLD))
+	    {
+#undef MPN_MUL_N
+#undef MPN_SQR
+#undef MPN_REDUCE
+#define MPN_MUL_N(r,a,b,n)		mpn_mul_basecase (r,a,n,b,n)
+#define MPN_SQR(r,a,n)			mpn_mul_basecase (r,a,n,a,n)
+#define MPN_REDUCE(rp,tp,mp,n,mip)	MPN_REDC_2 (rp, tp, mp, n, mip)
+	      INNERLOOP;
+	    }
+	  else
+	    {
+#undef MPN_MUL_N
+#undef MPN_SQR
+#undef MPN_REDUCE
+#define MPN_MUL_N(r,a,b,n)		mpn_mul_basecase (r,a,n,b,n)
+#define MPN_SQR(r,a,n)			mpn_sqr_basecase (r,a,n)
+#define MPN_REDUCE(rp,tp,mp,n,mip)	MPN_REDC_2 (rp, tp, mp, n, mip)
+	      INNERLOOP;
+	    }
+	}
+      else if (BELOW_THRESHOLD (n, REDC_2_TO_REDC_N_THRESHOLD))
+	{
+#undef MPN_MUL_N
+#undef MPN_SQR
+#undef MPN_REDUCE
+#define MPN_MUL_N(r,a,b,n)		mpn_mul_n (r,a,b,n)
+#define MPN_SQR(r,a,n)			mpn_sqr (r,a,n)
+#define MPN_REDUCE(rp,tp,mp,n,mip)	MPN_REDC_2 (rp, tp, mp, n, mip)
+	  INNERLOOP;
+	}
+      else
+	{
+#undef MPN_MUL_N
+#undef MPN_SQR
+#undef MPN_REDUCE
+#define MPN_MUL_N(r,a,b,n)		mpn_mul_n (r,a,b,n)
+#define MPN_SQR(r,a,n)			mpn_sqr (r,a,n)
+#define MPN_REDUCE(rp,tp,mp,n,mip)	mpn_redc_n (rp, tp, mp, n, mip)
+	  INNERLOOP;
+	}
+    }
+  else
+    {
+      if (BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD))
+	{
+	  if (MUL_TOOM22_THRESHOLD < SQR_BASECASE_THRESHOLD
+	      || BELOW_THRESHOLD (n, SQR_BASECASE_THRESHOLD))
+	    {
+#undef MPN_MUL_N
+#undef MPN_SQR
+#undef MPN_REDUCE
+#define MPN_MUL_N(r,a,b,n)		mpn_mul_basecase (r,a,n,b,n)
+#define MPN_SQR(r,a,n)			mpn_mul_basecase (r,a,n,a,n)
+#define MPN_REDUCE(rp,tp,mp,n,mip)	MPN_REDC_1 (rp, tp, mp, n, mip[0])
+	      INNERLOOP;
+	    }
+	  else
+	    {
+#undef MPN_MUL_N
+#undef MPN_SQR
+#undef MPN_REDUCE
+#define MPN_MUL_N(r,a,b,n)		mpn_mul_basecase (r,a,n,b,n)
+#define MPN_SQR(r,a,n)			mpn_sqr_basecase (r,a,n)
+#define MPN_REDUCE(rp,tp,mp,n,mip)	MPN_REDC_1 (rp, tp, mp, n, mip[0])
+	      INNERLOOP;
+	    }
+	}
+      else if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_2_THRESHOLD))
+	{
+#undef MPN_MUL_N
+#undef MPN_SQR
+#undef MPN_REDUCE
+#define MPN_MUL_N(r,a,b,n)		mpn_mul_n (r,a,b,n)
+#define MPN_SQR(r,a,n)			mpn_sqr (r,a,n)
+#define MPN_REDUCE(rp,tp,mp,n,mip)	MPN_REDC_1 (rp, tp, mp, n, mip[0])
+	  INNERLOOP;
+	}
+      else if (BELOW_THRESHOLD (n, REDC_2_TO_REDC_N_THRESHOLD))
+	{
+#undef MPN_MUL_N
+#undef MPN_SQR
+#undef MPN_REDUCE
+#define MPN_MUL_N(r,a,b,n)		mpn_mul_n (r,a,b,n)
+#define MPN_SQR(r,a,n)			mpn_sqr (r,a,n)
+#define MPN_REDUCE(rp,tp,mp,n,mip)	MPN_REDC_2 (rp, tp, mp, n, mip)
+	  INNERLOOP;
+	}
+      else
+	{
+#undef MPN_MUL_N
+#undef MPN_SQR
+#undef MPN_REDUCE
+#define MPN_MUL_N(r,a,b,n)		mpn_mul_n (r,a,b,n)
+#define MPN_SQR(r,a,n)			mpn_sqr (r,a,n)
+#define MPN_REDUCE(rp,tp,mp,n,mip)	mpn_redc_n (rp, tp, mp, n, mip)
+	  INNERLOOP;
+	}
+    }
+
+#else  /* WANT_REDC_2 */
+
+  if (REDC_1_TO_REDC_N_THRESHOLD < MUL_TOOM22_THRESHOLD)
+    {
+      if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_N_THRESHOLD))
+	{
+	  if (REDC_1_TO_REDC_N_THRESHOLD < SQR_BASECASE_THRESHOLD
+	      || BELOW_THRESHOLD (n, SQR_BASECASE_THRESHOLD))
+	    {
+#undef MPN_MUL_N
+#undef MPN_SQR
+#undef MPN_REDUCE
+#define MPN_MUL_N(r,a,b,n)		mpn_mul_basecase (r,a,n,b,n)
+#define MPN_SQR(r,a,n)			mpn_mul_basecase (r,a,n,a,n)
+#define MPN_REDUCE(rp,tp,mp,n,mip)	MPN_REDC_1 (rp, tp, mp, n, mip[0])
+	      INNERLOOP;
+	    }
+	  else
+	    {
+#undef MPN_MUL_N
+#undef MPN_SQR
+#undef MPN_REDUCE
+#define MPN_MUL_N(r,a,b,n)		mpn_mul_basecase (r,a,n,b,n)
+#define MPN_SQR(r,a,n)			mpn_sqr_basecase (r,a,n)
+#define MPN_REDUCE(rp,tp,mp,n,mip)	MPN_REDC_1 (rp, tp, mp, n, mip[0])
+	      INNERLOOP;
+	    }
+	}
+      else if (BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD))
+	{
+	  if (MUL_TOOM22_THRESHOLD < SQR_BASECASE_THRESHOLD
+	      || BELOW_THRESHOLD (n, SQR_BASECASE_THRESHOLD))
+	    {
+#undef MPN_MUL_N
+#undef MPN_SQR
+#undef MPN_REDUCE
+#define MPN_MUL_N(r,a,b,n)		mpn_mul_basecase (r,a,n,b,n)
+#define MPN_SQR(r,a,n)			mpn_mul_basecase (r,a,n,a,n)
+#define MPN_REDUCE(rp,tp,mp,n,mip)	mpn_redc_n (rp, tp, mp, n, mip)
+	      INNERLOOP;
+	    }
+	  else
+	    {
+#undef MPN_MUL_N
+#undef MPN_SQR
+#undef MPN_REDUCE
+#define MPN_MUL_N(r,a,b,n)		mpn_mul_basecase (r,a,n,b,n)
+#define MPN_SQR(r,a,n)			mpn_sqr_basecase (r,a,n)
+#define MPN_REDUCE(rp,tp,mp,n,mip)	mpn_redc_n (rp, tp, mp, n, mip)
+	      INNERLOOP;
+	    }
+	}
+      else
+	{
+#undef MPN_MUL_N
+#undef MPN_SQR
+#undef MPN_REDUCE
+#define MPN_MUL_N(r,a,b,n)		mpn_mul_n (r,a,b,n)
+#define MPN_SQR(r,a,n)			mpn_sqr (r,a,n)
+#define MPN_REDUCE(rp,tp,mp,n,mip)	mpn_redc_n (rp, tp, mp, n, mip)
+	  INNERLOOP;
+	}
+    }
+  else
+    {
+      if (BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD))
+	{
+	  if (MUL_TOOM22_THRESHOLD < SQR_BASECASE_THRESHOLD
+	      || BELOW_THRESHOLD (n, SQR_BASECASE_THRESHOLD))
+	    {
+#undef MPN_MUL_N
+#undef MPN_SQR
+#undef MPN_REDUCE
+#define MPN_MUL_N(r,a,b,n)		mpn_mul_basecase (r,a,n,b,n)
+#define MPN_SQR(r,a,n)			mpn_mul_basecase (r,a,n,a,n)
+#define MPN_REDUCE(rp,tp,mp,n,mip)	MPN_REDC_1 (rp, tp, mp, n, mip[0])
+	      INNERLOOP;
+	    }
+	  else
+	    {
+#undef MPN_MUL_N
+#undef MPN_SQR
+#undef MPN_REDUCE
+#define MPN_MUL_N(r,a,b,n)		mpn_mul_basecase (r,a,n,b,n)
+#define MPN_SQR(r,a,n)			mpn_sqr_basecase (r,a,n)
+#define MPN_REDUCE(rp,tp,mp,n,mip)	MPN_REDC_1 (rp, tp, mp, n, mip[0])
+	      INNERLOOP;
+	    }
+	}
+      else if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_N_THRESHOLD))
+	{
+#undef MPN_MUL_N
+#undef MPN_SQR
+#undef MPN_REDUCE
+#define MPN_MUL_N(r,a,b,n)		mpn_mul_n (r,a,b,n)
+#define MPN_SQR(r,a,n)			mpn_sqr (r,a,n)
+#define MPN_REDUCE(rp,tp,mp,n,mip)	MPN_REDC_1 (rp, tp, mp, n, mip[0])
+	  INNERLOOP;
+	}
+      else
+	{
+#undef MPN_MUL_N
+#undef MPN_SQR
+#undef MPN_REDUCE
+#define MPN_MUL_N(r,a,b,n)		mpn_mul_n (r,a,b,n)
+#define MPN_SQR(r,a,n)			mpn_sqr (r,a,n)
+#define MPN_REDUCE(rp,tp,mp,n,mip)	mpn_redc_n (rp, tp, mp, n, mip)
+	  INNERLOOP;
+	}
+    }
+#endif  /* WANT_REDC_2 */
+
+ done:
+
+  MPN_COPY (tp, rp, n);
+  MPN_ZERO (tp + n, n);
+
+#if WANT_REDC_2
+  if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_2_THRESHOLD))
+    MPN_REDC_1 (rp, tp, mp, n, mip[0]);
+  else if (BELOW_THRESHOLD (n, REDC_2_TO_REDC_N_THRESHOLD))
+    MPN_REDC_2 (rp, tp, mp, n, mip);
+#else
+  if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_N_THRESHOLD))
+    MPN_REDC_1 (rp, tp, mp, n, mip[0]);
+#endif
+  else
+    mpn_redc_n (rp, tp, mp, n, mip);
+
+  if (mpn_cmp (rp, mp, n) >= 0)
+    mpn_sub_n (rp, rp, mp, n);
+
+  TMP_FREE;
+}

diff --git a/third_party/gmp/mpn/generic/pre_divrem_1.c b/third_party/gmp/mpn/generic/pre_divrem_1.c
new file mode 100644
index 0000000..3b29d77
--- /dev/null
+++ b/third_party/gmp/mpn/generic/pre_divrem_1.c

@@ -0,0 +1,145 @@
+/* mpn_preinv_divrem_1 -- mpn by limb division with pre-inverted divisor.
+
+   THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY.  THEY'RE ALMOST
+   CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
+   FUTURE GNU MP RELEASES.
+
+Copyright 2000-2003 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+
+/* Don't bloat a shared library with unused code. */
+#if USE_PREINV_DIVREM_1
+
+/* Same test here for skipping one divide step as in mpn_divrem_1.
+
+   The main reason for a separate shift==0 case is that not all CPUs give
+   zero for "n0 >> GMP_LIMB_BITS" which would arise in the general case
+   code used on shift==0.  shift==0 is also reasonably common in mp_bases
+   big_base, for instance base==10 on a 64-bit limb.
+
+   Under shift!=0 it would be possible to call mpn_lshift to adjust the
+   dividend all in one go (into the quotient space say), rather than
+   limb-by-limb in the loop.  This might help if mpn_lshift is a lot faster
+   than what the compiler can generate for EXTRACT.  But this is left to CPU
+   specific implementations to consider, especially since EXTRACT isn't on
+   the dependent chain.
+
+   If size==0 then the result is simply xsize limbs of zeros, but nothing
+   special is done for that, since it wouldn't be a usual call, and
+   certainly never arises from mpn_get_str which is our main caller.  */
+
+mp_limb_t
+mpn_preinv_divrem_1 (mp_ptr qp, mp_size_t xsize,
+		     mp_srcptr ap, mp_size_t size, mp_limb_t d_unnorm,
+		     mp_limb_t dinv, int shift)
+{
+  mp_limb_t  ahigh, qhigh, r;
+  mp_size_t  i;
+  mp_limb_t  n1, n0;
+  mp_limb_t  d;
+
+  ASSERT (xsize >= 0);
+  ASSERT (size >= 1);
+  ASSERT (d_unnorm != 0);
+#if WANT_ASSERT
+  {
+    int        want_shift;
+    mp_limb_t  want_dinv;
+    count_leading_zeros (want_shift, d_unnorm);
+    ASSERT (shift == want_shift);
+    invert_limb (want_dinv, d_unnorm << shift);
+    ASSERT (dinv == want_dinv);
+  }
+#endif
+  /* FIXME: What's the correct overlap rule when xsize!=0? */
+  ASSERT (MPN_SAME_OR_SEPARATE_P (qp+xsize, ap, size));
+
+  ahigh = ap[size-1];
+  d = d_unnorm << shift;
+  qp += (size + xsize - 1);   /* dest high limb */
+
+  if (shift == 0)
+    {
+      /* High quotient limb is 0 or 1, and skip a divide step. */
+      r = ahigh;
+      qhigh = (r >= d);
+      r = (qhigh ? r-d : r);
+      *qp-- = qhigh;
+      size--;
+
+      for (i = size-1; i >= 0; i--)
+	{
+	  n0 = ap[i];
+	  udiv_qrnnd_preinv (*qp, r, r, n0, d, dinv);
+	  qp--;
+	}
+    }
+  else
+    {
+      r = 0;
+      if (ahigh < d_unnorm)
+	{
+	  r = ahigh << shift;
+	  *qp-- = 0;
+	  size--;
+	  if (size == 0)
+	    goto done_integer;
+	}
+
+      n1 = ap[size-1];
+      r |= n1 >> (GMP_LIMB_BITS - shift);
+
+      for (i = size-2; i >= 0; i--)
+	{
+	  ASSERT (r < d);
+	  n0 = ap[i];
+	  udiv_qrnnd_preinv (*qp, r, r,
+			     ((n1 << shift) | (n0 >> (GMP_LIMB_BITS - shift))),
+			     d, dinv);
+	  qp--;
+	  n1 = n0;
+	}
+      udiv_qrnnd_preinv (*qp, r, r, n1 << shift, d, dinv);
+      qp--;
+    }
+
+ done_integer:
+  for (i = 0; i < xsize; i++)
+    {
+      udiv_qrnnd_preinv (*qp, r, r, CNST_LIMB(0), d, dinv);
+      qp--;
+    }
+
+  return r >> shift;
+}
+
+#endif /* USE_PREINV_DIVREM_1 */

diff --git a/third_party/gmp/mpn/generic/pre_mod_1.c b/third_party/gmp/mpn/generic/pre_mod_1.c
new file mode 100644
index 0000000..78ae308
--- /dev/null
+++ b/third_party/gmp/mpn/generic/pre_mod_1.c

@@ -0,0 +1,61 @@
+/* mpn_preinv_mod_1 (up, un, d, dinv) -- Divide (UP,,UN) by the normalized D.
+   DINV should be 2^(2*GMP_LIMB_BITS) / D - 2^GMP_LIMB_BITS.
+   Return the single-limb remainder.
+
+Copyright 1991, 1993, 1994, 2000-2002, 2004, 2005 Free Software Foundation,
+Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+
+/* This function used to be documented, but is now considered obsolete.  It
+   continues to exist for binary compatibility, even when not required
+   internally.  */
+
+mp_limb_t
+mpn_preinv_mod_1 (mp_srcptr up, mp_size_t un, mp_limb_t d, mp_limb_t dinv)
+{
+  mp_size_t i;
+  mp_limb_t n0, r;
+
+  ASSERT (un >= 1);
+  ASSERT (d & GMP_LIMB_HIGHBIT);
+
+  r = up[un - 1];
+  if (r >= d)
+    r -= d;
+
+  for (i = un - 2; i >= 0; i--)
+    {
+      n0 = up[i];
+      udiv_rnnd_preinv (r, r, n0, d, dinv);
+    }
+  return r;
+}

diff --git a/third_party/gmp/mpn/generic/random.c b/third_party/gmp/mpn/generic/random.c
new file mode 100644
index 0000000..485f9eb
--- /dev/null
+++ b/third_party/gmp/mpn/generic/random.c

@@ -0,0 +1,50 @@
+/* mpn_random -- Generate random numbers.
+
+Copyright 2001, 2002 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+
+void
+mpn_random (mp_ptr ptr, mp_size_t size)
+{
+  gmp_randstate_ptr  rands;
+
+  /* FIXME: Is size==0 supposed to be allowed? */
+  ASSERT (size >= 0);
+
+  if (size == 0)
+    return;
+
+  rands = RANDS;
+  _gmp_rand (ptr, rands, size * GMP_NUMB_BITS);
+
+  /* Make sure the most significant limb is non-zero.  */
+  while (ptr[size-1] == 0)
+    _gmp_rand (&ptr[size-1], rands, GMP_NUMB_BITS);
+}

diff --git a/third_party/gmp/mpn/generic/random2.c b/third_party/gmp/mpn/generic/random2.c
new file mode 100644
index 0000000..1eede67
--- /dev/null
+++ b/third_party/gmp/mpn/generic/random2.c

@@ -0,0 +1,105 @@
+/* mpn_random2 -- Generate random numbers with relatively long strings
+   of ones and zeroes.  Suitable for border testing.
+
+Copyright 1992-1994, 1996, 2000-2002, 2004, 2012 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+
+static void gmp_rrandomb (mp_ptr, gmp_randstate_t, mp_bitcnt_t);
+
+/* Ask _gmp_rand for 32 bits per call unless that's more than a limb can hold.
+   Thus, we get the same random number sequence in the common cases.
+   FIXME: We should always generate the same random number sequence!  */
+#if GMP_NUMB_BITS < 32
+#define BITS_PER_RANDCALL GMP_NUMB_BITS
+#else
+#define BITS_PER_RANDCALL 32
+#endif
+
+void
+mpn_random2 (mp_ptr rp, mp_size_t n)
+{
+  gmp_randstate_ptr rstate = RANDS;
+  int bit_pos;			/* bit number of least significant bit where
+				   next bit field to be inserted */
+  mp_limb_t ran, ranm;		/* buffer for random bits */
+
+  /* FIXME: Is n==0 supposed to be allowed? */
+  ASSERT (n >= 0);
+
+  _gmp_rand (&ranm, rstate, BITS_PER_RANDCALL);
+  ran = ranm;
+
+  /* Start off at a random bit position in the most significant limb.  */
+  bit_pos = ran % GMP_NUMB_BITS;
+
+  gmp_rrandomb (rp, rstate, n * GMP_NUMB_BITS - bit_pos);
+}
+
+static void
+gmp_rrandomb (mp_ptr rp, gmp_randstate_t rstate, mp_bitcnt_t nbits)
+{
+  mp_bitcnt_t bi;
+  mp_limb_t ranm;		/* buffer for random bits */
+  unsigned cap_chunksize, chunksize;
+  mp_size_t i;
+
+  /* Set entire result to 111..1  */
+  i = BITS_TO_LIMBS (nbits) - 1;
+  rp[i] = GMP_NUMB_MAX >> (GMP_NUMB_BITS - (nbits % GMP_NUMB_BITS)) % GMP_NUMB_BITS;
+  for (i = i - 1; i >= 0; i--)
+    rp[i] = GMP_NUMB_MAX;
+
+  _gmp_rand (&ranm, rstate, BITS_PER_RANDCALL);
+  cap_chunksize = nbits / (ranm % 4 + 1);
+  cap_chunksize += cap_chunksize == 0; /* make it at least 1 */
+
+  bi = nbits;
+
+  for (;;)
+    {
+      _gmp_rand (&ranm, rstate, BITS_PER_RANDCALL);
+      chunksize = 1 + ranm % cap_chunksize;
+      bi = (bi < chunksize) ? 0 : bi - chunksize;
+
+      if (bi == 0)
+	break;			/* low chunk is ...1 */
+
+      rp[bi / GMP_NUMB_BITS] ^= CNST_LIMB (1) << bi % GMP_NUMB_BITS;
+
+      _gmp_rand (&ranm, rstate, BITS_PER_RANDCALL);
+      chunksize = 1 + ranm % cap_chunksize;
+      bi = (bi < chunksize) ? 0 : bi - chunksize;
+
+      mpn_incr_u (rp + bi / GMP_NUMB_BITS, CNST_LIMB (1) << bi % GMP_NUMB_BITS);
+
+      if (bi == 0)
+	break;			/* low chunk is ...0 */
+    }
+}

diff --git a/third_party/gmp/mpn/generic/redc_1.c b/third_party/gmp/mpn/generic/redc_1.c
new file mode 100644
index 0000000..eab128f
--- /dev/null
+++ b/third_party/gmp/mpn/generic/redc_1.c

@@ -0,0 +1,56 @@
+/* mpn_redc_1.  Set rp[] <- up[]/R^n mod mp[].  Clobber up[].
+   mp[] is n limbs; up[] is 2n limbs.
+
+   THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE.  IT IS ONLY
+   SAFE TO REACH THIS FUNCTION THROUGH DOCUMENTED INTERFACES.
+
+Copyright (C) 2000-2002, 2004, 2008, 2009, 2012 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+
+mp_limb_t
+mpn_redc_1 (mp_ptr rp, mp_ptr up, mp_srcptr mp, mp_size_t n, mp_limb_t invm)
+{
+  mp_size_t j;
+  mp_limb_t cy;
+
+  ASSERT (n > 0);
+  ASSERT_MPN (up, 2*n);
+
+  for (j = n - 1; j >= 0; j--)
+    {
+      cy = mpn_addmul_1 (up, mp, n, (up[0] * invm) & GMP_NUMB_MASK);
+      ASSERT (up[0] == 0);
+      up[0] = cy;
+      up++;
+    }
+
+  cy = mpn_add_n (rp, up, up - n, n);
+  return cy;
+}

diff --git a/third_party/gmp/mpn/generic/redc_2.c b/third_party/gmp/mpn/generic/redc_2.c
new file mode 100644
index 0000000..8d15589
--- /dev/null
+++ b/third_party/gmp/mpn/generic/redc_2.c

@@ -0,0 +1,110 @@
+/* mpn_redc_2.  Set rp[] <- up[]/R^n mod mp[].  Clobber up[].
+   mp[] is n limbs; up[] is 2n limbs.
+
+   THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE.  IT IS ONLY
+   SAFE TO REACH THIS FUNCTION THROUGH DOCUMENTED INTERFACES.
+
+Copyright (C) 2000-2002, 2004, 2008, 2012 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+
+#if GMP_NAIL_BITS != 0
+you lose
+#endif
+
+/* For testing purposes, define our own mpn_addmul_2 if there is none already
+   available.  */
+#ifndef HAVE_NATIVE_mpn_addmul_2
+#undef mpn_addmul_2
+static mp_limb_t
+mpn_addmul_2 (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_srcptr vp)
+{
+  rp[n] = mpn_addmul_1 (rp, up, n, vp[0]);
+  return mpn_addmul_1 (rp + 1, up, n, vp[1]);
+}
+#endif
+
+#if defined (__GNUC__) && ! defined (NO_ASM) \
+  && defined (__ia64) && W_TYPE_SIZE == 64
+#define umul2low(ph, pl, uh, ul, vh, vl) \
+  do {									\
+    mp_limb_t _ph, _pl;							\
+    __asm__ ("xma.hu %0 = %3, %5, f0\n\t"				\
+	     "xma.l %1 = %3, %5, f0\n\t"				\
+	     ";;\n\t"							\
+	     "xma.l %0 = %3, %4, %0\n\t"				\
+	     ";;\n\t"							\
+	     "xma.l %0 = %2, %5, %0"					\
+	     : "=&f" (ph), "=&f" (pl)					\
+	     : "f" (uh), "f" (ul), "f" (vh), "f" (vl));			\
+  } while (0)
+#endif
+
+#ifndef umul2low
+#define umul2low(ph, pl, uh, ul, vh, vl) \
+  do {									\
+    mp_limb_t _ph, _pl;							\
+    umul_ppmm (_ph, _pl, ul, vl);					\
+    (ph) = _ph + (ul) * (vh) + (uh) * (vl);				\
+    (pl) = _pl;								\
+  } while (0)
+#endif
+
+mp_limb_t
+mpn_redc_2 (mp_ptr rp, mp_ptr up, mp_srcptr mp, mp_size_t n, mp_srcptr mip)
+{
+  mp_limb_t q[2];
+  mp_size_t j;
+  mp_limb_t upn;
+  mp_limb_t cy;
+
+  ASSERT (n > 0);
+  ASSERT_MPN (up, 2*n);
+
+  if ((n & 1) != 0)
+    {
+      up[0] = mpn_addmul_1 (up, mp, n, (up[0] * mip[0]) & GMP_NUMB_MASK);
+      up++;
+    }
+
+  for (j = n - 2; j >= 0; j -= 2)
+    {
+      umul2low (q[1], q[0], mip[1], mip[0], up[1], up[0]);
+      upn = up[n];		/* mpn_addmul_2 overwrites this */
+      up[1] = mpn_addmul_2 (up, mp, n, q);
+      up[0] = up[n];
+      up[n] = upn;
+      up += 2;
+    }
+
+  cy = mpn_add_n (rp, up, up - n, n);
+  return cy;
+}

diff --git a/third_party/gmp/mpn/generic/redc_n.c b/third_party/gmp/mpn/generic/redc_n.c
new file mode 100644
index 0000000..0c94b7c
--- /dev/null
+++ b/third_party/gmp/mpn/generic/redc_n.c

@@ -0,0 +1,80 @@
+/* mpn_redc_n.  Set rp[] <- up[]/R^n mod mp[].  Clobber up[].
+   mp[] is n limbs; up[] is 2n limbs, the inverse ip[] is n limbs.
+
+   THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE.  IT IS ONLY
+   SAFE TO REACH THIS FUNCTION THROUGH DOCUMENTED INTERFACES.
+
+Copyright 2009, 2012 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+
+/*
+  TODO
+
+  * We assume mpn_mulmod_bnm1 is always faster than plain mpn_mul_n (or a
+    future mpn_mulhi) for the range we will be called.  Follow up that
+    assumption.
+
+  * Decrease scratch usage.
+
+  * Consider removing the residue canonicalisation.
+*/
+
+void
+mpn_redc_n (mp_ptr rp, mp_ptr up, mp_srcptr mp, mp_size_t n, mp_srcptr ip)
+{
+  mp_ptr xp, yp, scratch;
+  mp_limb_t cy;
+  mp_size_t rn;
+  TMP_DECL;
+  TMP_MARK;
+
+  ASSERT (n > 8);
+
+  rn = mpn_mulmod_bnm1_next_size (n);
+
+  scratch = TMP_ALLOC_LIMBS (n + rn + mpn_mulmod_bnm1_itch (rn, n, n));
+
+  xp = scratch;
+  mpn_mullo_n (xp, up, ip, n);
+
+  yp = scratch + n;
+  mpn_mulmod_bnm1 (yp, rn, xp, n, mp, n, scratch + n + rn);
+
+  ASSERT_ALWAYS (2 * n > rn);				/* could handle this */
+
+  cy = mpn_sub_n (yp + rn, yp, up, 2*n - rn);		/* undo wrap around */
+  MPN_DECR_U (yp + 2*n - rn, rn, cy);
+
+  cy = mpn_sub_n (rp, up + n, yp + n, n);
+  if (cy != 0)
+    mpn_add_n (rp, rp, mp, n);
+
+  TMP_FREE;
+}

diff --git a/third_party/gmp/mpn/generic/remove.c b/third_party/gmp/mpn/generic/remove.c
new file mode 100644
index 0000000..cbb0742
--- /dev/null
+++ b/third_party/gmp/mpn/generic/remove.c

@@ -0,0 +1,182 @@
+/* mpn_remove -- divide out all multiples of odd mpn number from another mpn
+   number.
+
+   Contributed to the GNU project by Torbjorn Granlund.
+
+   THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
+   SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
+
+Copyright 2009, 2012-2014, 2017 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+
+#if GMP_LIMB_BITS > 50
+#define LOG 50
+#else
+#define LOG GMP_LIMB_BITS
+#endif
+
+
+/* Input: U = {up,un}, V = {vp,vn} must be odd, cap
+   Ouput  W = {wp,*wn} allocation need is exactly *wn
+
+   Set W = U / V^k, where k is the largest integer <= cap such that the
+   division yields an integer.
+
+   FIXME: We currently allow any operand overlap.  This is quite non mpn-ish
+   and might be changed, since it cost significant temporary space.
+   * If we require W to have space for un + 1 limbs, we could save qp or qp2
+     (but we will still need to copy things into wp 50% of the time).
+   * If we allow ourselves to clobber U, we could save the other of qp and qp2,
+     and the initial COPY (but also here we would need un + 1 limbs).
+*/
+
+/* FIXME: We need to wrap mpn_bdiv_qr due to the itch interface.  This need
+   indicates a flaw in the current itch mechanism: Which operands not greater
+   than un,un will incur the worst itch?  We need a parallel foo_maxitch set
+   of functions.  */
+static void
+mpn_bdiv_qr_wrap (mp_ptr qp, mp_ptr rp,
+		  mp_srcptr np, mp_size_t nn,
+		  mp_srcptr dp, mp_size_t dn)
+{
+  mp_ptr scratch_out;
+  TMP_DECL;
+
+  TMP_MARK;
+  scratch_out = TMP_ALLOC_LIMBS (mpn_bdiv_qr_itch (nn, dn));
+  mpn_bdiv_qr (qp, rp, np, nn, dp, dn, scratch_out);
+
+  TMP_FREE;
+}
+
+mp_bitcnt_t
+mpn_remove (mp_ptr wp, mp_size_t *wn,
+	    mp_srcptr up, mp_size_t un, mp_srcptr vp, mp_size_t vn,
+	    mp_bitcnt_t cap)
+{
+  mp_srcptr pwpsp[LOG];
+  mp_size_t pwpsn[LOG];
+  mp_size_t npowers;
+  mp_ptr tp, qp, np, qp2;
+  mp_srcptr pp;
+  mp_size_t pn, nn, qn, i;
+  mp_bitcnt_t pwr;
+  TMP_DECL;
+
+  ASSERT (un > 0);
+  ASSERT (vn > 0);
+  ASSERT (vp[0] % 2 != 0);	/* 2-adic division wants odd numbers */
+  ASSERT (vn > 1 || vp[0] > 1);	/* else we would loop indefinitely */
+
+  TMP_MARK;
+
+  TMP_ALLOC_LIMBS_3 (qp, un + 1,	/* quotient, alternating */
+		     qp2, un + 1,	/* quotient, alternating */
+		     tp, (un + 1 + vn) / 2); /* remainder */
+  pp = vp;
+  pn = vn;
+
+  MPN_COPY (qp, up, un);
+  qn = un;
+
+  npowers = 0;
+  while (qn >= pn)
+    {
+      qp[qn] = 0;
+      mpn_bdiv_qr_wrap (qp2, tp, qp, qn + 1, pp, pn);
+      if (!mpn_zero_p (tp, pn))
+	{
+	  if (mpn_cmp (tp, pp, pn) != 0)
+	    break;		/* could not divide by V^npowers */
+	}
+
+      MP_PTR_SWAP (qp, qp2);
+      qn = qn - pn;
+      mpn_neg (qp, qp, qn+1);
+
+      qn += qp[qn] != 0;
+
+      pwpsp[npowers] = pp;
+      pwpsn[npowers] = pn;
+      ++npowers;
+
+      if (((mp_bitcnt_t) 2 << npowers) - 1 > cap)
+	break;
+
+      nn = 2 * pn - 1;		/* next power will be at least this large */
+      if (nn > qn)
+	break;			/* next power would be overlarge */
+
+      if (npowers == 1)		/* Alloc once, but only if it's needed */
+	np = TMP_ALLOC_LIMBS (qn + LOG);	/* powers of V */
+      else
+	np += pn;
+
+      mpn_sqr (np, pp, pn);
+      pn = nn + (np[nn] != 0);
+      pp = np;
+    }
+
+  pwr = ((mp_bitcnt_t) 1 << npowers) - 1;
+
+  for (i = npowers; --i >= 0;)
+    {
+      pn = pwpsn[i];
+      if (qn < pn)
+	continue;
+
+      if (pwr + ((mp_bitcnt_t) 1 << i) > cap)
+	continue;		/* V^i would bring us past cap */
+
+      qp[qn] = 0;
+      mpn_bdiv_qr_wrap (qp2, tp, qp, qn + 1, pwpsp[i], pn);
+      if (!mpn_zero_p (tp, pn))
+	{
+	  if (mpn_cmp (tp, pwpsp[i], pn) != 0)
+	    continue;		/* could not divide by V^i */
+	}
+
+      MP_PTR_SWAP (qp, qp2);
+      qn = qn - pn;
+      mpn_neg (qp, qp, qn+1);
+
+      qn += qp[qn] != 0;
+
+      pwr += (mp_bitcnt_t) 1 << i;
+    }
+
+  MPN_COPY (wp, qp, qn);
+  *wn = qn;
+
+  TMP_FREE;
+
+  return pwr;
+}

diff --git a/third_party/gmp/mpn/generic/rootrem.c b/third_party/gmp/mpn/generic/rootrem.c
new file mode 100644
index 0000000..a79099e
--- /dev/null
+++ b/third_party/gmp/mpn/generic/rootrem.c

@@ -0,0 +1,515 @@
+/* mpn_rootrem(rootp,remp,ap,an,nth) -- Compute the nth root of {ap,an}, and
+   store the truncated integer part at rootp and the remainder at remp.
+
+   Contributed by Paul Zimmermann (algorithm) and
+   Paul Zimmermann and Torbjorn Granlund (implementation).
+   Marco Bodrato wrote logbased_root to seed the loop.
+
+   THE FUNCTIONS IN THIS FILE ARE INTERNAL, AND HAVE MUTABLE INTERFACES.  IT'S
+   ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT'S ALMOST
+   GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+Copyright 2002, 2005, 2009-2012, 2015 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+/* FIXME:
+     This implementation is not optimal when remp == NULL, since the complexity
+     is M(n), whereas it should be M(n/k) on average.
+*/
+
+#include <stdio.h>		/* for NULL */
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+static mp_size_t mpn_rootrem_internal (mp_ptr, mp_ptr, mp_srcptr, mp_size_t,
+				       mp_limb_t, int);
+
+#define MPN_RSHIFT(rp,up,un,cnt) \
+  do {									\
+    if ((cnt) != 0)							\
+      mpn_rshift (rp, up, un, cnt);					\
+    else								\
+      {									\
+	MPN_COPY_INCR (rp, up, un);					\
+      }									\
+  } while (0)
+
+#define MPN_LSHIFT(cy,rp,up,un,cnt) \
+  do {									\
+    if ((cnt) != 0)							\
+      cy = mpn_lshift (rp, up, un, cnt);				\
+    else								\
+      {									\
+	MPN_COPY_DECR (rp, up, un);					\
+	cy = 0;								\
+      }									\
+  } while (0)
+
+
+/* Put in {rootp, ceil(un/k)} the kth root of {up, un}, rounded toward zero.
+   If remp <> NULL, put in {remp, un} the remainder.
+   Return the size (in limbs) of the remainder if remp <> NULL,
+	  or a non-zero value iff the remainder is non-zero when remp = NULL.
+   Assumes:
+   (a) up[un-1] is not zero
+   (b) rootp has at least space for ceil(un/k) limbs
+   (c) remp has at least space for un limbs (in case remp <> NULL)
+   (d) the operands do not overlap.
+
+   The auxiliary memory usage is 3*un+2 if remp = NULL,
+   and 2*un+2 if remp <> NULL.  FIXME: This is an incorrect comment.
+*/
+mp_size_t
+mpn_rootrem (mp_ptr rootp, mp_ptr remp,
+	     mp_srcptr up, mp_size_t un, mp_limb_t k)
+{
+  ASSERT (un > 0);
+  ASSERT (up[un - 1] != 0);
+  ASSERT (k > 1);
+
+  if (UNLIKELY (k == 2))
+    return mpn_sqrtrem (rootp, remp, up, un);
+  /* (un-1)/k > 2 <=> un > 3k <=> (un + 2)/3 > k */
+  if (remp == NULL && (un + 2) / 3 > k)
+    /* Pad {up,un} with k zero limbs.  This will produce an approximate root
+       with one more limb, allowing us to compute the exact integral result. */
+    {
+      mp_ptr sp, wp;
+      mp_size_t rn, sn, wn;
+      TMP_DECL;
+      TMP_MARK;
+      wn = un + k;
+      sn = (un - 1) / k + 2; /* ceil(un/k) + 1 */
+      TMP_ALLOC_LIMBS_2 (wp, wn, /* will contain the padded input */
+			 sp, sn); /* approximate root of padded input */
+      MPN_COPY (wp + k, up, un);
+      MPN_FILL (wp, k, 0);
+      rn = mpn_rootrem_internal (sp, NULL, wp, wn, k, 1);
+      /* The approximate root S = {sp,sn} is either the correct root of
+	 {sp,sn}, or 1 too large.  Thus unless the least significant limb of
+	 S is 0 or 1, we can deduce the root of {up,un} is S truncated by one
+	 limb.  (In case sp[0]=1, we can deduce the root, but not decide
+	 whether it is exact or not.) */
+      MPN_COPY (rootp, sp + 1, sn - 1);
+      TMP_FREE;
+      return rn;
+    }
+  else
+    {
+      return mpn_rootrem_internal (rootp, remp, up, un, k, 0);
+    }
+}
+
+#define LOGROOT_USED_BITS 8
+#define LOGROOT_NEEDS_TWO_CORRECTIONS 1
+#define LOGROOT_RETURNED_BITS (LOGROOT_USED_BITS + LOGROOT_NEEDS_TWO_CORRECTIONS)
+/* Puts in *rootp some bits of the k^nt root of the number
+   2^bitn * 1.op ; where op represents the "fractional" bits.
+
+   The returned value is the number of bits of the root minus one;
+   i.e. an approximation of the root will be
+   (*rootp) * 2^(retval-LOGROOT_RETURNED_BITS+1).
+
+   Currently, only LOGROOT_USED_BITS bits of op are used (the implicit
+   one is not counted).
+ */
+static unsigned
+logbased_root (mp_ptr rootp, mp_limb_t op, mp_bitcnt_t bitn, mp_limb_t k)
+{
+  /* vlog=vector(256,i,floor((log(256+i)/log(2)-8)*256)-(i>255)) */
+  static const
+  unsigned char vlog[] = {1,   2,   4,   5,   7,   8,   9,  11,  12,  14,  15,  16,  18,  19,  21,  22,
+			 23,  25,  26,  27,  29,  30,  31,  33,  34,  35,  37,  38,  39,  40,  42,  43,
+			 44,  46,  47,  48,  49,  51,  52,  53,  54,  56,  57,  58,  59,  61,  62,  63,
+			 64,  65,  67,  68,  69,  70,  71,  73,  74,  75,  76,  77,  78,  80,  81,  82,
+			 83,  84,  85,  87,  88,  89,  90,  91,  92,  93,  94,  96,  97,  98,  99, 100,
+			101, 102, 103, 104, 105, 106, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117,
+			118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 131, 132, 133, 134,
+			135, 136, 137, 138, 139, 140, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
+			150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 162, 163, 164,
+			165, 166, 167, 168, 169, 170, 171, 172, 173, 173, 174, 175, 176, 177, 178, 179,
+			180, 181, 181, 182, 183, 184, 185, 186, 187, 188, 188, 189, 190, 191, 192, 193,
+			194, 194, 195, 196, 197, 198, 199, 200, 200, 201, 202, 203, 204, 205, 205, 206,
+			207, 208, 209, 209, 210, 211, 212, 213, 214, 214, 215, 216, 217, 218, 218, 219,
+			220, 221, 222, 222, 223, 224, 225, 225, 226, 227, 228, 229, 229, 230, 231, 232,
+			232, 233, 234, 235, 235, 236, 237, 238, 239, 239, 240, 241, 242, 242, 243, 244,
+			245, 245, 246, 247, 247, 248, 249, 250, 250, 251, 252, 253, 253, 254, 255, 255};
+
+  /* vexp=vector(256,i,floor(2^(8+i/256)-256)-(i>255)) */
+  static const
+  unsigned char vexp[] = {0,   1,   2,   2,   3,   4,   4,   5,   6,   7,   7,   8,   9,   9,  10,  11,
+			 12,  12,  13,  14,  14,  15,  16,  17,  17,  18,  19,  20,  20,  21,  22,  23,
+			 23,  24,  25,  26,  26,  27,  28,  29,  30,  30,  31,  32,  33,  33,  34,  35,
+			 36,  37,  37,  38,  39,  40,  41,  41,  42,  43,  44,  45,  45,  46,  47,  48,
+			 49,  50,  50,  51,  52,  53,  54,  55,  55,  56,  57,  58,  59,  60,  61,  61,
+			 62,  63,  64,  65,  66,  67,  67,  68,  69,  70,  71,  72,  73,  74,  75,  75,
+			 76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  86,  87,  88,  89,  90,
+			 91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103, 104, 105, 106,
+			107, 108, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 119, 120, 121, 122,
+			123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138,
+			139, 140, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 154, 155, 156,
+			157, 158, 159, 160, 161, 163, 164, 165, 166, 167, 168, 169, 171, 172, 173, 174,
+			175, 176, 178, 179, 180, 181, 182, 183, 185, 186, 187, 188, 189, 191, 192, 193,
+			194, 196, 197, 198, 199, 200, 202, 203, 204, 205, 207, 208, 209, 210, 212, 213,
+			214, 216, 217, 218, 219, 221, 222, 223, 225, 226, 227, 229, 230, 231, 232, 234,
+			235, 236, 238, 239, 240, 242, 243, 245, 246, 247, 249, 250, 251, 253, 254, 255};
+  mp_bitcnt_t retval;
+
+  if (UNLIKELY (bitn > (~ (mp_bitcnt_t) 0) >> LOGROOT_USED_BITS))
+    {
+      /* In the unlikely case, we use two divisions and a modulo. */
+      retval = bitn / k;
+      bitn %= k;
+      bitn = (bitn << LOGROOT_USED_BITS |
+	      vlog[op >> (GMP_NUMB_BITS - LOGROOT_USED_BITS)]) / k;
+    }
+  else
+    {
+      bitn = (bitn << LOGROOT_USED_BITS |
+	      vlog[op >> (GMP_NUMB_BITS - LOGROOT_USED_BITS)]) / k;
+      retval = bitn >> LOGROOT_USED_BITS;
+      bitn &= (CNST_LIMB (1) << LOGROOT_USED_BITS) - 1;
+    }
+  ASSERT(bitn < CNST_LIMB (1) << LOGROOT_USED_BITS);
+  *rootp = CNST_LIMB(1) << (LOGROOT_USED_BITS - ! LOGROOT_NEEDS_TWO_CORRECTIONS)
+    | vexp[bitn] >> ! LOGROOT_NEEDS_TWO_CORRECTIONS;
+  return retval;
+}
+
+/* if approx is non-zero, does not compute the final remainder */
+static mp_size_t
+mpn_rootrem_internal (mp_ptr rootp, mp_ptr remp, mp_srcptr up, mp_size_t un,
+		      mp_limb_t k, int approx)
+{
+  mp_ptr qp, rp, sp, wp, scratch;
+  mp_size_t qn, rn, sn, wn, nl, bn;
+  mp_limb_t save, save2, cy, uh;
+  mp_bitcnt_t unb; /* number of significant bits of {up,un} */
+  mp_bitcnt_t xnb; /* number of significant bits of the result */
+  mp_bitcnt_t b, kk;
+  mp_bitcnt_t sizes[GMP_NUMB_BITS + 1];
+  int ni;
+  int perf_pow;
+  unsigned ulz, snb, c, logk;
+  TMP_DECL;
+
+  /* MPN_SIZEINBASE_2EXP(unb, up, un, 1); --unb; */
+  uh = up[un - 1];
+  count_leading_zeros (ulz, uh);
+  ulz = ulz - GMP_NAIL_BITS + 1; /* Ignore the first 1. */
+  unb = (mp_bitcnt_t) un * GMP_NUMB_BITS - ulz;
+  /* unb is the (truncated) logarithm of the input U in base 2*/
+
+  if (unb < k) /* root is 1 */
+    {
+      rootp[0] = 1;
+      if (remp == NULL)
+	un -= (*up == CNST_LIMB (1)); /* Non-zero iif {up,un} > 1 */
+      else
+	{
+	  mpn_sub_1 (remp, up, un, CNST_LIMB (1));
+	  un -= (remp [un - 1] == 0);	/* There should be at most one zero limb,
+				   if we demand u to be normalized  */
+	}
+      return un;
+    }
+  /* if (unb - k < k/2 + k/16) // root is 2 */
+
+  if (ulz == GMP_NUMB_BITS)
+    uh = up[un - 2];
+  else
+    uh = (uh << ulz & GMP_NUMB_MASK) | up[un - 1 - (un != 1)] >> (GMP_NUMB_BITS - ulz);
+  ASSERT (un != 1 || up[un - 1 - (un != 1)] >> (GMP_NUMB_BITS - ulz) == 1);
+
+  xnb = logbased_root (rootp, uh, unb, k);
+  snb = LOGROOT_RETURNED_BITS - 1;
+  /* xnb+1 is the number of bits of the root R */
+  /* snb+1 is the number of bits of the current approximation S */
+
+  kk = k * xnb;		/* number of truncated bits in the input */
+
+  /* FIXME: Should we skip the next two loops when xnb <= snb ? */
+  for (uh = (k - 1) / 2, logk = 3; (uh >>= 1) != 0; ++logk )
+    ;
+  /* logk = ceil(log(k)/log(2)) + 1 */
+
+  /* xnb is the number of remaining bits to determine in the kth root */
+  for (ni = 0; (sizes[ni] = xnb) > snb; ++ni)
+    {
+      /* invariant: here we want xnb+1 total bits for the kth root */
+
+      /* if c is the new value of xnb, this means that we'll go from a
+	 root of c+1 bits (say s') to a root of xnb+1 bits.
+	 It is proved in the book "Modern Computer Arithmetic" by Brent
+	 and Zimmermann, Chapter 1, that
+	 if s' >= k*beta, then at most one correction is necessary.
+	 Here beta = 2^(xnb-c), and s' >= 2^c, thus it suffices that
+	 c >= ceil((xnb + log2(k))/2). */
+      if (xnb > logk)
+	xnb = (xnb + logk) / 2;
+      else
+	--xnb;	/* add just one bit at a time */
+    }
+
+  *rootp >>= snb - xnb;
+  kk -= xnb;
+
+  ASSERT_ALWAYS (ni < GMP_NUMB_BITS + 1);
+  /* We have sizes[0] = b > sizes[1] > ... > sizes[ni] = 0 with
+     sizes[i] <= 2 * sizes[i+1].
+     Newton iteration will first compute sizes[ni-1] extra bits,
+     then sizes[ni-2], ..., then sizes[0] = b. */
+
+  TMP_MARK;
+  /* qp and wp need enough space to store S'^k where S' is an approximate
+     root. Since S' can be as large as S+2, the worst case is when S=2 and
+     S'=4. But then since we know the number of bits of S in advance, S'
+     can only be 3 at most. Similarly for S=4, then S' can be 6 at most.
+     So the worst case is S'/S=3/2, thus S'^k <= (3/2)^k * S^k. Since S^k
+     fits in un limbs, the number of extra limbs needed is bounded by
+     ceil(k*log2(3/2)/GMP_NUMB_BITS). */
+  /* THINK: with the use of logbased_root, maybe the constant is
+     258/256 instead of 3/2 ? log2(258/256) < 1/89 < 1/64 */
+#define EXTRA 2 + (mp_size_t) (0.585 * (double) k / (double) GMP_NUMB_BITS)
+  TMP_ALLOC_LIMBS_3 (scratch, un + 1, /* used by mpn_div_q */
+		     qp, un + EXTRA,  /* will contain quotient and remainder
+					 of R/(k*S^(k-1)), and S^k */
+		     wp, un + EXTRA); /* will contain S^(k-1), k*S^(k-1),
+					 and temporary for mpn_pow_1 */
+
+  if (remp == NULL)
+    rp = scratch;	/* will contain the remainder */
+  else
+    rp = remp;
+  sp = rootp;
+
+  sn = 1;		/* Initial approximation has one limb */
+
+  for (b = xnb; ni != 0; --ni)
+    {
+      /* 1: loop invariant:
+	 {sp, sn} is the current approximation of the root, which has
+		  exactly 1 + sizes[ni] bits.
+	 {rp, rn} is the current remainder
+	 {wp, wn} = {sp, sn}^(k-1)
+	 kk = number of truncated bits of the input
+      */
+
+      /* Since each iteration treats b bits from the root and thus k*b bits
+	 from the input, and we already considered b bits from the input,
+	 we now have to take another (k-1)*b bits from the input. */
+      kk -= (k - 1) * b; /* remaining input bits */
+      /* {rp, rn} = floor({up, un} / 2^kk) */
+      rn = un - kk / GMP_NUMB_BITS;
+      MPN_RSHIFT (rp, up + kk / GMP_NUMB_BITS, rn, kk % GMP_NUMB_BITS);
+      rn -= rp[rn - 1] == 0;
+
+      /* 9: current buffers: {sp,sn}, {rp,rn} */
+
+      for (c = 0;; c++)
+	{
+	  /* Compute S^k in {qp,qn}. */
+	  /* W <- S^(k-1) for the next iteration,
+	     and S^k = W * S. */
+	  wn = mpn_pow_1 (wp, sp, sn, k - 1, qp);
+	  mpn_mul (qp, wp, wn, sp, sn);
+	  qn = wn + sn;
+	  qn -= qp[qn - 1] == 0;
+
+	  perf_pow = 1;
+	  /* if S^k > floor(U/2^kk), the root approximation was too large */
+	  if (qn > rn || (qn == rn && (perf_pow=mpn_cmp (qp, rp, rn)) > 0))
+	    MPN_DECR_U (sp, sn, 1);
+	  else
+	    break;
+	}
+
+      /* 10: current buffers: {sp,sn}, {rp,rn}, {qp,qn}, {wp,wn} */
+
+      /* sometimes two corrections are needed with logbased_root*/
+      ASSERT (c <= 1 + LOGROOT_NEEDS_TWO_CORRECTIONS);
+      ASSERT_ALWAYS (rn >= qn);
+
+      b = sizes[ni - 1] - sizes[ni]; /* number of bits to compute in the
+				      next iteration */
+      bn = b / GMP_NUMB_BITS; /* lowest limb from high part of rp[], after shift */
+
+      kk = kk - b;
+      /* nl is the number of limbs in U which contain bits [kk,kk+b-1] */
+      nl = 1 + (kk + b - 1) / GMP_NUMB_BITS - (kk / GMP_NUMB_BITS);
+      /* nl  = 1 + floor((kk + b - 1) / GMP_NUMB_BITS)
+		 - floor(kk / GMP_NUMB_BITS)
+	     <= 1 + (kk + b - 1) / GMP_NUMB_BITS
+		  - (kk - GMP_NUMB_BITS + 1) / GMP_NUMB_BITS
+	     = 2 + (b - 2) / GMP_NUMB_BITS
+	 thus since nl is an integer:
+	 nl <= 2 + floor(b/GMP_NUMB_BITS) <= 2 + bn. */
+
+      /* 11: current buffers: {sp,sn}, {rp,rn}, {wp,wn} */
+
+      /* R = R - Q = floor(U/2^kk) - S^k */
+      if (perf_pow != 0)
+	{
+	  mpn_sub (rp, rp, rn, qp, qn);
+	  MPN_NORMALIZE_NOT_ZERO (rp, rn);
+
+	  /* first multiply the remainder by 2^b */
+	  MPN_LSHIFT (cy, rp + bn, rp, rn, b % GMP_NUMB_BITS);
+	  rn = rn + bn;
+	  if (cy != 0)
+	    {
+	      rp[rn] = cy;
+	      rn++;
+	    }
+
+	  save = rp[bn];
+	  /* we have to save rp[bn] up to rp[nl-1], i.e. 1 or 2 limbs */
+	  if (nl - 1 > bn)
+	    save2 = rp[bn + 1];
+	}
+      else
+	{
+	  rn = bn;
+	  save2 = save = 0;
+	}
+      /* 2: current buffers: {sp,sn}, {rp,rn}, {wp,wn} */
+
+      /* Now insert bits [kk,kk+b-1] from the input U */
+      MPN_RSHIFT (rp, up + kk / GMP_NUMB_BITS, nl, kk % GMP_NUMB_BITS);
+      /* set to zero high bits of rp[bn] */
+      rp[bn] &= (CNST_LIMB (1) << (b % GMP_NUMB_BITS)) - 1;
+      /* restore corresponding bits */
+      rp[bn] |= save;
+      if (nl - 1 > bn)
+	rp[bn + 1] = save2; /* the low b bits go in rp[0..bn] only, since
+			       they start by bit 0 in rp[0], so they use
+			       at most ceil(b/GMP_NUMB_BITS) limbs */
+      /* FIXME: Should we normalise {rp,rn} here ?*/
+
+      /* 3: current buffers: {sp,sn}, {rp,rn}, {wp,wn} */
+
+      /* compute {wp, wn} = k * {sp, sn}^(k-1) */
+      cy = mpn_mul_1 (wp, wp, wn, k);
+      wp[wn] = cy;
+      wn += cy != 0;
+
+      /* 6: current buffers: {sp,sn}, {qp,qn} */
+
+      /* multiply the root approximation by 2^b */
+      MPN_LSHIFT (cy, sp + b / GMP_NUMB_BITS, sp, sn, b % GMP_NUMB_BITS);
+      sn = sn + b / GMP_NUMB_BITS;
+      if (cy != 0)
+	{
+	  sp[sn] = cy;
+	  sn++;
+	}
+
+      save = sp[b / GMP_NUMB_BITS];
+
+      /* Number of limbs used by b bits, when least significant bit is
+	 aligned to least limb */
+      bn = (b - 1) / GMP_NUMB_BITS + 1;
+
+      /* 4: current buffers: {sp,sn}, {rp,rn}, {wp,wn} */
+
+      /* now divide {rp, rn} by {wp, wn} to get the low part of the root */
+      if (UNLIKELY (rn < wn))
+	{
+	  MPN_FILL (sp, bn, 0);
+	}
+      else
+	{
+	  qn = rn - wn; /* expected quotient size */
+	  if (qn <= bn) { /* Divide only if result is not too big. */
+	    mpn_div_q (qp, rp, rn, wp, wn, scratch);
+	    qn += qp[qn] != 0;
+	  }
+
+      /* 5: current buffers: {sp,sn}, {qp,qn}.
+	 Note: {rp,rn} is not needed any more since we'll compute it from
+	 scratch at the end of the loop.
+       */
+
+      /* the quotient should be smaller than 2^b, since the previous
+	 approximation was correctly rounded toward zero */
+	  if (qn > bn || (qn == bn && (b % GMP_NUMB_BITS != 0) &&
+			  qp[qn - 1] >= (CNST_LIMB (1) << (b % GMP_NUMB_BITS))))
+	    {
+	      for (qn = 1; qn < bn; ++qn)
+		sp[qn - 1] = GMP_NUMB_MAX;
+	      sp[qn - 1] = GMP_NUMB_MAX >> (GMP_NUMB_BITS - 1 - ((b - 1) % GMP_NUMB_BITS));
+	    }
+	  else
+	    {
+      /* 7: current buffers: {sp,sn}, {qp,qn} */
+
+      /* Combine sB and q to form sB + q.  */
+	      MPN_COPY (sp, qp, qn);
+	      MPN_ZERO (sp + qn, bn - qn);
+	    }
+	}
+      sp[b / GMP_NUMB_BITS] |= save;
+
+      /* 8: current buffer: {sp,sn} */
+
+    }
+
+  /* otherwise we have rn > 0, thus the return value is ok */
+  if (!approx || sp[0] <= CNST_LIMB (1))
+    {
+      for (c = 0;; c++)
+	{
+	  /* Compute S^k in {qp,qn}. */
+	  /* Last iteration: we don't need W anymore. */
+	  /* mpn_pow_1 requires that both qp and wp have enough
+	     space to store the result {sp,sn}^k + 1 limb */
+	  qn = mpn_pow_1 (qp, sp, sn, k, wp);
+
+	  perf_pow = 1;
+	  if (qn > un || (qn == un && (perf_pow=mpn_cmp (qp, up, un)) > 0))
+	    MPN_DECR_U (sp, sn, 1);
+	  else
+	    break;
+	};
+
+      /* sometimes two corrections are needed with logbased_root*/
+      ASSERT (c <= 1 + LOGROOT_NEEDS_TWO_CORRECTIONS);
+
+      rn = perf_pow != 0;
+      if (rn != 0 && remp != NULL)
+	{
+	  mpn_sub (remp, up, un, qp, qn);
+	  rn = un;
+	  MPN_NORMALIZE_NOT_ZERO (remp, rn);
+	}
+    }
+
+  TMP_FREE;
+  return rn;
+}

diff --git a/third_party/gmp/mpn/generic/rshift.c b/third_party/gmp/mpn/generic/rshift.c
new file mode 100644
index 0000000..15d427d
--- /dev/null
+++ b/third_party/gmp/mpn/generic/rshift.c

@@ -0,0 +1,69 @@
+/* mpn_rshift -- Shift right low level.
+
+Copyright 1991, 1993, 1994, 1996, 2000-2002 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+
+/* Shift U (pointed to by up and N limbs long) cnt bits to the right
+   and store the n least significant limbs of the result at rp.
+   The bits shifted out to the right are returned.
+
+   Argument constraints:
+   1. 0 < cnt < GMP_NUMB_BITS.
+   2. If the result is to be written over the input, rp must be <= up.
+*/
+
+mp_limb_t
+mpn_rshift (mp_ptr rp, mp_srcptr up, mp_size_t n, unsigned int cnt)
+{
+  mp_limb_t high_limb, low_limb;
+  unsigned int tnc;
+  mp_size_t i;
+  mp_limb_t retval;
+
+  ASSERT (n >= 1);
+  ASSERT (cnt >= 1);
+  ASSERT (cnt < GMP_NUMB_BITS);
+  ASSERT (MPN_SAME_OR_INCR_P (rp, up, n));
+
+  tnc = GMP_NUMB_BITS - cnt;
+  high_limb = *up++;
+  retval = (high_limb << tnc) & GMP_NUMB_MASK;
+  low_limb = high_limb >> cnt;
+
+  for (i = n - 1; i != 0; i--)
+    {
+      high_limb = *up++;
+      *rp++ = low_limb | ((high_limb << tnc) & GMP_NUMB_MASK);
+      low_limb = high_limb >> cnt;
+    }
+  *rp = low_limb;
+
+  return retval;
+}

diff --git a/third_party/gmp/mpn/generic/sbpi1_bdiv_q.c b/third_party/gmp/mpn/generic/sbpi1_bdiv_q.c
new file mode 100644
index 0000000..850e593
--- /dev/null
+++ b/third_party/gmp/mpn/generic/sbpi1_bdiv_q.c

@@ -0,0 +1,96 @@
+/* mpn_sbpi1_bdiv_q -- schoolbook Hensel division with precomputed inverse,
+   returning quotient only.
+
+   Contributed to the GNU project by Niels Möller and Torbjörn Granlund.
+
+   THE FUNCTIONS IN THIS FILE ARE INTERNAL FUNCTIONS WITH MUTABLE INTERFACES.
+   IT IS ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS
+   ALMOST GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
+
+Copyright 2005, 2006, 2009, 2011, 2012, 2017 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+
+/* Computes Q = - U / D mod B^un, destroys U.
+
+   D must be odd. dinv is (-D)^-1 mod B.
+
+*/
+
+void
+mpn_sbpi1_bdiv_q (mp_ptr qp,
+		  mp_ptr up, mp_size_t un,
+		  mp_srcptr dp, mp_size_t dn,
+		  mp_limb_t dinv)
+{
+  mp_size_t i;
+  mp_limb_t q;
+
+  ASSERT (dn > 0);
+  ASSERT (un >= dn);
+  ASSERT ((dp[0] & 1) != 0);
+  ASSERT (-(dp[0] * dinv) == 1);
+  ASSERT (up == qp || !MPN_OVERLAP_P (up, un, qp, un - dn));
+
+  if (un > dn)
+    {
+      mp_limb_t cy, hi;
+      for (i = un - dn - 1, cy = 0; i > 0; i--)
+	{
+	  q = dinv * up[0];
+	  hi = mpn_addmul_1 (up, dp, dn, q);
+
+	  ASSERT (up[0] == 0);
+	  *qp++ = q;
+	  hi += cy;
+	  cy = hi < cy;
+	  hi += up[dn];
+	  cy += hi < up[dn];
+	  up[dn] = hi;
+	  up++;
+	}
+      q = dinv * up[0];
+      hi = cy + mpn_addmul_1 (up, dp, dn, q);
+      ASSERT (up[0] == 0);
+      *qp++ = q;
+      up[dn] += hi;
+      up++;
+    }
+  for (i = dn; i > 1; i--)
+    {
+      mp_limb_t q = dinv * up[0];
+      mpn_addmul_1 (up, dp, i, q);
+      ASSERT (up[0] == 0);
+      *qp++ = q;
+      up++;
+    }
+
+  /* Final limb */
+  *qp = dinv * up[0];
+}

diff --git a/third_party/gmp/mpn/generic/sbpi1_bdiv_qr.c b/third_party/gmp/mpn/generic/sbpi1_bdiv_qr.c
new file mode 100644
index 0000000..6146c45
--- /dev/null
+++ b/third_party/gmp/mpn/generic/sbpi1_bdiv_qr.c

@@ -0,0 +1,82 @@
+/* mpn_sbpi1_bdiv_qr -- schoolbook Hensel division with precomputed inverse,
+   returning quotient and remainder.
+
+   Contributed to the GNU project by Niels Möller and Torbjörn Granlund.
+
+   THE FUNCTIONS IN THIS FILE ARE INTERNAL FUNCTIONS WITH MUTABLE INTERFACES.
+   IT IS ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS
+   ALMOST GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
+
+Copyright 2006, 2009, 2011, 2012, 2017 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+
+
+/* Computes a binary quotient of size qn = un - dn.
+   Output:
+
+      Q = -U * D^{-1} mod B^qn,
+
+      R = (U + Q * D) * B^(-qn)
+
+   Stores the dn least significant limbs of R at {up + un - dn, dn},
+   and returns the carry from the addition N + Q*D.
+
+   D must be odd. dinv is (-D)^-1 mod B. */
+
+mp_limb_t
+mpn_sbpi1_bdiv_qr (mp_ptr qp,
+		   mp_ptr up, mp_size_t un,
+		   mp_srcptr dp, mp_size_t dn, mp_limb_t dinv)
+{
+  mp_size_t i;
+  mp_limb_t cy;
+
+  ASSERT (dn > 0);
+  ASSERT (un > dn);
+  ASSERT ((dp[0] & 1) != 0);
+  ASSERT (-(dp[0] * dinv) == 1);
+  ASSERT (up == qp || !MPN_OVERLAP_P (up, un, qp, un - dn));
+
+  for (i = un - dn, cy = 0; i != 0; i--)
+    {
+      mp_limb_t q = dinv * up[0];
+      mp_limb_t hi = mpn_addmul_1 (up, dp, dn, q);
+      *qp++ = q;
+
+      hi += cy;
+      cy = hi < cy;
+      hi += up[dn];
+      cy += hi < up[dn];
+      up[dn] = hi;
+      up++;
+    }
+
+  return cy;
+}

diff --git a/third_party/gmp/mpn/generic/sbpi1_bdiv_r.c b/third_party/gmp/mpn/generic/sbpi1_bdiv_r.c
new file mode 100644
index 0000000..a609951
--- /dev/null
+++ b/third_party/gmp/mpn/generic/sbpi1_bdiv_r.c

@@ -0,0 +1,79 @@
+/* mpn_sbpi1_bdiv_r -- schoolbook Hensel division with precomputed inverse,
+   returning remainder.
+
+   Contributed to the GNU project by Niels Möller and Torbjörn Granlund.
+
+   THE FUNCTIONS IN THIS FILE ARE INTERNAL FUNCTIONS WITH MUTABLE INTERFACES.
+   IT IS ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS
+   ALMOST GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
+
+Copyright 2006, 2009, 2011, 2012, 2017 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+
+
+/* Computes a binary quotient of size qn = un - dn.
+   Output:
+
+      Q = -U * D^{-1} mod B^qn,
+
+      R = (U + Q * D) * B^(-qn)
+
+   Stores the dn least significant limbs of R at {up + un - dn, dn},
+   and returns the carry from the addition N + Q*D.
+
+   D must be odd. dinv is (-D)^-1 mod B. */
+
+mp_limb_t
+mpn_sbpi1_bdiv_r (mp_ptr up, mp_size_t un,
+		  mp_srcptr dp, mp_size_t dn, mp_limb_t dinv)
+{
+  mp_size_t i;
+  mp_limb_t cy;
+
+  ASSERT (dn > 0);
+  ASSERT (un > dn);
+  ASSERT ((dp[0] & 1) != 0);
+  ASSERT (-(dp[0] * dinv) == 1);
+
+  for (i = un - dn, cy = 0; i != 0; i--)
+    {
+      mp_limb_t q = dinv * up[0];
+      mp_limb_t hi = mpn_addmul_1 (up, dp, dn, q);
+
+      hi += cy;
+      cy = hi < cy;
+      hi += up[dn];
+      cy += hi < up[dn];
+      up[dn] = hi;
+      up++;
+    }
+
+  return cy;
+}

diff --git a/third_party/gmp/mpn/generic/sbpi1_div_q.c b/third_party/gmp/mpn/generic/sbpi1_div_q.c
new file mode 100644
index 0000000..a9975eb
--- /dev/null
+++ b/third_party/gmp/mpn/generic/sbpi1_div_q.c

@@ -0,0 +1,302 @@
+/* mpn_sbpi1_div_q -- Schoolbook division using the Möller-Granlund 3/2
+   division algorithm.
+
+   Contributed to the GNU project by Torbjorn Granlund.
+
+   THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
+   SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
+
+Copyright 2007, 2009 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+mp_limb_t
+mpn_sbpi1_div_q (mp_ptr qp,
+		 mp_ptr np, mp_size_t nn,
+		 mp_srcptr dp, mp_size_t dn,
+		 mp_limb_t dinv)
+{
+  mp_limb_t qh;
+  mp_size_t qn, i;
+  mp_limb_t n1, n0;
+  mp_limb_t d1, d0;
+  mp_limb_t cy, cy1;
+  mp_limb_t q;
+  mp_limb_t flag;
+
+  mp_size_t dn_orig = dn;
+  mp_srcptr dp_orig = dp;
+  mp_ptr np_orig = np;
+
+  ASSERT (dn > 2);
+  ASSERT (nn >= dn);
+  ASSERT ((dp[dn-1] & GMP_NUMB_HIGHBIT) != 0);
+
+  np += nn;
+
+  qn = nn - dn;
+  if (qn + 1 < dn)
+    {
+      dp += dn - (qn + 1);
+      dn = qn + 1;
+    }
+
+  qh = mpn_cmp (np - dn, dp, dn) >= 0;
+  if (qh != 0)
+    mpn_sub_n (np - dn, np - dn, dp, dn);
+
+  qp += qn;
+
+  dn -= 2;			/* offset dn by 2 for main division loops,
+				   saving two iterations in mpn_submul_1.  */
+  d1 = dp[dn + 1];
+  d0 = dp[dn + 0];
+
+  np -= 2;
+
+  n1 = np[1];
+
+  for (i = qn - (dn + 2); i >= 0; i--)
+    {
+      np--;
+      if (UNLIKELY (n1 == d1) && np[1] == d0)
+	{
+	  q = GMP_NUMB_MASK;
+	  mpn_submul_1 (np - dn, dp, dn + 2, q);
+	  n1 = np[1];		/* update n1, last loop's value will now be invalid */
+	}
+      else
+	{
+	  udiv_qr_3by2 (q, n1, n0, n1, np[1], np[0], d1, d0, dinv);
+
+	  cy = mpn_submul_1 (np - dn, dp, dn, q);
+
+	  cy1 = n0 < cy;
+	  n0 = (n0 - cy) & GMP_NUMB_MASK;
+	  cy = n1 < cy1;
+	  n1 -= cy1;
+	  np[0] = n0;
+
+	  if (UNLIKELY (cy != 0))
+	    {
+	      n1 += d1 + mpn_add_n (np - dn, np - dn, dp, dn + 1);
+	      q--;
+	    }
+	}
+
+      *--qp = q;
+    }
+
+  flag = ~CNST_LIMB(0);
+
+  if (dn >= 0)
+    {
+      for (i = dn; i > 0; i--)
+	{
+	  np--;
+	  if (UNLIKELY (n1 >= (d1 & flag)))
+	    {
+	      q = GMP_NUMB_MASK;
+	      cy = mpn_submul_1 (np - dn, dp, dn + 2, q);
+
+	      if (UNLIKELY (n1 != cy))
+		{
+		  if (n1 < (cy & flag))
+		    {
+		      q--;
+		      mpn_add_n (np - dn, np - dn, dp, dn + 2);
+		    }
+		  else
+		    flag = 0;
+		}
+	      n1 = np[1];
+	    }
+	  else
+	    {
+	      udiv_qr_3by2 (q, n1, n0, n1, np[1], np[0], d1, d0, dinv);
+
+	      cy = mpn_submul_1 (np - dn, dp, dn, q);
+
+	      cy1 = n0 < cy;
+	      n0 = (n0 - cy) & GMP_NUMB_MASK;
+	      cy = n1 < cy1;
+	      n1 -= cy1;
+	      np[0] = n0;
+
+	      if (UNLIKELY (cy != 0))
+		{
+		  n1 += d1 + mpn_add_n (np - dn, np - dn, dp, dn + 1);
+		  q--;
+		}
+	    }
+
+	  *--qp = q;
+
+	  /* Truncate operands.  */
+	  dn--;
+	  dp++;
+	}
+
+      np--;
+      if (UNLIKELY (n1 >= (d1 & flag)))
+	{
+	  q = GMP_NUMB_MASK;
+	  cy = mpn_submul_1 (np, dp, 2, q);
+
+	  if (UNLIKELY (n1 != cy))
+	    {
+	      if (n1 < (cy & flag))
+		{
+		  q--;
+		  add_ssaaaa (np[1], np[0], np[1], np[0], dp[1], dp[0]);
+		}
+	      else
+		flag = 0;
+	    }
+	  n1 = np[1];
+	}
+      else
+	{
+	  udiv_qr_3by2 (q, n1, n0, n1, np[1], np[0], d1, d0, dinv);
+
+	  np[0] = n0;
+	  np[1] = n1;
+	}
+
+      *--qp = q;
+    }
+  ASSERT_ALWAYS (np[1] == n1);
+  np += 2;
+
+
+  dn = dn_orig;
+  if (UNLIKELY (n1 < (dn & flag)))
+    {
+      mp_limb_t q, x;
+
+      /* The quotient may be too large if the remainder is small.  Recompute
+	 for above ignored operand parts, until the remainder spills.
+
+	 FIXME: The quality of this code isn't the same as the code above.
+	 1. We don't compute things in an optimal order, high-to-low, in order
+	    to terminate as quickly as possible.
+	 2. We mess with pointers and sizes, adding and subtracting and
+	    adjusting to get things right.  It surely could be streamlined.
+	 3. The only termination criteria are that we determine that the
+	    quotient needs to be adjusted, or that we have recomputed
+	    everything.  We should stop when the remainder is so large
+	    that no additional subtracting could make it spill.
+	 4. If nothing else, we should not do two loops of submul_1 over the
+	    data, instead handle both the triangularization and chopping at
+	    once.  */
+
+      x = n1;
+
+      if (dn > 2)
+	{
+	  /* Compensate for triangularization.  */
+	  mp_limb_t y;
+
+	  dp = dp_orig;
+	  if (qn + 1 < dn)
+	    {
+	      dp += dn - (qn + 1);
+	      dn = qn + 1;
+	    }
+
+	  y = np[-2];
+
+	  for (i = dn - 3; i >= 0; i--)
+	    {
+	      q = qp[i];
+	      cy = mpn_submul_1 (np - (dn - i), dp, dn - i - 2, q);
+
+	      if (y < cy)
+		{
+		  if (x == 0)
+		    {
+		      cy = mpn_sub_1 (qp, qp, qn, 1);
+		      ASSERT_ALWAYS (cy == 0);
+		      return qh - cy;
+		    }
+		  x--;
+		}
+	      y -= cy;
+	    }
+	  np[-2] = y;
+	}
+
+      dn = dn_orig;
+      if (qn + 1 < dn)
+	{
+	  /* Compensate for ignored dividend and divisor tails.  */
+
+	  dp = dp_orig;
+	  np = np_orig;
+
+	  if (qh != 0)
+	    {
+	      cy = mpn_sub_n (np + qn, np + qn, dp, dn - (qn + 1));
+	      if (cy != 0)
+		{
+		  if (x == 0)
+		    {
+		      if (qn != 0)
+			cy = mpn_sub_1 (qp, qp, qn, 1);
+		      return qh - cy;
+		    }
+		  x--;
+		}
+	    }
+
+	  if (qn == 0)
+	    return qh;
+
+	  for (i = dn - qn - 2; i >= 0; i--)
+	    {
+	      cy = mpn_submul_1 (np + i, qp, qn, dp[i]);
+	      cy = mpn_sub_1 (np + qn + i, np + qn + i, dn - qn - i - 1, cy);
+	      if (cy != 0)
+		{
+		  if (x == 0)
+		    {
+		      cy = mpn_sub_1 (qp, qp, qn, 1);
+		      return qh;
+		    }
+		  x--;
+		}
+	    }
+	}
+    }
+
+  return qh;
+}

diff --git a/third_party/gmp/mpn/generic/sbpi1_div_qr.c b/third_party/gmp/mpn/generic/sbpi1_div_qr.c
new file mode 100644
index 0000000..7330a77
--- /dev/null
+++ b/third_party/gmp/mpn/generic/sbpi1_div_qr.c

@@ -0,0 +1,109 @@
+/* mpn_sbpi1_div_qr -- Schoolbook division using the Möller-Granlund 3/2
+   division algorithm.
+
+   Contributed to the GNU project by Torbjorn Granlund.
+
+   THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
+   SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
+
+Copyright 2007, 2009 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+mp_limb_t
+mpn_sbpi1_div_qr (mp_ptr qp,
+		  mp_ptr np, mp_size_t nn,
+		  mp_srcptr dp, mp_size_t dn,
+		  mp_limb_t dinv)
+{
+  mp_limb_t qh;
+  mp_size_t i;
+  mp_limb_t n1, n0;
+  mp_limb_t d1, d0;
+  mp_limb_t cy, cy1;
+  mp_limb_t q;
+
+  ASSERT (dn > 2);
+  ASSERT (nn >= dn);
+  ASSERT ((dp[dn-1] & GMP_NUMB_HIGHBIT) != 0);
+
+  np += nn;
+
+  qh = mpn_cmp (np - dn, dp, dn) >= 0;
+  if (qh != 0)
+    mpn_sub_n (np - dn, np - dn, dp, dn);
+
+  qp += nn - dn;
+
+  dn -= 2;			/* offset dn by 2 for main division loops,
+				   saving two iterations in mpn_submul_1.  */
+  d1 = dp[dn + 1];
+  d0 = dp[dn + 0];
+
+  np -= 2;
+
+  n1 = np[1];
+
+  for (i = nn - (dn + 2); i > 0; i--)
+    {
+      np--;
+      if (UNLIKELY (n1 == d1) && np[1] == d0)
+	{
+	  q = GMP_NUMB_MASK;
+	  mpn_submul_1 (np - dn, dp, dn + 2, q);
+	  n1 = np[1];		/* update n1, last loop's value will now be invalid */
+	}
+      else
+	{
+	  udiv_qr_3by2 (q, n1, n0, n1, np[1], np[0], d1, d0, dinv);
+
+	  cy = mpn_submul_1 (np - dn, dp, dn, q);
+
+	  cy1 = n0 < cy;
+	  n0 = (n0 - cy) & GMP_NUMB_MASK;
+	  cy = n1 < cy1;
+	  n1 = (n1 - cy1) & GMP_NUMB_MASK;
+	  np[0] = n0;
+
+	  if (UNLIKELY (cy != 0))
+	    {
+	      n1 += d1 + mpn_add_n (np - dn, np - dn, dp, dn + 1);
+	      q--;
+	    }
+	}
+
+      *--qp = q;
+    }
+  np[1] = n1;
+
+  return qh;
+}

diff --git a/third_party/gmp/mpn/generic/sbpi1_divappr_q.c b/third_party/gmp/mpn/generic/sbpi1_divappr_q.c
new file mode 100644
index 0000000..ef7ca26
--- /dev/null
+++ b/third_party/gmp/mpn/generic/sbpi1_divappr_q.c

@@ -0,0 +1,198 @@
+/* mpn_sbpi1_divappr_q -- Schoolbook division using the Möller-Granlund 3/2
+   division algorithm, returning approximate quotient.  The quotient returned
+   is either correct, or one too large.
+
+   Contributed to the GNU project by Torbjorn Granlund.
+
+   THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
+   SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
+
+Copyright 2007, 2009 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+mp_limb_t
+mpn_sbpi1_divappr_q (mp_ptr qp,
+		     mp_ptr np, mp_size_t nn,
+		     mp_srcptr dp, mp_size_t dn,
+		     mp_limb_t dinv)
+{
+  mp_limb_t qh;
+  mp_size_t qn, i;
+  mp_limb_t n1, n0;
+  mp_limb_t d1, d0;
+  mp_limb_t cy, cy1;
+  mp_limb_t q;
+  mp_limb_t flag;
+
+  ASSERT (dn > 2);
+  ASSERT (nn >= dn);
+  ASSERT ((dp[dn-1] & GMP_NUMB_HIGHBIT) != 0);
+
+  np += nn;
+
+  qn = nn - dn;
+  if (qn + 1 < dn)
+    {
+      dp += dn - (qn + 1);
+      dn = qn + 1;
+    }
+
+  qh = mpn_cmp (np - dn, dp, dn) >= 0;
+  if (qh != 0)
+    mpn_sub_n (np - dn, np - dn, dp, dn);
+
+  qp += qn;
+
+  dn -= 2;			/* offset dn by 2 for main division loops,
+				   saving two iterations in mpn_submul_1.  */
+  d1 = dp[dn + 1];
+  d0 = dp[dn + 0];
+
+  np -= 2;
+
+  n1 = np[1];
+
+  for (i = qn - (dn + 2); i >= 0; i--)
+    {
+      np--;
+      if (UNLIKELY (n1 == d1) && np[1] == d0)
+	{
+	  q = GMP_NUMB_MASK;
+	  mpn_submul_1 (np - dn, dp, dn + 2, q);
+	  n1 = np[1];		/* update n1, last loop's value will now be invalid */
+	}
+      else
+	{
+	  udiv_qr_3by2 (q, n1, n0, n1, np[1], np[0], d1, d0, dinv);
+
+	  cy = mpn_submul_1 (np - dn, dp, dn, q);
+
+	  cy1 = n0 < cy;
+	  n0 = (n0 - cy) & GMP_NUMB_MASK;
+	  cy = n1 < cy1;
+	  n1 -= cy1;
+	  np[0] = n0;
+
+	  if (UNLIKELY (cy != 0))
+	    {
+	      n1 += d1 + mpn_add_n (np - dn, np - dn, dp, dn + 1);
+	      q--;
+	    }
+	}
+
+      *--qp = q;
+    }
+
+  flag = ~CNST_LIMB(0);
+
+  if (dn >= 0)
+    {
+      for (i = dn; i > 0; i--)
+	{
+	  np--;
+	  if (UNLIKELY (n1 >= (d1 & flag)))
+	    {
+	      q = GMP_NUMB_MASK;
+	      cy = mpn_submul_1 (np - dn, dp, dn + 2, q);
+
+	      if (UNLIKELY (n1 != cy))
+		{
+		  if (n1 < (cy & flag))
+		    {
+		      q--;
+		      mpn_add_n (np - dn, np - dn, dp, dn + 2);
+		    }
+		  else
+		    flag = 0;
+		}
+	      n1 = np[1];
+	    }
+	  else
+	    {
+	      udiv_qr_3by2 (q, n1, n0, n1, np[1], np[0], d1, d0, dinv);
+
+	      cy = mpn_submul_1 (np - dn, dp, dn, q);
+
+	      cy1 = n0 < cy;
+	      n0 = (n0 - cy) & GMP_NUMB_MASK;
+	      cy = n1 < cy1;
+	      n1 -= cy1;
+	      np[0] = n0;
+
+	      if (UNLIKELY (cy != 0))
+		{
+		  n1 += d1 + mpn_add_n (np - dn, np - dn, dp, dn + 1);
+		  q--;
+		}
+	    }
+
+	  *--qp = q;
+
+	  /* Truncate operands.  */
+	  dn--;
+	  dp++;
+	}
+
+      np--;
+      if (UNLIKELY (n1 >= (d1 & flag)))
+	{
+	  q = GMP_NUMB_MASK;
+	  cy = mpn_submul_1 (np, dp, 2, q);
+
+	  if (UNLIKELY (n1 != cy))
+	    {
+	      if (n1 < (cy & flag))
+		{
+		  q--;
+		  add_ssaaaa (np[1], np[0], np[1], np[0], dp[1], dp[0]);
+		}
+	      else
+		flag = 0;
+	    }
+	  n1 = np[1];
+	}
+      else
+	{
+	  udiv_qr_3by2 (q, n1, n0, n1, np[1], np[0], d1, d0, dinv);
+
+	  np[1] = n1;
+	  np[0] = n0;
+	}
+
+      *--qp = q;
+    }
+
+  ASSERT_ALWAYS (np[1] == n1);
+
+  return qh;
+}

diff --git a/third_party/gmp/mpn/generic/scan0.c b/third_party/gmp/mpn/generic/scan0.c
new file mode 100644
index 0000000..d71832e
--- /dev/null
+++ b/third_party/gmp/mpn/generic/scan0.c

@@ -0,0 +1,59 @@
+/* mpn_scan0 -- Scan from a given bit position for the next clear bit.
+
+Copyright 1994, 1996, 2001, 2002, 2004 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+/* Argument constraints:
+   1. U must sooner or later have a limb with a clear bit.
+ */
+
+mp_bitcnt_t
+mpn_scan0 (mp_srcptr up, mp_bitcnt_t starting_bit)
+{
+  mp_size_t starting_word;
+  mp_limb_t alimb;
+  int cnt;
+  mp_srcptr p;
+
+  /* Start at the word implied by STARTING_BIT.  */
+  starting_word = starting_bit / GMP_NUMB_BITS;
+  p = up + starting_word;
+  alimb = *p++ ^ GMP_NUMB_MASK;
+
+  /* Mask off any bits before STARTING_BIT in the first limb.  */
+  alimb &= - (mp_limb_t) 1 << (starting_bit % GMP_NUMB_BITS);
+
+  while (alimb == 0)
+    alimb = *p++ ^ GMP_NUMB_MASK;
+
+  count_trailing_zeros (cnt, alimb);
+  return (p - up - 1) * GMP_NUMB_BITS + cnt;
+}

diff --git a/third_party/gmp/mpn/generic/scan1.c b/third_party/gmp/mpn/generic/scan1.c
new file mode 100644
index 0000000..09e8060
--- /dev/null
+++ b/third_party/gmp/mpn/generic/scan1.c

@@ -0,0 +1,59 @@
+/* mpn_scan1 -- Scan from a given bit position for the next set bit.
+
+Copyright 1994, 1996, 2001, 2002, 2004 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+/* Argument constraints:
+   1. U must sooner or later have a limb != 0.
+ */
+
+mp_bitcnt_t
+mpn_scan1 (mp_srcptr up, mp_bitcnt_t starting_bit)
+{
+  mp_size_t starting_word;
+  mp_limb_t alimb;
+  int cnt;
+  mp_srcptr p;
+
+  /* Start at the word implied by STARTING_BIT.  */
+  starting_word = starting_bit / GMP_NUMB_BITS;
+  p = up + starting_word;
+  alimb = *p++;
+
+  /* Mask off any bits before STARTING_BIT in the first limb.  */
+  alimb &= - (mp_limb_t) 1 << (starting_bit % GMP_NUMB_BITS);
+
+  while (alimb == 0)
+    alimb = *p++;
+
+  count_trailing_zeros (cnt, alimb);
+  return (p - up - 1) * GMP_NUMB_BITS + cnt;
+}

diff --git a/third_party/gmp/mpn/generic/sec_aors_1.c b/third_party/gmp/mpn/generic/sec_aors_1.c
new file mode 100644
index 0000000..6480fa1
--- /dev/null
+++ b/third_party/gmp/mpn/generic/sec_aors_1.c

@@ -0,0 +1,59 @@
+/* mpn_sec_add_1, mpn_sec_sub_1
+
+   Contributed to the GNU project by Niels Möller
+
+Copyright 2013, 2014 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+
+#if OPERATION_sec_add_1
+#define FNAME mpn_sec_add_1
+#define FNAME_itch mpn_sec_add_1_itch
+#define OP_N mpn_add_n
+#endif
+#if OPERATION_sec_sub_1
+#define FNAME mpn_sec_sub_1
+#define FNAME_itch mpn_sec_sub_1_itch
+#define OP_N mpn_sub_n
+#endif
+
+/* It's annoying to that we need scratch space */
+mp_size_t
+FNAME_itch (mp_size_t n)
+{
+  return n;
+}
+
+mp_limb_t
+FNAME (mp_ptr rp, mp_srcptr ap, mp_size_t n, mp_limb_t b, mp_ptr scratch)
+{
+  scratch[0] = b;
+  MPN_ZERO (scratch + 1, n-1);
+  return OP_N (rp, ap, scratch, n);
+}

diff --git a/third_party/gmp/mpn/generic/sec_div.c b/third_party/gmp/mpn/generic/sec_div.c
new file mode 100644
index 0000000..1f08649
--- /dev/null
+++ b/third_party/gmp/mpn/generic/sec_div.c

@@ -0,0 +1,131 @@
+/* mpn_sec_div_qr, mpn_sec_div_r -- Compute Q = floor(U / V), U = U mod V.
+   Side-channel silent under the assumption that the used instructions are
+   side-channel silent.
+
+   Contributed to the GNU project by Torbjörn Granlund.
+
+Copyright 2011-2015 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+#if OPERATION_sec_div_qr
+#define FNAME mpn_sec_div_qr
+#define FNAME_itch mpn_sec_div_qr_itch
+#define Q(q) q,
+#define RETTYPE mp_limb_t
+#endif
+#if OPERATION_sec_div_r
+#define FNAME mpn_sec_div_r
+#define FNAME_itch mpn_sec_div_r_itch
+#define Q(q)
+#define RETTYPE void
+#endif
+
+mp_size_t
+FNAME_itch (mp_size_t nn, mp_size_t dn)
+{
+#if OPERATION_sec_div_qr
+/* Needs (nn + dn + 1) + mpn_sec_pi1_div_qr's needs of (2nn' - dn + 1) for a
+   total of 3nn + 4 limbs at tp.  Note that mpn_sec_pi1_div_qr's nn is one
+   greater than ours, therefore +4 and not just +2.  */
+  return 3 * nn + 4;
+#endif
+#if OPERATION_sec_div_r
+/* Needs (nn + dn + 1) + mpn_sec_pi1_div_r's needs of (dn + 1) for a total of
+   nn + 2dn + 2 limbs at tp.  */
+  return nn + 2 * dn + 2;
+#endif
+}
+
+RETTYPE
+FNAME (Q(mp_ptr qp)
+       mp_ptr np, mp_size_t nn,
+       mp_srcptr dp, mp_size_t dn,
+       mp_ptr tp)
+{
+  mp_limb_t d1, d0;
+  unsigned int cnt;
+  mp_limb_t inv32;
+
+  ASSERT (dn >= 1);
+  ASSERT (nn >= dn);
+  ASSERT (dp[dn - 1] != 0);
+
+  d1 = dp[dn - 1];
+  count_leading_zeros (cnt, d1);
+
+  if (cnt != 0)
+    {
+      mp_limb_t qh, cy;
+      mp_ptr np2, dp2;
+      dp2 = tp;					/* dn limbs */
+      mpn_lshift (dp2, dp, dn, cnt);
+
+      np2 = tp + dn;				/* (nn + 1) limbs */
+      cy = mpn_lshift (np2, np, nn, cnt);
+      np2[nn++] = cy;
+
+      d0 = dp2[dn - 1];
+      d0 += (~d0 != 0);
+      invert_limb (inv32, d0);
+
+      /* We add nn + dn to tp here, not nn + 1 + dn, as expected.  This is
+	 since nn here will have been incremented.  */
+#if OPERATION_sec_div_qr
+      qh = mpn_sec_pi1_div_qr (np2 + dn, np2, nn, dp2, dn, inv32, tp + nn + dn);
+      ASSERT (qh == 0);		/* FIXME: this indicates inefficiency! */
+      MPN_COPY (qp, np2 + dn, nn - dn - 1);
+      qh = np2[nn - 1];
+#else
+      mpn_sec_pi1_div_r (np2, nn, dp2, dn, inv32, tp + nn + dn);
+#endif
+
+      mpn_rshift (np, np2, dn, cnt);
+
+#if OPERATION_sec_div_qr
+      return qh;
+#endif
+    }
+  else
+    {
+      /* FIXME: Consider copying np => np2 here, adding a 0-limb at the top.
+	 That would simplify the underlying pi1 function, since then it could
+	 assume nn > dn.  */
+      d0 = dp[dn - 1];
+      d0 += (~d0 != 0);
+      invert_limb (inv32, d0);
+
+#if OPERATION_sec_div_qr
+      return mpn_sec_pi1_div_qr (qp, np, nn, dp, dn, inv32, tp);
+#else
+      mpn_sec_pi1_div_r (np, nn, dp, dn, inv32, tp);
+#endif
+    }
+}

diff --git a/third_party/gmp/mpn/generic/sec_invert.c b/third_party/gmp/mpn/generic/sec_invert.c
new file mode 100644
index 0000000..07665d1
--- /dev/null
+++ b/third_party/gmp/mpn/generic/sec_invert.c

@@ -0,0 +1,177 @@
+/* mpn_sec_invert
+
+   Contributed to the GNU project by Niels Möller
+
+Copyright 2013 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+
+#if 0
+/* Currently unused. Should be resurrected once mpn_cnd_neg is
+   advertised. */
+static mp_size_t
+mpn_cnd_neg_itch (mp_size_t n)
+{
+  return n;
+}
+#endif
+
+/* FIXME: Ought to return carry */
+static void
+mpn_cnd_neg (int cnd, mp_limb_t *rp, const mp_limb_t *ap, mp_size_t n,
+	     mp_ptr scratch)
+{
+  mpn_lshift (scratch, ap, n, 1);
+  mpn_cnd_sub_n (cnd, rp, ap, scratch, n);
+}
+
+static int
+mpn_sec_eq_ui (mp_srcptr ap, mp_size_t n, mp_limb_t b)
+{
+  mp_limb_t d;
+  ASSERT (n > 0);
+
+  d = ap[0] ^ b;
+
+  while (--n > 0)
+    d |= ap[n];
+
+  return d == 0;
+}
+
+mp_size_t
+mpn_sec_invert_itch (mp_size_t n)
+{
+  return 4*n;
+}
+
+/* Compute V <-- A^{-1} (mod M), in data-independent time. M must be
+   odd. Returns 1 on success, and 0 on failure (i.e., if gcd (A, m) !=
+   1). Inputs and outputs of size n, and no overlap allowed. The {ap,
+   n} area is destroyed. For arbitrary inputs, bit_size should be
+   2*n*GMP_NUMB_BITS, but if A or M are known to be smaller, e.g., if
+   M = 2^521 - 1 and A < M, bit_size can be any bound on the sum of
+   the bit sizes of A and M. */
+int
+mpn_sec_invert (mp_ptr vp, mp_ptr ap, mp_srcptr mp,
+		mp_size_t n, mp_bitcnt_t bit_size,
+		mp_ptr scratch)
+{
+  ASSERT (n > 0);
+  ASSERT (bit_size > 0);
+  ASSERT (mp[0] & 1);
+  ASSERT (! MPN_OVERLAP_P (ap, n, vp, n));
+#define bp (scratch + n)
+#define up (scratch + 2*n)
+#define m1hp (scratch + 3*n)
+
+  /* Maintain
+
+       a = u * orig_a (mod m)
+       b = v * orig_a (mod m)
+
+     and b odd at all times. Initially,
+
+       a = a_orig, u = 1
+       b = m,      v = 0
+     */
+
+
+  up[0] = 1;
+  mpn_zero (up+1, n - 1);
+  mpn_copyi (bp, mp, n);
+  mpn_zero (vp, n);
+
+  ASSERT_CARRY (mpn_rshift (m1hp, mp, n, 1));
+  ASSERT_NOCARRY (mpn_sec_add_1 (m1hp, m1hp, n, 1, scratch));
+
+  while (bit_size-- > 0)
+    {
+      mp_limb_t odd, swap, cy;
+
+      /* Always maintain b odd. The logic of the iteration is as
+	 follows. For a, b:
+
+	   odd = a & 1
+	   a -= odd * b
+	   if (underflow from a-b)
+	     {
+	       b += a, assigns old a
+	       a = B^n-a
+	     }
+
+	   a /= 2
+
+	 For u, v:
+
+	   if (underflow from a - b)
+	     swap u, v
+	   u -= odd * v
+	   if (underflow from u - v)
+	     u += m
+
+	   u /= 2
+	   if (a one bit was shifted out)
+	     u += (m+1)/2
+
+	 As long as a > 0, the quantity
+
+	   (bitsize of a) + (bitsize of b)
+
+	 is reduced by at least one bit per iteration, hence after (bit_size of
+	 orig_a) + (bit_size of m) - 1 iterations we surely have a = 0. Then b
+	 = gcd(orig_a, m) and if b = 1 then also v = orig_a^{-1} (mod m).
+      */
+
+      ASSERT (bp[0] & 1);
+      odd = ap[0] & 1;
+
+      swap = mpn_cnd_sub_n (odd, ap, ap, bp, n);
+      mpn_cnd_add_n (swap, bp, bp, ap, n);
+      mpn_cnd_neg (swap, ap, ap, n, scratch);
+
+      mpn_cnd_swap (swap, up, vp, n);
+      cy = mpn_cnd_sub_n (odd, up, up, vp, n);
+      cy -= mpn_cnd_add_n (cy, up, up, mp, n);
+      ASSERT (cy == 0);
+
+      cy = mpn_rshift (ap, ap, n, 1);
+      ASSERT (cy == 0);
+      cy = mpn_rshift (up, up, n, 1);
+      cy = mpn_cnd_add_n (cy, up, up, m1hp, n);
+      ASSERT (cy == 0);
+    }
+  /* Should be all zeros, but check only extreme limbs */
+  ASSERT ( (ap[0] | ap[n-1]) == 0);
+  /* Check if indeed gcd == 1. */
+  return mpn_sec_eq_ui (bp, n, 1);
+#undef bp
+#undef up
+#undef m1hp
+}

diff --git a/third_party/gmp/mpn/generic/sec_mul.c b/third_party/gmp/mpn/generic/sec_mul.c
new file mode 100644
index 0000000..4bbfa61
--- /dev/null
+++ b/third_party/gmp/mpn/generic/sec_mul.c

@@ -0,0 +1,48 @@
+/* mpn_sec_mul.
+
+   Contributed to the GNU project by Torbjörn Granlund.
+
+Copyright 2013 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+
+void
+mpn_sec_mul (mp_ptr rp,
+	     mp_srcptr ap, mp_size_t an,
+	     mp_srcptr bp, mp_size_t bn,
+	     mp_ptr tp)
+{
+  mpn_mul_basecase (rp, ap, an, bp, bn);
+}
+
+mp_size_t
+mpn_sec_mul_itch (mp_size_t an, mp_size_t bn)
+{
+  return 0;
+}

diff --git a/third_party/gmp/mpn/generic/sec_pi1_div.c b/third_party/gmp/mpn/generic/sec_pi1_div.c
new file mode 100644
index 0000000..29d01e7
--- /dev/null
+++ b/third_party/gmp/mpn/generic/sec_pi1_div.c

@@ -0,0 +1,172 @@
+/* mpn_sec_pi1_div_qr, mpn_sec_pi1_div_r -- Compute Q = floor(U / V), U = U
+   mod V.  Side-channel silent under the assumption that the used instructions
+   are side-channel silent.
+
+   Contributed to the GNU project by Torbjörn Granlund.
+
+   THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
+   SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+Copyright 2011-2013 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+/* This side-channel silent division algorithm reduces the partial remainder by
+   GMP_NUMB_BITS/2 bits at a time, compared to GMP_NUMB_BITS for the main
+   division algorithm.  We actually do not insist on reducing by exactly
+   GMP_NUMB_BITS/2, but may leave a partial remainder that is D*B^i to 3D*B^i
+   too large (B is the limb base, D is the divisor, and i is the induction
+   variable); the subsequent step will handle the extra partial remainder bits.
+
+   With that partial remainder reduction, each step generates a quotient "half
+   limb".  The outer loop generates two quotient half limbs, an upper (q1h) and
+   a lower (q0h) which are stored sparsely in separate limb arrays.  These
+   arrays are added at the end; using separate arrays avoids data-dependent
+   carry propagation which could else pose a side-channel leakage problem.
+
+   The quotient half limbs may be between -3 to 0 from the accurate value
+   ("accurate" being the one which corresponds to a reduction to a principal
+   partial remainder).  Too small quotient half limbs correspond to too large
+   remainders, which we reduce later, as described above.
+
+   In order to keep quotients from getting too big, corresponding to a negative
+   partial remainder, we use an inverse which is slightly smaller than usually.
+*/
+
+#if OPERATION_sec_pi1_div_qr
+/* Needs (dn + 1) + (nn - dn) + (nn - dn) = 2nn - dn + 1 limbs at tp. */
+#define FNAME mpn_sec_pi1_div_qr
+#define Q(q) q,
+#define RETTYPE mp_limb_t
+#endif
+#if OPERATION_sec_pi1_div_r
+/* Needs (dn + 1) limbs at tp.  */
+#define FNAME mpn_sec_pi1_div_r
+#define Q(q)
+#define RETTYPE void
+#endif
+
+RETTYPE
+FNAME (Q(mp_ptr qp)
+       mp_ptr np, mp_size_t nn,
+       mp_srcptr dp, mp_size_t dn,
+       mp_limb_t dinv,
+       mp_ptr tp)
+{
+  mp_limb_t nh, cy, q1h, q0h, dummy, cnd;
+  mp_size_t i;
+  mp_ptr hp;
+#if OPERATION_sec_pi1_div_qr
+  mp_limb_t qh;
+  mp_ptr qlp, qhp;
+#endif
+
+  ASSERT (dn >= 1);
+  ASSERT (nn >= dn);
+  ASSERT ((dp[dn - 1] & GMP_NUMB_HIGHBIT) != 0);
+
+  if (nn == dn)
+    {
+      cy = mpn_sub_n (np, np, dp, dn);
+      mpn_cnd_add_n (cy, np, np, dp, dn);
+#if OPERATION_sec_pi1_div_qr
+      return 1 - cy;
+#else
+      return;
+#endif
+    }
+
+  /* Create a divisor copy shifted half a limb.  */
+  hp = tp;					/* (dn + 1) limbs */
+  hp[dn] = mpn_lshift (hp, dp, dn, GMP_NUMB_BITS / 2);
+
+#if OPERATION_sec_pi1_div_qr
+  qlp = tp + (dn + 1);				/* (nn - dn) limbs */
+  qhp = tp + (nn + 1);				/* (nn - dn) limbs */
+#endif
+
+  np += nn - dn;
+  nh = 0;
+
+  for (i = nn - dn - 1; i >= 0; i--)
+    {
+      np--;
+
+      nh = (nh << GMP_NUMB_BITS/2) + (np[dn] >> GMP_NUMB_BITS/2);
+      umul_ppmm (q1h, dummy, nh, dinv);
+      q1h += nh;
+#if OPERATION_sec_pi1_div_qr
+      qhp[i] = q1h;
+#endif
+      mpn_submul_1 (np, hp, dn + 1, q1h);
+
+      nh = np[dn];
+      umul_ppmm (q0h, dummy, nh, dinv);
+      q0h += nh;
+#if OPERATION_sec_pi1_div_qr
+      qlp[i] = q0h;
+#endif
+      nh -= mpn_submul_1 (np, dp, dn, q0h);
+    }
+
+  /* 1st adjustment depends on extra high remainder limb.  */
+  cnd = nh != 0;				/* FIXME: cmp-to-int */
+#if OPERATION_sec_pi1_div_qr
+  qlp[0] += cnd;
+#endif
+  nh -= mpn_cnd_sub_n (cnd, np, np, dp, dn);
+
+  /* 2nd adjustment depends on remainder/divisor comparison as well as whether
+     extra remainder limb was nullified by previous subtract.  */
+  cy = mpn_sub_n (np, np, dp, dn);
+  cy = cy - nh;
+#if OPERATION_sec_pi1_div_qr
+  qlp[0] += 1 - cy;
+#endif
+  mpn_cnd_add_n (cy, np, np, dp, dn);
+
+  /* 3rd adjustment depends on remainder/divisor comparison.  */
+  cy = mpn_sub_n (np, np, dp, dn);
+#if OPERATION_sec_pi1_div_qr
+  qlp[0] += 1 - cy;
+#endif
+  mpn_cnd_add_n (cy, np, np, dp, dn);
+
+#if OPERATION_sec_pi1_div_qr
+  /* Combine quotient halves into final quotient.  */
+  qh = mpn_lshift (qhp, qhp, nn - dn, GMP_NUMB_BITS/2);
+  qh += mpn_add_n (qp, qhp, qlp, nn - dn);
+
+  return qh;
+#else
+  return;
+#endif
+}

diff --git a/third_party/gmp/mpn/generic/sec_powm.c b/third_party/gmp/mpn/generic/sec_powm.c
new file mode 100644
index 0000000..3a78c66
--- /dev/null
+++ b/third_party/gmp/mpn/generic/sec_powm.c

@@ -0,0 +1,383 @@
+/* mpn_sec_powm -- Compute R = U^E mod M.  Secure variant, side-channel silent
+   under the assumption that the multiply instruction is side channel silent.
+
+   Contributed to the GNU project by Torbjörn Granlund.
+
+Copyright 2007-2009, 2011-2014, 2018-2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+/*
+  BASIC ALGORITHM, Compute U^E mod M, where M < B^n is odd.
+
+  1. T <- (B^n * U) mod M; convert to REDC form
+
+  2. Compute table U^0, U^1, U^2... of floor(log(E))-dependent size
+
+  3. While there are more bits in E
+       W <- power left-to-right base-k
+
+  The article "Defeating modexp side-channel attacks with data-independent
+  execution traces", https://gmplib.org/~tege/modexp-silent.pdf, has details.
+
+
+  TODO:
+
+   * Make getbits a macro, thereby allowing it to update the index operand.
+     That will simplify the code using getbits.  (Perhaps make getbits' sibling
+     getbit then have similar form, for symmetry.)
+
+   * Choose window size without looping.  (Superoptimize or think(tm).)
+
+   * REDC_1_TO_REDC_2_THRESHOLD might actually represent the cutoff between
+     redc_1 and redc_n.  On such systems, we will switch to redc_2 causing
+     slowdown.
+*/
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+#undef MPN_REDC_1_SEC
+#if HAVE_NATIVE_mpn_sbpi1_bdiv_r
+#define MPN_REDC_1_SEC(rp, up, mp, n, invm)				\
+  do {									\
+    mp_limb_t cy;							\
+    cy = mpn_sbpi1_bdiv_r (up, 2 * n, mp, n, invm);			\
+    mpn_cnd_sub_n (cy, rp, up + n, mp, n);				\
+  } while (0)
+#else
+#define MPN_REDC_1_SEC(rp, up, mp, n, invm)				\
+  do {									\
+    mp_limb_t cy;							\
+    cy = mpn_redc_1 (rp, up, mp, n, invm);				\
+    mpn_cnd_sub_n (cy, rp, rp, mp, n);					\
+  } while (0)
+#endif
+
+#if HAVE_NATIVE_mpn_addmul_2 || HAVE_NATIVE_mpn_redc_2
+#undef MPN_REDC_2_SEC
+#define MPN_REDC_2_SEC(rp, up, mp, n, mip)				\
+  do {									\
+    mp_limb_t cy;							\
+    cy = mpn_redc_2 (rp, up, mp, n, mip);				\
+    mpn_cnd_sub_n (cy, rp, rp, mp, n);					\
+  } while (0)
+#else
+#define MPN_REDC_2_SEC(rp, up, mp, n, mip) /* empty */
+#undef REDC_1_TO_REDC_2_THRESHOLD
+#define REDC_1_TO_REDC_2_THRESHOLD MP_SIZE_T_MAX
+#endif
+
+/* Define our own mpn squaring function.  We do this since we cannot use a
+   native mpn_sqr_basecase over TUNE_SQR_TOOM2_MAX, or a non-native one over
+   SQR_TOOM2_THRESHOLD.  This is so because of fixed size stack allocations
+   made inside mpn_sqr_basecase.  */
+
+#if ! HAVE_NATIVE_mpn_sqr_basecase
+/* The limit of the generic code is SQR_TOOM2_THRESHOLD.  */
+#define SQR_BASECASE_LIM  SQR_TOOM2_THRESHOLD
+#endif
+
+#if HAVE_NATIVE_mpn_sqr_basecase
+#ifdef TUNE_SQR_TOOM2_MAX
+/* We slightly abuse TUNE_SQR_TOOM2_MAX here.  If it is set for an assembly
+   mpn_sqr_basecase, it comes from SQR_TOOM2_THRESHOLD_MAX in the assembly
+   file.  An assembly mpn_sqr_basecase that does not define it should allow
+   any size.  */
+#define SQR_BASECASE_LIM  SQR_TOOM2_THRESHOLD
+#endif
+#endif
+
+#ifdef WANT_FAT_BINARY
+/* For fat builds, we use SQR_TOOM2_THRESHOLD which will expand to a read from
+   __gmpn_cpuvec.  Perhaps any possible sqr_basecase.asm allow any size, and we
+   limit the use unnecessarily.  We cannot tell, so play it safe.  FIXME.  */
+#define SQR_BASECASE_LIM  SQR_TOOM2_THRESHOLD
+#endif
+
+#ifndef SQR_BASECASE_LIM
+/* If SQR_BASECASE_LIM is now not defined, use mpn_sqr_basecase for any operand
+   size.  */
+#define SQR_BASECASE_LIM  MP_SIZE_T_MAX
+#endif
+
+#define mpn_local_sqr(rp,up,n)						\
+  do {									\
+    if (ABOVE_THRESHOLD (n, SQR_BASECASE_THRESHOLD)			\
+	&& BELOW_THRESHOLD (n, SQR_BASECASE_LIM))			\
+      mpn_sqr_basecase (rp, up, n);					\
+    else								\
+      mpn_mul_basecase(rp, up, n, up, n);				\
+  } while (0)
+
+#define getbit(p,bi) \
+  ((p[(bi - 1) / GMP_NUMB_BITS] >> (bi - 1) % GMP_NUMB_BITS) & 1)
+
+/* FIXME: Maybe some things would get simpler if all callers ensure
+   that bi >= nbits. As far as I understand, with the current code bi
+   < nbits can happen only for the final iteration. */
+static inline mp_limb_t
+getbits (const mp_limb_t *p, mp_bitcnt_t bi, int nbits)
+{
+  int nbits_in_r;
+  mp_limb_t r;
+  mp_size_t i;
+
+  if (bi < nbits)
+    {
+      return p[0] & (((mp_limb_t) 1 << bi) - 1);
+    }
+  else
+    {
+      bi -= nbits;			/* bit index of low bit to extract */
+      i = bi / GMP_NUMB_BITS;		/* word index of low bit to extract */
+      bi %= GMP_NUMB_BITS;		/* bit index in low word */
+      r = p[i] >> bi;			/* extract (low) bits */
+      nbits_in_r = GMP_NUMB_BITS - bi;	/* number of bits now in r */
+      if (nbits_in_r < nbits)		/* did we get enough bits? */
+	r += p[i + 1] << nbits_in_r;	/* prepend bits from higher word */
+      return r & (((mp_limb_t ) 1 << nbits) - 1);
+    }
+}
+
+#ifndef POWM_SEC_TABLE
+#if GMP_NUMB_BITS < 50
+#define POWM_SEC_TABLE  2,33,96,780,2741
+#else
+#define POWM_SEC_TABLE  2,130,524,2578
+#endif
+#endif
+
+#if TUNE_PROGRAM_BUILD
+extern int win_size (mp_bitcnt_t);
+#else
+static inline int
+win_size (mp_bitcnt_t enb)
+{
+  int k;
+  /* Find k, such that x[k-1] < enb <= x[k].
+
+     We require that x[k] >= k, then it follows that enb > x[k-1] >=
+     k-1, which implies k <= enb.
+  */
+  static const mp_bitcnt_t x[] = {0,POWM_SEC_TABLE,~(mp_bitcnt_t)0};
+  for (k = 1; enb > x[k]; k++)
+    ;
+  ASSERT (k <= enb);
+  return k;
+}
+#endif
+
+/* Convert U to REDC form, U_r = B^n * U mod M.
+   Uses scratch space at tp of size 2un + n + 1.  */
+static void
+redcify (mp_ptr rp, mp_srcptr up, mp_size_t un, mp_srcptr mp, mp_size_t n, mp_ptr tp)
+{
+  MPN_ZERO (tp, n);
+  MPN_COPY (tp + n, up, un);
+
+  mpn_sec_div_r (tp, un + n, mp, n, tp + un + n);
+  MPN_COPY (rp, tp, n);
+}
+
+/* {rp, n} <-- {bp, bn} ^ {ep, en} mod {mp, n},
+   where en = ceil (enb / GMP_NUMB_BITS)
+   Requires that {mp, n} is odd (and hence also mp[0] odd).
+   Uses scratch space at tp as defined by mpn_sec_powm_itch.  */
+void
+mpn_sec_powm (mp_ptr rp, mp_srcptr bp, mp_size_t bn,
+	      mp_srcptr ep, mp_bitcnt_t enb,
+	      mp_srcptr mp, mp_size_t n, mp_ptr tp)
+{
+  mp_limb_t ip[2], *mip;
+  int windowsize, this_windowsize;
+  mp_limb_t expbits;
+  mp_ptr pp, this_pp, ps;
+  long i;
+  int cnd;
+
+  ASSERT (enb > 0);
+  ASSERT (n > 0);
+  /* The code works for bn = 0, but the defined scratch space is 2 limbs
+     greater than we supply, when converting 1 to redc form .  */
+  ASSERT (bn > 0);
+  ASSERT ((mp[0] & 1) != 0);
+
+  windowsize = win_size (enb);
+
+  if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_2_THRESHOLD))
+    {
+      mip = ip;
+      binvert_limb (mip[0], mp[0]);
+      mip[0] = -mip[0];
+    }
+  else
+    {
+      mip = ip;
+      mpn_binvert (mip, mp, 2, tp);
+      mip[0] = -mip[0]; mip[1] = ~mip[1];
+    }
+
+  pp = tp;
+  tp += (n << windowsize);	/* put tp after power table */
+
+  /* Compute pp[0] table entry */
+  /* scratch: |   n   | 1 |   n+2    |  */
+  /*          | pp[0] | 1 | redcify  |  */
+  this_pp = pp;
+  this_pp[n] = 1;
+  redcify (this_pp, this_pp + n, 1, mp, n, this_pp + n + 1);
+  this_pp += n;
+
+  /* Compute pp[1] table entry.  To avoid excessive scratch usage in the
+     degenerate situation where B >> M, we let redcify use scratch space which
+     will later be used by the pp table (element 2 and up).  */
+  /* scratch: |   n   |   n   |  bn + n + 1  |  */
+  /*          | pp[0] | pp[1] |   redcify    |  */
+  redcify (this_pp, bp, bn, mp, n, this_pp + n);
+
+  /* Precompute powers of b and put them in the temporary area at pp.  */
+  /* scratch: |   n   |   n   | ...  |                    |   2n      |  */
+  /*          | pp[0] | pp[1] | ...  | pp[2^windowsize-1] |  product  |  */
+  ps = pp + n;		/* initially B^1 */
+  if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_2_THRESHOLD))
+    {
+      for (i = (1 << windowsize) - 2; i > 0; i -= 2)
+	{
+	  mpn_local_sqr (tp, ps, n);
+	  ps += n;
+	  this_pp += n;
+	  MPN_REDC_1_SEC (this_pp, tp, mp, n, mip[0]);
+
+	  mpn_mul_basecase (tp, this_pp, n, pp + n, n);
+	  this_pp += n;
+	  MPN_REDC_1_SEC (this_pp, tp, mp, n, mip[0]);
+	}
+    }
+  else
+    {
+      for (i = (1 << windowsize) - 2; i > 0; i -= 2)
+	{
+	  mpn_local_sqr (tp, ps, n);
+	  ps += n;
+	  this_pp += n;
+	  MPN_REDC_2_SEC (this_pp, tp, mp, n, mip);
+
+	  mpn_mul_basecase (tp, this_pp, n, pp + n, n);
+	  this_pp += n;
+	  MPN_REDC_2_SEC (this_pp, tp, mp, n, mip);
+	}
+    }
+
+  expbits = getbits (ep, enb, windowsize);
+  ASSERT_ALWAYS (enb >= windowsize);
+  enb -= windowsize;
+
+  mpn_sec_tabselect (rp, pp, n, 1 << windowsize, expbits);
+
+  /* Main exponentiation loop.  */
+  /* scratch: |   n   |   n   | ...  |                    |     3n-4n     |  */
+  /*          | pp[0] | pp[1] | ...  | pp[2^windowsize-1] |  loop scratch |  */
+
+#define INNERLOOP							\
+  while (enb != 0)							\
+    {									\
+      expbits = getbits (ep, enb, windowsize);				\
+      this_windowsize = windowsize;					\
+      if (enb < windowsize)						\
+	{								\
+	  this_windowsize -= windowsize - enb;				\
+	  enb = 0;							\
+	}								\
+      else								\
+	enb -= windowsize;						\
+									\
+      do								\
+	{								\
+	  mpn_local_sqr (tp, rp, n);					\
+	  MPN_REDUCE (rp, tp, mp, n, mip);				\
+	  this_windowsize--;						\
+	}								\
+      while (this_windowsize != 0);					\
+									\
+      mpn_sec_tabselect (tp + 2*n, pp, n, 1 << windowsize, expbits);	\
+      mpn_mul_basecase (tp, rp, n, tp + 2*n, n);			\
+									\
+      MPN_REDUCE (rp, tp, mp, n, mip);					\
+    }
+
+  if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_2_THRESHOLD))
+    {
+#undef MPN_REDUCE
+#define MPN_REDUCE(rp,tp,mp,n,mip)	MPN_REDC_1_SEC (rp, tp, mp, n, mip[0])
+      INNERLOOP;
+    }
+  else
+    {
+#undef MPN_REDUCE
+#define MPN_REDUCE(rp,tp,mp,n,mip)	MPN_REDC_2_SEC (rp, tp, mp, n, mip)
+      INNERLOOP;
+    }
+
+  MPN_COPY (tp, rp, n);
+  MPN_ZERO (tp + n, n);
+
+  if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_2_THRESHOLD))
+    MPN_REDC_1_SEC (rp, tp, mp, n, mip[0]);
+  else
+    MPN_REDC_2_SEC (rp, tp, mp, n, mip);
+
+  cnd = mpn_sub_n (tp, rp, mp, n);	/* we need just retval */
+  mpn_cnd_sub_n (!cnd, rp, rp, mp, n);
+}
+
+mp_size_t
+mpn_sec_powm_itch (mp_size_t bn, mp_bitcnt_t enb, mp_size_t n)
+{
+  int windowsize;
+  mp_size_t redcify_itch, itch;
+
+  /* FIXME: no more _local/_basecase difference. */
+  /* The top scratch usage will either be when reducing B in the 2nd redcify
+     call, or more typically n*2^windowsize + 3n or 4n, in the main loop.  (It
+     is 3n or 4n depending on if we use mpn_local_sqr or a native
+     mpn_sqr_basecase.  We assume 4n always for now.) */
+
+  windowsize = win_size (enb);
+
+  /* The 2n term is due to pp[0] and pp[1] at the time of the 2nd redcify call,
+     the (bn + n) term is due to redcify's own usage, and the rest is due to
+     mpn_sec_div_r's usage when called from redcify.  */
+  redcify_itch = (2 * n) + (bn + n) + ((bn + n) + 2 * n + 2);
+
+  /* The n * 2^windowsize term is due to the power table, the 4n term is due to
+     scratch needs of squaring/multiplication in the exponentiation loop.  */
+  itch = (n << windowsize) + (4 * n);
+
+  return MAX (itch, redcify_itch);
+}

diff --git a/third_party/gmp/mpn/generic/sec_sqr.c b/third_party/gmp/mpn/generic/sec_sqr.c
new file mode 100644
index 0000000..83fc7d9
--- /dev/null
+++ b/third_party/gmp/mpn/generic/sec_sqr.c

@@ -0,0 +1,76 @@
+/* mpn_sec_sqr.
+
+   Contributed to the GNU project by Torbjörn Granlund.
+
+Copyright 2013, 2014 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+
+#if ! HAVE_NATIVE_mpn_sqr_basecase
+/* The limit of the generic code is SQR_TOOM2_THRESHOLD.  */
+#define SQR_BASECASE_LIM  SQR_TOOM2_THRESHOLD
+#endif
+
+#if HAVE_NATIVE_mpn_sqr_basecase
+#ifdef TUNE_SQR_TOOM2_MAX
+/* We slightly abuse TUNE_SQR_TOOM2_MAX here.  If it is set for an assembly
+   mpn_sqr_basecase, it comes from SQR_TOOM2_THRESHOLD_MAX in the assembly
+   file.  An assembly mpn_sqr_basecase that does not define it should allow
+   any size.  */
+#define SQR_BASECASE_LIM  SQR_TOOM2_THRESHOLD
+#endif
+#endif
+
+#ifdef WANT_FAT_BINARY
+/* For fat builds, we use SQR_TOOM2_THRESHOLD which will expand to a read from
+   __gmpn_cpuvec.  Perhaps any possible sqr_basecase.asm allow any size, and we
+   limit the use unnecessarily.  We cannot tell, so play it safe.  FIXME.  */
+#define SQR_BASECASE_LIM  SQR_TOOM2_THRESHOLD
+#endif
+
+void
+mpn_sec_sqr (mp_ptr rp,
+	     mp_srcptr ap, mp_size_t an,
+	     mp_ptr tp)
+{
+#ifndef SQR_BASECASE_LIM
+/* If SQR_BASECASE_LIM is now not defined, use mpn_sqr_basecase for any operand
+   size.  */
+  mpn_sqr_basecase (rp, ap, an);
+#else
+/* Else use mpn_mul_basecase.  */
+  mpn_mul_basecase (rp, ap, an, ap, an);
+#endif
+}
+
+mp_size_t
+mpn_sec_sqr_itch (mp_size_t an)
+{
+  return 0;
+}

diff --git a/third_party/gmp/mpn/generic/sec_tabselect.c b/third_party/gmp/mpn/generic/sec_tabselect.c
new file mode 100644
index 0000000..5767e27
--- /dev/null
+++ b/third_party/gmp/mpn/generic/sec_tabselect.c

@@ -0,0 +1,54 @@
+/* mpn_sec_tabselect.
+
+Copyright 2007-2009, 2011, 2013 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+
+
+/* Select entry `which' from table `tab', which has nents entries, each `n'
+   limbs.  Store the selected entry at rp.  Reads entire table to avoid
+   side-channel information leaks.  O(n*nents).  */
+void
+mpn_sec_tabselect (volatile mp_limb_t *rp, volatile const mp_limb_t *tab,
+		   mp_size_t n, mp_size_t nents, mp_size_t which)
+{
+  mp_size_t k, i;
+  mp_limb_t mask;
+  volatile const mp_limb_t *tp;
+
+  for (k = 0; k < nents; k++)
+    {
+      mask = -(mp_limb_t) (which == k);
+      tp = tab + n * k;
+      for (i = 0; i < n; i++)
+	{
+	  rp[i] = (rp[i] & ~mask) | (tp[i] & mask);
+	}
+    }
+}

diff --git a/third_party/gmp/mpn/generic/set_str.c b/third_party/gmp/mpn/generic/set_str.c
new file mode 100644
index 0000000..2bd584c
--- /dev/null
+++ b/third_party/gmp/mpn/generic/set_str.c

@@ -0,0 +1,290 @@
+/* mpn_set_str (mp_ptr res_ptr, const char *str, size_t str_len, int base) --
+   Convert a STR_LEN long base BASE byte string pointed to by STR to a limb
+   vector pointed to by RES_PTR.  Return the number of limbs in RES_PTR.
+
+   Contributed to the GNU project by Torbjorn Granlund.
+
+   THE FUNCTIONS IN THIS FILE, EXCEPT mpn_set_str, ARE INTERNAL WITH MUTABLE
+   INTERFACES.  IT IS ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.
+   IN FACT, IT IS ALMOST GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A
+   FUTURE GNU MP RELEASE.
+
+Copyright 1991-2017 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+/* TODO:
+
+      Perhaps do not compute the highest power?
+      Instead, multiply twice by the 2nd highest power:
+
+	       _______
+	      |_______|  hp
+	      |_______|  pow
+       _______________
+      |_______________|  final result
+
+
+	       _______
+	      |_______|  hp
+		  |___|  pow[-1]
+	   ___________
+	  |___________|  intermediate result
+		  |___|  pow[-1]
+       _______________
+      |_______________|  final result
+
+      Generalizing that idea, perhaps we should make powtab contain successive
+      cubes, not squares.
+*/
+
+#include "gmp-impl.h"
+
+mp_size_t
+mpn_set_str (mp_ptr rp, const unsigned char *str, size_t str_len, int base)
+{
+  if (POW2_P (base))
+    {
+      /* The base is a power of 2.  Read the input string from least to most
+	 significant character/digit.  */
+
+      const unsigned char *s;
+      int next_bitpos;
+      mp_limb_t res_digit;
+      mp_size_t size;
+      int bits_per_indigit = mp_bases[base].big_base;
+
+      size = 0;
+      res_digit = 0;
+      next_bitpos = 0;
+
+      for (s = str + str_len - 1; s >= str; s--)
+	{
+	  int inp_digit = *s;
+
+	  res_digit |= ((mp_limb_t) inp_digit << next_bitpos) & GMP_NUMB_MASK;
+	  next_bitpos += bits_per_indigit;
+	  if (next_bitpos >= GMP_NUMB_BITS)
+	    {
+	      rp[size++] = res_digit;
+	      next_bitpos -= GMP_NUMB_BITS;
+	      res_digit = inp_digit >> (bits_per_indigit - next_bitpos);
+	    }
+	}
+
+      if (res_digit != 0)
+	rp[size++] = res_digit;
+      return size;
+    }
+
+  if (BELOW_THRESHOLD (str_len, SET_STR_PRECOMPUTE_THRESHOLD))
+    return mpn_bc_set_str (rp, str, str_len, base);
+  else
+    {
+      mp_ptr powtab_mem, tp;
+      powers_t powtab[GMP_LIMB_BITS];
+      int chars_per_limb;
+      mp_size_t size;
+      mp_size_t un;
+      TMP_DECL;
+
+      TMP_MARK;
+
+      chars_per_limb = mp_bases[base].chars_per_limb;
+
+      un = str_len / chars_per_limb + 1; /* FIXME: scalar integer division */
+
+      /* Allocate one large block for the powers of big_base.  */
+      powtab_mem = TMP_BALLOC_LIMBS (mpn_str_powtab_alloc (un));
+
+      size_t n_pows = mpn_compute_powtab (powtab, powtab_mem, un, base);
+      powers_t *pt = powtab + n_pows;
+
+      tp = TMP_BALLOC_LIMBS (mpn_dc_set_str_itch (un));
+      size = mpn_dc_set_str (rp, str, str_len, pt, tp);
+
+      TMP_FREE;
+      return size;
+    }
+}
+
+mp_size_t
+mpn_dc_set_str (mp_ptr rp, const unsigned char *str, size_t str_len,
+		const powers_t *powtab, mp_ptr tp)
+{
+  size_t len_lo, len_hi;
+  mp_limb_t cy;
+  mp_size_t ln, hn, n, sn;
+
+  len_lo = powtab->digits_in_base;
+
+  if (str_len <= len_lo)
+    {
+      if (BELOW_THRESHOLD (str_len, SET_STR_DC_THRESHOLD))
+	return mpn_bc_set_str (rp, str, str_len, powtab->base);
+      else
+	return mpn_dc_set_str (rp, str, str_len, powtab - 1, tp);
+    }
+
+  len_hi = str_len - len_lo;
+  ASSERT (len_lo >= len_hi);
+
+  if (BELOW_THRESHOLD (len_hi, SET_STR_DC_THRESHOLD))
+    hn = mpn_bc_set_str (tp, str, len_hi, powtab->base);
+  else
+    hn = mpn_dc_set_str (tp, str, len_hi, powtab - 1, rp);
+
+  sn = powtab->shift;
+
+  if (hn == 0)
+    {
+      /* Zero +1 limb here, to avoid reading an allocated but uninitialised
+	 limb in mpn_incr_u below.  */
+      MPN_ZERO (rp, powtab->n + sn + 1);
+    }
+  else
+    {
+      if (powtab->n > hn)
+	mpn_mul (rp + sn, powtab->p, powtab->n, tp, hn);
+      else
+	mpn_mul (rp + sn, tp, hn, powtab->p, powtab->n);
+      MPN_ZERO (rp, sn);
+    }
+
+  str = str + str_len - len_lo;
+  if (BELOW_THRESHOLD (len_lo, SET_STR_DC_THRESHOLD))
+    ln = mpn_bc_set_str (tp, str, len_lo, powtab->base);
+  else
+    ln = mpn_dc_set_str (tp, str, len_lo, powtab - 1, tp + powtab->n + sn + 1);
+
+  if (ln != 0)
+    {
+      cy = mpn_add_n (rp, rp, tp, ln);
+      mpn_incr_u (rp + ln, cy);
+    }
+  n = hn + powtab->n + sn;
+  return n - (rp[n - 1] == 0);
+}
+
+mp_size_t
+mpn_bc_set_str (mp_ptr rp, const unsigned char *str, size_t str_len, int base)
+{
+  mp_size_t size;
+  size_t i;
+  long j;
+  mp_limb_t cy_limb;
+
+  mp_limb_t big_base;
+  int chars_per_limb;
+  mp_limb_t res_digit;
+
+  ASSERT (base >= 2);
+  ASSERT (base < numberof (mp_bases));
+  ASSERT (str_len >= 1);
+
+  big_base = mp_bases[base].big_base;
+  chars_per_limb = mp_bases[base].chars_per_limb;
+
+  size = 0;
+  for (i = chars_per_limb; i < str_len; i += chars_per_limb)
+    {
+      res_digit = *str++;
+      if (base == 10)
+	{ /* This is a common case.
+	     Help the compiler to avoid multiplication.  */
+	  for (j = MP_BASES_CHARS_PER_LIMB_10 - 1; j != 0; j--)
+	    res_digit = res_digit * 10 + *str++;
+	}
+      else
+	{
+	  for (j = chars_per_limb - 1; j != 0; j--)
+	    res_digit = res_digit * base + *str++;
+	}
+
+      if (size == 0)
+	{
+	  if (res_digit != 0)
+	    {
+	      rp[0] = res_digit;
+	      size = 1;
+	    }
+	}
+      else
+	{
+#if HAVE_NATIVE_mpn_mul_1c
+	  cy_limb = mpn_mul_1c (rp, rp, size, big_base, res_digit);
+#else
+	  cy_limb = mpn_mul_1 (rp, rp, size, big_base);
+	  cy_limb += mpn_add_1 (rp, rp, size, res_digit);
+#endif
+	  if (cy_limb != 0)
+	    rp[size++] = cy_limb;
+	}
+    }
+
+  big_base = base;
+  res_digit = *str++;
+  if (base == 10)
+    { /* This is a common case.
+	 Help the compiler to avoid multiplication.  */
+      for (j = str_len - (i - MP_BASES_CHARS_PER_LIMB_10) - 1; j > 0; j--)
+	{
+	  res_digit = res_digit * 10 + *str++;
+	  big_base *= 10;
+	}
+    }
+  else
+    {
+      for (j = str_len - (i - chars_per_limb) - 1; j > 0; j--)
+	{
+	  res_digit = res_digit * base + *str++;
+	  big_base *= base;
+	}
+    }
+
+  if (size == 0)
+    {
+      if (res_digit != 0)
+	{
+	  rp[0] = res_digit;
+	  size = 1;
+	}
+    }
+  else
+    {
+#if HAVE_NATIVE_mpn_mul_1c
+      cy_limb = mpn_mul_1c (rp, rp, size, big_base, res_digit);
+#else
+      cy_limb = mpn_mul_1 (rp, rp, size, big_base);
+      cy_limb += mpn_add_1 (rp, rp, size, res_digit);
+#endif
+      if (cy_limb != 0)
+	rp[size++] = cy_limb;
+    }
+  return size;
+}

diff --git a/third_party/gmp/mpn/generic/sizeinbase.c b/third_party/gmp/mpn/generic/sizeinbase.c
new file mode 100644
index 0000000..faee947
--- /dev/null
+++ b/third_party/gmp/mpn/generic/sizeinbase.c

@@ -0,0 +1,49 @@
+/* mpn_sizeinbase -- approximation to chars required for an mpn.
+
+   THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY.  THEY'RE ALMOST
+   CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
+   FUTURE GNU MP RELEASES.
+
+Copyright 1991, 1993-1995, 2001, 2002, 2011, 2012 Free Software Foundation,
+Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+
+/* Same as mpz_sizeinbase, meaning exact for power-of-2 bases, and either
+   exact or 1 too big for other bases.  */
+
+size_t
+mpn_sizeinbase (mp_srcptr xp, mp_size_t xsize, int base)
+{
+  size_t  result;
+  MPN_SIZEINBASE (result, xp, xsize, base);
+  return result;
+}

diff --git a/third_party/gmp/mpn/generic/sqr.c b/third_party/gmp/mpn/generic/sqr.c
new file mode 100644
index 0000000..74fbff0
--- /dev/null
+++ b/third_party/gmp/mpn/generic/sqr.c

@@ -0,0 +1,98 @@
+/* mpn_sqr -- square natural numbers.
+
+Copyright 1991, 1993, 1994, 1996-2003, 2005, 2008, 2009 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+void
+mpn_sqr (mp_ptr p, mp_srcptr a, mp_size_t n)
+{
+  ASSERT (n >= 1);
+  ASSERT (! MPN_OVERLAP_P (p, 2 * n, a, n));
+
+  if (BELOW_THRESHOLD (n, SQR_BASECASE_THRESHOLD))
+    { /* mul_basecase is faster than sqr_basecase on small sizes sometimes */
+      mpn_mul_basecase (p, a, n, a, n);
+    }
+  else if (BELOW_THRESHOLD (n, SQR_TOOM2_THRESHOLD))
+    {
+      mpn_sqr_basecase (p, a, n);
+    }
+  else if (BELOW_THRESHOLD (n, SQR_TOOM3_THRESHOLD))
+    {
+      /* Allocate workspace of fixed size on stack: fast! */
+      mp_limb_t ws[mpn_toom2_sqr_itch (SQR_TOOM3_THRESHOLD_LIMIT-1)];
+      ASSERT (SQR_TOOM3_THRESHOLD <= SQR_TOOM3_THRESHOLD_LIMIT);
+      mpn_toom2_sqr (p, a, n, ws);
+    }
+  else if (BELOW_THRESHOLD (n, SQR_TOOM4_THRESHOLD))
+    {
+      mp_ptr ws;
+      TMP_SDECL;
+      TMP_SMARK;
+      ws = TMP_SALLOC_LIMBS (mpn_toom3_sqr_itch (n));
+      mpn_toom3_sqr (p, a, n, ws);
+      TMP_SFREE;
+    }
+  else if (BELOW_THRESHOLD (n, SQR_TOOM6_THRESHOLD))
+    {
+      mp_ptr ws;
+      TMP_SDECL;
+      TMP_SMARK;
+      ws = TMP_SALLOC_LIMBS (mpn_toom4_sqr_itch (n));
+      mpn_toom4_sqr (p, a, n, ws);
+      TMP_SFREE;
+    }
+  else if (BELOW_THRESHOLD (n, SQR_TOOM8_THRESHOLD))
+    {
+      mp_ptr ws;
+      TMP_SDECL;
+      TMP_SMARK;
+      ws = TMP_SALLOC_LIMBS (mpn_toom6_sqr_itch (n));
+      mpn_toom6_sqr (p, a, n, ws);
+      TMP_SFREE;
+    }
+  else if (BELOW_THRESHOLD (n, SQR_FFT_THRESHOLD))
+    {
+      mp_ptr ws;
+      TMP_DECL;
+      TMP_MARK;
+      ws = TMP_ALLOC_LIMBS (mpn_toom8_sqr_itch (n));
+      mpn_toom8_sqr (p, a, n, ws);
+      TMP_FREE;
+    }
+  else
+    {
+      /* The current FFT code allocates its own space.  That should probably
+	 change.  */
+      mpn_fft_mul (p, a, n, a, n);
+    }
+}

diff --git a/third_party/gmp/mpn/generic/sqr_basecase.c b/third_party/gmp/mpn/generic/sqr_basecase.c
new file mode 100644
index 0000000..2645bad
--- /dev/null
+++ b/third_party/gmp/mpn/generic/sqr_basecase.c

@@ -0,0 +1,361 @@
+/* mpn_sqr_basecase -- Internal routine to square a natural number
+   of length n.
+
+   THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE.  IT IS ONLY
+   SAFE TO REACH THIS FUNCTION THROUGH DOCUMENTED INTERFACES.
+
+
+Copyright 1991-1994, 1996, 1997, 2000-2005, 2008, 2010, 2011, 2017 Free
+Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+
+#if HAVE_NATIVE_mpn_sqr_diagonal
+#define MPN_SQR_DIAGONAL(rp, up, n)					\
+  mpn_sqr_diagonal (rp, up, n)
+#else
+#define MPN_SQR_DIAGONAL(rp, up, n)					\
+  do {									\
+    mp_size_t _i;							\
+    for (_i = 0; _i < (n); _i++)					\
+      {									\
+	mp_limb_t ul, lpl;						\
+	ul = (up)[_i];							\
+	umul_ppmm ((rp)[2 * _i + 1], lpl, ul, ul << GMP_NAIL_BITS);	\
+	(rp)[2 * _i] = lpl >> GMP_NAIL_BITS;				\
+      }									\
+  } while (0)
+#endif
+
+#if HAVE_NATIVE_mpn_sqr_diag_addlsh1
+#define MPN_SQR_DIAG_ADDLSH1(rp, tp, up, n)				\
+  mpn_sqr_diag_addlsh1 (rp, tp, up, n)
+#else
+#if HAVE_NATIVE_mpn_addlsh1_n
+#define MPN_SQR_DIAG_ADDLSH1(rp, tp, up, n)				\
+  do {									\
+    mp_limb_t cy;							\
+    MPN_SQR_DIAGONAL (rp, up, n);					\
+    cy = mpn_addlsh1_n (rp + 1, rp + 1, tp, 2 * n - 2);			\
+    rp[2 * n - 1] += cy;						\
+  } while (0)
+#else
+#define MPN_SQR_DIAG_ADDLSH1(rp, tp, up, n)				\
+  do {									\
+    mp_limb_t cy;							\
+    MPN_SQR_DIAGONAL (rp, up, n);					\
+    cy = mpn_lshift (tp, tp, 2 * n - 2, 1);				\
+    cy += mpn_add_n (rp + 1, rp + 1, tp, 2 * n - 2);			\
+    rp[2 * n - 1] += cy;						\
+  } while (0)
+#endif
+#endif
+
+
+#undef READY_WITH_mpn_sqr_basecase
+
+
+#if ! defined (READY_WITH_mpn_sqr_basecase) && HAVE_NATIVE_mpn_addmul_2s
+void
+mpn_sqr_basecase (mp_ptr rp, mp_srcptr up, mp_size_t n)
+{
+  mp_size_t i;
+  mp_limb_t tarr[2 * SQR_TOOM2_THRESHOLD];
+  mp_ptr tp = tarr;
+  mp_limb_t cy;
+
+  /* must fit 2*n limbs in tarr */
+  ASSERT (n <= SQR_TOOM2_THRESHOLD);
+
+  if ((n & 1) != 0)
+    {
+      if (n == 1)
+	{
+	  mp_limb_t ul, lpl;
+	  ul = up[0];
+	  umul_ppmm (rp[1], lpl, ul, ul << GMP_NAIL_BITS);
+	  rp[0] = lpl >> GMP_NAIL_BITS;
+	  return;
+	}
+
+      MPN_ZERO (tp, n);
+
+      for (i = 0; i <= n - 2; i += 2)
+	{
+	  cy = mpn_addmul_2s (tp + 2 * i, up + i + 1, n - (i + 1), up + i);
+	  tp[n + i] = cy;
+	}
+    }
+  else
+    {
+      if (n == 2)
+	{
+#if HAVE_NATIVE_mpn_mul_2
+	  rp[3] = mpn_mul_2 (rp, up, 2, up);
+#else
+	  rp[0] = 0;
+	  rp[1] = 0;
+	  rp[3] = mpn_addmul_2 (rp, up, 2, up);
+#endif
+	  return;
+	}
+
+      MPN_ZERO (tp, n);
+
+      for (i = 0; i <= n - 4; i += 2)
+	{
+	  cy = mpn_addmul_2s (tp + 2 * i, up + i + 1, n - (i + 1), up + i);
+	  tp[n + i] = cy;
+	}
+      cy = mpn_addmul_1 (tp + 2 * n - 4, up + n - 1, 1, up[n - 2]);
+      tp[2 * n - 3] = cy;
+    }
+
+  MPN_SQR_DIAG_ADDLSH1 (rp, tp, up, n);
+}
+#define READY_WITH_mpn_sqr_basecase
+#endif
+
+
+#if ! defined (READY_WITH_mpn_sqr_basecase) && HAVE_NATIVE_mpn_addmul_2
+
+/* mpn_sqr_basecase using plain mpn_addmul_2.
+
+   This is tricky, since we have to let mpn_addmul_2 make some undesirable
+   multiplies, u[k]*u[k], that we would like to let mpn_sqr_diagonal handle.
+   This forces us to conditionally add or subtract the mpn_sqr_diagonal
+   results.  Examples of the product we form:
+
+   n = 4              n = 5		n = 6
+   u1u0 * u3u2u1      u1u0 * u4u3u2u1	u1u0 * u5u4u3u2u1
+   u2 * u3	      u3u2 * u4u3	u3u2 * u5u4u3
+					u4 * u5
+   add: u0 u2 u3      add: u0 u2 u4	add: u0 u2 u4 u5
+   sub: u1	      sub: u1 u3	sub: u1 u3
+*/
+
+void
+mpn_sqr_basecase (mp_ptr rp, mp_srcptr up, mp_size_t n)
+{
+  mp_size_t i;
+  mp_limb_t tarr[2 * SQR_TOOM2_THRESHOLD];
+  mp_ptr tp = tarr;
+  mp_limb_t cy;
+
+  /* must fit 2*n limbs in tarr */
+  ASSERT (n <= SQR_TOOM2_THRESHOLD);
+
+  if ((n & 1) != 0)
+    {
+      mp_limb_t x0, x1;
+
+      if (n == 1)
+	{
+	  mp_limb_t ul, lpl;
+	  ul = up[0];
+	  umul_ppmm (rp[1], lpl, ul, ul << GMP_NAIL_BITS);
+	  rp[0] = lpl >> GMP_NAIL_BITS;
+	  return;
+	}
+
+      /* The code below doesn't like unnormalized operands.  Since such
+	 operands are unusual, handle them with a dumb recursion.  */
+      if (up[n - 1] == 0)
+	{
+	  rp[2 * n - 2] = 0;
+	  rp[2 * n - 1] = 0;
+	  mpn_sqr_basecase (rp, up, n - 1);
+	  return;
+	}
+
+      MPN_ZERO (tp, n);
+
+      for (i = 0; i <= n - 2; i += 2)
+	{
+	  cy = mpn_addmul_2 (tp + 2 * i, up + i + 1, n - (i + 1), up + i);
+	  tp[n + i] = cy;
+	}
+
+      MPN_SQR_DIAGONAL (rp, up, n);
+
+      for (i = 2;; i += 4)
+	{
+	  x0 = rp[i + 0];
+	  rp[i + 0] = (-x0) & GMP_NUMB_MASK;
+	  x1 = rp[i + 1];
+	  rp[i + 1] = (-x1 - (x0 != 0)) & GMP_NUMB_MASK;
+	  __GMPN_SUB_1 (cy, rp + i + 2, rp + i + 2, 2, (x1 | x0) != 0);
+	  if (i + 4 >= 2 * n)
+	    break;
+	  mpn_incr_u (rp + i + 4, cy);
+	}
+    }
+  else
+    {
+      mp_limb_t x0, x1;
+
+      if (n == 2)
+	{
+#if HAVE_NATIVE_mpn_mul_2
+	  rp[3] = mpn_mul_2 (rp, up, 2, up);
+#else
+	  rp[0] = 0;
+	  rp[1] = 0;
+	  rp[3] = mpn_addmul_2 (rp, up, 2, up);
+#endif
+	  return;
+	}
+
+      /* The code below doesn't like unnormalized operands.  Since such
+	 operands are unusual, handle them with a dumb recursion.  */
+      if (up[n - 1] == 0)
+	{
+	  rp[2 * n - 2] = 0;
+	  rp[2 * n - 1] = 0;
+	  mpn_sqr_basecase (rp, up, n - 1);
+	  return;
+	}
+
+      MPN_ZERO (tp, n);
+
+      for (i = 0; i <= n - 4; i += 2)
+	{
+	  cy = mpn_addmul_2 (tp + 2 * i, up + i + 1, n - (i + 1), up + i);
+	  tp[n + i] = cy;
+	}
+      cy = mpn_addmul_1 (tp + 2 * n - 4, up + n - 1, 1, up[n - 2]);
+      tp[2 * n - 3] = cy;
+
+      MPN_SQR_DIAGONAL (rp, up, n);
+
+      for (i = 2;; i += 4)
+	{
+	  x0 = rp[i + 0];
+	  rp[i + 0] = (-x0) & GMP_NUMB_MASK;
+	  x1 = rp[i + 1];
+	  rp[i + 1] = (-x1 - (x0 != 0)) & GMP_NUMB_MASK;
+	  if (i + 6 >= 2 * n)
+	    break;
+	  __GMPN_SUB_1 (cy, rp + i + 2, rp + i + 2, 2, (x1 | x0) != 0);
+	  mpn_incr_u (rp + i + 4, cy);
+	}
+      mpn_decr_u (rp + i + 2, (x1 | x0) != 0);
+    }
+
+#if HAVE_NATIVE_mpn_addlsh1_n
+  cy = mpn_addlsh1_n (rp + 1, rp + 1, tp, 2 * n - 2);
+#else
+  cy = mpn_lshift (tp, tp, 2 * n - 2, 1);
+  cy += mpn_add_n (rp + 1, rp + 1, tp, 2 * n - 2);
+#endif
+  rp[2 * n - 1] += cy;
+}
+#define READY_WITH_mpn_sqr_basecase
+#endif
+
+
+#if ! defined (READY_WITH_mpn_sqr_basecase) && HAVE_NATIVE_mpn_sqr_diag_addlsh1
+
+/* mpn_sqr_basecase using mpn_addmul_1 and mpn_sqr_diag_addlsh1, avoiding stack
+   allocation.  */
+void
+mpn_sqr_basecase (mp_ptr rp, mp_srcptr up, mp_size_t n)
+{
+  if (n == 1)
+    {
+      mp_limb_t ul, lpl;
+      ul = up[0];
+      umul_ppmm (rp[1], lpl, ul, ul << GMP_NAIL_BITS);
+      rp[0] = lpl >> GMP_NAIL_BITS;
+    }
+  else
+    {
+      mp_size_t i;
+      mp_ptr xp;
+
+      rp += 1;
+      rp[n - 1] = mpn_mul_1 (rp, up + 1, n - 1, up[0]);
+      for (i = n - 2; i != 0; i--)
+	{
+	  up += 1;
+	  rp += 2;
+	  rp[i] = mpn_addmul_1 (rp, up + 1, i, up[0]);
+	}
+
+      xp = rp - 2 * n + 3;
+      mpn_sqr_diag_addlsh1 (xp, xp + 1, up - n + 2, n);
+    }
+}
+#define READY_WITH_mpn_sqr_basecase
+#endif
+
+
+#if ! defined (READY_WITH_mpn_sqr_basecase)
+
+/* Default mpn_sqr_basecase using mpn_addmul_1.  */
+void
+mpn_sqr_basecase (mp_ptr rp, mp_srcptr up, mp_size_t n)
+{
+  mp_size_t i;
+
+  ASSERT (n >= 1);
+  ASSERT (! MPN_OVERLAP_P (rp, 2*n, up, n));
+
+  if (n == 1)
+    {
+      mp_limb_t ul, lpl;
+      ul = up[0];
+      umul_ppmm (rp[1], lpl, ul, ul << GMP_NAIL_BITS);
+      rp[0] = lpl >> GMP_NAIL_BITS;
+    }
+  else
+    {
+      mp_limb_t tarr[2 * SQR_TOOM2_THRESHOLD];
+      mp_ptr tp = tarr;
+      mp_limb_t cy;
+
+      /* must fit 2*n limbs in tarr */
+      ASSERT (n <= SQR_TOOM2_THRESHOLD);
+
+      cy = mpn_mul_1 (tp, up + 1, n - 1, up[0]);
+      tp[n - 1] = cy;
+      for (i = 2; i < n; i++)
+	{
+	  mp_limb_t cy;
+	  cy = mpn_addmul_1 (tp + 2 * i - 2, up + i, n - i, up[i - 1]);
+	  tp[n + i - 2] = cy;
+	}
+
+      MPN_SQR_DIAG_ADDLSH1 (rp, tp, up, n);
+    }
+}
+#define READY_WITH_mpn_sqr_basecase
+#endif

diff --git a/third_party/gmp/mpn/generic/sqrlo.c b/third_party/gmp/mpn/generic/sqrlo.c
new file mode 100644
index 0000000..71530b6
--- /dev/null
+++ b/third_party/gmp/mpn/generic/sqrlo.c

@@ -0,0 +1,239 @@
+/* mpn_sqrlo -- squares an n-limb number and returns the low n limbs
+   of the result.
+
+   Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
+
+   THIS IS (FOR NOW) AN INTERNAL FUNCTION.  IT IS ONLY SAFE TO REACH THIS
+   FUNCTION THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST GUARANTEED
+   THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+Copyright 2004, 2005, 2009, 2010, 2012, 2015 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+
+#if TUNE_PROGRAM_BUILD || WANT_FAT_BINARY
+#define MAYBE_range_basecase 1
+#define MAYBE_range_toom22   1
+#else
+#define MAYBE_range_basecase                                           \
+  ((SQRLO_DC_THRESHOLD == 0 ? SQRLO_BASECASE_THRESHOLD : SQRLO_DC_THRESHOLD) < SQR_TOOM2_THRESHOLD*36/(36-11))
+#define MAYBE_range_toom22                                             \
+  ((SQRLO_DC_THRESHOLD == 0 ? SQRLO_BASECASE_THRESHOLD : SQRLO_DC_THRESHOLD) < SQR_TOOM3_THRESHOLD*36/(36-11) )
+#endif
+
+/*  THINK: The DC strategy uses different constants in different Toom's
+	 ranges. Something smoother?
+*/
+
+/*
+  Compute the least significant half of the product {xy,n}*{yp,n}, or
+  formally {rp,n} = {xy,n}*{yp,n} Mod (B^n).
+
+  Above the given threshold, the Divide and Conquer strategy is used.
+  The operand is split in two, and a full square plus a mullo
+  is used to obtain the final result. The more natural strategy is to
+  split in two halves, but this is far from optimal when a
+  sub-quadratic multiplication is used.
+
+  Mulders suggests an unbalanced split in favour of the full product,
+  split n = n1 + n2, where an = n1 <= n2 = (1-a)n; i.e. 0 < a <= 1/2.
+
+  To compute the value of a, we assume that the cost of mullo for a
+  given size ML(n) is a fraction of the cost of a full product with
+  same size M(n), and the cost M(n)=n^e for some exponent 1 < e <= 2;
+  then we can write:
+
+  ML(n) = 2*ML(an) + M((1-a)n) => k*M(n) = 2*k*M(n)*a^e + M(n)*(1-a)^e
+
+  Given a value for e, want to minimise the value of k, i.e. the
+  function k=(1-a)^e/(1-2*a^e).
+
+  With e=2, the exponent for schoolbook multiplication, the minimum is
+  given by the values a=1-a=1/2.
+
+  With e=log(3)/log(2), the exponent for Karatsuba (aka toom22),
+  Mulders compute (1-a) = 0.694... and we approximate a with 11/36.
+
+  Other possible approximations follow:
+  e=log(5)/log(3) [Toom-3] -> a ~= 9/40
+  e=log(7)/log(4) [Toom-4] -> a ~= 7/39
+  e=log(11)/log(6) [Toom-6] -> a ~= 1/8
+  e=log(15)/log(8) [Toom-8] -> a ~= 1/10
+
+  The values above where obtained with the following trivial commands
+  in the gp-pari shell:
+
+fun(e,a)=(1-a)^e/(1-2*a^e)
+mul(a,b,c)={local(m,x,p);if(b-c<1/10000,(b+c)/2,m=1;x=b;forstep(p=c,b,(b-c)/8,if(fun(a,p)<m,m=fun(a,p);x=p));mul(a,(b+x)/2,(c+x)/2))}
+contfracpnqn(contfrac(mul(log(2*2-1)/log(2),1/2,0),5))
+contfracpnqn(contfrac(mul(log(3*2-1)/log(3),1/2,0),5))
+contfracpnqn(contfrac(mul(log(4*2-1)/log(4),1/2,0),5))
+contfracpnqn(contfrac(mul(log(6*2-1)/log(6),1/2,0),3))
+contfracpnqn(contfrac(mul(log(8*2-1)/log(8),1/2,0),3))
+
+  ,
+  |\
+  | \
+  +----,
+  |    |
+  |    |
+  |    |\
+  |    | \
+  +----+--`
+  ^ n2 ^n1^
+
+  For an actual implementation, the assumption that M(n)=n^e is
+  incorrect, as a consequence also the assumption that ML(n)=k*M(n)
+  with a constant k is wrong.
+
+  But theory suggest us two things:
+  - the best the multiplication product is (lower e), the more k
+    approaches 1, and a approaches 0.
+
+  - A value for a smaller than optimal is probably less bad than a
+    bigger one: e.g. let e=log(3)/log(2), a=0.3058_ the optimal
+    value, and k(a)=0.808_ the mul/mullo speed ratio. We get
+    k(a+1/6)=0.929_ but k(a-1/6)=0.865_.
+*/
+
+static mp_size_t
+mpn_sqrlo_itch (mp_size_t n)
+{
+  return 2*n;
+}
+
+/*
+    mpn_dc_sqrlo requires a scratch space of 2*n limbs at tp.
+    It accepts tp == rp.
+*/
+static void
+mpn_dc_sqrlo (mp_ptr rp, mp_srcptr xp, mp_size_t n, mp_ptr tp)
+{
+  mp_size_t n2, n1;
+  ASSERT (n >= 2);
+  ASSERT (! MPN_OVERLAP_P (rp, n, xp, n));
+  ASSERT (MPN_SAME_OR_SEPARATE2_P(rp, n, tp, 2*n));
+
+  /* Divide-and-conquer */
+
+  /* We need fractional approximation of the value 0 < a <= 1/2
+     giving the minimum in the function k=(1-a)^e/(1-2*a^e).
+  */
+  if (MAYBE_range_basecase && BELOW_THRESHOLD (n, SQR_TOOM2_THRESHOLD*36/(36-11)))
+    n1 = n >> 1;
+  else if (MAYBE_range_toom22 && BELOW_THRESHOLD (n, SQR_TOOM3_THRESHOLD*36/(36-11)))
+    n1 = n * 11 / (size_t) 36;	/* n1 ~= n*(1-.694...) */
+  else if (BELOW_THRESHOLD (n, SQR_TOOM4_THRESHOLD*40/(40-9)))
+    n1 = n * 9 / (size_t) 40;	/* n1 ~= n*(1-.775...) */
+  else if (BELOW_THRESHOLD (n, SQR_TOOM8_THRESHOLD*10/9))
+    n1 = n * 7 / (size_t) 39;	/* n1 ~= n*(1-.821...) */
+  /* n1 = n * 4 / (size_t) 31;	// n1 ~= n*(1-.871...) [TOOM66] */
+  else
+    n1 = n / (size_t) 10;		/* n1 ~= n*(1-.899...) [TOOM88] */
+
+  n2 = n - n1;
+
+  /* Split as x = x1 2^(n2 GMP_NUMB_BITS) + x0 */
+
+  /* x0 ^ 2 */
+  mpn_sqr (tp, xp, n2);
+  MPN_COPY (rp, tp, n2);
+
+  /* x1 * x0 * 2^(n2 GMP_NUMB_BITS) */
+  if (BELOW_THRESHOLD (n1, MULLO_BASECASE_THRESHOLD))
+    mpn_mul_basecase (tp + n, xp + n2, n1, xp, n1);
+  else if (BELOW_THRESHOLD (n1, MULLO_DC_THRESHOLD))
+    mpn_mullo_basecase (tp + n, xp + n2, xp, n1);
+  else
+    mpn_mullo_n (tp + n, xp + n2, xp, n1);
+  /* mpn_dc_mullo_n (tp + n, xp + n2, xp, n1, tp + n); */
+#if HAVE_NATIVE_mpn_addlsh1_n
+  mpn_addlsh1_n (rp + n2, tp + n2, tp + n, n1);
+#else
+  mpn_lshift (rp + n2, tp + n, n1, 1);
+  mpn_add_n (rp + n2, rp + n2, tp + n2, n1);
+#endif
+}
+
+/* Avoid zero allocations when MULLO_BASECASE_THRESHOLD is 0.  */
+#define SQR_BASECASE_ALLOC \
+ (SQRLO_BASECASE_THRESHOLD_LIMIT == 0 ? 1 : 2*SQRLO_BASECASE_THRESHOLD_LIMIT)
+
+/* FIXME: This function should accept a temporary area; dc_sqrlo
+   accepts a pointer tp, and handle the case tp == rp, do the same here.
+*/
+
+void
+mpn_sqrlo (mp_ptr rp, mp_srcptr xp, mp_size_t n)
+{
+  ASSERT (n >= 1);
+  ASSERT (! MPN_OVERLAP_P (rp, n, xp, n));
+
+  if (BELOW_THRESHOLD (n, SQRLO_BASECASE_THRESHOLD))
+    {
+      /* FIXME: smarter criteria? */
+#if HAVE_NATIVE_mpn_mullo_basecase || ! HAVE_NATIVE_mpn_sqr_basecase
+      /* mullo computes as many products as sqr, but directly writes
+	 on the result area. */
+      mpn_mullo_basecase (rp, xp, xp, n);
+#else
+      /* Allocate workspace of fixed size on stack: fast! */
+      mp_limb_t tp[SQR_BASECASE_ALLOC];
+      mpn_sqr_basecase (tp, xp, n);
+      MPN_COPY (rp, tp, n);
+#endif
+    }
+  else if (BELOW_THRESHOLD (n, SQRLO_DC_THRESHOLD))
+    {
+      mpn_sqrlo_basecase (rp, xp, n);
+    }
+  else
+    {
+      mp_ptr tp;
+      TMP_DECL;
+      TMP_MARK;
+      tp = TMP_ALLOC_LIMBS (mpn_sqrlo_itch (n));
+      if (BELOW_THRESHOLD (n, SQRLO_SQR_THRESHOLD))
+	{
+	  mpn_dc_sqrlo (rp, xp, n, tp);
+	}
+      else
+	{
+	  /* For really large operands, use plain mpn_mul_n but throw away upper n
+	     limbs of result.  */
+#if !TUNE_PROGRAM_BUILD && (SQRLO_SQR_THRESHOLD > SQR_FFT_THRESHOLD)
+	  mpn_fft_mul (tp, xp, n, xp, n);
+#else
+	  mpn_sqr (tp, xp, n);
+#endif
+	  MPN_COPY (rp, tp, n);
+	}
+      TMP_FREE;
+    }
+}

diff --git a/third_party/gmp/mpn/generic/sqrlo_basecase.c b/third_party/gmp/mpn/generic/sqrlo_basecase.c
new file mode 100644
index 0000000..3148609
--- /dev/null
+++ b/third_party/gmp/mpn/generic/sqrlo_basecase.c

@@ -0,0 +1,194 @@
+/* mpn_sqrlo_basecase -- Internal routine to square a natural number
+   of length n.
+
+   THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE.  IT IS ONLY
+   SAFE TO REACH THIS FUNCTION THROUGH DOCUMENTED INTERFACES.
+
+
+Copyright 1991-1994, 1996, 1997, 2000-2005, 2008, 2010, 2011, 2015,
+2016 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+#ifndef SQRLO_SHORTCUT_MULTIPLICATIONS
+#if HAVE_NATIVE_mpn_addmul_1
+#define SQRLO_SHORTCUT_MULTIPLICATIONS 0
+#else
+#define SQRLO_SHORTCUT_MULTIPLICATIONS 1
+#endif
+#endif
+
+#if HAVE_NATIVE_mpn_sqr_diagonal
+#define MPN_SQR_DIAGONAL(rp, up, n)					\
+  mpn_sqr_diagonal (rp, up, n)
+#else
+#define MPN_SQR_DIAGONAL(rp, up, n)					\
+  do {									\
+    mp_size_t _i;							\
+    for (_i = 0; _i < (n); _i++)					\
+      {									\
+	mp_limb_t ul, lpl;						\
+	ul = (up)[_i];							\
+	umul_ppmm ((rp)[2 * _i + 1], lpl, ul, ul << GMP_NAIL_BITS);	\
+	(rp)[2 * _i] = lpl >> GMP_NAIL_BITS;				\
+      }									\
+  } while (0)
+#endif
+
+#define MPN_SQRLO_DIAGONAL(rp, up, n)					\
+  do {									\
+    mp_size_t nhalf;							\
+    nhalf = (n) >> 1;							\
+    MPN_SQR_DIAGONAL ((rp), (up), nhalf);				\
+    if (((n) & 1) != 0)							\
+      {									\
+	mp_limb_t op;							\
+	op = (up)[nhalf];						\
+	(rp)[(n) - 1] = (op * op) & GMP_NUMB_MASK;			\
+      }									\
+  } while (0)
+
+#if HAVE_NATIVE_mpn_addlsh1_n_ip1
+#define MPN_SQRLO_DIAG_ADDLSH1(rp, tp, up, n)				\
+  do {									\
+    MPN_SQRLO_DIAGONAL((rp), (up), (n));				\
+    mpn_addlsh1_n_ip1 ((rp) + 1, (tp), (n) - 1);			\
+  } while (0)
+#else
+#define MPN_SQRLO_DIAG_ADDLSH1(rp, tp, up, n)				\
+  do {									\
+    MPN_SQRLO_DIAGONAL((rp), (up), (n));				\
+    mpn_lshift ((tp), (tp), (n) - 1, 1);				\
+    mpn_add_n ((rp) + 1, (rp) + 1, (tp), (n) - 1);			\
+  } while (0)
+#endif
+
+/* Avoid zero allocations when SQRLO_LO_THRESHOLD is 0 (this code not used). */
+#define SQRLO_BASECASE_ALLOC						\
+  (SQRLO_DC_THRESHOLD_LIMIT < 2 ? 1 : SQRLO_DC_THRESHOLD_LIMIT - 1)
+
+/* Default mpn_sqrlo_basecase using mpn_addmul_1.  */
+#ifndef SQRLO_SPECIAL_CASES
+#define SQRLO_SPECIAL_CASES 2
+#endif
+
+#if TUNE_PROGRAM_BUILD || WANT_FAT_BINARY
+#define MAYBE_special_cases 1
+#else
+#define MAYBE_special_cases \
+  ((SQRLO_BASECASE_THRESHOLD <= SQRLO_SPECIAL_CASES) && (SQRLO_DC_THRESHOLD != 0))
+#endif
+
+void
+mpn_sqrlo_basecase (mp_ptr rp, mp_srcptr up, mp_size_t n)
+{
+  mp_limb_t ul;
+
+  ASSERT (n >= 1);
+  ASSERT (! MPN_OVERLAP_P (rp, n, up, n));
+
+  ul = up[0];
+
+  if (MAYBE_special_cases && n <= SQRLO_SPECIAL_CASES)
+    {
+#if SQRLO_SPECIAL_CASES == 1
+      rp[0] = (ul * ul) & GMP_NUMB_MASK;
+#else
+      if (n == 1)
+	rp[0] = (ul * ul) & GMP_NUMB_MASK;
+      else
+	{
+	  mp_limb_t hi, lo, ul1;
+	  umul_ppmm (hi, lo, ul, ul << GMP_NAIL_BITS);
+	  rp[0] = lo >> GMP_NAIL_BITS;
+	  ul1 = up[1];
+#if SQRLO_SPECIAL_CASES == 2
+	  rp[1] = (hi + ul * ul1 * 2) & GMP_NUMB_MASK;
+#else
+	  if (n == 2)
+	    rp[1] = (hi + ul * ul1 * 2) & GMP_NUMB_MASK;
+	  else
+	    {
+	      mp_limb_t hi1;
+#if GMP_NAIL_BITS != 0
+	      ul <<= 1;
+#endif
+	      umul_ppmm (hi1, lo, ul1 << GMP_NAIL_BITS, ul);
+	      hi1 += ul * up[2];
+#if GMP_NAIL_BITS == 0
+	      hi1 = (hi1 << 1) | (lo >> (GMP_LIMB_BITS - 1));
+	      add_ssaaaa(rp[2], rp[1], hi1, lo << 1, ul1 * ul1, hi);
+#else
+	      hi += lo >> GMP_NAIL_BITS;
+	      rp[1] = hi & GMP_NUMB_MASK;
+	      rp[2] = (hi1 + ul1 * ul1 + (hi >> GMP_NUMB_BITS)) & GMP_NUMB_MASK;
+#endif
+	    }
+#endif
+	}
+#endif
+    }
+  else
+    {
+      mp_limb_t tp[SQRLO_BASECASE_ALLOC];
+      mp_size_t i;
+
+      /* must fit n-1 limbs in tp */
+      ASSERT (n <= SQRLO_DC_THRESHOLD_LIMIT);
+
+      --n;
+#if SQRLO_SHORTCUT_MULTIPLICATIONS
+      {
+	mp_limb_t cy;
+
+	cy = ul * up[n] + mpn_mul_1 (tp, up + 1, n - 1, ul);
+	for (i = 1; 2 * i + 1 < n; ++i)
+	  {
+	    ul = up[i];
+	    cy += ul * up[n - i] + mpn_addmul_1 (tp + 2 * i, up + i + 1, n - 2 * i - 1, ul);
+	  }
+	tp [n-1] = (cy + ((n & 1)?up[i] * up[i + 1]:0)) & GMP_NUMB_MASK;
+      }
+#else
+      mpn_mul_1 (tp, up + 1, n, ul);
+      for (i = 1; 2 * i < n; ++i)
+	mpn_addmul_1 (tp + 2 * i, up + i + 1, n - 2 * i, up[i]);
+#endif
+
+      MPN_SQRLO_DIAG_ADDLSH1 (rp, tp, up, n + 1);
+    }
+}
+#undef SQRLO_SPECIAL_CASES
+#undef MAYBE_special_cases
+#undef SQRLO_BASECASE_ALLOC
+#undef SQRLO_SHORTCUT_MULTIPLICATIONS
+#undef MPN_SQR_DIAGONAL
+#undef MPN_SQRLO_DIAGONAL
+#undef MPN_SQRLO_DIAG_ADDLSH1

diff --git a/third_party/gmp/mpn/generic/sqrmod_bnm1.c b/third_party/gmp/mpn/generic/sqrmod_bnm1.c
new file mode 100644
index 0000000..0a27d7b
--- /dev/null
+++ b/third_party/gmp/mpn/generic/sqrmod_bnm1.c

@@ -0,0 +1,312 @@
+/* sqrmod_bnm1.c -- squaring mod B^n-1.
+
+   Contributed to the GNU project by Niels Möller, Torbjorn Granlund and
+   Marco Bodrato.
+
+   THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
+   SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+Copyright 2009, 2010, 2012 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+/* Input is {ap,rn}; output is {rp,rn}, computation is
+   mod B^rn - 1, and values are semi-normalised; zero is represented
+   as either 0 or B^n - 1.  Needs a scratch of 2rn limbs at tp.
+   tp==rp is allowed. */
+static void
+mpn_bc_sqrmod_bnm1 (mp_ptr rp, mp_srcptr ap, mp_size_t rn, mp_ptr tp)
+{
+  mp_limb_t cy;
+
+  ASSERT (0 < rn);
+
+  mpn_sqr (tp, ap, rn);
+  cy = mpn_add_n (rp, tp, tp + rn, rn);
+  /* If cy == 1, then the value of rp is at most B^rn - 2, so there can
+   * be no overflow when adding in the carry. */
+  MPN_INCR_U (rp, rn, cy);
+}
+
+
+/* Input is {ap,rn+1}; output is {rp,rn+1}, in
+   semi-normalised representation, computation is mod B^rn + 1. Needs
+   a scratch area of 2rn + 2 limbs at tp; tp == rp is allowed.
+   Output is normalised. */
+static void
+mpn_bc_sqrmod_bnp1 (mp_ptr rp, mp_srcptr ap, mp_size_t rn, mp_ptr tp)
+{
+  mp_limb_t cy;
+
+  ASSERT (0 < rn);
+
+  mpn_sqr (tp, ap, rn + 1);
+  ASSERT (tp[2*rn+1] == 0);
+  ASSERT (tp[2*rn] < GMP_NUMB_MAX);
+  cy = tp[2*rn] + mpn_sub_n (rp, tp, tp+rn, rn);
+  rp[rn] = 0;
+  MPN_INCR_U (rp, rn+1, cy);
+}
+
+
+/* Computes {rp,MIN(rn,2an)} <- {ap,an}^2 Mod(B^rn-1)
+ *
+ * The result is expected to be ZERO if and only if the operand
+ * already is. Otherwise the class [0] Mod(B^rn-1) is represented by
+ * B^rn-1.
+ * It should not be a problem if sqrmod_bnm1 is used to
+ * compute the full square with an <= 2*rn, because this condition
+ * implies (B^an-1)^2 < (B^rn-1) .
+ *
+ * Requires rn/4 < an <= rn
+ * Scratch need: rn/2 + (need for recursive call OR rn + 3). This gives
+ *
+ * S(n) <= rn/2 + MAX (rn + 4, S(n/2)) <= 3/2 rn + 4
+ */
+void
+mpn_sqrmod_bnm1 (mp_ptr rp, mp_size_t rn, mp_srcptr ap, mp_size_t an, mp_ptr tp)
+{
+  ASSERT (0 < an);
+  ASSERT (an <= rn);
+
+  if ((rn & 1) != 0 || BELOW_THRESHOLD (rn, SQRMOD_BNM1_THRESHOLD))
+    {
+      if (UNLIKELY (an < rn))
+	{
+	  if (UNLIKELY (2*an <= rn))
+	    {
+	      mpn_sqr (rp, ap, an);
+	    }
+	  else
+	    {
+	      mp_limb_t cy;
+	      mpn_sqr (tp, ap, an);
+	      cy = mpn_add (rp, tp, rn, tp + rn, 2*an - rn);
+	      MPN_INCR_U (rp, rn, cy);
+	    }
+	}
+      else
+	mpn_bc_sqrmod_bnm1 (rp, ap, rn, tp);
+    }
+  else
+    {
+      mp_size_t n;
+      mp_limb_t cy;
+      mp_limb_t hi;
+
+      n = rn >> 1;
+
+      ASSERT (2*an > n);
+
+      /* Compute xm = a^2 mod (B^n - 1), xp = a^2 mod (B^n + 1)
+	 and crt together as
+
+	 x = -xp * B^n + (B^n + 1) * [ (xp + xm)/2 mod (B^n-1)]
+      */
+
+#define a0 ap
+#define a1 (ap + n)
+
+#define xp  tp	/* 2n + 2 */
+      /* am1  maybe in {xp, n} */
+#define sp1 (tp + 2*n + 2)
+      /* ap1  maybe in {sp1, n + 1} */
+
+      {
+	mp_srcptr am1;
+	mp_size_t anm;
+	mp_ptr so;
+
+	if (LIKELY (an > n))
+	  {
+	    so = xp + n;
+	    am1 = xp;
+	    cy = mpn_add (xp, a0, n, a1, an - n);
+	    MPN_INCR_U (xp, n, cy);
+	    anm = n;
+	  }
+	else
+	  {
+	    so = xp;
+	    am1 = a0;
+	    anm = an;
+	  }
+
+	mpn_sqrmod_bnm1 (rp, n, am1, anm, so);
+      }
+
+      {
+	int       k;
+	mp_srcptr ap1;
+	mp_size_t anp;
+
+	if (LIKELY (an > n)) {
+	  ap1 = sp1;
+	  cy = mpn_sub (sp1, a0, n, a1, an - n);
+	  sp1[n] = 0;
+	  MPN_INCR_U (sp1, n + 1, cy);
+	  anp = n + ap1[n];
+	} else {
+	  ap1 = a0;
+	  anp = an;
+	}
+
+	if (BELOW_THRESHOLD (n, MUL_FFT_MODF_THRESHOLD))
+	  k=0;
+	else
+	  {
+	    int mask;
+	    k = mpn_fft_best_k (n, 1);
+	    mask = (1<<k) -1;
+	    while (n & mask) {k--; mask >>=1;};
+	  }
+	if (k >= FFT_FIRST_K)
+	  xp[n] = mpn_mul_fft (xp, n, ap1, anp, ap1, anp, k);
+	else if (UNLIKELY (ap1 == a0))
+	  {
+	    ASSERT (anp <= n);
+	    ASSERT (2*anp > n);
+	    mpn_sqr (xp, a0, an);
+	    anp = 2*an - n;
+	    cy = mpn_sub (xp, xp, n, xp + n, anp);
+	    xp[n] = 0;
+	    MPN_INCR_U (xp, n+1, cy);
+	  }
+	else
+	  mpn_bc_sqrmod_bnp1 (xp, ap1, n, xp);
+      }
+
+      /* Here the CRT recomposition begins.
+
+	 xm <- (xp + xm)/2 = (xp + xm)B^n/2 mod (B^n-1)
+	 Division by 2 is a bitwise rotation.
+
+	 Assumes xp normalised mod (B^n+1).
+
+	 The residue class [0] is represented by [B^n-1]; except when
+	 both input are ZERO.
+      */
+
+#if HAVE_NATIVE_mpn_rsh1add_n || HAVE_NATIVE_mpn_rsh1add_nc
+#if HAVE_NATIVE_mpn_rsh1add_nc
+      cy = mpn_rsh1add_nc(rp, rp, xp, n, xp[n]); /* B^n = 1 */
+      hi = cy << (GMP_NUMB_BITS - 1);
+      cy = 0;
+      /* next update of rp[n-1] will set cy = 1 only if rp[n-1]+=hi
+	 overflows, i.e. a further increment will not overflow again. */
+#else /* ! _nc */
+      cy = xp[n] + mpn_rsh1add_n(rp, rp, xp, n); /* B^n = 1 */
+      hi = (cy<<(GMP_NUMB_BITS-1))&GMP_NUMB_MASK; /* (cy&1) << ... */
+      cy >>= 1;
+      /* cy = 1 only if xp[n] = 1 i.e. {xp,n} = ZERO, this implies that
+	 the rsh1add was a simple rshift: the top bit is 0. cy=1 => hi=0. */
+#endif
+#if GMP_NAIL_BITS == 0
+      add_ssaaaa(cy, rp[n-1], cy, rp[n-1], CNST_LIMB(0), hi);
+#else
+      cy += (hi & rp[n-1]) >> (GMP_NUMB_BITS-1);
+      rp[n-1] ^= hi;
+#endif
+#else /* ! HAVE_NATIVE_mpn_rsh1add_n */
+#if HAVE_NATIVE_mpn_add_nc
+      cy = mpn_add_nc(rp, rp, xp, n, xp[n]);
+#else /* ! _nc */
+      cy = xp[n] + mpn_add_n(rp, rp, xp, n); /* xp[n] == 1 implies {xp,n} == ZERO */
+#endif
+      cy += (rp[0]&1);
+      mpn_rshift(rp, rp, n, 1);
+      ASSERT (cy <= 2);
+      hi = (cy<<(GMP_NUMB_BITS-1))&GMP_NUMB_MASK; /* (cy&1) << ... */
+      cy >>= 1;
+      /* We can have cy != 0 only if hi = 0... */
+      ASSERT ((rp[n-1] & GMP_NUMB_HIGHBIT) == 0);
+      rp[n-1] |= hi;
+      /* ... rp[n-1] + cy can not overflow, the following INCR is correct. */
+#endif
+      ASSERT (cy <= 1);
+      /* Next increment can not overflow, read the previous comments about cy. */
+      ASSERT ((cy == 0) || ((rp[n-1] & GMP_NUMB_HIGHBIT) == 0));
+      MPN_INCR_U(rp, n, cy);
+
+      /* Compute the highest half:
+	 ([(xp + xm)/2 mod (B^n-1)] - xp ) * B^n
+       */
+      if (UNLIKELY (2*an < rn))
+	{
+	  /* Note that in this case, the only way the result can equal
+	     zero mod B^{rn} - 1 is if the input is zero, and
+	     then the output of both the recursive calls and this CRT
+	     reconstruction is zero, not B^{rn} - 1. */
+	  cy = mpn_sub_n (rp + n, rp, xp, 2*an - n);
+
+	  /* FIXME: This subtraction of the high parts is not really
+	     necessary, we do it to get the carry out, and for sanity
+	     checking. */
+	  cy = xp[n] + mpn_sub_nc (xp + 2*an - n, rp + 2*an - n,
+				   xp + 2*an - n, rn - 2*an, cy);
+	  ASSERT (mpn_zero_p (xp + 2*an - n+1, rn - 1 - 2*an));
+	  cy = mpn_sub_1 (rp, rp, 2*an, cy);
+	  ASSERT (cy == (xp + 2*an - n)[0]);
+	}
+      else
+	{
+	  cy = xp[n] + mpn_sub_n (rp + n, rp, xp, n);
+	  /* cy = 1 only if {xp,n+1} is not ZERO, i.e. {rp,n} is not ZERO.
+	     DECR will affect _at most_ the lowest n limbs. */
+	  MPN_DECR_U (rp, 2*n, cy);
+	}
+#undef a0
+#undef a1
+#undef xp
+#undef sp1
+    }
+}
+
+mp_size_t
+mpn_sqrmod_bnm1_next_size (mp_size_t n)
+{
+  mp_size_t nh;
+
+  if (BELOW_THRESHOLD (n,     SQRMOD_BNM1_THRESHOLD))
+    return n;
+  if (BELOW_THRESHOLD (n, 4 * (SQRMOD_BNM1_THRESHOLD - 1) + 1))
+    return (n + (2-1)) & (-2);
+  if (BELOW_THRESHOLD (n, 8 * (SQRMOD_BNM1_THRESHOLD - 1) + 1))
+    return (n + (4-1)) & (-4);
+
+  nh = (n + 1) >> 1;
+
+  if (BELOW_THRESHOLD (nh, SQR_FFT_MODF_THRESHOLD))
+    return (n + (8-1)) & (-8);
+
+  return 2 * mpn_fft_next_size (nh, mpn_fft_best_k (nh, 1));
+}

diff --git a/third_party/gmp/mpn/generic/sqrtrem.c b/third_party/gmp/mpn/generic/sqrtrem.c
new file mode 100644
index 0000000..cc6dd9c
--- /dev/null
+++ b/third_party/gmp/mpn/generic/sqrtrem.c

@@ -0,0 +1,555 @@
+/* mpn_sqrtrem -- square root and remainder
+
+   Contributed to the GNU project by Paul Zimmermann (most code),
+   Torbjorn Granlund (mpn_sqrtrem1) and Marco Bodrato (mpn_dc_sqrt).
+
+   THE FUNCTIONS IN THIS FILE EXCEPT mpn_sqrtrem ARE INTERNAL WITH MUTABLE
+   INTERFACES.  IT IS ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.
+   IN FACT, IT IS ALMOST GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A
+   FUTURE GMP RELEASE.
+
+Copyright 1999-2002, 2004, 2005, 2008, 2010, 2012, 2015, 2017 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+/* See "Karatsuba Square Root", reference in gmp.texi.  */
+
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "gmp-impl.h"
+#include "longlong.h"
+#define USE_DIVAPPR_Q 1
+#define TRACE(x)
+
+static const unsigned char invsqrttab[384] = /* The common 0x100 was removed */
+{
+  0xff,0xfd,0xfb,0xf9,0xf7,0xf5,0xf3,0xf2, /* sqrt(1/80)..sqrt(1/87) */
+  0xf0,0xee,0xec,0xea,0xe9,0xe7,0xe5,0xe4, /* sqrt(1/88)..sqrt(1/8f) */
+  0xe2,0xe0,0xdf,0xdd,0xdb,0xda,0xd8,0xd7, /* sqrt(1/90)..sqrt(1/97) */
+  0xd5,0xd4,0xd2,0xd1,0xcf,0xce,0xcc,0xcb, /* sqrt(1/98)..sqrt(1/9f) */
+  0xc9,0xc8,0xc6,0xc5,0xc4,0xc2,0xc1,0xc0, /* sqrt(1/a0)..sqrt(1/a7) */
+  0xbe,0xbd,0xbc,0xba,0xb9,0xb8,0xb7,0xb5, /* sqrt(1/a8)..sqrt(1/af) */
+  0xb4,0xb3,0xb2,0xb0,0xaf,0xae,0xad,0xac, /* sqrt(1/b0)..sqrt(1/b7) */
+  0xaa,0xa9,0xa8,0xa7,0xa6,0xa5,0xa4,0xa3, /* sqrt(1/b8)..sqrt(1/bf) */
+  0xa2,0xa0,0x9f,0x9e,0x9d,0x9c,0x9b,0x9a, /* sqrt(1/c0)..sqrt(1/c7) */
+  0x99,0x98,0x97,0x96,0x95,0x94,0x93,0x92, /* sqrt(1/c8)..sqrt(1/cf) */
+  0x91,0x90,0x8f,0x8e,0x8d,0x8c,0x8c,0x8b, /* sqrt(1/d0)..sqrt(1/d7) */
+  0x8a,0x89,0x88,0x87,0x86,0x85,0x84,0x83, /* sqrt(1/d8)..sqrt(1/df) */
+  0x83,0x82,0x81,0x80,0x7f,0x7e,0x7e,0x7d, /* sqrt(1/e0)..sqrt(1/e7) */
+  0x7c,0x7b,0x7a,0x79,0x79,0x78,0x77,0x76, /* sqrt(1/e8)..sqrt(1/ef) */
+  0x76,0x75,0x74,0x73,0x72,0x72,0x71,0x70, /* sqrt(1/f0)..sqrt(1/f7) */
+  0x6f,0x6f,0x6e,0x6d,0x6d,0x6c,0x6b,0x6a, /* sqrt(1/f8)..sqrt(1/ff) */
+  0x6a,0x69,0x68,0x68,0x67,0x66,0x66,0x65, /* sqrt(1/100)..sqrt(1/107) */
+  0x64,0x64,0x63,0x62,0x62,0x61,0x60,0x60, /* sqrt(1/108)..sqrt(1/10f) */
+  0x5f,0x5e,0x5e,0x5d,0x5c,0x5c,0x5b,0x5a, /* sqrt(1/110)..sqrt(1/117) */
+  0x5a,0x59,0x59,0x58,0x57,0x57,0x56,0x56, /* sqrt(1/118)..sqrt(1/11f) */
+  0x55,0x54,0x54,0x53,0x53,0x52,0x52,0x51, /* sqrt(1/120)..sqrt(1/127) */
+  0x50,0x50,0x4f,0x4f,0x4e,0x4e,0x4d,0x4d, /* sqrt(1/128)..sqrt(1/12f) */
+  0x4c,0x4b,0x4b,0x4a,0x4a,0x49,0x49,0x48, /* sqrt(1/130)..sqrt(1/137) */
+  0x48,0x47,0x47,0x46,0x46,0x45,0x45,0x44, /* sqrt(1/138)..sqrt(1/13f) */
+  0x44,0x43,0x43,0x42,0x42,0x41,0x41,0x40, /* sqrt(1/140)..sqrt(1/147) */
+  0x40,0x3f,0x3f,0x3e,0x3e,0x3d,0x3d,0x3c, /* sqrt(1/148)..sqrt(1/14f) */
+  0x3c,0x3b,0x3b,0x3a,0x3a,0x39,0x39,0x39, /* sqrt(1/150)..sqrt(1/157) */
+  0x38,0x38,0x37,0x37,0x36,0x36,0x35,0x35, /* sqrt(1/158)..sqrt(1/15f) */
+  0x35,0x34,0x34,0x33,0x33,0x32,0x32,0x32, /* sqrt(1/160)..sqrt(1/167) */
+  0x31,0x31,0x30,0x30,0x2f,0x2f,0x2f,0x2e, /* sqrt(1/168)..sqrt(1/16f) */
+  0x2e,0x2d,0x2d,0x2d,0x2c,0x2c,0x2b,0x2b, /* sqrt(1/170)..sqrt(1/177) */
+  0x2b,0x2a,0x2a,0x29,0x29,0x29,0x28,0x28, /* sqrt(1/178)..sqrt(1/17f) */
+  0x27,0x27,0x27,0x26,0x26,0x26,0x25,0x25, /* sqrt(1/180)..sqrt(1/187) */
+  0x24,0x24,0x24,0x23,0x23,0x23,0x22,0x22, /* sqrt(1/188)..sqrt(1/18f) */
+  0x21,0x21,0x21,0x20,0x20,0x20,0x1f,0x1f, /* sqrt(1/190)..sqrt(1/197) */
+  0x1f,0x1e,0x1e,0x1e,0x1d,0x1d,0x1d,0x1c, /* sqrt(1/198)..sqrt(1/19f) */
+  0x1c,0x1b,0x1b,0x1b,0x1a,0x1a,0x1a,0x19, /* sqrt(1/1a0)..sqrt(1/1a7) */
+  0x19,0x19,0x18,0x18,0x18,0x18,0x17,0x17, /* sqrt(1/1a8)..sqrt(1/1af) */
+  0x17,0x16,0x16,0x16,0x15,0x15,0x15,0x14, /* sqrt(1/1b0)..sqrt(1/1b7) */
+  0x14,0x14,0x13,0x13,0x13,0x12,0x12,0x12, /* sqrt(1/1b8)..sqrt(1/1bf) */
+  0x12,0x11,0x11,0x11,0x10,0x10,0x10,0x0f, /* sqrt(1/1c0)..sqrt(1/1c7) */
+  0x0f,0x0f,0x0f,0x0e,0x0e,0x0e,0x0d,0x0d, /* sqrt(1/1c8)..sqrt(1/1cf) */
+  0x0d,0x0c,0x0c,0x0c,0x0c,0x0b,0x0b,0x0b, /* sqrt(1/1d0)..sqrt(1/1d7) */
+  0x0a,0x0a,0x0a,0x0a,0x09,0x09,0x09,0x09, /* sqrt(1/1d8)..sqrt(1/1df) */
+  0x08,0x08,0x08,0x07,0x07,0x07,0x07,0x06, /* sqrt(1/1e0)..sqrt(1/1e7) */
+  0x06,0x06,0x06,0x05,0x05,0x05,0x04,0x04, /* sqrt(1/1e8)..sqrt(1/1ef) */
+  0x04,0x04,0x03,0x03,0x03,0x03,0x02,0x02, /* sqrt(1/1f0)..sqrt(1/1f7) */
+  0x02,0x02,0x01,0x01,0x01,0x01,0x00,0x00  /* sqrt(1/1f8)..sqrt(1/1ff) */
+};
+
+/* Compute s = floor(sqrt(a0)), and *rp = a0 - s^2.  */
+
+#if GMP_NUMB_BITS > 32
+#define MAGIC CNST_LIMB(0x10000000000)	/* 0xffe7debbfc < MAGIC < 0x232b1850f410 */
+#else
+#define MAGIC CNST_LIMB(0x100000)		/* 0xfee6f < MAGIC < 0x29cbc8 */
+#endif
+
+static mp_limb_t
+mpn_sqrtrem1 (mp_ptr rp, mp_limb_t a0)
+{
+#if GMP_NUMB_BITS > 32
+  mp_limb_t a1;
+#endif
+  mp_limb_t x0, t2, t, x2;
+  unsigned abits;
+
+  ASSERT_ALWAYS (GMP_NAIL_BITS == 0);
+  ASSERT_ALWAYS (GMP_LIMB_BITS == 32 || GMP_LIMB_BITS == 64);
+  ASSERT (a0 >= GMP_NUMB_HIGHBIT / 2);
+
+  /* Use Newton iterations for approximating 1/sqrt(a) instead of sqrt(a),
+     since we can do the former without division.  As part of the last
+     iteration convert from 1/sqrt(a) to sqrt(a).  */
+
+  abits = a0 >> (GMP_LIMB_BITS - 1 - 8);	/* extract bits for table lookup */
+  x0 = 0x100 | invsqrttab[abits - 0x80];	/* initial 1/sqrt(a) */
+
+  /* x0 is now an 8 bits approximation of 1/sqrt(a0) */
+
+#if GMP_NUMB_BITS > 32
+  a1 = a0 >> (GMP_LIMB_BITS - 1 - 32);
+  t = (mp_limb_signed_t) (CNST_LIMB(0x2000000000000) - 0x30000 - a1 * x0 * x0) >> 16;
+  x0 = (x0 << 16) + ((mp_limb_signed_t) (x0 * t) >> (16+2));
+
+  /* x0 is now a 16 bits approximation of 1/sqrt(a0) */
+
+  t2 = x0 * (a0 >> (32-8));
+  t = t2 >> 25;
+  t = ((mp_limb_signed_t) ((a0 << 14) - t * t - MAGIC) >> (32-8));
+  x0 = t2 + ((mp_limb_signed_t) (x0 * t) >> 15);
+  x0 >>= 32;
+#else
+  t2 = x0 * (a0 >> (16-8));
+  t = t2 >> 13;
+  t = ((mp_limb_signed_t) ((a0 << 6) - t * t - MAGIC) >> (16-8));
+  x0 = t2 + ((mp_limb_signed_t) (x0 * t) >> 7);
+  x0 >>= 16;
+#endif
+
+  /* x0 is now a full limb approximation of sqrt(a0) */
+
+  x2 = x0 * x0;
+  if (x2 + 2*x0 <= a0 - 1)
+    {
+      x2 += 2*x0 + 1;
+      x0++;
+    }
+
+  *rp = a0 - x2;
+  return x0;
+}
+
+
+#define Prec (GMP_NUMB_BITS >> 1)
+#if ! defined(SQRTREM2_INPLACE)
+#define SQRTREM2_INPLACE 0
+#endif
+
+/* same as mpn_sqrtrem, but for size=2 and {np, 2} normalized
+   return cc such that {np, 2} = sp[0]^2 + cc*2^GMP_NUMB_BITS + rp[0] */
+#if SQRTREM2_INPLACE
+#define CALL_SQRTREM2_INPLACE(sp,rp) mpn_sqrtrem2 (sp, rp)
+static mp_limb_t
+mpn_sqrtrem2 (mp_ptr sp, mp_ptr rp)
+{
+  mp_srcptr np = rp;
+#else
+#define CALL_SQRTREM2_INPLACE(sp,rp) mpn_sqrtrem2 (sp, rp, rp)
+static mp_limb_t
+mpn_sqrtrem2 (mp_ptr sp, mp_ptr rp, mp_srcptr np)
+{
+#endif
+  mp_limb_t q, u, np0, sp0, rp0, q2;
+  int cc;
+
+  ASSERT (np[1] >= GMP_NUMB_HIGHBIT / 2);
+
+  np0 = np[0];
+  sp0 = mpn_sqrtrem1 (rp, np[1]);
+  rp0 = rp[0];
+  /* rp0 <= 2*sp0 < 2^(Prec + 1) */
+  rp0 = (rp0 << (Prec - 1)) + (np0 >> (Prec + 1));
+  q = rp0 / sp0;
+  /* q <= 2^Prec, if q = 2^Prec, reduce the overestimate. */
+  q -= q >> Prec;
+  /* now we have q < 2^Prec */
+  u = rp0 - q * sp0;
+  /* now we have (rp[0]<<Prec + np0>>Prec)/2 = q * sp0 + u */
+  sp0 = (sp0 << Prec) | q;
+  cc = u >> (Prec - 1);
+  rp0 = ((u << (Prec + 1)) & GMP_NUMB_MASK) + (np0 & ((CNST_LIMB (1) << (Prec + 1)) - 1));
+  /* subtract q * q from rp */
+  q2 = q * q;
+  cc -= rp0 < q2;
+  rp0 -= q2;
+  if (cc < 0)
+    {
+      rp0 += sp0;
+      cc += rp0 < sp0;
+      --sp0;
+      rp0 += sp0;
+      cc += rp0 < sp0;
+    }
+
+  rp[0] = rp0;
+  sp[0] = sp0;
+  return cc;
+}
+
+/* writes in {sp, n} the square root (rounded towards zero) of {np, 2n},
+   and in {np, n} the low n limbs of the remainder, returns the high
+   limb of the remainder (which is 0 or 1).
+   Assumes {np, 2n} is normalized, i.e. np[2n-1] >= B/4
+   where B=2^GMP_NUMB_BITS.
+   Needs a scratch of n/2+1 limbs. */
+static mp_limb_t
+mpn_dc_sqrtrem (mp_ptr sp, mp_ptr np, mp_size_t n, mp_limb_t approx, mp_ptr scratch)
+{
+  mp_limb_t q;			/* carry out of {sp, n} */
+  int c, b;			/* carry out of remainder */
+  mp_size_t l, h;
+
+  ASSERT (n > 1);
+  ASSERT (np[2 * n - 1] >= GMP_NUMB_HIGHBIT / 2);
+
+  l = n / 2;
+  h = n - l;
+  if (h == 1)
+    q = CALL_SQRTREM2_INPLACE (sp + l, np + 2 * l);
+  else
+    q = mpn_dc_sqrtrem (sp + l, np + 2 * l, h, 0, scratch);
+  if (q != 0)
+    ASSERT_CARRY (mpn_sub_n (np + 2 * l, np + 2 * l, sp + l, h));
+  TRACE(printf("tdiv_qr(,,,,%u,,%u) -> %u\n", (unsigned) n, (unsigned) h, (unsigned) (n - h + 1)));
+  mpn_tdiv_qr (scratch, np + l, 0, np + l, n, sp + l, h);
+  q += scratch[l];
+  c = scratch[0] & 1;
+  mpn_rshift (sp, scratch, l, 1);
+  sp[l - 1] |= (q << (GMP_NUMB_BITS - 1)) & GMP_NUMB_MASK;
+  if (UNLIKELY ((sp[0] & approx) != 0)) /* (sp[0] & mask) > 1 */
+    return 1; /* Remainder is non-zero */
+  q >>= 1;
+  if (c != 0)
+    c = mpn_add_n (np + l, np + l, sp + l, h);
+  TRACE(printf("sqr(,,%u)\n", (unsigned) l));
+  mpn_sqr (np + n, sp, l);
+  b = q + mpn_sub_n (np, np, np + n, 2 * l);
+  c -= (l == h) ? b : mpn_sub_1 (np + 2 * l, np + 2 * l, 1, (mp_limb_t) b);
+
+  if (c < 0)
+    {
+      q = mpn_add_1 (sp + l, sp + l, h, q);
+#if HAVE_NATIVE_mpn_addlsh1_n_ip1 || HAVE_NATIVE_mpn_addlsh1_n
+      c += mpn_addlsh1_n_ip1 (np, sp, n) + 2 * q;
+#else
+      c += mpn_addmul_1 (np, sp, n, CNST_LIMB(2)) + 2 * q;
+#endif
+      c -= mpn_sub_1 (np, np, n, CNST_LIMB(1));
+      q -= mpn_sub_1 (sp, sp, n, CNST_LIMB(1));
+    }
+
+  return c;
+}
+
+#if USE_DIVAPPR_Q
+static void
+mpn_divappr_q (mp_ptr qp, mp_srcptr np, mp_size_t nn, mp_srcptr dp, mp_size_t dn, mp_ptr scratch)
+{
+  gmp_pi1_t inv;
+  mp_limb_t qh;
+  ASSERT (dn > 2);
+  ASSERT (nn >= dn);
+  ASSERT ((dp[dn-1] & GMP_NUMB_HIGHBIT) != 0);
+
+  MPN_COPY (scratch, np, nn);
+  invert_pi1 (inv, dp[dn-1], dp[dn-2]);
+  if (BELOW_THRESHOLD (dn, DC_DIVAPPR_Q_THRESHOLD))
+    qh = mpn_sbpi1_divappr_q (qp, scratch, nn, dp, dn, inv.inv32);
+  else if (BELOW_THRESHOLD (dn, MU_DIVAPPR_Q_THRESHOLD))
+    qh = mpn_dcpi1_divappr_q (qp, scratch, nn, dp, dn, &inv);
+  else
+    {
+      mp_size_t itch = mpn_mu_divappr_q_itch (nn, dn, 0);
+      TMP_DECL;
+      TMP_MARK;
+      /* Sadly, scratch is too small. */
+      qh = mpn_mu_divappr_q (qp, np, nn, dp, dn, TMP_ALLOC_LIMBS (itch));
+      TMP_FREE;
+    }
+  qp [nn - dn] = qh;
+}
+#endif
+
+/* writes in {sp, n} the square root (rounded towards zero) of {np, 2n-odd},
+   returns zero if the operand was a perfect square, one otherwise.
+   Assumes {np, 2n-odd}*4^nsh is normalized, i.e. B > np[2n-1-odd]*4^nsh >= B/4
+   where B=2^GMP_NUMB_BITS.
+   THINK: In the odd case, three more (dummy) limbs are taken into account,
+   when nsh is maximal, two limbs are discarded from the result of the
+   division. Too much? Is a single dummy limb enough? */
+static int
+mpn_dc_sqrt (mp_ptr sp, mp_srcptr np, mp_size_t n, unsigned nsh, unsigned odd)
+{
+  mp_limb_t q;			/* carry out of {sp, n} */
+  int c;			/* carry out of remainder */
+  mp_size_t l, h;
+  mp_ptr qp, tp, scratch;
+  TMP_DECL;
+  TMP_MARK;
+
+  ASSERT (np[2 * n - 1 - odd] != 0);
+  ASSERT (n > 4);
+  ASSERT (nsh < GMP_NUMB_BITS / 2);
+
+  l = (n - 1) / 2;
+  h = n - l;
+  ASSERT (n >= l + 2 && l + 2 >= h && h > l && l >= 1 + odd);
+  scratch = TMP_ALLOC_LIMBS (l + 2 * n + 5 - USE_DIVAPPR_Q); /* n + 2-USE_DIVAPPR_Q */
+  tp = scratch + n + 2 - USE_DIVAPPR_Q; /* n + h + 1, but tp [-1] is writable */
+  if (nsh != 0)
+    {
+      /* o is used to exactly set the lowest bits of the dividend, is it needed? */
+      int o = l > (1 + odd);
+      ASSERT_NOCARRY (mpn_lshift (tp - o, np + l - 1 - o - odd, n + h + 1 + o, 2 * nsh));
+    }
+  else
+    MPN_COPY (tp, np + l - 1 - odd, n + h + 1);
+  q = mpn_dc_sqrtrem (sp + l, tp + l + 1, h, 0, scratch);
+  if (q != 0)
+    ASSERT_CARRY (mpn_sub_n (tp + l + 1, tp + l + 1, sp + l, h));
+  qp = tp + n + 1; /* l + 2 */
+  TRACE(printf("div(appr)_q(,,%u,,%u) -> %u \n", (unsigned) n+1, (unsigned) h, (unsigned) (n + 1 - h + 1)));
+#if USE_DIVAPPR_Q
+  mpn_divappr_q (qp, tp, n + 1, sp + l, h, scratch);
+#else
+  mpn_div_q (qp, tp, n + 1, sp + l, h, scratch);
+#endif
+  q += qp [l + 1];
+  c = 1;
+  if (q > 1)
+    {
+      /* FIXME: if s!=0 we will shift later, a noop on this area. */
+      MPN_FILL (sp, l, GMP_NUMB_MAX);
+    }
+  else
+    {
+      /* FIXME: if s!=0 we will shift again later, shift just once. */
+      mpn_rshift (sp, qp + 1, l, 1);
+      sp[l - 1] |= q << (GMP_NUMB_BITS - 1);
+      if (((qp[0] >> (2 + USE_DIVAPPR_Q)) | /* < 3 + 4*USE_DIVAPPR_Q */
+	   (qp[1] & (GMP_NUMB_MASK >> ((GMP_NUMB_BITS >> odd)- nsh - 1)))) == 0)
+	{
+	  mp_limb_t cy;
+	  /* Approximation is not good enough, the extra limb(+ nsh bits)
+	     is smaller than needed to absorb the possible error. */
+	  /* {qp + 1, l + 1} equals 2*{sp, l} */
+	  /* FIXME: use mullo or wrap-around, or directly evaluate
+	     remainder with a single sqrmod_bnm1. */
+	  TRACE(printf("mul(,,%u,,%u)\n", (unsigned) h, (unsigned) (l+1)));
+	  ASSERT_NOCARRY (mpn_mul (scratch, sp + l, h, qp + 1, l + 1));
+	  /* Compute the remainder of the previous mpn_div(appr)_q. */
+	  cy = mpn_sub_n (tp + 1, tp + 1, scratch, h);
+#if USE_DIVAPPR_Q || WANT_ASSERT
+	  MPN_DECR_U (tp + 1 + h, l, cy);
+#if USE_DIVAPPR_Q
+	  ASSERT (mpn_cmp (tp + 1 + h, scratch + h, l) <= 0);
+	  if (mpn_cmp (tp + 1 + h, scratch + h, l) < 0)
+	    {
+	      /* May happen only if div result was not exact. */
+#if HAVE_NATIVE_mpn_addlsh1_n_ip1 || HAVE_NATIVE_mpn_addlsh1_n
+	      cy = mpn_addlsh1_n_ip1 (tp + 1, sp + l, h);
+#else
+	      cy = mpn_addmul_1 (tp + 1, sp + l, h, CNST_LIMB(2));
+#endif
+	      ASSERT_NOCARRY (mpn_add_1 (tp + 1 + h, tp + 1 + h, l, cy));
+	      MPN_DECR_U (sp, l, 1);
+	    }
+	  /* Can the root be exact when a correction was needed? We
+	     did not find an example, but it depends on divappr
+	     internals, and we can not assume it true in general...*/
+	  /* else */
+#else /* WANT_ASSERT */
+	  ASSERT (mpn_cmp (tp + 1 + h, scratch + h, l) == 0);
+#endif
+#endif
+	  if (mpn_zero_p (tp + l + 1, h - l))
+	    {
+	      TRACE(printf("sqr(,,%u)\n", (unsigned) l));
+	      mpn_sqr (scratch, sp, l);
+	      c = mpn_cmp (tp + 1, scratch + l, l);
+	      if (c == 0)
+		{
+		  if (nsh != 0)
+		    {
+		      mpn_lshift (tp, np, l, 2 * nsh);
+		      np = tp;
+		    }
+		  c = mpn_cmp (np, scratch + odd, l - odd);
+		}
+	      if (c < 0)
+		{
+		  MPN_DECR_U (sp, l, 1);
+		  c = 1;
+		}
+	    }
+	}
+    }
+  TMP_FREE;
+
+  if ((odd | nsh) != 0)
+    mpn_rshift (sp, sp, n, nsh + (odd ? GMP_NUMB_BITS / 2 : 0));
+  return c;
+}
+
+
+mp_size_t
+mpn_sqrtrem (mp_ptr sp, mp_ptr rp, mp_srcptr np, mp_size_t nn)
+{
+  mp_limb_t cc, high, rl;
+  int c;
+  mp_size_t rn, tn;
+  TMP_DECL;
+
+  ASSERT (nn > 0);
+  ASSERT_MPN (np, nn);
+
+  ASSERT (np[nn - 1] != 0);
+  ASSERT (rp == NULL || MPN_SAME_OR_SEPARATE_P (np, rp, nn));
+  ASSERT (rp == NULL || ! MPN_OVERLAP_P (sp, (nn + 1) / 2, rp, nn));
+  ASSERT (! MPN_OVERLAP_P (sp, (nn + 1) / 2, np, nn));
+
+  high = np[nn - 1];
+  if (high & (GMP_NUMB_HIGHBIT | (GMP_NUMB_HIGHBIT / 2)))
+    c = 0;
+  else
+    {
+      count_leading_zeros (c, high);
+      c -= GMP_NAIL_BITS;
+
+      c = c / 2; /* we have to shift left by 2c bits to normalize {np, nn} */
+    }
+  if (nn == 1) {
+    if (c == 0)
+      {
+	sp[0] = mpn_sqrtrem1 (&rl, high);
+	if (rp != NULL)
+	  rp[0] = rl;
+      }
+    else
+      {
+	cc = mpn_sqrtrem1 (&rl, high << (2*c)) >> c;
+	sp[0] = cc;
+	if (rp != NULL)
+	  rp[0] = rl = high - cc*cc;
+      }
+    return rl != 0;
+  }
+  if (nn == 2) {
+    mp_limb_t tp [2];
+    if (rp == NULL) rp = tp;
+    if (c == 0)
+      {
+#if SQRTREM2_INPLACE
+	rp[1] = high;
+	rp[0] = np[0];
+	cc = CALL_SQRTREM2_INPLACE (sp, rp);
+#else
+	cc = mpn_sqrtrem2 (sp, rp, np);
+#endif
+	rp[1] = cc;
+	return ((rp[0] | cc) != 0) + cc;
+      }
+    else
+      {
+	rl = np[0];
+	rp[1] = (high << (2*c)) | (rl >> (GMP_NUMB_BITS - 2*c));
+	rp[0] = rl << (2*c);
+	CALL_SQRTREM2_INPLACE (sp, rp);
+	cc = sp[0] >>= c;	/* c != 0, the highest bit of the root cc is 0. */
+	rp[0] = rl -= cc*cc;	/* Computed modulo 2^GMP_LIMB_BITS, because it's smaller. */
+	return rl != 0;
+      }
+  }
+  tn = (nn + 1) / 2; /* 2*tn is the smallest even integer >= nn */
+
+  if ((rp == NULL) && (nn > 8))
+    return mpn_dc_sqrt (sp, np, tn, c, nn & 1);
+  TMP_MARK;
+  if (((nn & 1) | c) != 0)
+    {
+      mp_limb_t s0[1], mask;
+      mp_ptr tp, scratch;
+      TMP_ALLOC_LIMBS_2 (tp, 2 * tn, scratch, tn / 2 + 1);
+      tp[0] = 0;	     /* needed only when 2*tn > nn, but saves a test */
+      if (c != 0)
+	mpn_lshift (tp + (nn & 1), np, nn, 2 * c);
+      else
+	MPN_COPY (tp + (nn & 1), np, nn);
+      c += (nn & 1) ? GMP_NUMB_BITS / 2 : 0;		/* c now represents k */
+      mask = (CNST_LIMB (1) << c) - 1;
+      rl = mpn_dc_sqrtrem (sp, tp, tn, (rp == NULL) ? mask - 1 : 0, scratch);
+      /* We have 2^(2k)*N = S^2 + R where k = c + (2tn-nn)*GMP_NUMB_BITS/2,
+	 thus 2^(2k)*N = (S-s0)^2 + 2*S*s0 - s0^2 + R where s0=S mod 2^k */
+      s0[0] = sp[0] & mask;	/* S mod 2^k */
+      rl += mpn_addmul_1 (tp, sp, tn, 2 * s0[0]);	/* R = R + 2*s0*S */
+      cc = mpn_submul_1 (tp, s0, 1, s0[0]);
+      rl -= (tn > 1) ? mpn_sub_1 (tp + 1, tp + 1, tn - 1, cc) : cc;
+      mpn_rshift (sp, sp, tn, c);
+      tp[tn] = rl;
+      if (rp == NULL)
+	rp = tp;
+      c = c << 1;
+      if (c < GMP_NUMB_BITS)
+	tn++;
+      else
+	{
+	  tp++;
+	  c -= GMP_NUMB_BITS;
+	}
+      if (c != 0)
+	mpn_rshift (rp, tp, tn, c);
+      else
+	MPN_COPY_INCR (rp, tp, tn);
+      rn = tn;
+    }
+  else
+    {
+      if (rp != np)
+	{
+	  if (rp == NULL) /* nn <= 8 */
+	    rp = TMP_SALLOC_LIMBS (nn);
+	  MPN_COPY (rp, np, nn);
+	}
+      rn = tn + (rp[tn] = mpn_dc_sqrtrem (sp, rp, tn, 0, TMP_ALLOC_LIMBS(tn / 2 + 1)));
+    }
+
+  MPN_NORMALIZE (rp, rn);
+
+  TMP_FREE;
+  return rn;
+}

diff --git a/third_party/gmp/mpn/generic/strongfibo.c b/third_party/gmp/mpn/generic/strongfibo.c
new file mode 100644
index 0000000..ffd038a
--- /dev/null
+++ b/third_party/gmp/mpn/generic/strongfibo.c

@@ -0,0 +1,216 @@
+/* mpn_fib2m -- calculate Fibonacci numbers, modulo m.
+
+Contributed to the GNU project by Marco Bodrato.
+
+   THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY.  THEY'RE ALMOST
+   CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
+   FUTURE GNU MP RELEASES.
+
+Copyright 2001, 2002, 2005, 2009, 2018 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include <stdio.h>
+#include "gmp-impl.h"
+
+/* Stores |{ap,n}-{bp,n}| in {rp,n},
+   returns the sign of {ap,n}-{bp,n}. */
+static int
+abs_sub_n (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t n)
+{
+  mp_limb_t  x, y;
+  while (--n >= 0)
+    {
+      x = ap[n];
+      y = bp[n];
+      if (x != y)
+        {
+          ++n;
+          if (x > y)
+            {
+              ASSERT_NOCARRY (mpn_sub_n (rp, ap, bp, n));
+              return 1;
+            }
+          else
+            {
+              ASSERT_NOCARRY (mpn_sub_n (rp, bp, ap, n));
+              return -1;
+            }
+        }
+      rp[n] = 0;
+    }
+  return 0;
+}
+
+/* Computes at most count terms of the sequence needed by the
+   Lucas-Lehmer-Riesel test, indexing backward:
+   L_i = L_{i+1}^2 - 2
+
+   The sequence is computed modulo M = {mp, mn}.
+   The starting point is given in L_{count+1} = {lp, mn}.
+   The scratch pointed by sp, needs a space of at least 3 * mn + 1 limbs.
+
+   Returns the index i>0 if L_i = 0 (mod M) is found within the
+   computed count terms of the sequence.  Otherwise it returns zero.
+
+   Note: (+/-2)^2-2=2, (+/-1)^2-2=-1, 0^2-2=-2
+ */
+
+static mp_bitcnt_t
+mpn_llriter (mp_ptr lp, mp_srcptr mp, mp_size_t mn, mp_bitcnt_t count, mp_ptr sp)
+{
+  do
+    {
+      mpn_sqr (sp, lp, mn);
+      mpn_tdiv_qr (sp + 2 * mn, lp, 0, sp, 2 * mn, mp, mn);
+      if (lp[0] < 5)
+	{
+	  /* If L^2 % M < 5, |L^2 % M - 2| <= 2 */
+	  if (mn == 1 || mpn_zero_p (lp + 1, mn - 1))
+	    return (lp[0] == 2) ? count : 0;
+	  else
+	    MPN_DECR_U (lp, mn, 2);
+	}
+      else
+	lp[0] -= 2;
+    } while (--count != 0);
+  return 0;
+}
+
+/* Store the Lucas' number L[n] at lp (maybe), computed modulo m.  lp
+   and scratch should have room for mn*2+1 limbs.
+
+   Returns the size of L[n] normally.
+
+   If F[n] is zero modulo m, or L[n] is, returns 0 and lp is
+   undefined.
+*/
+
+static mp_size_t
+mpn_lucm (mp_ptr lp, mp_srcptr np, mp_size_t nn, mp_srcptr mp, mp_size_t mn, mp_ptr scratch)
+{
+  int		neg;
+  mp_limb_t	cy;
+
+  ASSERT (! MPN_OVERLAP_P (lp, MAX(2*mn+1,5), scratch, MAX(2*mn+1,5)));
+  ASSERT (nn > 0);
+
+  neg = mpn_fib2m (lp, scratch, np, nn, mp, mn);
+
+  /* F[n] = +/-{lp, mn}, F[n-1] = +/-{scratch, mn} */
+  if (mpn_zero_p (lp, mn))
+    return 0;
+
+  if (neg) /* One sign is opposite, use sub instead of add. */
+    {
+#if HAVE_NATIVE_mpn_rsblsh1_n || HAVE_NATIVE_mpn_sublsh1_n
+#if HAVE_NATIVE_mpn_rsblsh1_n
+      cy = mpn_rsblsh1_n (lp, lp, scratch, mn); /* L[n] = +/-(2F[n-1]-(-F[n])) */
+#else
+      cy = mpn_sublsh1_n (lp, lp, scratch, mn); /* L[n] = -/+(F[n]-(-2F[n-1])) */
+      if (cy != 0)
+	cy = mpn_add_n (lp, lp, mp, mn) - cy;
+#endif
+      if (cy > 1)
+	cy += mpn_add_n (lp, lp, mp, mn);
+#else
+      cy = mpn_lshift (scratch, scratch, mn, 1); /* 2F[n-1] */
+      if (UNLIKELY (cy))
+	cy -= mpn_sub_n (lp, scratch, lp, mn); /* L[n] = +/-(2F[n-1]-(-F[n])) */
+      else
+	abs_sub_n (lp, lp, scratch, mn);
+#endif
+      ASSERT (cy <= 1);
+    }
+  else
+    {
+#if HAVE_NATIVE_mpn_addlsh1_n
+      cy = mpn_addlsh1_n (lp, lp, scratch, mn); /* L[n] = +/-(2F[n-1]+F[n])) */
+#else
+      cy = mpn_lshift (scratch, scratch, mn, 1);
+      cy+= mpn_add_n (lp, lp, scratch, mn);
+#endif
+      ASSERT (cy <= 2);
+    }
+  while (cy || mpn_cmp (lp, mp, mn) >= 0)
+    cy -= mpn_sub_n (lp, lp, mp, mn);
+  MPN_NORMALIZE (lp, mn);
+  return mn;
+}
+
+int
+mpn_strongfibo (mp_srcptr mp, mp_size_t mn, mp_ptr scratch)
+{
+  mp_ptr	lp, sp;
+  mp_size_t	en;
+  mp_bitcnt_t	b0;
+  TMP_DECL;
+
+#if GMP_NUMB_BITS % 4 == 0
+  b0 = mpn_scan0 (mp, 0);
+#else
+  {
+    mpz_t m = MPZ_ROINIT_N(mp, mn);
+    b0 = mpz_scan0 (m, 0);
+  }
+  if (UNLIKELY (b0 == mn * GMP_NUMB_BITS))
+    {
+      en = 1;
+      scratch [0] = 1;
+    }
+  else
+#endif
+    {
+      int cnt = b0 % GMP_NUMB_BITS;
+      en = b0 / GMP_NUMB_BITS;
+      if (LIKELY (cnt != 0))
+	mpn_rshift (scratch, mp + en, mn - en, cnt);
+      else
+	MPN_COPY (scratch, mp + en, mn - en);
+      en = mn - en;
+      scratch [0] |= 1;
+      en -= scratch [en - 1] == 0;
+    }
+  TMP_MARK;
+
+  lp = TMP_ALLOC_LIMBS (4 * mn + 6);
+  sp = lp + 2 * mn + 3;
+  en = mpn_lucm (sp, scratch, en, mp, mn, lp);
+  if (en != 0 && LIKELY (--b0 != 0))
+    {
+      mpn_sqr (lp, sp, en);
+      lp [0] |= 2; /* V^2 + 2 */
+      if (LIKELY (2 * en >= mn))
+	mpn_tdiv_qr (sp, lp, 0, lp, 2 * en, mp, mn);
+      else
+	MPN_ZERO (lp + 2 * en, mn - 2 * en);
+      if (! mpn_zero_p (lp, mn) && LIKELY (--b0 != 0))
+	b0 = mpn_llriter (lp, mp, mn, b0, lp + mn + 1);
+    }
+  TMP_FREE;
+  return (b0 != 0);
+}

diff --git a/third_party/gmp/mpn/generic/sub.c b/third_party/gmp/mpn/generic/sub.c
new file mode 100644
index 0000000..df0afd6
--- /dev/null
+++ b/third_party/gmp/mpn/generic/sub.c

@@ -0,0 +1,33 @@
+/* mpn_sub - subtract mpn from mpn.
+
+Copyright 2001 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define __GMP_FORCE_mpn_sub 1
+
+#include "gmp-impl.h"

diff --git a/third_party/gmp/mpn/generic/sub_1.c b/third_party/gmp/mpn/generic/sub_1.c
new file mode 100644
index 0000000..a20f191
--- /dev/null
+++ b/third_party/gmp/mpn/generic/sub_1.c

@@ -0,0 +1,33 @@
+/* mpn_sub_1 - subtract limb from mpn.
+
+Copyright 2001 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define __GMP_FORCE_mpn_sub_1 1
+
+#include "gmp-impl.h"

diff --git a/third_party/gmp/mpn/generic/sub_err1_n.c b/third_party/gmp/mpn/generic/sub_err1_n.c
new file mode 100644
index 0000000..beca57e
--- /dev/null
+++ b/third_party/gmp/mpn/generic/sub_err1_n.c

@@ -0,0 +1,100 @@
+/* mpn_sub_err1_n -- sub_n with one error term
+
+   Contributed by David Harvey.
+
+   THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
+   SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+Copyright 2011 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+
+/*
+  Computes:
+
+  (1) {rp,n} := {up,n} - {vp,n} (just like mpn_sub_n) with incoming borrow cy,
+  return value is borrow out.
+
+  (2) Let c[i+1] = borrow from i-th limb subtraction (c[0] = cy).
+  Computes c[1]*yp[n-1] + ... + c[n]*yp[0], stores two-limb result at ep.
+
+  Requires n >= 1.
+
+  None of the outputs may overlap each other or any of the inputs, except
+  that {rp,n} may be equal to {up,n} or {vp,n}.
+*/
+mp_limb_t
+mpn_sub_err1_n (mp_ptr rp, mp_srcptr up, mp_srcptr vp,
+		mp_ptr ep, mp_srcptr yp,
+                mp_size_t n, mp_limb_t cy)
+{
+  mp_limb_t el, eh, ul, vl, yl, zl, rl, sl, cy1, cy2;
+
+  ASSERT (n >= 1);
+  ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, n));
+  ASSERT (MPN_SAME_OR_SEPARATE_P (rp, vp, n));
+  ASSERT (! MPN_OVERLAP_P (rp, n, yp, n));
+  ASSERT (! MPN_OVERLAP_P (ep, 2, up, n));
+  ASSERT (! MPN_OVERLAP_P (ep, 2, vp, n));
+  ASSERT (! MPN_OVERLAP_P (ep, 2, yp, n));
+  ASSERT (! MPN_OVERLAP_P (ep, 2, rp, n));
+
+  yp += n - 1;
+  el = eh = 0;
+
+  do
+    {
+      yl = *yp--;
+      ul = *up++;
+      vl = *vp++;
+
+      /* ordinary sub_n */
+      SUBC_LIMB (cy1, sl, ul, vl);
+      SUBC_LIMB (cy2, rl, sl, cy);
+      cy = cy1 | cy2;
+      *rp++ = rl;
+
+      /* update (eh:el) */
+      zl = (-cy) & yl;
+      el += zl;
+      eh += el < zl;
+    }
+  while (--n);
+
+#if GMP_NAIL_BITS != 0
+  eh = (eh << GMP_NAIL_BITS) + (el >> GMP_NUMB_BITS);
+  el &= GMP_NUMB_MASK;
+#endif
+
+  ep[0] = el;
+  ep[1] = eh;
+
+  return cy;
+}

diff --git a/third_party/gmp/mpn/generic/sub_err2_n.c b/third_party/gmp/mpn/generic/sub_err2_n.c
new file mode 100644
index 0000000..1edf8d6
--- /dev/null
+++ b/third_party/gmp/mpn/generic/sub_err2_n.c

@@ -0,0 +1,116 @@
+/* mpn_sub_err2_n -- sub_n with two error terms
+
+   Contributed by David Harvey.
+
+   THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
+   SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+Copyright 2011 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+
+/*
+  Computes:
+
+  (1) {rp,n} := {up,n} - {vp,n} (just like mpn_sub_n) with incoming borrow cy,
+  return value is borrow out.
+
+  (2) Let c[i+1] = borrow from i-th limb subtraction (c[0] = cy).
+  Computes c[1]*yp1[n-1] + ... + c[n]*yp1[0],
+           c[1]*yp2[n-1] + ... + c[n]*yp2[0],
+  stores two-limb results at {ep,2} and {ep+2,2} respectively.
+
+  Requires n >= 1.
+
+  None of the outputs may overlap each other or any of the inputs, except
+  that {rp,n} may be equal to {up,n} or {vp,n}.
+*/
+mp_limb_t
+mpn_sub_err2_n (mp_ptr rp, mp_srcptr up, mp_srcptr vp,
+                mp_ptr ep, mp_srcptr yp1, mp_srcptr yp2,
+                mp_size_t n, mp_limb_t cy)
+{
+  mp_limb_t el1, eh1, el2, eh2, ul, vl, yl1, yl2, zl1, zl2, rl, sl, cy1, cy2;
+
+  ASSERT (n >= 1);
+  ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, n));
+  ASSERT (MPN_SAME_OR_SEPARATE_P (rp, vp, n));
+  ASSERT (! MPN_OVERLAP_P (rp, n, yp1, n));
+  ASSERT (! MPN_OVERLAP_P (rp, n, yp2, n));
+  ASSERT (! MPN_OVERLAP_P (ep, 4, up, n));
+  ASSERT (! MPN_OVERLAP_P (ep, 4, vp, n));
+  ASSERT (! MPN_OVERLAP_P (ep, 4, yp1, n));
+  ASSERT (! MPN_OVERLAP_P (ep, 4, yp2, n));
+  ASSERT (! MPN_OVERLAP_P (ep, 4, rp, n));
+
+  yp1 += n - 1;
+  yp2 += n - 1;
+  el1 = eh1 = 0;
+  el2 = eh2 = 0;
+
+  do
+    {
+      yl1 = *yp1--;
+      yl2 = *yp2--;
+      ul = *up++;
+      vl = *vp++;
+
+      /* ordinary sub_n */
+      SUBC_LIMB (cy1, sl, ul, vl);
+      SUBC_LIMB (cy2, rl, sl, cy);
+      cy = cy1 | cy2;
+      *rp++ = rl;
+
+      /* update (eh1:el1) */
+      zl1 = (-cy) & yl1;
+      el1 += zl1;
+      eh1 += el1 < zl1;
+
+      /* update (eh2:el2) */
+      zl2 = (-cy) & yl2;
+      el2 += zl2;
+      eh2 += el2 < zl2;
+    }
+  while (--n);
+
+#if GMP_NAIL_BITS != 0
+  eh1 = (eh1 << GMP_NAIL_BITS) + (el1 >> GMP_NUMB_BITS);
+  el1 &= GMP_NUMB_MASK;
+  eh2 = (eh2 << GMP_NAIL_BITS) + (el2 >> GMP_NUMB_BITS);
+  el2 &= GMP_NUMB_MASK;
+#endif
+
+  ep[0] = el1;
+  ep[1] = eh1;
+  ep[2] = el2;
+  ep[3] = eh2;
+
+  return cy;
+}

diff --git a/third_party/gmp/mpn/generic/sub_err3_n.c b/third_party/gmp/mpn/generic/sub_err3_n.c
new file mode 100644
index 0000000..2db3c63
--- /dev/null
+++ b/third_party/gmp/mpn/generic/sub_err3_n.c

@@ -0,0 +1,131 @@
+/* mpn_sub_err3_n -- sub_n with three error terms
+
+   Contributed by David Harvey.
+
+   THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
+   SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+Copyright 2011 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+
+/*
+  Computes:
+
+  (1) {rp,n} := {up,n} - {vp,n} (just like mpn_sub_n) with incoming borrow cy,
+  return value is borrow out.
+
+  (2) Let c[i+1] = borrow from i-th limb subtraction (c[0] = cy).
+  Computes c[1]*yp1[n-1] + ... + c[n]*yp1[0],
+           c[1]*yp2[n-1] + ... + c[n]*yp2[0],
+           c[1]*yp3[n-1] + ... + c[n]*yp3[0],
+  stores two-limb results at {ep,2}, {ep+2,2} and {ep+4,2} respectively.
+
+  Requires n >= 1.
+
+  None of the outputs may overlap each other or any of the inputs, except
+  that {rp,n} may be equal to {up,n} or {vp,n}.
+*/
+mp_limb_t
+mpn_sub_err3_n (mp_ptr rp, mp_srcptr up, mp_srcptr vp,
+                mp_ptr ep, mp_srcptr yp1, mp_srcptr yp2, mp_srcptr yp3,
+                mp_size_t n, mp_limb_t cy)
+{
+  mp_limb_t el1, eh1, el2, eh2, el3, eh3, ul, vl, yl1, yl2, yl3, zl1, zl2, zl3, rl, sl, cy1, cy2;
+
+  ASSERT (n >= 1);
+  ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, n));
+  ASSERT (MPN_SAME_OR_SEPARATE_P (rp, vp, n));
+  ASSERT (! MPN_OVERLAP_P (rp, n, yp1, n));
+  ASSERT (! MPN_OVERLAP_P (rp, n, yp2, n));
+  ASSERT (! MPN_OVERLAP_P (rp, n, yp3, n));
+  ASSERT (! MPN_OVERLAP_P (ep, 6, up, n));
+  ASSERT (! MPN_OVERLAP_P (ep, 6, vp, n));
+  ASSERT (! MPN_OVERLAP_P (ep, 6, yp1, n));
+  ASSERT (! MPN_OVERLAP_P (ep, 6, yp2, n));
+  ASSERT (! MPN_OVERLAP_P (ep, 6, yp3, n));
+  ASSERT (! MPN_OVERLAP_P (ep, 6, rp, n));
+
+  yp1 += n - 1;
+  yp2 += n - 1;
+  yp3 += n - 1;
+  el1 = eh1 = 0;
+  el2 = eh2 = 0;
+  el3 = eh3 = 0;
+
+  do
+    {
+      yl1 = *yp1--;
+      yl2 = *yp2--;
+      yl3 = *yp3--;
+      ul = *up++;
+      vl = *vp++;
+
+      /* ordinary sub_n */
+      SUBC_LIMB (cy1, sl, ul, vl);
+      SUBC_LIMB (cy2, rl, sl, cy);
+      cy = cy1 | cy2;
+      *rp++ = rl;
+
+      /* update (eh1:el1) */
+      zl1 = (-cy) & yl1;
+      el1 += zl1;
+      eh1 += el1 < zl1;
+
+      /* update (eh2:el2) */
+      zl2 = (-cy) & yl2;
+      el2 += zl2;
+      eh2 += el2 < zl2;
+
+      /* update (eh3:el3) */
+      zl3 = (-cy) & yl3;
+      el3 += zl3;
+      eh3 += el3 < zl3;
+    }
+  while (--n);
+
+#if GMP_NAIL_BITS != 0
+  eh1 = (eh1 << GMP_NAIL_BITS) + (el1 >> GMP_NUMB_BITS);
+  el1 &= GMP_NUMB_MASK;
+  eh2 = (eh2 << GMP_NAIL_BITS) + (el2 >> GMP_NUMB_BITS);
+  el2 &= GMP_NUMB_MASK;
+  eh3 = (eh3 << GMP_NAIL_BITS) + (el3 >> GMP_NUMB_BITS);
+  el3 &= GMP_NUMB_MASK;
+#endif
+
+  ep[0] = el1;
+  ep[1] = eh1;
+  ep[2] = el2;
+  ep[3] = eh2;
+  ep[4] = el3;
+  ep[5] = eh3;
+
+  return cy;
+}

diff --git a/third_party/gmp/mpn/generic/sub_n.c b/third_party/gmp/mpn/generic/sub_n.c
new file mode 100644
index 0000000..b192c96
--- /dev/null
+++ b/third_party/gmp/mpn/generic/sub_n.c

@@ -0,0 +1,89 @@
+/* mpn_sub_n -- Subtract equal length limb vectors.
+
+Copyright 1992-1994, 1996, 2000, 2002, 2009 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+
+
+#if GMP_NAIL_BITS == 0
+
+mp_limb_t
+mpn_sub_n (mp_ptr rp, mp_srcptr up, mp_srcptr vp, mp_size_t n)
+{
+  mp_limb_t ul, vl, sl, rl, cy, cy1, cy2;
+
+  ASSERT (n >= 1);
+  ASSERT (MPN_SAME_OR_INCR_P (rp, up, n));
+  ASSERT (MPN_SAME_OR_INCR_P (rp, vp, n));
+
+  cy = 0;
+  do
+    {
+      ul = *up++;
+      vl = *vp++;
+      sl = ul - vl;
+      cy1 = sl > ul;
+      rl = sl - cy;
+      cy2 = rl > sl;
+      cy = cy1 | cy2;
+      *rp++ = rl;
+    }
+  while (--n != 0);
+
+  return cy;
+}
+
+#endif
+
+#if GMP_NAIL_BITS >= 1
+
+mp_limb_t
+mpn_sub_n (mp_ptr rp, mp_srcptr up, mp_srcptr vp, mp_size_t n)
+{
+  mp_limb_t ul, vl, rl, cy;
+
+  ASSERT (n >= 1);
+  ASSERT (MPN_SAME_OR_INCR_P (rp, up, n));
+  ASSERT (MPN_SAME_OR_INCR_P (rp, vp, n));
+
+  cy = 0;
+  do
+    {
+      ul = *up++;
+      vl = *vp++;
+      rl = ul - vl - cy;
+      cy = rl >> (GMP_LIMB_BITS - 1);
+      *rp++ = rl & GMP_NUMB_MASK;
+    }
+  while (--n != 0);
+
+  return cy;
+}
+
+#endif

diff --git a/third_party/gmp/mpn/generic/submul_1.c b/third_party/gmp/mpn/generic/submul_1.c
new file mode 100644
index 0000000..4744274
--- /dev/null
+++ b/third_party/gmp/mpn/generic/submul_1.c

@@ -0,0 +1,144 @@
+/* mpn_submul_1 -- multiply the N long limb vector pointed to by UP by VL,
+   subtract the N least significant limbs of the product from the limb
+   vector pointed to by RP.  Return the most significant limb of the
+   product, adjusted for carry-out from the subtraction.
+
+Copyright 1992-1994, 1996, 2000, 2002, 2004 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+
+#if GMP_NAIL_BITS == 0
+
+mp_limb_t
+mpn_submul_1 (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_limb_t v0)
+{
+  mp_limb_t u0, crec, c, p1, p0, r0;
+
+  ASSERT (n >= 1);
+  ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, n));
+
+  crec = 0;
+  do
+    {
+      u0 = *up++;
+      umul_ppmm (p1, p0, u0, v0);
+
+      r0 = *rp;
+
+      p0 = r0 - p0;
+      c = r0 < p0;
+
+      p1 = p1 + c;
+
+      r0 = p0 - crec;		/* cycle 0, 3, ... */
+      c = p0 < r0;		/* cycle 1, 4, ... */
+
+      crec = p1 + c;		/* cycle 2, 5, ... */
+
+      *rp++ = r0;
+    }
+  while (--n != 0);
+
+  return crec;
+}
+
+#endif
+
+#if GMP_NAIL_BITS == 1
+
+mp_limb_t
+mpn_submul_1 (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_limb_t v0)
+{
+  mp_limb_t shifted_v0, u0, r0, p0, p1, prev_p1, cl, xl, c1, c2, c3;
+
+  ASSERT (n >= 1);
+  ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, n));
+  ASSERT_MPN (rp, n);
+  ASSERT_MPN (up, n);
+  ASSERT_LIMB (v0);
+
+  shifted_v0 = v0 << GMP_NAIL_BITS;
+  cl = 0;
+  prev_p1 = 0;
+  do
+    {
+      u0 = *up++;
+      r0 = *rp;
+      umul_ppmm (p1, p0, u0, shifted_v0);
+      p0 >>= GMP_NAIL_BITS;
+      SUBC_LIMB (c1, xl, r0, prev_p1);
+      SUBC_LIMB (c2, xl, xl, p0);
+      SUBC_LIMB (c3, xl, xl, cl);
+      cl = c1 + c2 + c3;
+      *rp++ = xl;
+      prev_p1 = p1;
+    }
+  while (--n != 0);
+
+  return prev_p1 + cl;
+}
+
+#endif
+
+#if GMP_NAIL_BITS >= 2
+
+mp_limb_t
+mpn_submul_1 (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_limb_t v0)
+{
+  mp_limb_t shifted_v0, u0, r0, p0, p1, prev_p1, xw, cl, xl;
+
+  ASSERT (n >= 1);
+  ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, n));
+  ASSERT_MPN (rp, n);
+  ASSERT_MPN (up, n);
+  ASSERT_LIMB (v0);
+
+  shifted_v0 = v0 << GMP_NAIL_BITS;
+  cl = 0;
+  prev_p1 = 0;
+  do
+    {
+      u0 = *up++;
+      r0 = *rp;
+      umul_ppmm (p1, p0, u0, shifted_v0);
+      p0 >>= GMP_NAIL_BITS;
+      xw = r0 - (prev_p1 + p0) + cl;
+      cl = (mp_limb_signed_t) xw >> GMP_NUMB_BITS; /* FIXME: non-portable */
+      xl = xw & GMP_NUMB_MASK;
+      *rp++ = xl;
+      prev_p1 = p1;
+    }
+  while (--n != 0);
+
+  return prev_p1 - cl;
+}
+
+#endif

diff --git a/third_party/gmp/mpn/generic/tdiv_qr.c b/third_party/gmp/mpn/generic/tdiv_qr.c
new file mode 100644
index 0000000..92ff33c
--- /dev/null
+++ b/third_party/gmp/mpn/generic/tdiv_qr.c

@@ -0,0 +1,386 @@
+/* mpn_tdiv_qr -- Divide the numerator (np,nn) by the denominator (dp,dn) and
+   write the nn-dn+1 quotient limbs at qp and the dn remainder limbs at rp.  If
+   qxn is non-zero, generate that many fraction limbs and append them after the
+   other quotient limbs, and update the remainder accordingly.  The input
+   operands are unaffected.
+
+   Preconditions:
+   1. The most significant limb of the divisor must be non-zero.
+   2. nn >= dn, even if qxn is non-zero.  (??? relax this ???)
+
+   The time complexity of this is O(qn*qn+M(dn,qn)), where M(m,n) is the time
+   complexity of multiplication.
+
+Copyright 1997, 2000-2002, 2005, 2009, 2015 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+
+void
+mpn_tdiv_qr (mp_ptr qp, mp_ptr rp, mp_size_t qxn,
+	     mp_srcptr np, mp_size_t nn, mp_srcptr dp, mp_size_t dn)
+{
+  ASSERT_ALWAYS (qxn == 0);
+
+  ASSERT (nn >= 0);
+  ASSERT (dn >= 0);
+  ASSERT (dn == 0 || dp[dn - 1] != 0);
+  ASSERT (! MPN_OVERLAP_P (qp, nn - dn + 1 + qxn, np, nn));
+  ASSERT (! MPN_OVERLAP_P (qp, nn - dn + 1 + qxn, dp, dn));
+
+  switch (dn)
+    {
+    case 0:
+      DIVIDE_BY_ZERO;
+
+    case 1:
+      {
+	rp[0] = mpn_divrem_1 (qp, (mp_size_t) 0, np, nn, dp[0]);
+	return;
+      }
+
+    case 2:
+      {
+	mp_ptr n2p;
+	mp_limb_t qhl, cy;
+	TMP_DECL;
+	TMP_MARK;
+	if ((dp[1] & GMP_NUMB_HIGHBIT) == 0)
+	  {
+	    int cnt;
+	    mp_limb_t d2p[2];
+	    count_leading_zeros (cnt, dp[1]);
+	    cnt -= GMP_NAIL_BITS;
+	    d2p[1] = (dp[1] << cnt) | (dp[0] >> (GMP_NUMB_BITS - cnt));
+	    d2p[0] = (dp[0] << cnt) & GMP_NUMB_MASK;
+	    n2p = TMP_ALLOC_LIMBS (nn + 1);
+	    cy = mpn_lshift (n2p, np, nn, cnt);
+	    n2p[nn] = cy;
+	    qhl = mpn_divrem_2 (qp, 0L, n2p, nn + (cy != 0), d2p);
+	    if (cy == 0)
+	      qp[nn - 2] = qhl;	/* always store nn-2+1 quotient limbs */
+	    rp[0] = (n2p[0] >> cnt)
+	      | ((n2p[1] << (GMP_NUMB_BITS - cnt)) & GMP_NUMB_MASK);
+	    rp[1] = (n2p[1] >> cnt);
+	  }
+	else
+	  {
+	    n2p = TMP_ALLOC_LIMBS (nn);
+	    MPN_COPY (n2p, np, nn);
+	    qhl = mpn_divrem_2 (qp, 0L, n2p, nn, dp);
+	    qp[nn - 2] = qhl;	/* always store nn-2+1 quotient limbs */
+	    rp[0] = n2p[0];
+	    rp[1] = n2p[1];
+	  }
+	TMP_FREE;
+	return;
+      }
+
+    default:
+      {
+	int adjust;
+	gmp_pi1_t dinv;
+	TMP_DECL;
+	TMP_MARK;
+	adjust = np[nn - 1] >= dp[dn - 1];	/* conservative tests for quotient size */
+	if (nn + adjust >= 2 * dn)
+	  {
+	    mp_ptr n2p, d2p;
+	    mp_limb_t cy;
+	    int cnt;
+
+	    qp[nn - dn] = 0;			  /* zero high quotient limb */
+	    if ((dp[dn - 1] & GMP_NUMB_HIGHBIT) == 0) /* normalize divisor */
+	      {
+		count_leading_zeros (cnt, dp[dn - 1]);
+		cnt -= GMP_NAIL_BITS;
+		d2p = TMP_ALLOC_LIMBS (dn);
+		mpn_lshift (d2p, dp, dn, cnt);
+		n2p = TMP_ALLOC_LIMBS (nn + 1);
+		cy = mpn_lshift (n2p, np, nn, cnt);
+		n2p[nn] = cy;
+		nn += adjust;
+	      }
+	    else
+	      {
+		cnt = 0;
+		d2p = (mp_ptr) dp;
+		n2p = TMP_ALLOC_LIMBS (nn + 1);
+		MPN_COPY (n2p, np, nn);
+		n2p[nn] = 0;
+		nn += adjust;
+	      }
+
+	    invert_pi1 (dinv, d2p[dn - 1], d2p[dn - 2]);
+	    if (BELOW_THRESHOLD (dn, DC_DIV_QR_THRESHOLD))
+	      mpn_sbpi1_div_qr (qp, n2p, nn, d2p, dn, dinv.inv32);
+	    else if (BELOW_THRESHOLD (dn, MUPI_DIV_QR_THRESHOLD) ||   /* fast condition */
+		     BELOW_THRESHOLD (nn, 2 * MU_DIV_QR_THRESHOLD) || /* fast condition */
+		     (double) (2 * (MU_DIV_QR_THRESHOLD - MUPI_DIV_QR_THRESHOLD)) * dn /* slow... */
+		     + (double) MUPI_DIV_QR_THRESHOLD * nn > (double) dn * nn)    /* ...condition */
+	      mpn_dcpi1_div_qr (qp, n2p, nn, d2p, dn, &dinv);
+	    else
+	      {
+		mp_size_t itch = mpn_mu_div_qr_itch (nn, dn, 0);
+		mp_ptr scratch = TMP_ALLOC_LIMBS (itch);
+		mpn_mu_div_qr (qp, rp, n2p, nn, d2p, dn, scratch);
+		n2p = rp;
+	      }
+
+	    if (cnt != 0)
+	      mpn_rshift (rp, n2p, dn, cnt);
+	    else
+	      MPN_COPY (rp, n2p, dn);
+	    TMP_FREE;
+	    return;
+	  }
+
+	/* When we come here, the numerator/partial remainder is less
+	   than twice the size of the denominator.  */
+
+	  {
+	    /* Problem:
+
+	       Divide a numerator N with nn limbs by a denominator D with dn
+	       limbs forming a quotient of qn=nn-dn+1 limbs.  When qn is small
+	       compared to dn, conventional division algorithms perform poorly.
+	       We want an algorithm that has an expected running time that is
+	       dependent only on qn.
+
+	       Algorithm (very informally stated):
+
+	       1) Divide the 2 x qn most significant limbs from the numerator
+		  by the qn most significant limbs from the denominator.  Call
+		  the result qest.  This is either the correct quotient, but
+		  might be 1 or 2 too large.  Compute the remainder from the
+		  division.  (This step is implemented by an mpn_divrem call.)
+
+	       2) Is the most significant limb from the remainder < p, where p
+		  is the product of the most significant limb from the quotient
+		  and the next(d)?  (Next(d) denotes the next ignored limb from
+		  the denominator.)  If it is, decrement qest, and adjust the
+		  remainder accordingly.
+
+	       3) Is the remainder >= qest?  If it is, qest is the desired
+		  quotient.  The algorithm terminates.
+
+	       4) Subtract qest x next(d) from the remainder.  If there is
+		  borrow out, decrement qest, and adjust the remainder
+		  accordingly.
+
+	       5) Skip one word from the denominator (i.e., let next(d) denote
+		  the next less significant limb.  */
+
+	    mp_size_t qn;
+	    mp_ptr n2p, d2p;
+	    mp_ptr tp;
+	    mp_limb_t cy;
+	    mp_size_t in, rn;
+	    mp_limb_t quotient_too_large;
+	    unsigned int cnt;
+
+	    qn = nn - dn;
+	    qp[qn] = 0;				/* zero high quotient limb */
+	    qn += adjust;			/* qn cannot become bigger */
+
+	    if (qn == 0)
+	      {
+		MPN_COPY (rp, np, dn);
+		TMP_FREE;
+		return;
+	      }
+
+	    in = dn - qn;		/* (at least partially) ignored # of limbs in ops */
+	    /* Normalize denominator by shifting it to the left such that its
+	       most significant bit is set.  Then shift the numerator the same
+	       amount, to mathematically preserve quotient.  */
+	    if ((dp[dn - 1] & GMP_NUMB_HIGHBIT) == 0)
+	      {
+		count_leading_zeros (cnt, dp[dn - 1]);
+		cnt -= GMP_NAIL_BITS;
+
+		d2p = TMP_ALLOC_LIMBS (qn);
+		mpn_lshift (d2p, dp + in, qn, cnt);
+		d2p[0] |= dp[in - 1] >> (GMP_NUMB_BITS - cnt);
+
+		n2p = TMP_ALLOC_LIMBS (2 * qn + 1);
+		cy = mpn_lshift (n2p, np + nn - 2 * qn, 2 * qn, cnt);
+		if (adjust)
+		  {
+		    n2p[2 * qn] = cy;
+		    n2p++;
+		  }
+		else
+		  {
+		    n2p[0] |= np[nn - 2 * qn - 1] >> (GMP_NUMB_BITS - cnt);
+		  }
+	      }
+	    else
+	      {
+		cnt = 0;
+		d2p = (mp_ptr) dp + in;
+
+		n2p = TMP_ALLOC_LIMBS (2 * qn + 1);
+		MPN_COPY (n2p, np + nn - 2 * qn, 2 * qn);
+		if (adjust)
+		  {
+		    n2p[2 * qn] = 0;
+		    n2p++;
+		  }
+	      }
+
+	    /* Get an approximate quotient using the extracted operands.  */
+	    if (qn == 1)
+	      {
+		mp_limb_t q0, r0;
+		udiv_qrnnd (q0, r0, n2p[1], n2p[0] << GMP_NAIL_BITS, d2p[0] << GMP_NAIL_BITS);
+		n2p[0] = r0 >> GMP_NAIL_BITS;
+		qp[0] = q0;
+	      }
+	    else if (qn == 2)
+	      mpn_divrem_2 (qp, 0L, n2p, 4L, d2p); /* FIXME: obsolete function */
+	    else
+	      {
+		invert_pi1 (dinv, d2p[qn - 1], d2p[qn - 2]);
+		if (BELOW_THRESHOLD (qn, DC_DIV_QR_THRESHOLD))
+		  mpn_sbpi1_div_qr (qp, n2p, 2 * qn, d2p, qn, dinv.inv32);
+		else if (BELOW_THRESHOLD (qn, MU_DIV_QR_THRESHOLD))
+		  mpn_dcpi1_div_qr (qp, n2p, 2 * qn, d2p, qn, &dinv);
+		else
+		  {
+		    mp_size_t itch = mpn_mu_div_qr_itch (2 * qn, qn, 0);
+		    mp_ptr scratch = TMP_ALLOC_LIMBS (itch);
+		    mp_ptr r2p = rp;
+		    if (np == r2p)	/* If N and R share space, put ... */
+		      r2p += nn - qn;	/* intermediate remainder at N's upper end. */
+		    mpn_mu_div_qr (qp, r2p, n2p, 2 * qn, d2p, qn, scratch);
+		    MPN_COPY (n2p, r2p, qn);
+		  }
+	      }
+
+	    rn = qn;
+	    /* Multiply the first ignored divisor limb by the most significant
+	       quotient limb.  If that product is > the partial remainder's
+	       most significant limb, we know the quotient is too large.  This
+	       test quickly catches most cases where the quotient is too large;
+	       it catches all cases where the quotient is 2 too large.  */
+	    {
+	      mp_limb_t dl, x;
+	      mp_limb_t h, dummy;
+
+	      if (in - 2 < 0)
+		dl = 0;
+	      else
+		dl = dp[in - 2];
+
+#if GMP_NAIL_BITS == 0
+	      x = (dp[in - 1] << cnt) | ((dl >> 1) >> ((~cnt) % GMP_LIMB_BITS));
+#else
+	      x = (dp[in - 1] << cnt) & GMP_NUMB_MASK;
+	      if (cnt != 0)
+		x |= dl >> (GMP_NUMB_BITS - cnt);
+#endif
+	      umul_ppmm (h, dummy, x, qp[qn - 1] << GMP_NAIL_BITS);
+
+	      if (n2p[qn - 1] < h)
+		{
+		  mp_limb_t cy;
+
+		  mpn_decr_u (qp, (mp_limb_t) 1);
+		  cy = mpn_add_n (n2p, n2p, d2p, qn);
+		  if (cy)
+		    {
+		      /* The partial remainder is safely large.  */
+		      n2p[qn] = cy;
+		      ++rn;
+		    }
+		}
+	    }
+
+	    quotient_too_large = 0;
+	    if (cnt != 0)
+	      {
+		mp_limb_t cy1, cy2;
+
+		/* Append partially used numerator limb to partial remainder.  */
+		cy1 = mpn_lshift (n2p, n2p, rn, GMP_NUMB_BITS - cnt);
+		n2p[0] |= np[in - 1] & (GMP_NUMB_MASK >> cnt);
+
+		/* Update partial remainder with partially used divisor limb.  */
+		cy2 = mpn_submul_1 (n2p, qp, qn, dp[in - 1] & (GMP_NUMB_MASK >> cnt));
+		if (qn != rn)
+		  {
+		    ASSERT_ALWAYS (n2p[qn] >= cy2);
+		    n2p[qn] -= cy2;
+		  }
+		else
+		  {
+		    n2p[qn] = cy1 - cy2; /* & GMP_NUMB_MASK; */
+
+		    quotient_too_large = (cy1 < cy2);
+		    ++rn;
+		  }
+		--in;
+	      }
+	    /* True: partial remainder now is neutral, i.e., it is not shifted up.  */
+
+	    tp = TMP_ALLOC_LIMBS (dn);
+
+	    if (in < qn)
+	      {
+		if (in == 0)
+		  {
+		    MPN_COPY (rp, n2p, rn);
+		    ASSERT_ALWAYS (rn == dn);
+		    goto foo;
+		  }
+		mpn_mul (tp, qp, qn, dp, in);
+	      }
+	    else
+	      mpn_mul (tp, dp, in, qp, qn);
+
+	    cy = mpn_sub (n2p, n2p, rn, tp + in, qn);
+	    MPN_COPY (rp + in, n2p, dn - in);
+	    quotient_too_large |= cy;
+	    cy = mpn_sub_n (rp, np, tp, in);
+	    cy = mpn_sub_1 (rp + in, rp + in, rn, cy);
+	    quotient_too_large |= cy;
+	  foo:
+	    if (quotient_too_large)
+	      {
+		mpn_decr_u (qp, (mp_limb_t) 1);
+		mpn_add_n (rp, rp, dp, dn);
+	      }
+	  }
+	TMP_FREE;
+	return;
+      }
+    }
+}

diff --git a/third_party/gmp/mpn/generic/toom22_mul.c b/third_party/gmp/mpn/generic/toom22_mul.c
new file mode 100644
index 0000000..64f024a
--- /dev/null
+++ b/third_party/gmp/mpn/generic/toom22_mul.c

@@ -0,0 +1,221 @@
+/* mpn_toom22_mul -- Multiply {ap,an} and {bp,bn} where an >= bn.  Or more
+   accurately, bn <= an < 2bn.
+
+   Contributed to the GNU project by Torbjorn Granlund.
+
+   THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
+   SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+Copyright 2006-2010, 2012, 2014, 2018 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+#include "gmp-impl.h"
+
+/* Evaluate in: -1, 0, +inf
+
+  <-s--><--n-->
+   ____ ______
+  |_a1_|___a0_|
+   |b1_|___b0_|
+   <-t-><--n-->
+
+  v0  =  a0     * b0       #   A(0)*B(0)
+  vm1 = (a0- a1)*(b0- b1)  #  A(-1)*B(-1)
+  vinf=      a1 *     b1   # A(inf)*B(inf)
+*/
+
+#if TUNE_PROGRAM_BUILD || WANT_FAT_BINARY
+#define MAYBE_mul_toom22   1
+#else
+#define MAYBE_mul_toom22						\
+  (MUL_TOOM33_THRESHOLD >= 2 * MUL_TOOM22_THRESHOLD)
+#endif
+
+#define TOOM22_MUL_N_REC(p, a, b, n, ws)				\
+  do {									\
+    if (! MAYBE_mul_toom22						\
+	|| BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD))			\
+      mpn_mul_basecase (p, a, n, b, n);					\
+    else								\
+      mpn_toom22_mul (p, a, n, b, n, ws);				\
+  } while (0)
+
+/* Normally, this calls mul_basecase or toom22_mul.  But when when the fraction
+   MUL_TOOM33_THRESHOLD / MUL_TOOM22_THRESHOLD is large, an initially small
+   relative unbalance will become a larger and larger relative unbalance with
+   each recursion (the difference s-t will be invariant over recursive calls).
+   Therefore, we need to call toom32_mul.  FIXME: Suppress depending on
+   MUL_TOOM33_THRESHOLD / MUL_TOOM22_THRESHOLD and on MUL_TOOM22_THRESHOLD.  */
+#define TOOM22_MUL_REC(p, a, an, b, bn, ws)				\
+  do {									\
+    if (! MAYBE_mul_toom22						\
+	|| BELOW_THRESHOLD (bn, MUL_TOOM22_THRESHOLD))			\
+      mpn_mul_basecase (p, a, an, b, bn);				\
+    else if (4 * an < 5 * bn)						\
+      mpn_toom22_mul (p, a, an, b, bn, ws);				\
+    else								\
+      mpn_toom32_mul (p, a, an, b, bn, ws);				\
+  } while (0)
+
+void
+mpn_toom22_mul (mp_ptr pp,
+		mp_srcptr ap, mp_size_t an,
+		mp_srcptr bp, mp_size_t bn,
+		mp_ptr scratch)
+{
+  const int __gmpn_cpuvec_initialized = 1;
+  mp_size_t n, s, t;
+  int vm1_neg;
+  mp_limb_t cy, cy2;
+  mp_ptr asm1;
+  mp_ptr bsm1;
+
+#define a0  ap
+#define a1  (ap + n)
+#define b0  bp
+#define b1  (bp + n)
+
+  s = an >> 1;
+  n = an - s;
+  t = bn - n;
+
+  ASSERT (an >= bn);
+
+  ASSERT (0 < s && s <= n && s >= n - 1);
+  ASSERT (0 < t && t <= s);
+
+  asm1 = pp;
+  bsm1 = pp + n;
+
+  vm1_neg = 0;
+
+  /* Compute asm1.  */
+  if (s == n)
+    {
+      if (mpn_cmp (a0, a1, n) < 0)
+	{
+	  mpn_sub_n (asm1, a1, a0, n);
+	  vm1_neg = 1;
+	}
+      else
+	{
+	  mpn_sub_n (asm1, a0, a1, n);
+	}
+    }
+  else /* n - s == 1 */
+    {
+      if (a0[s] == 0 && mpn_cmp (a0, a1, s) < 0)
+	{
+	  mpn_sub_n (asm1, a1, a0, s);
+	  asm1[s] = 0;
+	  vm1_neg = 1;
+	}
+      else
+	{
+	  asm1[s] = a0[s] - mpn_sub_n (asm1, a0, a1, s);
+	}
+    }
+
+  /* Compute bsm1.  */
+  if (t == n)
+    {
+      if (mpn_cmp (b0, b1, n) < 0)
+	{
+	  mpn_sub_n (bsm1, b1, b0, n);
+	  vm1_neg ^= 1;
+	}
+      else
+	{
+	  mpn_sub_n (bsm1, b0, b1, n);
+	}
+    }
+  else
+    {
+      if (mpn_zero_p (b0 + t, n - t) && mpn_cmp (b0, b1, t) < 0)
+	{
+	  mpn_sub_n (bsm1, b1, b0, t);
+	  MPN_ZERO (bsm1 + t, n - t);
+	  vm1_neg ^= 1;
+	}
+      else
+	{
+	  mpn_sub (bsm1, b0, n, b1, t);
+	}
+    }
+
+#define v0	pp				/* 2n */
+#define vinf	(pp + 2 * n)			/* s+t */
+#define vm1	scratch				/* 2n */
+#define scratch_out	scratch + 2 * n
+
+  /* vm1, 2n limbs */
+  TOOM22_MUL_N_REC (vm1, asm1, bsm1, n, scratch_out);
+
+  if (s > t)  TOOM22_MUL_REC (vinf, a1, s, b1, t, scratch_out);
+  else        TOOM22_MUL_N_REC (vinf, a1, b1, s, scratch_out);
+
+  /* v0, 2n limbs */
+  TOOM22_MUL_N_REC (v0, ap, bp, n, scratch_out);
+
+  /* H(v0) + L(vinf) */
+  cy = mpn_add_n (pp + 2 * n, v0 + n, vinf, n);
+
+  /* L(v0) + H(v0) */
+  cy2 = cy + mpn_add_n (pp + n, pp + 2 * n, v0, n);
+
+  /* L(vinf) + H(vinf) */
+  cy += mpn_add (pp + 2 * n, pp + 2 * n, n, vinf + n, s + t - n);
+
+  if (vm1_neg)
+    cy += mpn_add_n (pp + n, pp + n, vm1, 2 * n);
+  else {
+    cy -= mpn_sub_n (pp + n, pp + n, vm1, 2 * n);
+    if (UNLIKELY (cy + 1 == 0)) { /* cy is negative */
+      /* The total contribution of v0+vinf-vm1 can not be negative. */
+#if WANT_ASSERT
+      /* The borrow in cy stops the propagation of the carry cy2, */
+      ASSERT (cy2 == 1);
+      cy += mpn_add_1 (pp + 2 * n, pp + 2 * n, n, cy2);
+      ASSERT (cy == 0);
+#else
+      /* we simply fill the area with zeros. */
+      MPN_FILL (pp + 2 * n, n, 0);
+#endif
+      return;
+    }
+  }
+
+  ASSERT (cy  <= 2);
+  ASSERT (cy2 <= 2);
+
+  MPN_INCR_U (pp + 2 * n, s + t, cy2);
+  /* if s+t==n, cy is zero, but we should not access pp[3*n] at all. */
+  MPN_INCR_U (pp + 3 * n, s + t - n, cy);
+}

diff --git a/third_party/gmp/mpn/generic/toom2_sqr.c b/third_party/gmp/mpn/generic/toom2_sqr.c
new file mode 100644
index 0000000..4eaa141
--- /dev/null
+++ b/third_party/gmp/mpn/generic/toom2_sqr.c

@@ -0,0 +1,155 @@
+/* mpn_toom2_sqr -- Square {ap,an}.
+
+   Contributed to the GNU project by Torbjorn Granlund.
+
+   THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
+   SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+Copyright 2006-2010, 2012, 2014, 2018 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+#include "gmp-impl.h"
+
+/* Evaluate in: -1, 0, +inf
+
+  <-s--><--n-->
+   ____ ______
+  |_a1_|___a0_|
+
+  v0  =  a0     ^2  #   A(0)^2
+  vm1 = (a0- a1)^2  #  A(-1)^2
+  vinf=      a1 ^2  # A(inf)^2
+*/
+
+#if TUNE_PROGRAM_BUILD || WANT_FAT_BINARY
+#define MAYBE_sqr_toom2   1
+#else
+#define MAYBE_sqr_toom2							\
+  (SQR_TOOM3_THRESHOLD >= 2 * SQR_TOOM2_THRESHOLD)
+#endif
+
+#define TOOM2_SQR_REC(p, a, n, ws)					\
+  do {									\
+    if (! MAYBE_sqr_toom2						\
+	|| BELOW_THRESHOLD (n, SQR_TOOM2_THRESHOLD))			\
+      mpn_sqr_basecase (p, a, n);					\
+    else								\
+      mpn_toom2_sqr (p, a, n, ws);					\
+  } while (0)
+
+void
+mpn_toom2_sqr (mp_ptr pp,
+	       mp_srcptr ap, mp_size_t an,
+	       mp_ptr scratch)
+{
+  const int __gmpn_cpuvec_initialized = 1;
+  mp_size_t n, s;
+  mp_limb_t cy, cy2;
+  mp_ptr asm1;
+
+#define a0  ap
+#define a1  (ap + n)
+
+  s = an >> 1;
+  n = an - s;
+
+  ASSERT (0 < s && s <= n && s >= n - 1);
+
+  asm1 = pp;
+
+  /* Compute asm1.  */
+  if (s == n)
+    {
+      if (mpn_cmp (a0, a1, n) < 0)
+	{
+	  mpn_sub_n (asm1, a1, a0, n);
+	}
+      else
+	{
+	  mpn_sub_n (asm1, a0, a1, n);
+	}
+    }
+  else /* n - s == 1 */
+    {
+      if (a0[s] == 0 && mpn_cmp (a0, a1, s) < 0)
+	{
+	  mpn_sub_n (asm1, a1, a0, s);
+	  asm1[s] = 0;
+	}
+      else
+	{
+	  asm1[s] = a0[s] - mpn_sub_n (asm1, a0, a1, s);
+	}
+    }
+
+#define v0	pp				/* 2n */
+#define vinf	(pp + 2 * n)			/* s+s */
+#define vm1	scratch				/* 2n */
+#define scratch_out	scratch + 2 * n
+
+  /* vm1, 2n limbs */
+  TOOM2_SQR_REC (vm1, asm1, n, scratch_out);
+
+  /* vinf, s+s limbs */
+  TOOM2_SQR_REC (vinf, a1, s, scratch_out);
+
+  /* v0, 2n limbs */
+  TOOM2_SQR_REC (v0, ap, n, scratch_out);
+
+  /* H(v0) + L(vinf) */
+  cy = mpn_add_n (pp + 2 * n, v0 + n, vinf, n);
+
+  /* L(v0) + H(v0) */
+  cy2 = cy + mpn_add_n (pp + n, pp + 2 * n, v0, n);
+
+  /* L(vinf) + H(vinf) */
+  cy += mpn_add (pp + 2 * n, pp + 2 * n, n, vinf + n, s + s - n);
+
+  cy -= mpn_sub_n (pp + n, pp + n, vm1, 2 * n);
+
+  ASSERT (cy + 1 <= 3);
+  ASSERT (cy2 <= 2);
+
+  if (LIKELY (cy <= 2)) {
+    MPN_INCR_U (pp + 2 * n, s + s, cy2);
+    MPN_INCR_U (pp + 3 * n, s + s - n, cy);
+  } else { /* cy is negative */
+    /* The total contribution of v0+vinf-vm1 can not be negative. */
+#if WANT_ASSERT
+    /* The borrow in cy stops the propagation of the carry cy2, */
+    ASSERT (cy2 == 1);
+    cy += mpn_add_1 (pp + 2 * n, pp + 2 * n, n, cy2);
+    ASSERT (cy == 0);
+#else
+    /* we simply fill the area with zeros. */
+    MPN_FILL (pp + 2 * n, n, 0);
+#endif
+  }
+}

diff --git a/third_party/gmp/mpn/generic/toom32_mul.c b/third_party/gmp/mpn/generic/toom32_mul.c
new file mode 100644
index 0000000..f03ba56
--- /dev/null
+++ b/third_party/gmp/mpn/generic/toom32_mul.c

@@ -0,0 +1,322 @@
+/* mpn_toom32_mul -- Multiply {ap,an} and {bp,bn} where an is nominally 1.5
+   times as large as bn.  Or more accurately, bn < an < 3bn.
+
+   Contributed to the GNU project by Torbjorn Granlund.
+   Improvements by Marco Bodrato and Niels Möller.
+
+   The idea of applying toom to unbalanced multiplication is due to Marco
+   Bodrato and Alberto Zanoni.
+
+   THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
+   SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+Copyright 2006-2010 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+#include "gmp-impl.h"
+
+/* Evaluate in: -1, 0, +1, +inf
+
+  <-s-><--n--><--n-->
+   ___ ______ ______
+  |a2_|___a1_|___a0_|
+	|_b1_|___b0_|
+	<-t--><--n-->
+
+  v0  =  a0         * b0      #   A(0)*B(0)
+  v1  = (a0+ a1+ a2)*(b0+ b1) #   A(1)*B(1)      ah  <= 2  bh <= 1
+  vm1 = (a0- a1+ a2)*(b0- b1) #  A(-1)*B(-1)    |ah| <= 1  bh = 0
+  vinf=          a2 *     b1  # A(inf)*B(inf)
+*/
+
+#define TOOM32_MUL_N_REC(p, a, b, n, ws)				\
+  do {									\
+    mpn_mul_n (p, a, b, n);						\
+  } while (0)
+
+void
+mpn_toom32_mul (mp_ptr pp,
+		mp_srcptr ap, mp_size_t an,
+		mp_srcptr bp, mp_size_t bn,
+		mp_ptr scratch)
+{
+  mp_size_t n, s, t;
+  int vm1_neg;
+  mp_limb_t cy;
+  mp_limb_signed_t hi;
+  mp_limb_t ap1_hi, bp1_hi;
+
+#define a0  ap
+#define a1  (ap + n)
+#define a2  (ap + 2 * n)
+#define b0  bp
+#define b1  (bp + n)
+
+  /* Required, to ensure that s + t >= n. */
+  ASSERT (bn + 2 <= an && an + 6 <= 3*bn);
+
+  n = 1 + (2 * an >= 3 * bn ? (an - 1) / (size_t) 3 : (bn - 1) >> 1);
+
+  s = an - 2 * n;
+  t = bn - n;
+
+  ASSERT (0 < s && s <= n);
+  ASSERT (0 < t && t <= n);
+  ASSERT (s + t >= n);
+
+  /* Product area of size an + bn = 3*n + s + t >= 4*n + 2. */
+#define ap1 (pp)		/* n, most significant limb in ap1_hi */
+#define bp1 (pp + n)		/* n, most significant bit in bp1_hi */
+#define am1 (pp + 2*n)		/* n, most significant bit in hi */
+#define bm1 (pp + 3*n)		/* n */
+#define v1 (scratch)		/* 2n + 1 */
+#define vm1 (pp)		/* 2n + 1 */
+#define scratch_out (scratch + 2*n + 1) /* Currently unused. */
+
+  /* Scratch need: 2*n + 1 + scratch for the recursive multiplications. */
+
+  /* FIXME: Keep v1[2*n] and vm1[2*n] in scalar variables? */
+
+  /* Compute ap1 = a0 + a1 + a2, am1 = a0 - a1 + a2 */
+  ap1_hi = mpn_add (ap1, a0, n, a2, s);
+#if HAVE_NATIVE_mpn_add_n_sub_n
+  if (ap1_hi == 0 && mpn_cmp (ap1, a1, n) < 0)
+    {
+      ap1_hi = mpn_add_n_sub_n (ap1, am1, a1, ap1, n) >> 1;
+      hi = 0;
+      vm1_neg = 1;
+    }
+  else
+    {
+      cy = mpn_add_n_sub_n (ap1, am1, ap1, a1, n);
+      hi = ap1_hi - (cy & 1);
+      ap1_hi += (cy >> 1);
+      vm1_neg = 0;
+    }
+#else
+  if (ap1_hi == 0 && mpn_cmp (ap1, a1, n) < 0)
+    {
+      ASSERT_NOCARRY (mpn_sub_n (am1, a1, ap1, n));
+      hi = 0;
+      vm1_neg = 1;
+    }
+  else
+    {
+      hi = ap1_hi - mpn_sub_n (am1, ap1, a1, n);
+      vm1_neg = 0;
+    }
+  ap1_hi += mpn_add_n (ap1, ap1, a1, n);
+#endif
+
+  /* Compute bp1 = b0 + b1 and bm1 = b0 - b1. */
+  if (t == n)
+    {
+#if HAVE_NATIVE_mpn_add_n_sub_n
+      if (mpn_cmp (b0, b1, n) < 0)
+	{
+	  cy = mpn_add_n_sub_n (bp1, bm1, b1, b0, n);
+	  vm1_neg ^= 1;
+	}
+      else
+	{
+	  cy = mpn_add_n_sub_n (bp1, bm1, b0, b1, n);
+	}
+      bp1_hi = cy >> 1;
+#else
+      bp1_hi = mpn_add_n (bp1, b0, b1, n);
+
+      if (mpn_cmp (b0, b1, n) < 0)
+	{
+	  ASSERT_NOCARRY (mpn_sub_n (bm1, b1, b0, n));
+	  vm1_neg ^= 1;
+	}
+      else
+	{
+	  ASSERT_NOCARRY (mpn_sub_n (bm1, b0, b1, n));
+	}
+#endif
+    }
+  else
+    {
+      /* FIXME: Should still use mpn_add_n_sub_n for the main part. */
+      bp1_hi = mpn_add (bp1, b0, n, b1, t);
+
+      if (mpn_zero_p (b0 + t, n - t) && mpn_cmp (b0, b1, t) < 0)
+	{
+	  ASSERT_NOCARRY (mpn_sub_n (bm1, b1, b0, t));
+	  MPN_ZERO (bm1 + t, n - t);
+	  vm1_neg ^= 1;
+	}
+      else
+	{
+	  ASSERT_NOCARRY (mpn_sub (bm1, b0, n, b1, t));
+	}
+    }
+
+  TOOM32_MUL_N_REC (v1, ap1, bp1, n, scratch_out);
+  if (ap1_hi == 1)
+    {
+      cy = bp1_hi + mpn_add_n (v1 + n, v1 + n, bp1, n);
+    }
+  else if (ap1_hi == 2)
+    {
+#if HAVE_NATIVE_mpn_addlsh1_n
+      cy = 2 * bp1_hi + mpn_addlsh1_n (v1 + n, v1 + n, bp1, n);
+#else
+      cy = 2 * bp1_hi + mpn_addmul_1 (v1 + n, bp1, n, CNST_LIMB(2));
+#endif
+    }
+  else
+    cy = 0;
+  if (bp1_hi != 0)
+    cy += mpn_add_n (v1 + n, v1 + n, ap1, n);
+  v1[2 * n] = cy;
+
+  TOOM32_MUL_N_REC (vm1, am1, bm1, n, scratch_out);
+  if (hi)
+    hi = mpn_add_n (vm1+n, vm1+n, bm1, n);
+
+  vm1[2*n] = hi;
+
+  /* v1 <-- (v1 + vm1) / 2 = x0 + x2 */
+  if (vm1_neg)
+    {
+#if HAVE_NATIVE_mpn_rsh1sub_n
+      mpn_rsh1sub_n (v1, v1, vm1, 2*n+1);
+#else
+      mpn_sub_n (v1, v1, vm1, 2*n+1);
+      ASSERT_NOCARRY (mpn_rshift (v1, v1, 2*n+1, 1));
+#endif
+    }
+  else
+    {
+#if HAVE_NATIVE_mpn_rsh1add_n
+      mpn_rsh1add_n (v1, v1, vm1, 2*n+1);
+#else
+      mpn_add_n (v1, v1, vm1, 2*n+1);
+      ASSERT_NOCARRY (mpn_rshift (v1, v1, 2*n+1, 1));
+#endif
+    }
+
+  /* We get x1 + x3 = (x0 + x2) - (x0 - x1 + x2 - x3), and hence
+
+     y = x1 + x3 + (x0 + x2) * B
+       = (x0 + x2) * B + (x0 + x2) - vm1.
+
+     y is 3*n + 1 limbs, y = y0 + y1 B + y2 B^2. We store them as
+     follows: y0 at scratch, y1 at pp + 2*n, and y2 at scratch + n
+     (already in place, except for carry propagation).
+
+     We thus add
+
+   B^3  B^2   B    1
+    |    |    |    |
+   +-----+----+
+ + |  x0 + x2 |
+   +----+-----+----+
+ +      |  x0 + x2 |
+	+----------+
+ -      |  vm1     |
+ --+----++----+----+-
+   | y2  | y1 | y0 |
+   +-----+----+----+
+
+  Since we store y0 at the same location as the low half of x0 + x2, we
+  need to do the middle sum first. */
+
+  hi = vm1[2*n];
+  cy = mpn_add_n (pp + 2*n, v1, v1 + n, n);
+  MPN_INCR_U (v1 + n, n + 1, cy + v1[2*n]);
+
+  /* FIXME: Can we get rid of this second vm1_neg conditional by
+     swapping the location of +1 and -1 values? */
+  if (vm1_neg)
+    {
+      cy = mpn_add_n (v1, v1, vm1, n);
+      hi += mpn_add_nc (pp + 2*n, pp + 2*n, vm1 + n, n, cy);
+      MPN_INCR_U (v1 + n, n+1, hi);
+    }
+  else
+    {
+      cy = mpn_sub_n (v1, v1, vm1, n);
+      hi += mpn_sub_nc (pp + 2*n, pp + 2*n, vm1 + n, n, cy);
+      MPN_DECR_U (v1 + n, n+1, hi);
+    }
+
+  TOOM32_MUL_N_REC (pp, a0, b0, n, scratch_out);
+  /* vinf, s+t limbs.  Use mpn_mul for now, to handle unbalanced operands */
+  if (s > t)  mpn_mul (pp+3*n, a2, s, b1, t);
+  else        mpn_mul (pp+3*n, b1, t, a2, s);
+
+  /* Remaining interpolation.
+
+     y * B + x0 + x3 B^3 - x0 B^2 - x3 B
+     = (x1 + x3) B + (x0 + x2) B^2 + x0 + x3 B^3 - x0 B^2 - x3 B
+     = y0 B + y1 B^2 + y3 B^3 + Lx0 + H x0 B
+       + L x3 B^3 + H x3 B^4 - Lx0 B^2 - H x0 B^3 - L x3 B - H x3 B^2
+     = L x0 + (y0 + H x0 - L x3) B + (y1 - L x0 - H x3) B^2
+       + (y2 - (H x0 - L x3)) B^3 + H x3 B^4
+
+	  B^4       B^3       B^2        B         1
+ |         |         |         |         |         |
+   +-------+                   +---------+---------+
+   |  Hx3  |                   | Hx0-Lx3 |    Lx0  |
+   +------+----------+---------+---------+---------+
+	  |    y2    |  y1     |   y0    |
+	  ++---------+---------+---------+
+	  -| Hx0-Lx3 | - Lx0   |
+	   +---------+---------+
+		      | - Hx3  |
+		      +--------+
+
+    We must take into account the carry from Hx0 - Lx3.
+  */
+
+  cy = mpn_sub_n (pp + n, pp + n, pp+3*n, n);
+  hi = scratch[2*n] + cy;
+
+  cy = mpn_sub_nc (pp + 2*n, pp + 2*n, pp, n, cy);
+  hi -= mpn_sub_nc (pp + 3*n, scratch + n, pp + n, n, cy);
+
+  hi += mpn_add (pp + n, pp + n, 3*n, scratch, n);
+
+  /* FIXME: Is support for s + t == n needed? */
+  if (LIKELY (s + t > n))
+    {
+      hi -= mpn_sub (pp + 2*n, pp + 2*n, 2*n, pp + 4*n, s+t-n);
+
+      if (hi < 0)
+	MPN_DECR_U (pp + 4*n, s+t-n, -hi);
+      else
+	MPN_INCR_U (pp + 4*n, s+t-n, hi);
+    }
+  else
+    ASSERT (hi == 0);
+}

diff --git a/third_party/gmp/mpn/generic/toom33_mul.c b/third_party/gmp/mpn/generic/toom33_mul.c
new file mode 100644
index 0000000..8f49f42
--- /dev/null
+++ b/third_party/gmp/mpn/generic/toom33_mul.c

@@ -0,0 +1,315 @@
+/* mpn_toom33_mul -- Multiply {ap,an} and {p,bn} where an and bn are close in
+   size.  Or more accurately, bn <= an < (3/2)bn.
+
+   Contributed to the GNU project by Torbjorn Granlund.
+   Additional improvements by Marco Bodrato.
+
+   THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
+   SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+Copyright 2006-2008, 2010, 2012, 2015 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+#include "gmp-impl.h"
+
+/* Evaluate in: -1, 0, +1, +2, +inf
+
+  <-s--><--n--><--n-->
+   ____ ______ ______
+  |_a2_|___a1_|___a0_|
+   |b2_|___b1_|___b0_|
+   <-t-><--n--><--n-->
+
+  v0  =  a0         * b0          #   A(0)*B(0)
+  v1  = (a0+ a1+ a2)*(b0+ b1+ b2) #   A(1)*B(1)      ah  <= 2  bh <= 2
+  vm1 = (a0- a1+ a2)*(b0- b1+ b2) #  A(-1)*B(-1)    |ah| <= 1  bh <= 1
+  v2  = (a0+2a1+4a2)*(b0+2b1+4b2) #   A(2)*B(2)      ah  <= 6  bh <= 6
+  vinf=          a2 *         b2  # A(inf)*B(inf)
+*/
+
+#if TUNE_PROGRAM_BUILD || WANT_FAT_BINARY
+#define MAYBE_mul_basecase 1
+#define MAYBE_mul_toom33   1
+#else
+#define MAYBE_mul_basecase						\
+  (MUL_TOOM33_THRESHOLD < 3 * MUL_TOOM22_THRESHOLD)
+#define MAYBE_mul_toom33						\
+  (MUL_TOOM44_THRESHOLD >= 3 * MUL_TOOM33_THRESHOLD)
+#endif
+
+/* FIXME: TOOM33_MUL_N_REC is not quite right for a balanced
+   multiplication at the infinity point. We may have
+   MAYBE_mul_basecase == 0, and still get s just below
+   MUL_TOOM22_THRESHOLD. If MUL_TOOM33_THRESHOLD == 7, we can even get
+   s == 1 and mpn_toom22_mul will crash.
+*/
+
+#define TOOM33_MUL_N_REC(p, a, b, n, ws)				\
+  do {									\
+    if (MAYBE_mul_basecase						\
+	&& BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD))			\
+      mpn_mul_basecase (p, a, n, b, n);					\
+    else if (! MAYBE_mul_toom33						\
+	     || BELOW_THRESHOLD (n, MUL_TOOM33_THRESHOLD))		\
+      mpn_toom22_mul (p, a, n, b, n, ws);				\
+    else								\
+      mpn_toom33_mul (p, a, n, b, n, ws);				\
+  } while (0)
+
+void
+mpn_toom33_mul (mp_ptr pp,
+		mp_srcptr ap, mp_size_t an,
+		mp_srcptr bp, mp_size_t bn,
+		mp_ptr scratch)
+{
+  const int __gmpn_cpuvec_initialized = 1;
+  mp_size_t n, s, t;
+  int vm1_neg;
+  mp_limb_t cy, vinf0;
+  mp_ptr gp;
+  mp_ptr as1, asm1, as2;
+  mp_ptr bs1, bsm1, bs2;
+
+#define a0  ap
+#define a1  (ap + n)
+#define a2  (ap + 2*n)
+#define b0  bp
+#define b1  (bp + n)
+#define b2  (bp + 2*n)
+
+  n = (an + 2) / (size_t) 3;
+
+  s = an - 2 * n;
+  t = bn - 2 * n;
+
+  ASSERT (an >= bn);
+
+  ASSERT (0 < s && s <= n);
+  ASSERT (0 < t && t <= n);
+
+  as1  = scratch + 4 * n + 4;
+  asm1 = scratch + 2 * n + 2;
+  as2 = pp + n + 1;
+
+  bs1 = pp;
+  bsm1 = scratch + 3 * n + 3; /* we need 4n+4 <= 4n+s+t */
+  bs2 = pp + 2 * n + 2;
+
+  gp = scratch;
+
+  vm1_neg = 0;
+
+  /* Compute as1 and asm1.  */
+  cy = mpn_add (gp, a0, n, a2, s);
+#if HAVE_NATIVE_mpn_add_n_sub_n
+  if (cy == 0 && mpn_cmp (gp, a1, n) < 0)
+    {
+      cy = mpn_add_n_sub_n (as1, asm1, a1, gp, n);
+      as1[n] = cy >> 1;
+      asm1[n] = 0;
+      vm1_neg = 1;
+    }
+  else
+    {
+      mp_limb_t cy2;
+      cy2 = mpn_add_n_sub_n (as1, asm1, gp, a1, n);
+      as1[n] = cy + (cy2 >> 1);
+      asm1[n] = cy - (cy2 & 1);
+    }
+#else
+  as1[n] = cy + mpn_add_n (as1, gp, a1, n);
+  if (cy == 0 && mpn_cmp (gp, a1, n) < 0)
+    {
+      mpn_sub_n (asm1, a1, gp, n);
+      asm1[n] = 0;
+      vm1_neg = 1;
+    }
+  else
+    {
+      cy -= mpn_sub_n (asm1, gp, a1, n);
+      asm1[n] = cy;
+    }
+#endif
+
+  /* Compute as2.  */
+#if HAVE_NATIVE_mpn_rsblsh1_n
+  cy = mpn_add_n (as2, a2, as1, s);
+  if (s != n)
+    cy = mpn_add_1 (as2 + s, as1 + s, n - s, cy);
+  cy += as1[n];
+  cy = 2 * cy + mpn_rsblsh1_n (as2, a0, as2, n);
+#else
+#if HAVE_NATIVE_mpn_addlsh1_n
+  cy  = mpn_addlsh1_n (as2, a1, a2, s);
+  if (s != n)
+    cy = mpn_add_1 (as2 + s, a1 + s, n - s, cy);
+  cy = 2 * cy + mpn_addlsh1_n (as2, a0, as2, n);
+#else
+  cy = mpn_add_n (as2, a2, as1, s);
+  if (s != n)
+    cy = mpn_add_1 (as2 + s, as1 + s, n - s, cy);
+  cy += as1[n];
+  cy = 2 * cy + mpn_lshift (as2, as2, n, 1);
+  cy -= mpn_sub_n (as2, as2, a0, n);
+#endif
+#endif
+  as2[n] = cy;
+
+  /* Compute bs1 and bsm1.  */
+  cy = mpn_add (gp, b0, n, b2, t);
+#if HAVE_NATIVE_mpn_add_n_sub_n
+  if (cy == 0 && mpn_cmp (gp, b1, n) < 0)
+    {
+      cy = mpn_add_n_sub_n (bs1, bsm1, b1, gp, n);
+      bs1[n] = cy >> 1;
+      bsm1[n] = 0;
+      vm1_neg ^= 1;
+    }
+  else
+    {
+      mp_limb_t cy2;
+      cy2 = mpn_add_n_sub_n (bs1, bsm1, gp, b1, n);
+      bs1[n] = cy + (cy2 >> 1);
+      bsm1[n] = cy - (cy2 & 1);
+    }
+#else
+  bs1[n] = cy + mpn_add_n (bs1, gp, b1, n);
+  if (cy == 0 && mpn_cmp (gp, b1, n) < 0)
+    {
+      mpn_sub_n (bsm1, b1, gp, n);
+      bsm1[n] = 0;
+      vm1_neg ^= 1;
+    }
+  else
+    {
+      cy -= mpn_sub_n (bsm1, gp, b1, n);
+      bsm1[n] = cy;
+    }
+#endif
+
+  /* Compute bs2.  */
+#if HAVE_NATIVE_mpn_rsblsh1_n
+  cy = mpn_add_n (bs2, b2, bs1, t);
+  if (t != n)
+    cy = mpn_add_1 (bs2 + t, bs1 + t, n - t, cy);
+  cy += bs1[n];
+  cy = 2 * cy + mpn_rsblsh1_n (bs2, b0, bs2, n);
+#else
+#if HAVE_NATIVE_mpn_addlsh1_n
+  cy  = mpn_addlsh1_n (bs2, b1, b2, t);
+  if (t != n)
+    cy = mpn_add_1 (bs2 + t, b1 + t, n - t, cy);
+  cy = 2 * cy + mpn_addlsh1_n (bs2, b0, bs2, n);
+#else
+  cy  = mpn_add_n (bs2, bs1, b2, t);
+  if (t != n)
+    cy = mpn_add_1 (bs2 + t, bs1 + t, n - t, cy);
+  cy += bs1[n];
+  cy = 2 * cy + mpn_lshift (bs2, bs2, n, 1);
+  cy -= mpn_sub_n (bs2, bs2, b0, n);
+#endif
+#endif
+  bs2[n] = cy;
+
+  ASSERT (as1[n] <= 2);
+  ASSERT (bs1[n] <= 2);
+  ASSERT (asm1[n] <= 1);
+  ASSERT (bsm1[n] <= 1);
+  ASSERT (as2[n] <= 6);
+  ASSERT (bs2[n] <= 6);
+
+#define v0    pp				/* 2n */
+#define v1    (pp + 2 * n)			/* 2n+1 */
+#define vinf  (pp + 4 * n)			/* s+t */
+#define vm1   scratch				/* 2n+1 */
+#define v2    (scratch + 2 * n + 1)		/* 2n+2 */
+#define scratch_out  (scratch + 5 * n + 5)
+
+  /* vm1, 2n+1 limbs */
+#ifdef SMALLER_RECURSION
+  TOOM33_MUL_N_REC (vm1, asm1, bsm1, n, scratch_out);
+  cy = 0;
+  if (asm1[n] != 0)
+    cy = bsm1[n] + mpn_add_n (vm1 + n, vm1 + n, bsm1, n);
+  if (bsm1[n] != 0)
+    cy += mpn_add_n (vm1 + n, vm1 + n, asm1, n);
+  vm1[2 * n] = cy;
+#else
+  TOOM33_MUL_N_REC (vm1, asm1, bsm1, n + 1, scratch_out);
+#endif
+
+  TOOM33_MUL_N_REC (v2, as2, bs2, n + 1, scratch_out);	/* v2, 2n+1 limbs */
+
+  /* vinf, s+t limbs */
+  if (s > t)  mpn_mul (vinf, a2, s, b2, t);
+  else        TOOM33_MUL_N_REC (vinf, a2, b2, s, scratch_out);
+
+  vinf0 = vinf[0];				/* v1 overlaps with this */
+
+#ifdef SMALLER_RECURSION
+  /* v1, 2n+1 limbs */
+  TOOM33_MUL_N_REC (v1, as1, bs1, n, scratch_out);
+  if (as1[n] == 1)
+    {
+      cy = bs1[n] + mpn_add_n (v1 + n, v1 + n, bs1, n);
+    }
+  else if (as1[n] != 0)
+    {
+#if HAVE_NATIVE_mpn_addlsh1_n_ip1
+      cy = 2 * bs1[n] + mpn_addlsh1_n_ip1 (v1 + n, bs1, n);
+#else
+      cy = 2 * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, CNST_LIMB(2));
+#endif
+    }
+  else
+    cy = 0;
+  if (bs1[n] == 1)
+    {
+      cy += mpn_add_n (v1 + n, v1 + n, as1, n);
+    }
+  else if (bs1[n] != 0)
+    {
+#if HAVE_NATIVE_mpn_addlsh1_n_ip1
+      cy += mpn_addlsh1_n_ip1 (v1 + n, as1, n);
+#else
+      cy += mpn_addmul_1 (v1 + n, as1, n, CNST_LIMB(2));
+#endif
+    }
+  v1[2 * n] = cy;
+#else
+  cy = vinf[1];
+  TOOM33_MUL_N_REC (v1, as1, bs1, n + 1, scratch_out);
+  vinf[1] = cy;
+#endif
+
+  TOOM33_MUL_N_REC (v0, ap, bp, n, scratch_out);	/* v0, 2n limbs */
+
+  mpn_toom_interpolate_5pts (pp, v2, vm1, n, s + t, vm1_neg, vinf0);
+}

diff --git a/third_party/gmp/mpn/generic/toom3_sqr.c b/third_party/gmp/mpn/generic/toom3_sqr.c
new file mode 100644
index 0000000..7be15bf
--- /dev/null
+++ b/third_party/gmp/mpn/generic/toom3_sqr.c

@@ -0,0 +1,225 @@
+/* mpn_toom3_sqr -- Square {ap,an}.
+
+   Contributed to the GNU project by Torbjorn Granlund.
+   Additional improvements by Marco Bodrato.
+
+   THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
+   SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+Copyright 2006-2010, 2012, 2015 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+#include "gmp-impl.h"
+
+/* Evaluate in: -1, 0, +1, +2, +inf
+
+  <-s--><--n--><--n-->
+   ____ ______ ______
+  |_a2_|___a1_|___a0_|
+
+  v0  =  a0         ^2 #   A(0)^2
+  v1  = (a0+ a1+ a2)^2 #   A(1)^2    ah  <= 2
+  vm1 = (a0- a1+ a2)^2 #  A(-1)^2   |ah| <= 1
+  v2  = (a0+2a1+4a2)^2 #   A(2)^2    ah  <= 6
+  vinf=          a2 ^2 # A(inf)^2
+*/
+
+#if TUNE_PROGRAM_BUILD || WANT_FAT_BINARY
+#define MAYBE_sqr_basecase 1
+#define MAYBE_sqr_toom3   1
+#else
+#define MAYBE_sqr_basecase						\
+  (SQR_TOOM3_THRESHOLD < 3 * SQR_TOOM2_THRESHOLD)
+#define MAYBE_sqr_toom3							\
+  (SQR_TOOM4_THRESHOLD >= 3 * SQR_TOOM3_THRESHOLD)
+#endif
+
+#define TOOM3_SQR_REC(p, a, n, ws)					\
+  do {									\
+    if (MAYBE_sqr_basecase						\
+	&& BELOW_THRESHOLD (n, SQR_TOOM2_THRESHOLD))			\
+      mpn_sqr_basecase (p, a, n);					\
+    else if (! MAYBE_sqr_toom3						\
+	     || BELOW_THRESHOLD (n, SQR_TOOM3_THRESHOLD))		\
+      mpn_toom2_sqr (p, a, n, ws);					\
+    else								\
+      mpn_toom3_sqr (p, a, n, ws);					\
+  } while (0)
+
+void
+mpn_toom3_sqr (mp_ptr pp,
+	       mp_srcptr ap, mp_size_t an,
+	       mp_ptr scratch)
+{
+  const int __gmpn_cpuvec_initialized = 1;
+  mp_size_t n, s;
+  mp_limb_t cy, vinf0;
+  mp_ptr gp;
+  mp_ptr as1, asm1, as2;
+
+#define a0  ap
+#define a1  (ap + n)
+#define a2  (ap + 2*n)
+
+  n = (an + 2) / (size_t) 3;
+
+  s = an - 2 * n;
+
+  ASSERT (0 < s && s <= n);
+
+  as1 = scratch + 4 * n + 4;
+  asm1 = scratch + 2 * n + 2;
+  as2 = pp + n + 1;
+
+  gp = scratch;
+
+  /* Compute as1 and asm1.  */
+  cy = mpn_add (gp, a0, n, a2, s);
+#if HAVE_NATIVE_mpn_add_n_sub_n
+  if (cy == 0 && mpn_cmp (gp, a1, n) < 0)
+    {
+      cy = mpn_add_n_sub_n (as1, asm1, a1, gp, n);
+      as1[n] = cy >> 1;
+      asm1[n] = 0;
+    }
+  else
+    {
+      mp_limb_t cy2;
+      cy2 = mpn_add_n_sub_n (as1, asm1, gp, a1, n);
+      as1[n] = cy + (cy2 >> 1);
+      asm1[n] = cy - (cy2 & 1);
+    }
+#else
+  as1[n] = cy + mpn_add_n (as1, gp, a1, n);
+  if (cy == 0 && mpn_cmp (gp, a1, n) < 0)
+    {
+      mpn_sub_n (asm1, a1, gp, n);
+      asm1[n] = 0;
+    }
+  else
+    {
+      cy -= mpn_sub_n (asm1, gp, a1, n);
+      asm1[n] = cy;
+    }
+#endif
+
+  /* Compute as2.  */
+#if HAVE_NATIVE_mpn_rsblsh1_n
+  cy = mpn_add_n (as2, a2, as1, s);
+  if (s != n)
+    cy = mpn_add_1 (as2 + s, as1 + s, n - s, cy);
+  cy += as1[n];
+  cy = 2 * cy + mpn_rsblsh1_n (as2, a0, as2, n);
+#else
+#if HAVE_NATIVE_mpn_addlsh1_n
+  cy  = mpn_addlsh1_n (as2, a1, a2, s);
+  if (s != n)
+    cy = mpn_add_1 (as2 + s, a1 + s, n - s, cy);
+  cy = 2 * cy + mpn_addlsh1_n (as2, a0, as2, n);
+#else
+  cy = mpn_add_n (as2, a2, as1, s);
+  if (s != n)
+    cy = mpn_add_1 (as2 + s, as1 + s, n - s, cy);
+  cy += as1[n];
+  cy = 2 * cy + mpn_lshift (as2, as2, n, 1);
+  cy -= mpn_sub_n (as2, as2, a0, n);
+#endif
+#endif
+  as2[n] = cy;
+
+  ASSERT (as1[n] <= 2);
+  ASSERT (asm1[n] <= 1);
+
+#define v0    pp				/* 2n */
+#define v1    (pp + 2 * n)			/* 2n+1 */
+#define vinf  (pp + 4 * n)			/* s+s */
+#define vm1   scratch				/* 2n+1 */
+#define v2    (scratch + 2 * n + 1)		/* 2n+2 */
+#define scratch_out  (scratch + 5 * n + 5)
+
+  /* vm1, 2n+1 limbs */
+#ifdef SMALLER_RECURSION
+  TOOM3_SQR_REC (vm1, asm1, n, scratch_out);
+  cy = 0;
+  if (asm1[n] != 0)
+    cy = asm1[n] + mpn_add_n (vm1 + n, vm1 + n, asm1, n);
+  if (asm1[n] != 0)
+    cy += mpn_add_n (vm1 + n, vm1 + n, asm1, n);
+  vm1[2 * n] = cy;
+#else
+  TOOM3_SQR_REC (vm1, asm1, n + 1, scratch_out);
+#endif
+
+  TOOM3_SQR_REC (v2, as2, n + 1, scratch_out);	/* v2, 2n+1 limbs */
+
+  TOOM3_SQR_REC (vinf, a2, s, scratch_out);	/* vinf, s+s limbs */
+
+  vinf0 = vinf[0];				/* v1 overlaps with this */
+
+#ifdef SMALLER_RECURSION
+  /* v1, 2n+1 limbs */
+  TOOM3_SQR_REC (v1, as1, n, scratch_out);
+  if (as1[n] == 1)
+    {
+      cy = as1[n] + mpn_add_n (v1 + n, v1 + n, as1, n);
+    }
+  else if (as1[n] != 0)
+    {
+#if HAVE_NATIVE_mpn_addlsh1_n_ip1
+      cy = 2 * as1[n] + mpn_addlsh1_n_ip1 (v1 + n, as1, n);
+#else
+      cy = 2 * as1[n] + mpn_addmul_1 (v1 + n, as1, n, CNST_LIMB(2));
+#endif
+    }
+  else
+    cy = 0;
+  if (as1[n] == 1)
+    {
+      cy += mpn_add_n (v1 + n, v1 + n, as1, n);
+    }
+  else if (as1[n] != 0)
+    {
+#if HAVE_NATIVE_mpn_addlsh1_n_ip1
+      cy += mpn_addlsh1_n_ip1 (v1 + n, as1, n);
+#else
+      cy += mpn_addmul_1 (v1 + n, as1, n, CNST_LIMB(2));
+#endif
+    }
+  v1[2 * n] = cy;
+#else
+  cy = vinf[1];
+  TOOM3_SQR_REC (v1, as1, n + 1, scratch_out);
+  vinf[1] = cy;
+#endif
+
+  TOOM3_SQR_REC (v0, ap, n, scratch_out);	/* v0, 2n limbs */
+
+  mpn_toom_interpolate_5pts (pp, v2, vm1, n, s + s, 0, vinf0);
+}

diff --git a/third_party/gmp/mpn/generic/toom42_mul.c b/third_party/gmp/mpn/generic/toom42_mul.c
new file mode 100644
index 0000000..2dfba9b
--- /dev/null
+++ b/third_party/gmp/mpn/generic/toom42_mul.c

@@ -0,0 +1,234 @@
+/* mpn_toom42_mul -- Multiply {ap,an} and {bp,bn} where an is nominally twice
+   as large as bn.  Or more accurately, (3/2)bn < an < 4bn.
+
+   Contributed to the GNU project by Torbjorn Granlund.
+   Additional improvements by Marco Bodrato.
+
+   The idea of applying toom to unbalanced multiplication is due to Marco
+   Bodrato and Alberto Zanoni.
+
+   THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
+   SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+Copyright 2006-2008, 2012, 2014 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+#include "gmp-impl.h"
+
+/* Evaluate in: -1, 0, +1, +2, +inf
+
+  <-s-><--n--><--n--><--n-->
+   ___ ______ ______ ______
+  |a3_|___a2_|___a1_|___a0_|
+	       |_b1_|___b0_|
+	       <-t--><--n-->
+
+  v0  =  a0             * b0      #   A(0)*B(0)
+  v1  = (a0+ a1+ a2+ a3)*(b0+ b1) #   A(1)*B(1)      ah  <= 3  bh <= 1
+  vm1 = (a0- a1+ a2- a3)*(b0- b1) #  A(-1)*B(-1)    |ah| <= 1  bh  = 0
+  v2  = (a0+2a1+4a2+8a3)*(b0+2b1) #   A(2)*B(2)      ah  <= 14 bh <= 2
+  vinf=              a3 *     b1  # A(inf)*B(inf)
+*/
+
+#define TOOM42_MUL_N_REC(p, a, b, n, ws)				\
+  do {									\
+    mpn_mul_n (p, a, b, n);						\
+  } while (0)
+
+void
+mpn_toom42_mul (mp_ptr pp,
+		mp_srcptr ap, mp_size_t an,
+		mp_srcptr bp, mp_size_t bn,
+		mp_ptr scratch)
+{
+  mp_size_t n, s, t;
+  int vm1_neg;
+  mp_limb_t cy, vinf0;
+  mp_ptr a0_a2;
+  mp_ptr as1, asm1, as2;
+  mp_ptr bs1, bsm1, bs2;
+  mp_ptr tmp;
+  TMP_DECL;
+
+#define a0  ap
+#define a1  (ap + n)
+#define a2  (ap + 2*n)
+#define a3  (ap + 3*n)
+#define b0  bp
+#define b1  (bp + n)
+
+  n = an >= 2 * bn ? (an + 3) >> 2 : (bn + 1) >> 1;
+
+  s = an - 3 * n;
+  t = bn - n;
+
+  ASSERT (0 < s && s <= n);
+  ASSERT (0 < t && t <= n);
+
+  TMP_MARK;
+
+  tmp = TMP_ALLOC_LIMBS (6 * n + 5);
+  as1  = tmp; tmp += n + 1;
+  asm1 = tmp; tmp += n + 1;
+  as2  = tmp; tmp += n + 1;
+  bs1  = tmp; tmp += n + 1;
+  bsm1 = tmp; tmp += n;
+  bs2  = tmp; tmp += n + 1;
+
+  a0_a2 = pp;
+
+  /* Compute as1 and asm1.  */
+  vm1_neg = mpn_toom_eval_dgr3_pm1 (as1, asm1, ap, n, s, a0_a2) & 1;
+
+  /* Compute as2.  */
+#if HAVE_NATIVE_mpn_addlsh1_n
+  cy  = mpn_addlsh1_n (as2, a2, a3, s);
+  if (s != n)
+    cy = mpn_add_1 (as2 + s, a2 + s, n - s, cy);
+  cy = 2 * cy + mpn_addlsh1_n (as2, a1, as2, n);
+  cy = 2 * cy + mpn_addlsh1_n (as2, a0, as2, n);
+#else
+  cy  = mpn_lshift (as2, a3, s, 1);
+  cy += mpn_add_n (as2, a2, as2, s);
+  if (s != n)
+    cy = mpn_add_1 (as2 + s, a2 + s, n - s, cy);
+  cy = 2 * cy + mpn_lshift (as2, as2, n, 1);
+  cy += mpn_add_n (as2, a1, as2, n);
+  cy = 2 * cy + mpn_lshift (as2, as2, n, 1);
+  cy += mpn_add_n (as2, a0, as2, n);
+#endif
+  as2[n] = cy;
+
+  /* Compute bs1 and bsm1.  */
+  if (t == n)
+    {
+#if HAVE_NATIVE_mpn_add_n_sub_n
+      if (mpn_cmp (b0, b1, n) < 0)
+	{
+	  cy = mpn_add_n_sub_n (bs1, bsm1, b1, b0, n);
+	  vm1_neg ^= 1;
+	}
+      else
+	{
+	  cy = mpn_add_n_sub_n (bs1, bsm1, b0, b1, n);
+	}
+      bs1[n] = cy >> 1;
+#else
+      bs1[n] = mpn_add_n (bs1, b0, b1, n);
+
+      if (mpn_cmp (b0, b1, n) < 0)
+	{
+	  mpn_sub_n (bsm1, b1, b0, n);
+	  vm1_neg ^= 1;
+	}
+      else
+	{
+	  mpn_sub_n (bsm1, b0, b1, n);
+	}
+#endif
+    }
+  else
+    {
+      bs1[n] = mpn_add (bs1, b0, n, b1, t);
+
+      if (mpn_zero_p (b0 + t, n - t) && mpn_cmp (b0, b1, t) < 0)
+	{
+	  mpn_sub_n (bsm1, b1, b0, t);
+	  MPN_ZERO (bsm1 + t, n - t);
+	  vm1_neg ^= 1;
+	}
+      else
+	{
+	  mpn_sub (bsm1, b0, n, b1, t);
+	}
+    }
+
+  /* Compute bs2, recycling bs1. bs2=bs1+b1  */
+  mpn_add (bs2, bs1, n + 1, b1, t);
+
+  ASSERT (as1[n] <= 3);
+  ASSERT (bs1[n] <= 1);
+  ASSERT (asm1[n] <= 1);
+/*ASSERT (bsm1[n] == 0);*/
+  ASSERT (as2[n] <= 14);
+  ASSERT (bs2[n] <= 2);
+
+#define v0    pp				/* 2n */
+#define v1    (pp + 2 * n)			/* 2n+1 */
+#define vinf  (pp + 4 * n)			/* s+t */
+#define vm1   scratch				/* 2n+1 */
+#define v2    (scratch + 2 * n + 1)		/* 2n+2 */
+#define scratch_out	scratch + 4 * n + 4	/* Currently unused. */
+
+  /* vm1, 2n+1 limbs */
+  TOOM42_MUL_N_REC (vm1, asm1, bsm1, n, scratch_out);
+  cy = 0;
+  if (asm1[n] != 0)
+    cy = mpn_add_n (vm1 + n, vm1 + n, bsm1, n);
+  vm1[2 * n] = cy;
+
+  TOOM42_MUL_N_REC (v2, as2, bs2, n + 1, scratch_out);	/* v2, 2n+1 limbs */
+
+  /* vinf, s+t limbs */
+  if (s > t)  mpn_mul (vinf, a3, s, b1, t);
+  else        mpn_mul (vinf, b1, t, a3, s);
+
+  vinf0 = vinf[0];				/* v1 overlaps with this */
+
+  /* v1, 2n+1 limbs */
+  TOOM42_MUL_N_REC (v1, as1, bs1, n, scratch_out);
+  if (as1[n] == 1)
+    {
+      cy = bs1[n] + mpn_add_n (v1 + n, v1 + n, bs1, n);
+    }
+  else if (as1[n] == 2)
+    {
+#if HAVE_NATIVE_mpn_addlsh1_n
+      cy = 2 * bs1[n] + mpn_addlsh1_n (v1 + n, v1 + n, bs1, n);
+#else
+      cy = 2 * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, CNST_LIMB(2));
+#endif
+    }
+  else if (as1[n] == 3)
+    {
+      cy = 3 * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, CNST_LIMB(3));
+    }
+  else
+    cy = 0;
+  if (bs1[n] != 0)
+    cy += mpn_add_n (v1 + n, v1 + n, as1, n);
+  v1[2 * n] = cy;
+
+  TOOM42_MUL_N_REC (v0, ap, bp, n, scratch_out);	/* v0, 2n limbs */
+
+  mpn_toom_interpolate_5pts (pp, v2, vm1, n, s + t, vm1_neg, vinf0);
+
+  TMP_FREE;
+}

diff --git a/third_party/gmp/mpn/generic/toom42_mulmid.c b/third_party/gmp/mpn/generic/toom42_mulmid.c
new file mode 100644
index 0000000..f581b10
--- /dev/null
+++ b/third_party/gmp/mpn/generic/toom42_mulmid.c

@@ -0,0 +1,237 @@
+/* mpn_toom42_mulmid -- toom42 middle product
+
+   Contributed by David Harvey.
+
+   THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
+   SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+Copyright 2011 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+#include "gmp-impl.h"
+
+
+
+/*
+  Middle product of {ap,2n-1} and {bp,n}, output written to {rp,n+2}.
+
+  Neither ap nor bp may overlap rp.
+
+  Must have n >= 4.
+
+  Amount of scratch space required is given by mpn_toom42_mulmid_itch().
+
+  FIXME: this code assumes that n is small compared to GMP_NUMB_MAX. The exact
+  requirements should be clarified.
+*/
+void
+mpn_toom42_mulmid (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t n,
+                   mp_ptr scratch)
+{
+  mp_limb_t cy, e[12], zh, zl;
+  mp_size_t m;
+  int neg;
+
+  ASSERT (n >= 4);
+  ASSERT (! MPN_OVERLAP_P (rp, n + 2, ap, 2*n - 1));
+  ASSERT (! MPN_OVERLAP_P (rp, n + 2, bp, n));
+
+  ap += n & 1;   /* handle odd row and diagonal later */
+  m = n / 2;
+
+  /* (e0h:e0l) etc are correction terms, in 2's complement */
+#define e0l (e[0])
+#define e0h (e[1])
+#define e1l (e[2])
+#define e1h (e[3])
+#define e2l (e[4])
+#define e2h (e[5])
+#define e3l (e[6])
+#define e3h (e[7])
+#define e4l (e[8])
+#define e4h (e[9])
+#define e5l (e[10])
+#define e5h (e[11])
+
+#define s (scratch + 2)
+#define t (rp + m + 2)
+#define p0 rp
+#define p1 scratch
+#define p2 (rp + m)
+#define next_scratch (scratch + 3*m + 1)
+
+  /*
+            rp                            scratch
+  |---------|-----------|    |---------|---------|----------|
+  0         m         2m+2   0         m         2m        3m+1
+            <----p2---->       <-------------s------------->
+  <----p0----><---t---->     <----p1---->
+  */
+
+  /* compute {s,3m-1} = {a,3m-1} + {a+m,3m-1} and error terms e0, e1, e2, e3 */
+  cy = mpn_add_err1_n (s, ap, ap + m, &e0l, bp + m, m - 1, 0);
+  cy = mpn_add_err2_n (s + m - 1, ap + m - 1, ap + 2*m - 1, &e1l,
+		       bp + m, bp, m, cy);
+  mpn_add_err1_n (s + 2*m - 1, ap + 2*m - 1, ap + 3*m - 1, &e3l, bp, m, cy);
+
+  /* compute t = (-1)^neg * ({b,m} - {b+m,m}) and error terms e4, e5 */
+  if (mpn_cmp (bp + m, bp, m) < 0)
+    {
+      ASSERT_NOCARRY (mpn_sub_err2_n (t, bp, bp + m, &e4l,
+				      ap + m - 1, ap + 2*m - 1, m, 0));
+      neg = 1;
+    }
+  else
+    {
+      ASSERT_NOCARRY (mpn_sub_err2_n (t, bp + m, bp, &e4l,
+				      ap + m - 1, ap + 2*m - 1, m, 0));
+      neg = 0;
+    }
+
+  /* recursive middle products. The picture is:
+
+      b[2m-1]   A   A   A   B   B   B   -   -   -   -   -
+      ...       -   A   A   A   B   B   B   -   -   -   -
+      b[m]      -   -   A   A   A   B   B   B   -   -   -
+      b[m-1]    -   -   -   C   C   C   D   D   D   -   -
+      ...       -   -   -   -   C   C   C   D   D   D   -
+      b[0]      -   -   -   -   -   C   C   C   D   D   D
+               a[0]   ...  a[m]  ...  a[2m]    ...    a[4m-2]
+  */
+
+  if (m < MULMID_TOOM42_THRESHOLD)
+    {
+      /* A + B */
+      mpn_mulmid_basecase (p0, s, 2*m - 1, bp + m, m);
+      /* accumulate high limbs of p0 into e1 */
+      ADDC_LIMB (cy, e1l, e1l, p0[m]);
+      e1h += p0[m + 1] + cy;
+      /* (-1)^neg * (B - C)   (overwrites first m limbs of s) */
+      mpn_mulmid_basecase (p1, ap + m, 2*m - 1, t, m);
+      /* C + D   (overwrites t) */
+      mpn_mulmid_basecase (p2, s + m, 2*m - 1, bp, m);
+    }
+  else
+    {
+      /* as above, but use toom42 instead */
+      mpn_toom42_mulmid (p0, s, bp + m, m, next_scratch);
+      ADDC_LIMB (cy, e1l, e1l, p0[m]);
+      e1h += p0[m + 1] + cy;
+      mpn_toom42_mulmid (p1, ap + m, t, m, next_scratch);
+      mpn_toom42_mulmid (p2, s + m, bp, m, next_scratch);
+    }
+
+  /* apply error terms */
+
+  /* -e0 at rp[0] */
+  SUBC_LIMB (cy, rp[0], rp[0], e0l);
+  SUBC_LIMB (cy, rp[1], rp[1], e0h + cy);
+  if (UNLIKELY (cy))
+    {
+      cy = (m > 2) ? mpn_sub_1 (rp + 2, rp + 2, m - 2, 1) : 1;
+      SUBC_LIMB (cy, e1l, e1l, cy);
+      e1h -= cy;
+    }
+
+  /* z = e1 - e2 + high(p0) */
+  SUBC_LIMB (cy, zl, e1l, e2l);
+  zh = e1h - e2h - cy;
+
+  /* z at rp[m] */
+  ADDC_LIMB (cy, rp[m], rp[m], zl);
+  zh = (zh + cy) & GMP_NUMB_MASK;
+  ADDC_LIMB (cy, rp[m + 1], rp[m + 1], zh);
+  cy -= (zh >> (GMP_NUMB_BITS - 1));
+  if (UNLIKELY (cy))
+    {
+      if (cy == 1)
+	mpn_add_1 (rp + m + 2, rp + m + 2, m, 1);
+      else /* cy == -1 */
+	mpn_sub_1 (rp + m + 2, rp + m + 2, m, 1);
+    }
+
+  /* e3 at rp[2*m] */
+  ADDC_LIMB (cy, rp[2*m], rp[2*m], e3l);
+  rp[2*m + 1] = (rp[2*m + 1] + e3h + cy) & GMP_NUMB_MASK;
+
+  /* e4 at p1[0] */
+  ADDC_LIMB (cy, p1[0], p1[0], e4l);
+  ADDC_LIMB (cy, p1[1], p1[1], e4h + cy);
+  if (UNLIKELY (cy))
+    mpn_add_1 (p1 + 2, p1 + 2, m, 1);
+
+  /* -e5 at p1[m] */
+  SUBC_LIMB (cy, p1[m], p1[m], e5l);
+  p1[m + 1] = (p1[m + 1] - e5h - cy) & GMP_NUMB_MASK;
+
+  /* adjustment if p1 ends up negative */
+  cy = (p1[m + 1] >> (GMP_NUMB_BITS - 1));
+
+  /* add (-1)^neg * (p1 - B^m * p1) to output */
+  if (neg)
+    {
+      mpn_sub_1 (rp + m + 2, rp + m + 2, m, cy);
+      mpn_add (rp, rp, 2*m + 2, p1, m + 2);             /* A + C */
+      mpn_sub_n (rp + m, rp + m, p1, m + 2);            /* B + D */
+    }
+  else
+    {
+      mpn_add_1 (rp + m + 2, rp + m + 2, m, cy);
+      mpn_sub (rp, rp, 2*m + 2, p1, m + 2);             /* A + C */
+      mpn_add_n (rp + m, rp + m, p1, m + 2);            /* B + D */
+    }
+
+  /* odd row and diagonal */
+  if (n & 1)
+    {
+      /*
+        Products marked E are already done. We need to do products marked O.
+
+        OOOOO----
+        -EEEEO---
+        --EEEEO--
+        ---EEEEO-
+        ----EEEEO
+       */
+
+      /* first row of O's */
+      cy = mpn_addmul_1 (rp, ap - 1, n, bp[n - 1]);
+      ADDC_LIMB (rp[n + 1], rp[n], rp[n], cy);
+
+      /* O's on diagonal */
+      /* FIXME: should probably define an interface "mpn_mulmid_diag_1"
+         that can handle the sum below. Currently we're relying on
+         mulmid_basecase being pretty fast for a diagonal sum like this,
+	 which is true at least for the K8 asm version, but surely false
+	 for the generic version. */
+      mpn_mulmid_basecase (e, ap + n - 1, n - 1, bp, n - 1);
+      mpn_add_n (rp + n - 1, rp + n - 1, e, 3);
+    }
+}

diff --git a/third_party/gmp/mpn/generic/toom43_mul.c b/third_party/gmp/mpn/generic/toom43_mul.c
new file mode 100644
index 0000000..0650138
--- /dev/null
+++ b/third_party/gmp/mpn/generic/toom43_mul.c

@@ -0,0 +1,233 @@
+/* mpn_toom43_mul -- Multiply {ap,an} and {bp,bn} where an is nominally 4/3
+   times as large as bn.  Or more accurately, bn < an < 2 bn.
+
+   Contributed to the GNU project by Marco Bodrato.
+
+   The idea of applying toom to unbalanced multiplication is due to Marco
+   Bodrato and Alberto Zanoni.
+
+   THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
+   SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+Copyright 2009 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+#include "gmp-impl.h"
+
+/* Evaluate in: -2, -1, 0, +1, +2, +inf
+
+  <-s-><--n--><--n--><--n-->
+   ___ ______ ______ ______
+  |a3_|___a2_|___a1_|___a0_|
+	|_b2_|___b1_|___b0_|
+	<-t--><--n--><--n-->
+
+  v0  =  a0             * b0          #   A(0)*B(0)
+  v1  = (a0+ a1+ a2+ a3)*(b0+ b1+ b2) #   A(1)*B(1)      ah  <= 3  bh <= 2
+  vm1 = (a0- a1+ a2- a3)*(b0- b1+ b2) #  A(-1)*B(-1)    |ah| <= 1 |bh|<= 1
+  v2  = (a0+2a1+4a2+8a3)*(b0+2b1+4b2) #   A(2)*B(2)      ah  <= 14 bh <= 6
+  vm2 = (a0-2a1+4a2-8a3)*(b0-2b1+4b2) #  A(-2)*B(-2)    |ah| <= 9 |bh|<= 4
+  vinf=              a3 *         b2  # A(inf)*B(inf)
+*/
+
+void
+mpn_toom43_mul (mp_ptr pp,
+		mp_srcptr ap, mp_size_t an,
+		mp_srcptr bp, mp_size_t bn, mp_ptr scratch)
+{
+  mp_size_t n, s, t;
+  enum toom6_flags flags;
+  mp_limb_t cy;
+
+#define a0  ap
+#define a1  (ap + n)
+#define a2  (ap + 2 * n)
+#define a3  (ap + 3 * n)
+#define b0  bp
+#define b1  (bp + n)
+#define b2  (bp + 2 * n)
+
+  n = 1 + (3 * an >= 4 * bn ? (an - 1) >> 2 : (bn - 1) / (size_t) 3);
+
+  s = an - 3 * n;
+  t = bn - 2 * n;
+
+  ASSERT (0 < s && s <= n);
+  ASSERT (0 < t && t <= n);
+
+  /* This is true whenever an >= 25 or bn >= 19, I think. It
+     guarantees that we can fit 5 values of size n+1 in the product
+     area. */
+  ASSERT (s+t >= 5);
+
+#define v0    pp				/* 2n */
+#define vm1   (scratch)				/* 2n+1 */
+#define v1    (pp + 2*n)			/* 2n+1 */
+#define vm2   (scratch + 2 * n + 1)		/* 2n+1 */
+#define v2    (scratch + 4 * n + 2)		/* 2n+1 */
+#define vinf  (pp + 5 * n)			/* s+t */
+#define bs1    pp				/* n+1 */
+#define bsm1  (scratch + 2 * n + 2)		/* n+1 */
+#define asm1  (scratch + 3 * n + 3)		/* n+1 */
+#define asm2  (scratch + 4 * n + 4)		/* n+1 */
+#define bsm2  (pp + n + 1)			/* n+1 */
+#define bs2   (pp + 2 * n + 2)			/* n+1 */
+#define as2   (pp + 3 * n + 3)			/* n+1 */
+#define as1   (pp + 4 * n + 4)			/* n+1 */
+
+  /* Total sccratch need is 6 * n + 3 + 1; we allocate one extra
+     limb, because products will overwrite 2n+2 limbs. */
+
+#define a0a2  scratch
+#define b0b2  scratch
+#define a1a3  asm1
+#define b1d   bsm1
+
+  /* Compute as2 and asm2.  */
+  flags = (enum toom6_flags) (toom6_vm2_neg & mpn_toom_eval_dgr3_pm2 (as2, asm2, ap, n, s, a1a3));
+
+  /* Compute bs2 and bsm2.  */
+  b1d[n] = mpn_lshift (b1d, b1, n, 1);			/*       2b1      */
+  cy  = mpn_lshift (b0b2, b2, t, 2);			/*  4b2           */
+  cy += mpn_add_n (b0b2, b0b2, b0, t);			/*  4b2      + b0 */
+  if (t != n)
+    cy = mpn_add_1 (b0b2 + t, b0 + t, n - t, cy);
+  b0b2[n] = cy;
+
+#if HAVE_NATIVE_mpn_add_n_sub_n
+  if (mpn_cmp (b0b2, b1d, n+1) < 0)
+    {
+      mpn_add_n_sub_n (bs2, bsm2, b1d, b0b2, n+1);
+      flags = (enum toom6_flags) (flags ^ toom6_vm2_neg);
+    }
+  else
+    {
+      mpn_add_n_sub_n (bs2, bsm2, b0b2, b1d, n+1);
+    }
+#else
+  mpn_add_n (bs2, b0b2, b1d, n+1);
+  if (mpn_cmp (b0b2, b1d, n+1) < 0)
+    {
+      mpn_sub_n (bsm2, b1d, b0b2, n+1);
+      flags = (enum toom6_flags) (flags ^ toom6_vm2_neg);
+    }
+  else
+    {
+      mpn_sub_n (bsm2, b0b2, b1d, n+1);
+    }
+#endif
+
+  /* Compute as1 and asm1.  */
+  flags = (enum toom6_flags) (flags ^ (toom6_vm1_neg & mpn_toom_eval_dgr3_pm1 (as1, asm1, ap, n, s, a0a2)));
+
+  /* Compute bs1 and bsm1.  */
+  bsm1[n] = mpn_add (bsm1, b0, n, b2, t);
+#if HAVE_NATIVE_mpn_add_n_sub_n
+  if (bsm1[n] == 0 && mpn_cmp (bsm1, b1, n) < 0)
+    {
+      cy = mpn_add_n_sub_n (bs1, bsm1, b1, bsm1, n);
+      bs1[n] = cy >> 1;
+      flags = (enum toom6_flags) (flags ^ toom6_vm1_neg);
+    }
+  else
+    {
+      cy = mpn_add_n_sub_n (bs1, bsm1, bsm1, b1, n);
+      bs1[n] = bsm1[n] + (cy >> 1);
+      bsm1[n]-= cy & 1;
+    }
+#else
+  bs1[n] = bsm1[n] + mpn_add_n (bs1, bsm1, b1, n);
+  if (bsm1[n] == 0 && mpn_cmp (bsm1, b1, n) < 0)
+    {
+      mpn_sub_n (bsm1, b1, bsm1, n);
+      flags = (enum toom6_flags) (flags ^ toom6_vm1_neg);
+    }
+  else
+    {
+      bsm1[n] -= mpn_sub_n (bsm1, bsm1, b1, n);
+    }
+#endif
+
+  ASSERT (as1[n] <= 3);
+  ASSERT (bs1[n] <= 2);
+  ASSERT (asm1[n] <= 1);
+  ASSERT (bsm1[n] <= 1);
+  ASSERT (as2[n] <=14);
+  ASSERT (bs2[n] <= 6);
+  ASSERT (asm2[n] <= 9);
+  ASSERT (bsm2[n] <= 4);
+
+  /* vm1, 2n+1 limbs */
+  mpn_mul_n (vm1, asm1, bsm1, n+1);  /* W4 */
+
+  /* vm2, 2n+1 limbs */
+  mpn_mul_n (vm2, asm2, bsm2, n+1);  /* W2 */
+
+  /* v2, 2n+1 limbs */
+  mpn_mul_n (v2, as2, bs2, n+1);  /* W1 */
+
+  /* v1, 2n+1 limbs */
+  mpn_mul_n (v1, as1, bs1, n+1);  /* W3 */
+
+  /* vinf, s+t limbs */   /* W0 */
+  if (s > t)  mpn_mul (vinf, a3, s, b2, t);
+  else        mpn_mul (vinf, b2, t, a3, s);
+
+  /* v0, 2n limbs */
+  mpn_mul_n (v0, ap, bp, n);  /* W5 */
+
+  mpn_toom_interpolate_6pts (pp, n, flags, vm1, vm2, v2, t + s);
+
+#undef v0
+#undef vm1
+#undef v1
+#undef vm2
+#undef v2
+#undef vinf
+#undef bs1
+#undef bs2
+#undef bsm1
+#undef bsm2
+#undef asm1
+#undef asm2
+/* #undef as1 */
+/* #undef as2 */
+#undef a0a2
+#undef b0b2
+#undef a1a3
+#undef b1d
+#undef a0
+#undef a1
+#undef a2
+#undef a3
+#undef b0
+#undef b1
+#undef b2
+}

diff --git a/third_party/gmp/mpn/generic/toom44_mul.c b/third_party/gmp/mpn/generic/toom44_mul.c
new file mode 100644
index 0000000..77d5083
--- /dev/null
+++ b/third_party/gmp/mpn/generic/toom44_mul.c

@@ -0,0 +1,235 @@
+/* mpn_toom44_mul -- Multiply {ap,an} and {bp,bn} where an and bn are close in
+   size.  Or more accurately, bn <= an < (4/3)bn.
+
+   Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
+
+   THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
+   SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+Copyright 2006-2008, 2013 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+#include "gmp-impl.h"
+
+/* Evaluate in: 0, +1, -1, +2, -2, 1/2, +inf
+
+  <-s--><--n--><--n--><--n-->
+   ____ ______ ______ ______
+  |_a3_|___a2_|___a1_|___a0_|
+   |b3_|___b2_|___b1_|___b0_|
+   <-t-><--n--><--n--><--n-->
+
+  v0  =   a0             *  b0              #    A(0)*B(0)
+  v1  = ( a0+ a1+ a2+ a3)*( b0+ b1+ b2+ b3) #    A(1)*B(1)      ah  <= 3   bh  <= 3
+  vm1 = ( a0- a1+ a2- a3)*( b0- b1+ b2- b3) #   A(-1)*B(-1)    |ah| <= 1  |bh| <= 1
+  v2  = ( a0+2a1+4a2+8a3)*( b0+2b1+4b2+8b3) #    A(2)*B(2)      ah  <= 14  bh  <= 14
+  vm2 = ( a0-2a1+4a2-8a3)*( b0-2b1+4b2-8b3) #    A(2)*B(2)      ah  <= 9  |bh| <= 9
+  vh  = (8a0+4a1+2a2+ a3)*(8b0+4b1+2b2+ b3) #  A(1/2)*B(1/2)    ah  <= 14  bh  <= 14
+  vinf=               a3 *          b2      #  A(inf)*B(inf)
+*/
+
+#if TUNE_PROGRAM_BUILD
+#define MAYBE_mul_basecase 1
+#define MAYBE_mul_toom22   1
+#define MAYBE_mul_toom44   1
+#else
+#define MAYBE_mul_basecase						\
+  (MUL_TOOM44_THRESHOLD < 4 * MUL_TOOM22_THRESHOLD)
+#define MAYBE_mul_toom22						\
+  (MUL_TOOM44_THRESHOLD < 4 * MUL_TOOM33_THRESHOLD)
+#define MAYBE_mul_toom44						\
+  (MUL_TOOM6H_THRESHOLD >= 4 * MUL_TOOM44_THRESHOLD)
+#endif
+
+#define TOOM44_MUL_N_REC(p, a, b, n, ws)				\
+  do {									\
+    if (MAYBE_mul_basecase						\
+	&& BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD))			\
+      mpn_mul_basecase (p, a, n, b, n);					\
+    else if (MAYBE_mul_toom22						\
+	     && BELOW_THRESHOLD (n, MUL_TOOM33_THRESHOLD))		\
+      mpn_toom22_mul (p, a, n, b, n, ws);				\
+    else if (! MAYBE_mul_toom44						\
+	     || BELOW_THRESHOLD (n, MUL_TOOM44_THRESHOLD))		\
+      mpn_toom33_mul (p, a, n, b, n, ws);				\
+    else								\
+      mpn_toom44_mul (p, a, n, b, n, ws);				\
+  } while (0)
+
+/* Use of scratch space. In the product area, we store
+
+      ___________________
+     |vinf|____|_v1_|_v0_|
+      s+t  2n-1 2n+1  2n
+
+   The other recursive products, vm1, v2, vm2, vh are stored in the
+   scratch area. When computing them, we use the product area for
+   intermediate values.
+
+   Next, we compute v1. We can store the intermediate factors at v0
+   and at vh + 2n + 2.
+
+   Finally, for v0 and vinf, factors are parts of the input operands,
+   and we need scratch space only for the recursive multiplication.
+
+   In all, if S(an) is the scratch need, the needed space is bounded by
+
+     S(an) <= 4 (2*ceil(an/4) + 1) + 1 + S(ceil(an/4) + 1)
+
+   which should give S(n) = 8 n/3 + c log(n) for some constant c.
+*/
+
+void
+mpn_toom44_mul (mp_ptr pp,
+		mp_srcptr ap, mp_size_t an,
+		mp_srcptr bp, mp_size_t bn,
+		mp_ptr scratch)
+{
+  mp_size_t n, s, t;
+  mp_limb_t cy;
+  enum toom7_flags flags;
+
+#define a0  ap
+#define a1  (ap + n)
+#define a2  (ap + 2*n)
+#define a3  (ap + 3*n)
+#define b0  bp
+#define b1  (bp + n)
+#define b2  (bp + 2*n)
+#define b3  (bp + 3*n)
+
+  ASSERT (an >= bn);
+
+  n = (an + 3) >> 2;
+
+  s = an - 3 * n;
+  t = bn - 3 * n;
+
+  ASSERT (0 < s && s <= n);
+  ASSERT (0 < t && t <= n);
+  ASSERT (s >= t);
+
+  /* NOTE: The multiplications to v2, vm2, vh and vm1 overwrites the
+   * following limb, so these must be computed in order, and we need a
+   * one limb gap to tp. */
+#define v0    pp				/* 2n */
+#define v1    (pp + 2 * n)			/* 2n+1 */
+#define vinf  (pp + 6 * n)			/* s+t */
+#define v2    scratch				/* 2n+1 */
+#define vm2   (scratch + 2 * n + 1)		/* 2n+1 */
+#define vh    (scratch + 4 * n + 2)		/* 2n+1 */
+#define vm1   (scratch + 6 * n + 3)		/* 2n+1 */
+#define tp (scratch + 8*n + 5)
+
+  /* apx and bpx must not overlap with v1 */
+#define apx   pp				/* n+1 */
+#define amx   (pp + n + 1)			/* n+1 */
+#define bmx   (pp + 2*n + 2)			/* n+1 */
+#define bpx   (pp + 4*n + 2)			/* n+1 */
+
+  /* Total scratch need: 8*n + 5 + scratch for recursive calls. This
+     gives roughly 32 n/3 + log term. */
+
+  /* Compute apx = a0 + 2 a1 + 4 a2 + 8 a3 and amx = a0 - 2 a1 + 4 a2 - 8 a3.  */
+  flags = (enum toom7_flags) (toom7_w1_neg & mpn_toom_eval_dgr3_pm2 (apx, amx, ap, n, s, tp));
+
+  /* Compute bpx = b0 + 2 b1 + 4 b2 + 8 b3 and bmx = b0 - 2 b1 + 4 b2 - 8 b3.  */
+  flags = (enum toom7_flags) (flags ^ (toom7_w1_neg & mpn_toom_eval_dgr3_pm2 (bpx, bmx, bp, n, t, tp)));
+
+  TOOM44_MUL_N_REC (v2, apx, bpx, n + 1, tp);	/* v2,  2n+1 limbs */
+  TOOM44_MUL_N_REC (vm2, amx, bmx, n + 1, tp);	/* vm2,  2n+1 limbs */
+
+  /* Compute apx = 8 a0 + 4 a1 + 2 a2 + a3 = (((2*a0 + a1) * 2 + a2) * 2 + a3 */
+#if HAVE_NATIVE_mpn_addlsh1_n
+  cy = mpn_addlsh1_n (apx, a1, a0, n);
+  cy = 2*cy + mpn_addlsh1_n (apx, a2, apx, n);
+  if (s < n)
+    {
+      mp_limb_t cy2;
+      cy2 = mpn_addlsh1_n (apx, a3, apx, s);
+      apx[n] = 2*cy + mpn_lshift (apx + s, apx + s, n - s, 1);
+      MPN_INCR_U (apx + s, n+1-s, cy2);
+    }
+  else
+    apx[n] = 2*cy + mpn_addlsh1_n (apx, a3, apx, n);
+#else
+  cy = mpn_lshift (apx, a0, n, 1);
+  cy += mpn_add_n (apx, apx, a1, n);
+  cy = 2*cy + mpn_lshift (apx, apx, n, 1);
+  cy += mpn_add_n (apx, apx, a2, n);
+  cy = 2*cy + mpn_lshift (apx, apx, n, 1);
+  apx[n] = cy + mpn_add (apx, apx, n, a3, s);
+#endif
+
+  /* Compute bpx = 8 b0 + 4 b1 + 2 b2 + b3 = (((2*b0 + b1) * 2 + b2) * 2 + b3 */
+#if HAVE_NATIVE_mpn_addlsh1_n
+  cy = mpn_addlsh1_n (bpx, b1, b0, n);
+  cy = 2*cy + mpn_addlsh1_n (bpx, b2, bpx, n);
+  if (t < n)
+    {
+      mp_limb_t cy2;
+      cy2 = mpn_addlsh1_n (bpx, b3, bpx, t);
+      bpx[n] = 2*cy + mpn_lshift (bpx + t, bpx + t, n - t, 1);
+      MPN_INCR_U (bpx + t, n+1-t, cy2);
+    }
+  else
+    bpx[n] = 2*cy + mpn_addlsh1_n (bpx, b3, bpx, n);
+#else
+  cy = mpn_lshift (bpx, b0, n, 1);
+  cy += mpn_add_n (bpx, bpx, b1, n);
+  cy = 2*cy + mpn_lshift (bpx, bpx, n, 1);
+  cy += mpn_add_n (bpx, bpx, b2, n);
+  cy = 2*cy + mpn_lshift (bpx, bpx, n, 1);
+  bpx[n] = cy + mpn_add (bpx, bpx, n, b3, t);
+#endif
+
+  ASSERT (apx[n] < 15);
+  ASSERT (bpx[n] < 15);
+
+  TOOM44_MUL_N_REC (vh, apx, bpx, n + 1, tp);	/* vh,  2n+1 limbs */
+
+  /* Compute apx = a0 + a1 + a2 + a3 and amx = a0 - a1 + a2 - a3.  */
+  flags = (enum toom7_flags) (flags | (toom7_w3_neg & mpn_toom_eval_dgr3_pm1 (apx, amx, ap, n, s, tp)));
+
+  /* Compute bpx = b0 + b1 + b2 + b3 and bmx = b0 - b1 + b2 - b3.  */
+  flags = (enum toom7_flags) (flags ^ (toom7_w3_neg & mpn_toom_eval_dgr3_pm1 (bpx, bmx, bp, n, t, tp)));
+
+  TOOM44_MUL_N_REC (vm1, amx, bmx, n + 1, tp);	/* vm1,  2n+1 limbs */
+  /* Clobbers amx, bmx. */
+  TOOM44_MUL_N_REC (v1, apx, bpx, n + 1, tp);	/* v1,  2n+1 limbs */
+
+  TOOM44_MUL_N_REC (v0, a0, b0, n, tp);
+  if (s > t)
+    mpn_mul (vinf, a3, s, b3, t);
+  else
+    TOOM44_MUL_N_REC (vinf, a3, b3, s, tp);	/* vinf, s+t limbs */
+
+  mpn_toom_interpolate_7pts (pp, n, flags, vm2, vm1, v2, vh, s + t, tp);
+}

diff --git a/third_party/gmp/mpn/generic/toom4_sqr.c b/third_party/gmp/mpn/generic/toom4_sqr.c
new file mode 100644
index 0000000..aec84c1
--- /dev/null
+++ b/third_party/gmp/mpn/generic/toom4_sqr.c

@@ -0,0 +1,163 @@
+/* mpn_toom4_sqr -- Square {ap,an}.
+
+   Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
+
+   THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
+   SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+Copyright 2006-2010, 2013 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+#include "gmp-impl.h"
+
+/* Evaluate in: -1, -1/2, 0, +1/2, +1, +2, +inf
+
+  <-s--><--n--><--n--><--n-->
+   ____ ______ ______ ______
+  |_a3_|___a2_|___a1_|___a0_|
+
+  v0  =   a0             ^2 #    A(0)^2
+  v1  = ( a0+ a1+ a2+ a3)^2 #    A(1)^2   ah  <= 3
+  vm1 = ( a0- a1+ a2- a3)^2 #   A(-1)^2  |ah| <= 1
+  v2  = ( a0+2a1+4a2+8a3)^2 #    A(2)^2   ah  <= 14
+  vh  = (8a0+4a1+2a2+ a3)^2 #  A(1/2)^2   ah  <= 14
+  vmh = (8a0-4a1+2a2- a3)^2 # A(-1/2)^2  -4<=ah<=9
+  vinf=               a3 ^2 #  A(inf)^2
+*/
+
+#if TUNE_PROGRAM_BUILD
+#define MAYBE_sqr_basecase 1
+#define MAYBE_sqr_toom2   1
+#define MAYBE_sqr_toom4   1
+#else
+#define MAYBE_sqr_basecase						\
+  (SQR_TOOM4_THRESHOLD < 4 * SQR_TOOM2_THRESHOLD)
+#define MAYBE_sqr_toom2							\
+  (SQR_TOOM4_THRESHOLD < 4 * SQR_TOOM3_THRESHOLD)
+#define MAYBE_sqr_toom4							\
+  (SQR_TOOM6_THRESHOLD >= 4 * SQR_TOOM4_THRESHOLD)
+#endif
+
+#define TOOM4_SQR_REC(p, a, n, ws)					\
+  do {									\
+    if (MAYBE_sqr_basecase						\
+	&& BELOW_THRESHOLD (n, SQR_TOOM2_THRESHOLD))			\
+      mpn_sqr_basecase (p, a, n);					\
+    else if (MAYBE_sqr_toom2						\
+	     && BELOW_THRESHOLD (n, SQR_TOOM3_THRESHOLD))		\
+      mpn_toom2_sqr (p, a, n, ws);					\
+    else if (! MAYBE_sqr_toom4						\
+	     || BELOW_THRESHOLD (n, SQR_TOOM4_THRESHOLD))		\
+      mpn_toom3_sqr (p, a, n, ws);					\
+    else								\
+      mpn_toom4_sqr (p, a, n, ws);					\
+  } while (0)
+
+void
+mpn_toom4_sqr (mp_ptr pp,
+	       mp_srcptr ap, mp_size_t an,
+	       mp_ptr scratch)
+{
+  mp_size_t n, s;
+  mp_limb_t cy;
+
+#define a0  ap
+#define a1  (ap + n)
+#define a2  (ap + 2*n)
+#define a3  (ap + 3*n)
+
+  n = (an + 3) >> 2;
+
+  s = an - 3 * n;
+
+  ASSERT (0 < s && s <= n);
+
+  /* NOTE: The multiplications to v2, vm2, vh and vm1 overwrites the
+   * following limb, so these must be computed in order, and we need a
+   * one limb gap to tp. */
+#define v0    pp				/* 2n */
+#define v1    (pp + 2 * n)			/* 2n+1 */
+#define vinf  (pp + 6 * n)			/* s+t */
+#define v2    scratch				/* 2n+1 */
+#define vm2   (scratch + 2 * n + 1)		/* 2n+1 */
+#define vh    (scratch + 4 * n + 2)		/* 2n+1 */
+#define vm1   (scratch + 6 * n + 3)		/* 2n+1 */
+#define tp (scratch + 8*n + 5)
+
+  /* No overlap with v1 */
+#define apx   pp				/* n+1 */
+#define amx   (pp + 4*n + 2)			/* n+1 */
+
+  /* Total scratch need: 8*n + 5 + scratch for recursive calls. This
+     gives roughly 32 n/3 + log term. */
+
+  /* Compute apx = a0 + 2 a1 + 4 a2 + 8 a3 and amx = a0 - 2 a1 + 4 a2 - 8 a3.  */
+  mpn_toom_eval_dgr3_pm2 (apx, amx, ap, n, s, tp);
+
+  TOOM4_SQR_REC (v2, apx, n + 1, tp);	/* v2,  2n+1 limbs */
+  TOOM4_SQR_REC (vm2, amx, n + 1, tp);	/* vm2,  2n+1 limbs */
+
+  /* Compute apx = 8 a0 + 4 a1 + 2 a2 + a3 = (((2*a0 + a1) * 2 + a2) * 2 + a3 */
+#if HAVE_NATIVE_mpn_addlsh1_n
+  cy = mpn_addlsh1_n (apx, a1, a0, n);
+  cy = 2*cy + mpn_addlsh1_n (apx, a2, apx, n);
+  if (s < n)
+    {
+      mp_limb_t cy2;
+      cy2 = mpn_addlsh1_n (apx, a3, apx, s);
+      apx[n] = 2*cy + mpn_lshift (apx + s, apx + s, n - s, 1);
+      MPN_INCR_U (apx + s, n+1-s, cy2);
+    }
+  else
+    apx[n] = 2*cy + mpn_addlsh1_n (apx, a3, apx, n);
+#else
+  cy = mpn_lshift (apx, a0, n, 1);
+  cy += mpn_add_n (apx, apx, a1, n);
+  cy = 2*cy + mpn_lshift (apx, apx, n, 1);
+  cy += mpn_add_n (apx, apx, a2, n);
+  cy = 2*cy + mpn_lshift (apx, apx, n, 1);
+  apx[n] = cy + mpn_add (apx, apx, n, a3, s);
+#endif
+
+  ASSERT (apx[n] < 15);
+
+  TOOM4_SQR_REC (vh, apx, n + 1, tp);	/* vh,  2n+1 limbs */
+
+  /* Compute apx = a0 + a1 + a2 + a3 and amx = a0 - a1 + a2 - a3.  */
+  mpn_toom_eval_dgr3_pm1 (apx, amx, ap, n, s, tp);
+
+  TOOM4_SQR_REC (v1, apx, n + 1, tp);	/* v1,  2n+1 limbs */
+  TOOM4_SQR_REC (vm1, amx, n + 1, tp);	/* vm1,  2n+1 limbs */
+
+  TOOM4_SQR_REC (v0, a0, n, tp);
+  TOOM4_SQR_REC (vinf, a3, s, tp);	/* vinf, 2s limbs */
+
+  mpn_toom_interpolate_7pts (pp, n, (enum toom7_flags) 0, vm2, vm1, v2, vh, 2*s, tp);
+}

diff --git a/third_party/gmp/mpn/generic/toom52_mul.c b/third_party/gmp/mpn/generic/toom52_mul.c
new file mode 100644
index 0000000..974059b
--- /dev/null
+++ b/third_party/gmp/mpn/generic/toom52_mul.c

@@ -0,0 +1,256 @@
+/* mpn_toom52_mul -- Multiply {ap,an} and {bp,bn} where an is nominally 4/3
+   times as large as bn.  Or more accurately, bn < an < 2 bn.
+
+   Contributed to the GNU project by Marco Bodrato.
+
+   The idea of applying toom to unbalanced multiplication is due to Marco
+   Bodrato and Alberto Zanoni.
+
+   THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
+   SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+Copyright 2009 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+#include "gmp-impl.h"
+
+/* Evaluate in: -2, -1, 0, +1, +2, +inf
+
+  <-s-><--n--><--n--><--n--><--n-->
+   ___ ______ ______ ______ ______
+  |a4_|___a3_|___a2_|___a1_|___a0_|
+			|b1|___b0_|
+			<t-><--n-->
+
+  v0  =  a0                  * b0      #   A(0)*B(0)
+  v1  = (a0+ a1+ a2+ a3+  a4)*(b0+ b1) #   A(1)*B(1)      ah  <= 4   bh <= 1
+  vm1 = (a0- a1+ a2- a3+  a4)*(b0- b1) #  A(-1)*B(-1)    |ah| <= 2   bh  = 0
+  v2  = (a0+2a1+4a2+8a3+16a4)*(b0+2b1) #   A(2)*B(2)      ah  <= 30  bh <= 2
+  vm2 = (a0-2a1+4a2-8a3+16a4)*(b0-2b1) #  A(-2)*B(-2)    |ah| <= 20 |bh|<= 1
+  vinf=                   a4 *     b1  # A(inf)*B(inf)
+
+  Some slight optimization in evaluation are taken from the paper:
+  "Towards Optimal Toom-Cook Multiplication for Univariate and
+  Multivariate Polynomials in Characteristic 2 and 0."
+*/
+
+void
+mpn_toom52_mul (mp_ptr pp,
+		mp_srcptr ap, mp_size_t an,
+		mp_srcptr bp, mp_size_t bn, mp_ptr scratch)
+{
+  mp_size_t n, s, t;
+  enum toom6_flags flags;
+
+#define a0  ap
+#define a1  (ap + n)
+#define a2  (ap + 2 * n)
+#define a3  (ap + 3 * n)
+#define a4  (ap + 4 * n)
+#define b0  bp
+#define b1  (bp + n)
+
+  n = 1 + (2 * an >= 5 * bn ? (an - 1) / (size_t) 5 : (bn - 1) >> 1);
+
+  s = an - 4 * n;
+  t = bn - n;
+
+  ASSERT (0 < s && s <= n);
+  ASSERT (0 < t && t <= n);
+
+  /* Ensures that 5 values of n+1 limbs each fits in the product area.
+     Borderline cases are an = 32, bn = 8, n = 7, and an = 36, bn = 9,
+     n = 8. */
+  ASSERT (s+t >= 5);
+
+#define v0    pp				/* 2n */
+#define vm1   (scratch)				/* 2n+1 */
+#define v1    (pp + 2 * n)			/* 2n+1 */
+#define vm2   (scratch + 2 * n + 1)		/* 2n+1 */
+#define v2    (scratch + 4 * n + 2)		/* 2n+1 */
+#define vinf  (pp + 5 * n)			/* s+t */
+#define bs1    pp				/* n+1 */
+#define bsm1  (scratch + 2 * n + 2)		/* n   */
+#define asm1  (scratch + 3 * n + 3)		/* n+1 */
+#define asm2  (scratch + 4 * n + 4)		/* n+1 */
+#define bsm2  (pp + n + 1)			/* n+1 */
+#define bs2   (pp + 2 * n + 2)			/* n+1 */
+#define as2   (pp + 3 * n + 3)			/* n+1 */
+#define as1   (pp + 4 * n + 4)			/* n+1 */
+
+  /* Scratch need is 6 * n + 3 + 1. We need one extra limb, because
+     products will overwrite 2n+2 limbs. */
+
+#define a0a2  scratch
+#define a1a3  asm1
+
+  /* Compute as2 and asm2.  */
+  flags = (enum toom6_flags) (toom6_vm2_neg & mpn_toom_eval_pm2 (as2, asm2, 4, ap, n, s, a1a3));
+
+  /* Compute bs1 and bsm1.  */
+  if (t == n)
+    {
+#if HAVE_NATIVE_mpn_add_n_sub_n
+      mp_limb_t cy;
+
+      if (mpn_cmp (b0, b1, n) < 0)
+	{
+	  cy = mpn_add_n_sub_n (bs1, bsm1, b1, b0, n);
+	  flags = (enum toom6_flags) (flags ^ toom6_vm1_neg);
+	}
+      else
+	{
+	  cy = mpn_add_n_sub_n (bs1, bsm1, b0, b1, n);
+	}
+      bs1[n] = cy >> 1;
+#else
+      bs1[n] = mpn_add_n (bs1, b0, b1, n);
+      if (mpn_cmp (b0, b1, n) < 0)
+	{
+	  mpn_sub_n (bsm1, b1, b0, n);
+	  flags = (enum toom6_flags) (flags ^ toom6_vm1_neg);
+	}
+      else
+	{
+	  mpn_sub_n (bsm1, b0, b1, n);
+	}
+#endif
+    }
+  else
+    {
+      bs1[n] = mpn_add (bs1, b0, n, b1, t);
+      if (mpn_zero_p (b0 + t, n - t) && mpn_cmp (b0, b1, t) < 0)
+	{
+	  mpn_sub_n (bsm1, b1, b0, t);
+	  MPN_ZERO (bsm1 + t, n - t);
+	  flags = (enum toom6_flags) (flags ^ toom6_vm1_neg);
+	}
+      else
+	{
+	  mpn_sub (bsm1, b0, n, b1, t);
+	}
+    }
+
+  /* Compute bs2 and bsm2, recycling bs1 and bsm1. bs2=bs1+b1; bsm2=bsm1-b1  */
+  mpn_add (bs2, bs1, n+1, b1, t);
+  if (flags & toom6_vm1_neg)
+    {
+      bsm2[n] = mpn_add (bsm2, bsm1, n, b1, t);
+      flags = (enum toom6_flags) (flags ^ toom6_vm2_neg);
+    }
+  else
+    {
+      bsm2[n] = 0;
+      if (t == n)
+	{
+	  if (mpn_cmp (bsm1, b1, n) < 0)
+	    {
+	      mpn_sub_n (bsm2, b1, bsm1, n);
+	      flags = (enum toom6_flags) (flags ^ toom6_vm2_neg);
+	    }
+	  else
+	    {
+	      mpn_sub_n (bsm2, bsm1, b1, n);
+	    }
+	}
+      else
+	{
+	  if (mpn_zero_p (bsm1 + t, n - t) && mpn_cmp (bsm1, b1, t) < 0)
+	    {
+	      mpn_sub_n (bsm2, b1, bsm1, t);
+	      MPN_ZERO (bsm2 + t, n - t);
+	      flags = (enum toom6_flags) (flags ^ toom6_vm2_neg);
+	    }
+	  else
+	    {
+	      mpn_sub (bsm2, bsm1, n, b1, t);
+	    }
+	}
+    }
+
+  /* Compute as1 and asm1.  */
+  flags = (enum toom6_flags) (flags ^ (toom6_vm1_neg & mpn_toom_eval_pm1 (as1, asm1, 4, ap, n, s, a0a2)));
+
+  ASSERT (as1[n] <= 4);
+  ASSERT (bs1[n] <= 1);
+  ASSERT (asm1[n] <= 2);
+/*   ASSERT (bsm1[n] <= 1); */
+  ASSERT (as2[n] <=30);
+  ASSERT (bs2[n] <= 2);
+  ASSERT (asm2[n] <= 20);
+  ASSERT (bsm2[n] <= 1);
+
+  /* vm1, 2n+1 limbs */
+  mpn_mul (vm1, asm1, n+1, bsm1, n);  /* W4 */
+
+  /* vm2, 2n+1 limbs */
+  mpn_mul_n (vm2, asm2, bsm2, n+1);  /* W2 */
+
+  /* v2, 2n+1 limbs */
+  mpn_mul_n (v2, as2, bs2, n+1);  /* W1 */
+
+  /* v1, 2n+1 limbs */
+  mpn_mul_n (v1, as1, bs1, n+1);  /* W3 */
+
+  /* vinf, s+t limbs */   /* W0 */
+  if (s > t)  mpn_mul (vinf, a4, s, b1, t);
+  else        mpn_mul (vinf, b1, t, a4, s);
+
+  /* v0, 2n limbs */
+  mpn_mul_n (v0, ap, bp, n);  /* W5 */
+
+  mpn_toom_interpolate_6pts (pp, n, flags, vm1, vm2, v2, t + s);
+
+#undef v0
+#undef vm1
+#undef v1
+#undef vm2
+#undef v2
+#undef vinf
+#undef bs1
+#undef bs2
+#undef bsm1
+#undef bsm2
+#undef asm1
+#undef asm2
+#undef as1
+#undef as2
+#undef a0a2
+#undef b0b2
+#undef a1a3
+#undef a0
+#undef a1
+#undef a2
+#undef a3
+#undef b0
+#undef b1
+#undef b2
+
+}

diff --git a/third_party/gmp/mpn/generic/toom53_mul.c b/third_party/gmp/mpn/generic/toom53_mul.c
new file mode 100644
index 0000000..c934297
--- /dev/null
+++ b/third_party/gmp/mpn/generic/toom53_mul.c

@@ -0,0 +1,331 @@
+/* mpn_toom53_mul -- Multiply {ap,an} and {bp,bn} where an is nominally 5/3
+   times as large as bn.  Or more accurately, (4/3)bn < an < (5/2)bn.
+
+   Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
+
+   The idea of applying toom to unbalanced multiplication is due to Marco
+   Bodrato and Alberto Zanoni.
+
+   THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
+   SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+Copyright 2006-2008, 2012, 2014, 2015 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+#include "gmp-impl.h"
+
+/* Evaluate in: 0, +1, -1, +2, -2, 1/2, +inf
+
+  <-s-><--n--><--n--><--n--><--n-->
+   ___ ______ ______ ______ ______
+  |a4_|___a3_|___a2_|___a1_|___a0_|
+	       |__b2|___b1_|___b0_|
+	       <-t--><--n--><--n-->
+
+  v0  =    a0                  *  b0          #    A(0)*B(0)
+  v1  = (  a0+ a1+ a2+ a3+  a4)*( b0+ b1+ b2) #    A(1)*B(1)      ah  <= 4   bh <= 2
+  vm1 = (  a0- a1+ a2- a3+  a4)*( b0- b1+ b2) #   A(-1)*B(-1)    |ah| <= 2   bh <= 1
+  v2  = (  a0+2a1+4a2+8a3+16a4)*( b0+2b1+4b2) #    A(2)*B(2)      ah  <= 30  bh <= 6
+  vm2 = (  a0-2a1+4a2-8a3+16a4)*( b0-2b1+4b2) #    A(2)*B(2)     -9<=ah<=20 -1<=bh<=4
+  vh  = (16a0+8a1+4a2+2a3+  a4)*(4b0+2b1+ b2) #  A(1/2)*B(1/2)    ah  <= 30  bh <= 6
+  vinf=                     a4 *          b2  #  A(inf)*B(inf)
+*/
+
+void
+mpn_toom53_mul (mp_ptr pp,
+		mp_srcptr ap, mp_size_t an,
+		mp_srcptr bp, mp_size_t bn,
+		mp_ptr scratch)
+{
+  mp_size_t n, s, t;
+  mp_limb_t cy;
+  mp_ptr gp;
+  mp_ptr as1, asm1, as2, asm2, ash;
+  mp_ptr bs1, bsm1, bs2, bsm2, bsh;
+  mp_ptr tmp;
+  enum toom7_flags flags;
+  TMP_DECL;
+
+#define a0  ap
+#define a1  (ap + n)
+#define a2  (ap + 2*n)
+#define a3  (ap + 3*n)
+#define a4  (ap + 4*n)
+#define b0  bp
+#define b1  (bp + n)
+#define b2  (bp + 2*n)
+
+  n = 1 + (3 * an >= 5 * bn ? (an - 1) / (size_t) 5 : (bn - 1) / (size_t) 3);
+
+  s = an - 4 * n;
+  t = bn - 2 * n;
+
+  ASSERT (0 < s && s <= n);
+  ASSERT (0 < t && t <= n);
+
+  TMP_MARK;
+
+  tmp = TMP_ALLOC_LIMBS (10 * (n + 1));
+  as1  = tmp; tmp += n + 1;
+  asm1 = tmp; tmp += n + 1;
+  as2  = tmp; tmp += n + 1;
+  asm2 = tmp; tmp += n + 1;
+  ash  = tmp; tmp += n + 1;
+  bs1  = tmp; tmp += n + 1;
+  bsm1 = tmp; tmp += n + 1;
+  bs2  = tmp; tmp += n + 1;
+  bsm2 = tmp; tmp += n + 1;
+  bsh  = tmp; tmp += n + 1;
+
+  gp = pp;
+
+  /* Compute as1 and asm1.  */
+  flags = (enum toom7_flags) (toom7_w3_neg & mpn_toom_eval_pm1 (as1, asm1, 4, ap, n, s, gp));
+
+  /* Compute as2 and asm2. */
+  flags = (enum toom7_flags) (flags | (toom7_w1_neg & mpn_toom_eval_pm2 (as2, asm2, 4, ap, n, s, gp)));
+
+  /* Compute ash = 16 a0 + 8 a1 + 4 a2 + 2 a3 + a4
+     = 2*(2*(2*(2*a0 + a1) + a2) + a3) + a4  */
+#if HAVE_NATIVE_mpn_addlsh1_n
+  cy = mpn_addlsh1_n (ash, a1, a0, n);
+  cy = 2*cy + mpn_addlsh1_n (ash, a2, ash, n);
+  cy = 2*cy + mpn_addlsh1_n (ash, a3, ash, n);
+  if (s < n)
+    {
+      mp_limb_t cy2;
+      cy2 = mpn_addlsh1_n (ash, a4, ash, s);
+      ash[n] = 2*cy + mpn_lshift (ash + s, ash + s, n - s, 1);
+      MPN_INCR_U (ash + s, n+1-s, cy2);
+    }
+  else
+    ash[n] = 2*cy + mpn_addlsh1_n (ash, a4, ash, n);
+#else
+  cy = mpn_lshift (ash, a0, n, 1);
+  cy += mpn_add_n (ash, ash, a1, n);
+  cy = 2*cy + mpn_lshift (ash, ash, n, 1);
+  cy += mpn_add_n (ash, ash, a2, n);
+  cy = 2*cy + mpn_lshift (ash, ash, n, 1);
+  cy += mpn_add_n (ash, ash, a3, n);
+  cy = 2*cy + mpn_lshift (ash, ash, n, 1);
+  ash[n] = cy + mpn_add (ash, ash, n, a4, s);
+#endif
+
+  /* Compute bs1 and bsm1.  */
+  bs1[n] = mpn_add (bs1, b0, n, b2, t);		/* b0 + b2 */
+#if HAVE_NATIVE_mpn_add_n_sub_n
+  if (bs1[n] == 0 && mpn_cmp (bs1, b1, n) < 0)
+    {
+      bs1[n] = mpn_add_n_sub_n (bs1, bsm1, b1, bs1, n) >> 1;
+      bsm1[n] = 0;
+      flags = (enum toom7_flags) (flags ^ toom7_w3_neg);
+    }
+  else
+    {
+      cy = mpn_add_n_sub_n (bs1, bsm1, bs1, b1, n);
+      bsm1[n] = bs1[n] - (cy & 1);
+      bs1[n] += (cy >> 1);
+    }
+#else
+  if (bs1[n] == 0 && mpn_cmp (bs1, b1, n) < 0)
+    {
+      mpn_sub_n (bsm1, b1, bs1, n);
+      bsm1[n] = 0;
+      flags = (enum toom7_flags) (flags ^ toom7_w3_neg);
+    }
+  else
+    {
+      bsm1[n] = bs1[n] - mpn_sub_n (bsm1, bs1, b1, n);
+    }
+  bs1[n] += mpn_add_n (bs1, bs1, b1, n);  /* b0+b1+b2 */
+#endif
+
+  /* Compute bs2 and bsm2. */
+#if HAVE_NATIVE_mpn_addlsh_n || HAVE_NATIVE_mpn_addlsh2_n
+#if HAVE_NATIVE_mpn_addlsh2_n
+  cy = mpn_addlsh2_n (bs2, b0, b2, t);
+#else /* HAVE_NATIVE_mpn_addlsh_n */
+  cy = mpn_addlsh_n (bs2, b0, b2, t, 2);
+#endif
+  if (t < n)
+    cy = mpn_add_1 (bs2 + t, b0 + t, n - t, cy);
+  bs2[n] = cy;
+#else
+  cy = mpn_lshift (gp, b2, t, 2);
+  bs2[n] = mpn_add (bs2, b0, n, gp, t);
+  MPN_INCR_U (bs2 + t, n+1-t, cy);
+#endif
+
+  gp[n] = mpn_lshift (gp, b1, n, 1);
+
+#if HAVE_NATIVE_mpn_add_n_sub_n
+  if (mpn_cmp (bs2, gp, n+1) < 0)
+    {
+      ASSERT_NOCARRY (mpn_add_n_sub_n (bs2, bsm2, gp, bs2, n+1));
+      flags = (enum toom7_flags) (flags ^ toom7_w1_neg);
+    }
+  else
+    {
+      ASSERT_NOCARRY (mpn_add_n_sub_n (bs2, bsm2, bs2, gp, n+1));
+    }
+#else
+  if (mpn_cmp (bs2, gp, n+1) < 0)
+    {
+      ASSERT_NOCARRY (mpn_sub_n (bsm2, gp, bs2, n+1));
+      flags = (enum toom7_flags) (flags ^ toom7_w1_neg);
+    }
+  else
+    {
+      ASSERT_NOCARRY (mpn_sub_n (bsm2, bs2, gp, n+1));
+    }
+  mpn_add_n (bs2, bs2, gp, n+1);
+#endif
+
+  /* Compute bsh = 4 b0 + 2 b1 + b2 = 2*(2*b0 + b1)+b2.  */
+#if HAVE_NATIVE_mpn_addlsh1_n
+  cy = mpn_addlsh1_n (bsh, b1, b0, n);
+  if (t < n)
+    {
+      mp_limb_t cy2;
+      cy2 = mpn_addlsh1_n (bsh, b2, bsh, t);
+      bsh[n] = 2*cy + mpn_lshift (bsh + t, bsh + t, n - t, 1);
+      MPN_INCR_U (bsh + t, n+1-t, cy2);
+    }
+  else
+    bsh[n] = 2*cy + mpn_addlsh1_n (bsh, b2, bsh, n);
+#else
+  cy = mpn_lshift (bsh, b0, n, 1);
+  cy += mpn_add_n (bsh, bsh, b1, n);
+  cy = 2*cy + mpn_lshift (bsh, bsh, n, 1);
+  bsh[n] = cy + mpn_add (bsh, bsh, n, b2, t);
+#endif
+
+  ASSERT (as1[n] <= 4);
+  ASSERT (bs1[n] <= 2);
+  ASSERT (asm1[n] <= 2);
+  ASSERT (bsm1[n] <= 1);
+  ASSERT (as2[n] <= 30);
+  ASSERT (bs2[n] <= 6);
+  ASSERT (asm2[n] <= 20);
+  ASSERT (bsm2[n] <= 4);
+  ASSERT (ash[n] <= 30);
+  ASSERT (bsh[n] <= 6);
+
+#define v0    pp				/* 2n */
+#define v1    (pp + 2 * n)			/* 2n+1 */
+#define vinf  (pp + 6 * n)			/* s+t */
+#define v2    scratch				/* 2n+1 */
+#define vm2   (scratch + 2 * n + 1)		/* 2n+1 */
+#define vh    (scratch + 4 * n + 2)		/* 2n+1 */
+#define vm1   (scratch + 6 * n + 3)		/* 2n+1 */
+#define scratch_out (scratch + 8 * n + 4)		/* 2n+1 */
+  /* Total scratch need: 10*n+5 */
+
+  /* Must be in allocation order, as they overwrite one limb beyond
+   * 2n+1. */
+  mpn_mul_n (v2, as2, bs2, n + 1);		/* v2, 2n+1 limbs */
+  mpn_mul_n (vm2, asm2, bsm2, n + 1);		/* vm2, 2n+1 limbs */
+  mpn_mul_n (vh, ash, bsh, n + 1);		/* vh, 2n+1 limbs */
+
+  /* vm1, 2n+1 limbs */
+#ifdef SMALLER_RECURSION
+  mpn_mul_n (vm1, asm1, bsm1, n);
+  if (asm1[n] == 1)
+    {
+      cy = bsm1[n] + mpn_add_n (vm1 + n, vm1 + n, bsm1, n);
+    }
+  else if (asm1[n] == 2)
+    {
+#if HAVE_NATIVE_mpn_addlsh1_n_ip1
+      cy = 2 * bsm1[n] + mpn_addlsh1_n_ip1 (vm1 + n, bsm1, n);
+#else
+      cy = 2 * bsm1[n] + mpn_addmul_1 (vm1 + n, bsm1, n, CNST_LIMB(2));
+#endif
+    }
+  else
+    cy = 0;
+  if (bsm1[n] != 0)
+    cy += mpn_add_n (vm1 + n, vm1 + n, asm1, n);
+  vm1[2 * n] = cy;
+#else /* SMALLER_RECURSION */
+  vm1[2 * n] = 0;
+  mpn_mul_n (vm1, asm1, bsm1, n + ((asm1[n] | bsm1[n]) != 0));
+#endif /* SMALLER_RECURSION */
+
+  /* v1, 2n+1 limbs */
+#ifdef SMALLER_RECURSION
+  mpn_mul_n (v1, as1, bs1, n);
+  if (as1[n] == 1)
+    {
+      cy = bs1[n] + mpn_add_n (v1 + n, v1 + n, bs1, n);
+    }
+  else if (as1[n] == 2)
+    {
+#if HAVE_NATIVE_mpn_addlsh1_n_ip1
+      cy = 2 * bs1[n] + mpn_addlsh1_n_ip1 (v1 + n, bs1, n);
+#else
+      cy = 2 * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, CNST_LIMB(2));
+#endif
+    }
+  else if (as1[n] != 0)
+    {
+      cy = as1[n] * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, as1[n]);
+    }
+  else
+    cy = 0;
+  if (bs1[n] == 1)
+    {
+      cy += mpn_add_n (v1 + n, v1 + n, as1, n);
+    }
+  else if (bs1[n] == 2)
+    {
+#if HAVE_NATIVE_mpn_addlsh1_n_ip1
+      cy += mpn_addlsh1_n_ip1 (v1 + n, as1, n);
+#else
+      cy += mpn_addmul_1 (v1 + n, as1, n, CNST_LIMB(2));
+#endif
+    }
+  v1[2 * n] = cy;
+#else /* SMALLER_RECURSION */
+  v1[2 * n] = 0;
+  mpn_mul_n (v1, as1, bs1, n + ((as1[n] | bs1[n]) != 0));
+#endif /* SMALLER_RECURSION */
+
+  mpn_mul_n (v0, a0, b0, n);			/* v0, 2n limbs */
+
+  /* vinf, s+t limbs */
+  if (s > t)  mpn_mul (vinf, a4, s, b2, t);
+  else        mpn_mul (vinf, b2, t, a4, s);
+
+  mpn_toom_interpolate_7pts (pp, n, flags, vm2, vm1, v2, vh, s + t,
+			     scratch_out);
+
+  TMP_FREE;
+}

diff --git a/third_party/gmp/mpn/generic/toom54_mul.c b/third_party/gmp/mpn/generic/toom54_mul.c
new file mode 100644
index 0000000..343b02e
--- /dev/null
+++ b/third_party/gmp/mpn/generic/toom54_mul.c

@@ -0,0 +1,142 @@
+/* Implementation of the algorithm for Toom-Cook 4.5-way.
+
+   Contributed to the GNU project by Marco Bodrato.
+
+   THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
+   SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+Copyright 2009, 2012 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+
+
+/* Toom-4.5, the splitting 5x4 unbalanced version.
+   Evaluate in: infinity, +4, -4, +2, -2, +1, -1, 0.
+
+  <--s-><--n--><--n--><--n--><--n-->
+   ____ ______ ______ ______ ______
+  |_a4_|__a3__|__a2__|__a1__|__a0__|
+	  |b3_|__b2__|__b1__|__b0__|
+	  <-t-><--n--><--n--><--n-->
+
+*/
+#define TOOM_54_MUL_N_REC(p, a, b, n, ws)		\
+  do {	mpn_mul_n (p, a, b, n);				\
+  } while (0)
+
+#define TOOM_54_MUL_REC(p, a, na, b, nb, ws)		\
+  do {	mpn_mul (p, a, na, b, nb);			\
+  } while (0)
+
+void
+mpn_toom54_mul (mp_ptr pp,
+		mp_srcptr ap, mp_size_t an,
+		mp_srcptr bp, mp_size_t bn, mp_ptr scratch)
+{
+  mp_size_t n, s, t;
+  int sign;
+
+  /***************************** decomposition *******************************/
+#define a4  (ap + 4 * n)
+#define b3  (bp + 3 * n)
+
+  ASSERT (an >= bn);
+  n = 1 + (4 * an >= 5 * bn ? (an - 1) / (size_t) 5 : (bn - 1) / (size_t) 4);
+
+  s = an - 4 * n;
+  t = bn - 3 * n;
+
+  ASSERT (0 < s && s <= n);
+  ASSERT (0 < t && t <= n);
+  /* Required by mpn_toom_interpolate_8pts. */
+  ASSERT ( s + t >= n );
+  ASSERT ( s + t > 4);
+  ASSERT ( n > 2);
+
+#define   r8    pp				/* 2n   */
+#define   r7    scratch				/* 3n+1 */
+#define   r5    (pp + 3*n)			/* 3n+1 */
+#define   v0    (pp + 3*n)			/* n+1 */
+#define   v1    (pp + 4*n+1)			/* n+1 */
+#define   v2    (pp + 5*n+2)			/* n+1 */
+#define   v3    (pp + 6*n+3)			/* n+1 */
+#define   r3    (scratch + 3 * n + 1)		/* 3n+1 */
+#define   r1    (pp + 7*n)			/* s+t <= 2*n */
+#define   ws    (scratch + 6 * n + 2)		/* ??? */
+
+  /* Alloc also 3n+1 limbs for ws... mpn_toom_interpolate_8pts may
+     need all of them, when DO_mpn_sublsh_n usea a scratch  */
+  /********************** evaluation and recursive calls *********************/
+  /* $\pm4$ */
+  sign = mpn_toom_eval_pm2exp (v2, v0, 4, ap, n, s, 2, pp)
+       ^ mpn_toom_eval_pm2exp (v3, v1, 3, bp, n, t, 2, pp);
+  TOOM_54_MUL_N_REC(pp, v0, v1, n + 1, ws); /* A(-4)*B(-4) */
+  TOOM_54_MUL_N_REC(r3, v2, v3, n + 1, ws); /* A(+4)*B(+4) */
+  mpn_toom_couple_handling (r3, 2*n+1, pp, sign, n, 2, 4);
+
+  /* $\pm1$ */
+  sign = mpn_toom_eval_pm1 (v2, v0, 4, ap, n, s,    pp)
+       ^ mpn_toom_eval_dgr3_pm1 (v3, v1, bp, n, t,    pp);
+  TOOM_54_MUL_N_REC(pp, v0, v1, n + 1, ws); /* A(-1)*B(-1) */
+  TOOM_54_MUL_N_REC(r7, v2, v3, n + 1, ws); /* A(1)*B(1) */
+  mpn_toom_couple_handling (r7, 2*n+1, pp, sign, n, 0, 0);
+
+  /* $\pm2$ */
+  sign = mpn_toom_eval_pm2 (v2, v0, 4, ap, n, s, pp)
+       ^ mpn_toom_eval_dgr3_pm2 (v3, v1, bp, n, t, pp);
+  TOOM_54_MUL_N_REC(pp, v0, v1, n + 1, ws); /* A(-2)*B(-2) */
+  TOOM_54_MUL_N_REC(r5, v2, v3, n + 1, ws); /* A(+2)*B(+2) */
+  mpn_toom_couple_handling (r5, 2*n+1, pp, sign, n, 1, 2);
+
+  /* A(0)*B(0) */
+  TOOM_54_MUL_N_REC(pp, ap, bp, n, ws);
+
+  /* Infinity */
+  if (s > t) {
+    TOOM_54_MUL_REC(r1, a4, s, b3, t, ws);
+  } else {
+    TOOM_54_MUL_REC(r1, b3, t, a4, s, ws);
+  };
+
+  mpn_toom_interpolate_8pts (pp, n, r3, r7, s + t, ws);
+
+#undef a4
+#undef b3
+#undef r1
+#undef r3
+#undef r5
+#undef v0
+#undef v1
+#undef v2
+#undef v3
+#undef r7
+#undef r8
+#undef ws
+}

diff --git a/third_party/gmp/mpn/generic/toom62_mul.c b/third_party/gmp/mpn/generic/toom62_mul.c
new file mode 100644
index 0000000..d971cc0
--- /dev/null
+++ b/third_party/gmp/mpn/generic/toom62_mul.c

@@ -0,0 +1,310 @@
+/* mpn_toom62_mul -- Multiply {ap,an} and {bp,bn} where an is nominally 3 times
+   as large as bn.  Or more accurately, (5/2)bn < an < 6bn.
+
+   Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
+
+   The idea of applying toom to unbalanced multiplication is due to Marco
+   Bodrato and Alberto Zanoni.
+
+   THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
+   SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+Copyright 2006-2008, 2012 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+#include "gmp-impl.h"
+
+/* Evaluate in:
+   0, +1, -1, +2, -2, 1/2, +inf
+
+  <-s-><--n--><--n--><--n--><--n--><--n-->
+   ___ ______ ______ ______ ______ ______
+  |a5_|___a4_|___a3_|___a2_|___a1_|___a0_|
+			     |_b1_|___b0_|
+			     <-t--><--n-->
+
+  v0  =    a0                       *   b0      #    A(0)*B(0)
+  v1  = (  a0+  a1+ a2+ a3+  a4+  a5)*( b0+ b1) #    A(1)*B(1)      ah  <= 5   bh <= 1
+  vm1 = (  a0-  a1+ a2- a3+  a4-  a5)*( b0- b1) #   A(-1)*B(-1)    |ah| <= 2   bh  = 0
+  v2  = (  a0+ 2a1+4a2+8a3+16a4+32a5)*( b0+2b1) #    A(2)*B(2)      ah  <= 62  bh <= 2
+  vm2 = (  a0- 2a1+4a2-8a3+16a4-32a5)*( b0-2b1) #   A(-2)*B(-2)    -41<=ah<=20 -1<=bh<=0
+  vh  = (32a0+16a1+8a2+4a3+ 2a4+  a5)*(2b0+ b1) #  A(1/2)*B(1/2)    ah  <= 62  bh <= 2
+  vinf=                           a5 *      b1  #  A(inf)*B(inf)
+*/
+
+void
+mpn_toom62_mul (mp_ptr pp,
+		mp_srcptr ap, mp_size_t an,
+		mp_srcptr bp, mp_size_t bn,
+		mp_ptr scratch)
+{
+  mp_size_t n, s, t;
+  mp_limb_t cy;
+  mp_ptr as1, asm1, as2, asm2, ash;
+  mp_ptr bs1, bsm1, bs2, bsm2, bsh;
+  mp_ptr gp;
+  enum toom7_flags aflags, bflags;
+  TMP_DECL;
+
+#define a0  ap
+#define a1  (ap + n)
+#define a2  (ap + 2*n)
+#define a3  (ap + 3*n)
+#define a4  (ap + 4*n)
+#define a5  (ap + 5*n)
+#define b0  bp
+#define b1  (bp + n)
+
+  n = 1 + (an >= 3 * bn ? (an - 1) / (size_t) 6 : (bn - 1) >> 1);
+
+  s = an - 5 * n;
+  t = bn - n;
+
+  ASSERT (0 < s && s <= n);
+  ASSERT (0 < t && t <= n);
+
+  TMP_MARK;
+
+  as1 = TMP_SALLOC_LIMBS (n + 1);
+  asm1 = TMP_SALLOC_LIMBS (n + 1);
+  as2 = TMP_SALLOC_LIMBS (n + 1);
+  asm2 = TMP_SALLOC_LIMBS (n + 1);
+  ash = TMP_SALLOC_LIMBS (n + 1);
+
+  bs1 = TMP_SALLOC_LIMBS (n + 1);
+  bsm1 = TMP_SALLOC_LIMBS (n);
+  bs2 = TMP_SALLOC_LIMBS (n + 1);
+  bsm2 = TMP_SALLOC_LIMBS (n + 1);
+  bsh = TMP_SALLOC_LIMBS (n + 1);
+
+  gp = pp;
+
+  /* Compute as1 and asm1.  */
+  aflags = (enum toom7_flags) (toom7_w3_neg & mpn_toom_eval_pm1 (as1, asm1, 5, ap, n, s, gp));
+
+  /* Compute as2 and asm2. */
+  aflags = (enum toom7_flags) (aflags | (toom7_w1_neg & mpn_toom_eval_pm2 (as2, asm2, 5, ap, n, s, gp)));
+
+  /* Compute ash = 32 a0 + 16 a1 + 8 a2 + 4 a3 + 2 a4 + a5
+     = 2*(2*(2*(2*(2*a0 + a1) + a2) + a3) + a4) + a5  */
+
+#if HAVE_NATIVE_mpn_addlsh1_n
+  cy = mpn_addlsh1_n (ash, a1, a0, n);
+  cy = 2*cy + mpn_addlsh1_n (ash, a2, ash, n);
+  cy = 2*cy + mpn_addlsh1_n (ash, a3, ash, n);
+  cy = 2*cy + mpn_addlsh1_n (ash, a4, ash, n);
+  if (s < n)
+    {
+      mp_limb_t cy2;
+      cy2 = mpn_addlsh1_n (ash, a5, ash, s);
+      ash[n] = 2*cy + mpn_lshift (ash + s, ash + s, n - s, 1);
+      MPN_INCR_U (ash + s, n+1-s, cy2);
+    }
+  else
+    ash[n] = 2*cy + mpn_addlsh1_n (ash, a5, ash, n);
+#else
+  cy = mpn_lshift (ash, a0, n, 1);
+  cy += mpn_add_n (ash, ash, a1, n);
+  cy = 2*cy + mpn_lshift (ash, ash, n, 1);
+  cy += mpn_add_n (ash, ash, a2, n);
+  cy = 2*cy + mpn_lshift (ash, ash, n, 1);
+  cy += mpn_add_n (ash, ash, a3, n);
+  cy = 2*cy + mpn_lshift (ash, ash, n, 1);
+  cy += mpn_add_n (ash, ash, a4, n);
+  cy = 2*cy + mpn_lshift (ash, ash, n, 1);
+  ash[n] = cy + mpn_add (ash, ash, n, a5, s);
+#endif
+
+  /* Compute bs1 and bsm1.  */
+  if (t == n)
+    {
+#if HAVE_NATIVE_mpn_add_n_sub_n
+      if (mpn_cmp (b0, b1, n) < 0)
+	{
+	  cy = mpn_add_n_sub_n (bs1, bsm1, b1, b0, n);
+	  bflags = toom7_w3_neg;
+	}
+      else
+	{
+	  cy = mpn_add_n_sub_n (bs1, bsm1, b0, b1, n);
+	  bflags = (enum toom7_flags) 0;
+	}
+      bs1[n] = cy >> 1;
+#else
+      bs1[n] = mpn_add_n (bs1, b0, b1, n);
+      if (mpn_cmp (b0, b1, n) < 0)
+	{
+	  mpn_sub_n (bsm1, b1, b0, n);
+	  bflags = toom7_w3_neg;
+	}
+      else
+	{
+	  mpn_sub_n (bsm1, b0, b1, n);
+	  bflags = (enum toom7_flags) 0;
+	}
+#endif
+    }
+  else
+    {
+      bs1[n] = mpn_add (bs1, b0, n, b1, t);
+      if (mpn_zero_p (b0 + t, n - t) && mpn_cmp (b0, b1, t) < 0)
+	{
+	  mpn_sub_n (bsm1, b1, b0, t);
+	  MPN_ZERO (bsm1 + t, n - t);
+	  bflags = toom7_w3_neg;
+	}
+      else
+	{
+	  mpn_sub (bsm1, b0, n, b1, t);
+	  bflags = (enum toom7_flags) 0;
+	}
+    }
+
+  /* Compute bs2 and bsm2. Recycling bs1 and bsm1; bs2=bs1+b1, bsm2 =
+     bsm1 - b1 */
+  mpn_add (bs2, bs1, n + 1, b1, t);
+  if (bflags & toom7_w3_neg)
+    {
+      bsm2[n] = mpn_add (bsm2, bsm1, n, b1, t);
+      bflags = (enum toom7_flags) (bflags | toom7_w1_neg);
+    }
+  else
+    {
+      /* FIXME: Simplify this logic? */
+      if (t < n)
+	{
+	  if (mpn_zero_p (bsm1 + t, n - t) && mpn_cmp (bsm1, b1, t) < 0)
+	    {
+	      ASSERT_NOCARRY (mpn_sub_n (bsm2, b1, bsm1, t));
+	      MPN_ZERO (bsm2 + t, n + 1 - t);
+	      bflags = (enum toom7_flags) (bflags | toom7_w1_neg);
+	    }
+	  else
+	    {
+	      ASSERT_NOCARRY (mpn_sub (bsm2, bsm1, n, b1, t));
+	      bsm2[n] = 0;
+	    }
+	}
+      else
+	{
+	  if (mpn_cmp (bsm1, b1, n) < 0)
+	    {
+	      ASSERT_NOCARRY (mpn_sub_n (bsm2, b1, bsm1, n));
+	      bflags = (enum toom7_flags) (bflags | toom7_w1_neg);
+	    }
+	  else
+	    {
+	      ASSERT_NOCARRY (mpn_sub_n (bsm2, bsm1, b1, n));
+	    }
+	  bsm2[n] = 0;
+	}
+    }
+
+  /* Compute bsh, recycling bs1. bsh=bs1+b0;  */
+  bsh[n] = bs1[n] + mpn_add_n (bsh, bs1, b0, n);
+
+  ASSERT (as1[n] <= 5);
+  ASSERT (bs1[n] <= 1);
+  ASSERT (asm1[n] <= 2);
+  ASSERT (as2[n] <= 62);
+  ASSERT (bs2[n] <= 2);
+  ASSERT (asm2[n] <= 41);
+  ASSERT (bsm2[n] <= 1);
+  ASSERT (ash[n] <= 62);
+  ASSERT (bsh[n] <= 2);
+
+#define v0    pp				/* 2n */
+#define v1    (pp + 2 * n)			/* 2n+1 */
+#define vinf  (pp + 6 * n)			/* s+t */
+#define v2    scratch				/* 2n+1 */
+#define vm2   (scratch + 2 * n + 1)		/* 2n+1 */
+#define vh    (scratch + 4 * n + 2)		/* 2n+1 */
+#define vm1   (scratch + 6 * n + 3)		/* 2n+1 */
+#define scratch_out (scratch + 8 * n + 4)		/* 2n+1 */
+  /* Total scratch need: 10*n+5 */
+
+  /* Must be in allocation order, as they overwrite one limb beyond
+   * 2n+1. */
+  mpn_mul_n (v2, as2, bs2, n + 1);		/* v2, 2n+1 limbs */
+  mpn_mul_n (vm2, asm2, bsm2, n + 1);		/* vm2, 2n+1 limbs */
+  mpn_mul_n (vh, ash, bsh, n + 1);		/* vh, 2n+1 limbs */
+
+  /* vm1, 2n+1 limbs */
+  mpn_mul_n (vm1, asm1, bsm1, n);
+  cy = 0;
+  if (asm1[n] == 1)
+    {
+      cy = mpn_add_n (vm1 + n, vm1 + n, bsm1, n);
+    }
+  else if (asm1[n] == 2)
+    {
+#if HAVE_NATIVE_mpn_addlsh1_n
+      cy = mpn_addlsh1_n (vm1 + n, vm1 + n, bsm1, n);
+#else
+      cy = mpn_addmul_1 (vm1 + n, bsm1, n, CNST_LIMB(2));
+#endif
+    }
+  vm1[2 * n] = cy;
+
+  /* v1, 2n+1 limbs */
+  mpn_mul_n (v1, as1, bs1, n);
+  if (as1[n] == 1)
+    {
+      cy = bs1[n] + mpn_add_n (v1 + n, v1 + n, bs1, n);
+    }
+  else if (as1[n] == 2)
+    {
+#if HAVE_NATIVE_mpn_addlsh1_n
+      cy = 2 * bs1[n] + mpn_addlsh1_n (v1 + n, v1 + n, bs1, n);
+#else
+      cy = 2 * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, CNST_LIMB(2));
+#endif
+    }
+  else if (as1[n] != 0)
+    {
+      cy = as1[n] * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, as1[n]);
+    }
+  else
+    cy = 0;
+  if (bs1[n] != 0)
+    cy += mpn_add_n (v1 + n, v1 + n, as1, n);
+  v1[2 * n] = cy;
+
+  mpn_mul_n (v0, a0, b0, n);			/* v0, 2n limbs */
+
+  /* vinf, s+t limbs */
+  if (s > t)  mpn_mul (vinf, a5, s, b1, t);
+  else        mpn_mul (vinf, b1, t, a5, s);
+
+  mpn_toom_interpolate_7pts (pp, n, (enum toom7_flags) (aflags ^ bflags),
+			     vm2, vm1, v2, vh, s + t, scratch_out);
+
+  TMP_FREE;
+}

diff --git a/third_party/gmp/mpn/generic/toom63_mul.c b/third_party/gmp/mpn/generic/toom63_mul.c
new file mode 100644
index 0000000..181996d
--- /dev/null
+++ b/third_party/gmp/mpn/generic/toom63_mul.c

@@ -0,0 +1,231 @@
+/* Implementation of the algorithm for Toom-Cook 4.5-way.
+
+   Contributed to the GNU project by Marco Bodrato.
+
+   THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
+   SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+Copyright 2009, 2012 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+#include "gmp-impl.h"
+
+/* Stores |{ap,n}-{bp,n}| in {rp,n}, returns the sign. */
+static int
+abs_sub_n (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t n)
+{
+  mp_limb_t  x, y;
+  while (--n >= 0)
+    {
+      x = ap[n];
+      y = bp[n];
+      if (x != y)
+	{
+	  n++;
+	  if (x > y)
+	    {
+	      mpn_sub_n (rp, ap, bp, n);
+	      return 0;
+	    }
+	  else
+	    {
+	      mpn_sub_n (rp, bp, ap, n);
+	      return ~0;
+	    }
+	}
+      rp[n] = 0;
+    }
+  return 0;
+}
+
+static int
+abs_sub_add_n (mp_ptr rm, mp_ptr rp, mp_srcptr rs, mp_size_t n) {
+  int result;
+  result = abs_sub_n (rm, rp, rs, n);
+  ASSERT_NOCARRY(mpn_add_n (rp, rp, rs, n));
+  return result;
+}
+
+
+/* Toom-4.5, the splitting 6x3 unbalanced version.
+   Evaluate in: infinity, +4, -4, +2, -2, +1, -1, 0.
+
+  <--s-><--n--><--n--><--n--><--n--><--n-->
+   ____ ______ ______ ______ ______ ______
+  |_a5_|__a4__|__a3__|__a2__|__a1__|__a0__|
+			|b2_|__b1__|__b0__|
+			<-t-><--n--><--n-->
+
+*/
+#define TOOM_63_MUL_N_REC(p, a, b, n, ws)		\
+  do {	mpn_mul_n (p, a, b, n);				\
+  } while (0)
+
+#define TOOM_63_MUL_REC(p, a, na, b, nb, ws)		\
+  do {	mpn_mul (p, a, na, b, nb);			\
+  } while (0)
+
+void
+mpn_toom63_mul (mp_ptr pp,
+		mp_srcptr ap, mp_size_t an,
+		mp_srcptr bp, mp_size_t bn, mp_ptr scratch)
+{
+  mp_size_t n, s, t;
+  mp_limb_t cy;
+  int sign;
+
+  /***************************** decomposition *******************************/
+#define a5  (ap + 5 * n)
+#define b0  (bp + 0 * n)
+#define b1  (bp + 1 * n)
+#define b2  (bp + 2 * n)
+
+  ASSERT (an >= bn);
+  n = 1 + (an >= 2 * bn ? (an - 1) / (size_t) 6 : (bn - 1) / (size_t) 3);
+
+  s = an - 5 * n;
+  t = bn - 2 * n;
+
+  ASSERT (0 < s && s <= n);
+  ASSERT (0 < t && t <= n);
+  /* WARNING! it assumes s+t>=n */
+  ASSERT ( s + t >= n );
+  ASSERT ( s + t > 4);
+  /* WARNING! it assumes n>1 */
+  ASSERT ( n > 2);
+
+#define   r8    pp				/* 2n   */
+#define   r7    scratch				/* 3n+1 */
+#define   r5    (pp + 3*n)			/* 3n+1 */
+#define   v0    (pp + 3*n)			/* n+1 */
+#define   v1    (pp + 4*n+1)			/* n+1 */
+#define   v2    (pp + 5*n+2)			/* n+1 */
+#define   v3    (pp + 6*n+3)			/* n+1 */
+#define   r3    (scratch + 3 * n + 1)		/* 3n+1 */
+#define   r1    (pp + 7*n)			/* s+t <= 2*n */
+#define   ws    (scratch + 6 * n + 2)		/* ??? */
+
+  /* Alloc also 3n+1 limbs for ws... mpn_toom_interpolate_8pts may
+     need all of them, when DO_mpn_sublsh_n usea a scratch  */
+/*   if (scratch == NULL) scratch = TMP_SALLOC_LIMBS (9 * n + 3); */
+
+  /********************** evaluation and recursive calls *********************/
+  /* $\pm4$ */
+  sign = mpn_toom_eval_pm2exp (v2, v0, 5, ap, n, s, 2, pp);
+  pp[n] = mpn_lshift (pp, b1, n, 2); /* 4b1 */
+  /* FIXME: use addlsh */
+  v3[t] = mpn_lshift (v3, b2, t, 4);/* 16b2 */
+  if ( n == t )
+    v3[n]+= mpn_add_n (v3, v3, b0, n); /* 16b2+b0 */
+  else
+    v3[n] = mpn_add (v3, b0, n, v3, t+1); /* 16b2+b0 */
+  sign ^= abs_sub_add_n (v1, v3, pp, n + 1);
+  TOOM_63_MUL_N_REC(pp, v0, v1, n + 1, ws); /* A(-4)*B(-4) */
+  TOOM_63_MUL_N_REC(r3, v2, v3, n + 1, ws); /* A(+4)*B(+4) */
+  mpn_toom_couple_handling (r3, 2*n+1, pp, sign, n, 2, 4);
+
+  /* $\pm1$ */
+  sign = mpn_toom_eval_pm1 (v2, v0, 5, ap, n, s,    pp);
+  /* Compute bs1 and bsm1. Code taken from toom33 */
+  cy = mpn_add (ws, b0, n, b2, t);
+#if HAVE_NATIVE_mpn_add_n_sub_n
+  if (cy == 0 && mpn_cmp (ws, b1, n) < 0)
+    {
+      cy = mpn_add_n_sub_n (v3, v1, b1, ws, n);
+      v3[n] = cy >> 1;
+      v1[n] = 0;
+      sign = ~sign;
+    }
+  else
+    {
+      mp_limb_t cy2;
+      cy2 = mpn_add_n_sub_n (v3, v1, ws, b1, n);
+      v3[n] = cy + (cy2 >> 1);
+      v1[n] = cy - (cy2 & 1);
+    }
+#else
+  v3[n] = cy + mpn_add_n (v3, ws, b1, n);
+  if (cy == 0 && mpn_cmp (ws, b1, n) < 0)
+    {
+      mpn_sub_n (v1, b1, ws, n);
+      v1[n] = 0;
+      sign = ~sign;
+    }
+  else
+    {
+      cy -= mpn_sub_n (v1, ws, b1, n);
+      v1[n] = cy;
+    }
+#endif
+  TOOM_63_MUL_N_REC(pp, v0, v1, n + 1, ws); /* A(-1)*B(-1) */
+  TOOM_63_MUL_N_REC(r7, v2, v3, n + 1, ws); /* A(1)*B(1) */
+  mpn_toom_couple_handling (r7, 2*n+1, pp, sign, n, 0, 0);
+
+  /* $\pm2$ */
+  sign = mpn_toom_eval_pm2 (v2, v0, 5, ap, n, s, pp);
+  pp[n] = mpn_lshift (pp, b1, n, 1); /* 2b1 */
+  /* FIXME: use addlsh or addlsh2 */
+  v3[t] = mpn_lshift (v3, b2, t, 2);/* 4b2 */
+  if ( n == t )
+    v3[n]+= mpn_add_n (v3, v3, b0, n); /* 4b2+b0 */
+  else
+    v3[n] = mpn_add (v3, b0, n, v3, t+1); /* 4b2+b0 */
+  sign ^= abs_sub_add_n (v1, v3, pp, n + 1);
+  TOOM_63_MUL_N_REC(pp, v0, v1, n + 1, ws); /* A(-2)*B(-2) */
+  TOOM_63_MUL_N_REC(r5, v2, v3, n + 1, ws); /* A(+2)*B(+2) */
+  mpn_toom_couple_handling (r5, 2*n+1, pp, sign, n, 1, 2);
+
+  /* A(0)*B(0) */
+  TOOM_63_MUL_N_REC(pp, ap, bp, n, ws);
+
+  /* Infinity */
+  if (s > t) {
+    TOOM_63_MUL_REC(r1, a5, s, b2, t, ws);
+  } else {
+    TOOM_63_MUL_REC(r1, b2, t, a5, s, ws);
+  };
+
+  mpn_toom_interpolate_8pts (pp, n, r3, r7, s + t, ws);
+
+#undef a5
+#undef b0
+#undef b1
+#undef b2
+#undef r1
+#undef r3
+#undef r5
+#undef v0
+#undef v1
+#undef v2
+#undef v3
+#undef r7
+#undef r8
+#undef ws
+}

diff --git a/third_party/gmp/mpn/generic/toom6_sqr.c b/third_party/gmp/mpn/generic/toom6_sqr.c
new file mode 100644
index 0000000..336eef9
--- /dev/null
+++ b/third_party/gmp/mpn/generic/toom6_sqr.c

@@ -0,0 +1,181 @@
+/* Implementation of the squaring algorithm with Toom-Cook 6.5-way.
+
+   Contributed to the GNU project by Marco Bodrato.
+
+   THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
+   SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+Copyright 2009 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+#include "gmp-impl.h"
+
+
+#if GMP_NUMB_BITS < 21
+#error Not implemented.
+#endif
+
+
+#if TUNE_PROGRAM_BUILD
+#define MAYBE_sqr_basecase 1
+#define MAYBE_sqr_above_basecase   1
+#define MAYBE_sqr_toom2   1
+#define MAYBE_sqr_above_toom2   1
+#define MAYBE_sqr_toom3   1
+#define MAYBE_sqr_above_toom3   1
+#define MAYBE_sqr_above_toom4   1
+#else
+#ifdef  SQR_TOOM8_THRESHOLD
+#define SQR_TOOM6_MAX ((SQR_TOOM8_THRESHOLD+6*2-1+5)/6)
+#else
+#define SQR_TOOM6_MAX					\
+  ((SQR_FFT_THRESHOLD <= MP_SIZE_T_MAX - (6*2-1+5)) ?	\
+   ((SQR_FFT_THRESHOLD+6*2-1+5)/6)			\
+   : MP_SIZE_T_MAX )
+#endif
+#define MAYBE_sqr_basecase					\
+  (SQR_TOOM6_THRESHOLD < 6 * SQR_TOOM2_THRESHOLD)
+#define MAYBE_sqr_above_basecase				\
+  (SQR_TOOM6_MAX >=  SQR_TOOM2_THRESHOLD)
+#define MAYBE_sqr_toom2						\
+  (SQR_TOOM6_THRESHOLD < 6 * SQR_TOOM3_THRESHOLD)
+#define MAYBE_sqr_above_toom2					\
+  (SQR_TOOM6_MAX >= SQR_TOOM3_THRESHOLD)
+#define MAYBE_sqr_toom3						\
+  (SQR_TOOM6_THRESHOLD < 6 * SQR_TOOM4_THRESHOLD)
+#define MAYBE_sqr_above_toom3					\
+  (SQR_TOOM6_MAX >= SQR_TOOM4_THRESHOLD)
+#define MAYBE_sqr_above_toom4					\
+  (SQR_TOOM6_MAX >= SQR_TOOM6_THRESHOLD)
+#endif
+
+#define TOOM6_SQR_REC(p, a, n, ws)					\
+  do {									\
+    if (MAYBE_sqr_basecase && ( !MAYBE_sqr_above_basecase		\
+	|| BELOW_THRESHOLD (n, SQR_TOOM2_THRESHOLD)))			\
+      mpn_sqr_basecase (p, a, n);					\
+    else if (MAYBE_sqr_toom2 && ( !MAYBE_sqr_above_toom2		\
+	     || BELOW_THRESHOLD (n, SQR_TOOM3_THRESHOLD)))		\
+      mpn_toom2_sqr (p, a, n, ws);					\
+    else if (MAYBE_sqr_toom3 && ( !MAYBE_sqr_above_toom3		\
+	     || BELOW_THRESHOLD (n, SQR_TOOM4_THRESHOLD)))		\
+      mpn_toom3_sqr (p, a, n, ws);					\
+    else if (! MAYBE_sqr_above_toom4					\
+	     || BELOW_THRESHOLD (n, SQR_TOOM6_THRESHOLD))		\
+      mpn_toom4_sqr (p, a, n, ws);					\
+    else								\
+      mpn_toom6_sqr (p, a, n, ws);					\
+  } while (0)
+
+void
+mpn_toom6_sqr  (mp_ptr pp, mp_srcptr ap, mp_size_t an, mp_ptr scratch)
+{
+  mp_size_t n, s;
+
+  /***************************** decomposition *******************************/
+
+  ASSERT( an >= 18 );
+
+  n = 1 + (an - 1) / (size_t) 6;
+
+  s = an - 5 * n;
+
+  ASSERT (0 < s && s <= n);
+
+#define   r4    (pp + 3 * n)			/* 3n+1 */
+#define   r2    (pp + 7 * n)			/* 3n+1 */
+#define   r0    (pp +11 * n)			/* s+t <= 2*n */
+#define   r5    (scratch)			/* 3n+1 */
+#define   r3    (scratch + 3 * n + 1)		/* 3n+1 */
+#define   r1    (scratch + 6 * n + 2)		/* 3n+1 */
+#define   v0    (pp + 7 * n)			/* n+1 */
+#define   v2    (pp + 9 * n+2)			/* n+1 */
+#define   wse   (scratch + 9 * n + 3)		/* 3n+1 */
+
+  /* Alloc also 3n+1 limbs for ws... toom_interpolate_12pts may
+     need all of them, when DO_mpn_sublsh_n usea a scratch  */
+/*   if (scratch== NULL) */
+/*     scratch = TMP_SALLOC_LIMBS (12 * n + 6); */
+
+  /********************** evaluation and recursive calls *********************/
+  /* $\pm1/2$ */
+  mpn_toom_eval_pm2rexp (v2, v0, 5, ap, n, s, 1, pp);
+  TOOM6_SQR_REC(pp, v0, n + 1, wse); /* A(-1/2)*B(-1/2)*2^. */
+  TOOM6_SQR_REC(r5, v2, n + 1, wse); /* A(+1/2)*B(+1/2)*2^. */
+  mpn_toom_couple_handling (r5, 2 * n + 1, pp, 0, n, 1, 0);
+
+  /* $\pm1$ */
+  mpn_toom_eval_pm1 (v2, v0, 5, ap, n, s,    pp);
+  TOOM6_SQR_REC(pp, v0, n + 1, wse); /* A(-1)*B(-1) */
+  TOOM6_SQR_REC(r3, v2, n + 1, wse); /* A(1)*B(1) */
+  mpn_toom_couple_handling (r3, 2 * n + 1, pp, 0, n, 0, 0);
+
+  /* $\pm4$ */
+  mpn_toom_eval_pm2exp (v2, v0, 5, ap, n, s, 2, pp);
+  TOOM6_SQR_REC(pp, v0, n + 1, wse); /* A(-4)*B(-4) */
+  TOOM6_SQR_REC(r1, v2, n + 1, wse); /* A(+4)*B(+4) */
+  mpn_toom_couple_handling (r1, 2 * n + 1, pp, 0, n, 2, 4);
+
+  /* $\pm1/4$ */
+  mpn_toom_eval_pm2rexp (v2, v0, 5, ap, n, s, 2, pp);
+  TOOM6_SQR_REC(pp, v0, n + 1, wse); /* A(-1/4)*B(-1/4)*4^. */
+  TOOM6_SQR_REC(r4, v2, n + 1, wse); /* A(+1/4)*B(+1/4)*4^. */
+  mpn_toom_couple_handling (r4, 2 * n + 1, pp, 0, n, 2, 0);
+
+  /* $\pm2$ */
+  mpn_toom_eval_pm2 (v2, v0, 5, ap, n, s, pp);
+  TOOM6_SQR_REC(pp, v0, n + 1, wse); /* A(-2)*B(-2) */
+  TOOM6_SQR_REC(r2, v2, n + 1, wse); /* A(+2)*B(+2) */
+  mpn_toom_couple_handling (r2, 2 * n + 1, pp, 0, n, 1, 2);
+
+#undef v0
+#undef v2
+
+  /* A(0)*B(0) */
+  TOOM6_SQR_REC(pp, ap, n, wse);
+
+  mpn_toom_interpolate_12pts (pp, r1, r3, r5, n, 2 * s, 0, wse);
+
+#undef r0
+#undef r1
+#undef r2
+#undef r3
+#undef r4
+#undef r5
+
+}
+#undef TOOM6_SQR_REC
+#undef MAYBE_sqr_basecase
+#undef MAYBE_sqr_above_basecase
+#undef MAYBE_sqr_toom2
+#undef MAYBE_sqr_above_toom2
+#undef MAYBE_sqr_toom3
+#undef MAYBE_sqr_above_toom3
+#undef MAYBE_sqr_above_toom4

diff --git a/third_party/gmp/mpn/generic/toom6h_mul.c b/third_party/gmp/mpn/generic/toom6h_mul.c
new file mode 100644
index 0000000..637f2a5
--- /dev/null
+++ b/third_party/gmp/mpn/generic/toom6h_mul.c

@@ -0,0 +1,262 @@
+/* Implementation of the multiplication algorithm for Toom-Cook 6.5-way.
+
+   Contributed to the GNU project by Marco Bodrato.
+
+   THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
+   SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+Copyright 2009, 2010, 2012 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+#include "gmp-impl.h"
+
+
+#if GMP_NUMB_BITS < 21
+#error Not implemented.
+#endif
+
+#if TUNE_PROGRAM_BUILD
+#define MAYBE_mul_basecase 1
+#define MAYBE_mul_toom22   1
+#define MAYBE_mul_toom33   1
+#define MAYBE_mul_toom6h   1
+#else
+#define MAYBE_mul_basecase						\
+  (MUL_TOOM6H_THRESHOLD < 6 * MUL_TOOM22_THRESHOLD)
+#define MAYBE_mul_toom22						\
+  (MUL_TOOM6H_THRESHOLD < 6 * MUL_TOOM33_THRESHOLD)
+#define MAYBE_mul_toom33						\
+  (MUL_TOOM6H_THRESHOLD < 6 * MUL_TOOM44_THRESHOLD)
+#define MAYBE_mul_toom6h						\
+  (MUL_FFT_THRESHOLD >= 6 * MUL_TOOM6H_THRESHOLD)
+#endif
+
+#define TOOM6H_MUL_N_REC(p, a, b, f, p2, a2, b2, n, ws)			\
+  do {									\
+    if (MAYBE_mul_basecase						\
+	&& BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD)) {			\
+      mpn_mul_basecase (p, a, n, b, n);					\
+      if (f)								\
+	mpn_mul_basecase (p2, a2, n, b2, n);				\
+    } else if (MAYBE_mul_toom22						\
+	       && BELOW_THRESHOLD (n, MUL_TOOM33_THRESHOLD)) {		\
+      mpn_toom22_mul (p, a, n, b, n, ws);				\
+      if (f)								\
+	mpn_toom22_mul (p2, a2, n, b2, n, ws);				\
+    } else if (MAYBE_mul_toom33						\
+	       && BELOW_THRESHOLD (n, MUL_TOOM44_THRESHOLD)) {		\
+      mpn_toom33_mul (p, a, n, b, n, ws);				\
+      if (f)								\
+	mpn_toom33_mul (p2, a2, n, b2, n, ws);				\
+    } else if (! MAYBE_mul_toom6h					\
+	       || BELOW_THRESHOLD (n, MUL_TOOM6H_THRESHOLD)) {		\
+      mpn_toom44_mul (p, a, n, b, n, ws);				\
+      if (f)								\
+	mpn_toom44_mul (p2, a2, n, b2, n, ws);				\
+    } else {								\
+      mpn_toom6h_mul (p, a, n, b, n, ws);				\
+      if (f)								\
+	mpn_toom6h_mul (p2, a2, n, b2, n, ws);				\
+    }									\
+  } while (0)
+
+#define TOOM6H_MUL_REC(p, a, na, b, nb, ws)		\
+  do { mpn_mul (p, a, na, b, nb);			\
+  } while (0)
+
+/* Toom-6.5 , compute the product {pp,an+bn} <- {ap,an} * {bp,bn}
+   With: an >= bn >= 46, an*6 <  bn * 17.
+   It _may_ work with bn<=46 and bn*17 < an*6 < bn*18
+
+   Evaluate in: infinity, +4, -4, +2, -2, +1, -1, +1/2, -1/2, +1/4, -1/4, 0.
+*/
+/* Estimate on needed scratch:
+   S(n) <= (n+5)\6*10+4+MAX(S((n+5)\6),1+2*(n+5)\6),
+   since n>42; S(n) <= ceil(log(n)/log(6))*(10+4)+n*12\6 < n*2 + lg2(n)*6
+ */
+
+void
+mpn_toom6h_mul   (mp_ptr pp,
+		  mp_srcptr ap, mp_size_t an,
+		  mp_srcptr bp, mp_size_t bn, mp_ptr scratch)
+{
+  mp_size_t n, s, t;
+  int p, q, half;
+  int sign;
+
+  /***************************** decomposition *******************************/
+
+  ASSERT (an >= bn);
+  /* Can not handle too much unbalancement */
+  ASSERT (bn >= 42);
+  /* Can not handle too much unbalancement */
+  ASSERT ((an*3 <  bn * 8) || (bn >= 46 && an * 6 <  bn * 17));
+
+  /* Limit num/den is a rational number between
+     (12/11)^(log(4)/log(2*4-1)) and (12/11)^(log(6)/log(2*6-1))             */
+#define LIMIT_numerator (18)
+#define LIMIT_denominat (17)
+
+  if (LIKELY (an * LIMIT_denominat < LIMIT_numerator * bn)) /* is 6*... < 6*... */
+    {
+      n = 1 + (an - 1) / (size_t) 6;
+      p = q = 5;
+      half = 0;
+
+      s = an - 5 * n;
+      t = bn - 5 * n;
+    }
+  else {
+    if (an * 5 * LIMIT_numerator < LIMIT_denominat * 7 * bn)
+      { p = 7; q = 6; }
+    else if (an * 5 * LIMIT_denominat < LIMIT_numerator * 7 * bn)
+      { p = 7; q = 5; }
+    else if (an * LIMIT_numerator < LIMIT_denominat * 2 * bn)  /* is 4*... < 8*... */
+      { p = 8; q = 5; }
+    else if (an * LIMIT_denominat < LIMIT_numerator * 2 * bn)  /* is 4*... < 8*... */
+      { p = 8; q = 4; }
+    else
+      { p = 9; q = 4; }
+
+    half = (p ^ q) & 1;
+    n = 1 + (q * an >= p * bn ? (an - 1) / (size_t) p : (bn - 1) / (size_t) q);
+    p--; q--;
+
+    s = an - p * n;
+    t = bn - q * n;
+
+    /* With LIMIT = 16/15, the following recover is needed only if bn<=73*/
+    if (half) { /* Recover from badly chosen splitting */
+      if (UNLIKELY (s<1)) {p--; s+=n; half=0;}
+      else if (UNLIKELY (t<1)) {q--; t+=n; half=0;}
+    }
+  }
+#undef LIMIT_numerator
+#undef LIMIT_denominat
+
+  ASSERT (0 < s && s <= n);
+  ASSERT (0 < t && t <= n);
+  ASSERT (half || s + t > 3);
+  ASSERT (n > 2);
+
+#define   r4    (pp + 3 * n)			/* 3n+1 */
+#define   r2    (pp + 7 * n)			/* 3n+1 */
+#define   r0    (pp +11 * n)			/* s+t <= 2*n */
+#define   r5    (scratch)			/* 3n+1 */
+#define   r3    (scratch + 3 * n + 1)		/* 3n+1 */
+#define   r1    (scratch + 6 * n + 2)		/* 3n+1 */
+#define   v0    (pp + 7 * n)			/* n+1 */
+#define   v1    (pp + 8 * n+1)			/* n+1 */
+#define   v2    (pp + 9 * n+2)			/* n+1 */
+#define   v3    (scratch + 9 * n + 3)		/* n+1 */
+#define   wsi   (scratch + 9 * n + 3)		/* 3n+1 */
+#define   wse   (scratch +10 * n + 4)		/* 2n+1 */
+
+  /* Alloc also 3n+1 limbs for wsi... toom_interpolate_12pts may
+     need all of them  */
+/*   if (scratch == NULL) */
+/*     scratch = TMP_SALLOC_LIMBS(mpn_toom6_sqr_itch(n * 6)); */
+  ASSERT (12 * n + 6 <= mpn_toom6h_mul_itch(an,bn));
+  ASSERT (12 * n + 6 <= mpn_toom6_sqr_itch(n * 6));
+
+  /********************** evaluation and recursive calls *********************/
+  /* $\pm1/2$ */
+  sign = mpn_toom_eval_pm2rexp (v2, v0, p, ap, n, s, 1, pp) ^
+	 mpn_toom_eval_pm2rexp (v3, v1, q, bp, n, t, 1, pp);
+  /* A(-1/2)*B(-1/2)*2^. */ /* A(+1/2)*B(+1/2)*2^. */
+  TOOM6H_MUL_N_REC(pp, v0, v1, 2, r5, v2, v3, n + 1, wse);
+  mpn_toom_couple_handling (r5, 2 * n + 1, pp, sign, n, 1+half , half);
+
+  /* $\pm1$ */
+  sign = mpn_toom_eval_pm1 (v2, v0, p, ap, n, s,    pp);
+  if (UNLIKELY (q == 3))
+    sign ^= mpn_toom_eval_dgr3_pm1 (v3, v1, bp, n, t,    pp);
+  else
+    sign ^= mpn_toom_eval_pm1 (v3, v1, q, bp, n, t,    pp);
+  /* A(-1)*B(-1) */ /* A(1)*B(1) */
+  TOOM6H_MUL_N_REC(pp, v0, v1, 2, r3, v2, v3, n + 1, wse);
+  mpn_toom_couple_handling (r3, 2 * n + 1, pp, sign, n, 0, 0);
+
+  /* $\pm4$ */
+  sign = mpn_toom_eval_pm2exp (v2, v0, p, ap, n, s, 2, pp) ^
+	 mpn_toom_eval_pm2exp (v3, v1, q, bp, n, t, 2, pp);
+  /* A(-4)*B(-4) */
+  TOOM6H_MUL_N_REC(pp, v0, v1, 2, r1, v2, v3, n + 1, wse); /* A(+4)*B(+4) */
+  mpn_toom_couple_handling (r1, 2 * n + 1, pp, sign, n, 2, 4);
+
+  /* $\pm1/4$ */
+  sign = mpn_toom_eval_pm2rexp (v2, v0, p, ap, n, s, 2, pp) ^
+	 mpn_toom_eval_pm2rexp (v3, v1, q, bp, n, t, 2, pp);
+  /* A(-1/4)*B(-1/4)*4^. */ /* A(+1/4)*B(+1/4)*4^. */
+  TOOM6H_MUL_N_REC(pp, v0, v1, 2, r4, v2, v3, n + 1, wse);
+  mpn_toom_couple_handling (r4, 2 * n + 1, pp, sign, n, 2*(1+half), 2*(half));
+
+  /* $\pm2$ */
+  sign = mpn_toom_eval_pm2 (v2, v0, p, ap, n, s, pp) ^
+	 mpn_toom_eval_pm2 (v3, v1, q, bp, n, t, pp);
+  /* A(-2)*B(-2) */ /* A(+2)*B(+2) */
+  TOOM6H_MUL_N_REC(pp, v0, v1, 2, r2, v2, v3, n + 1, wse);
+  mpn_toom_couple_handling (r2, 2 * n + 1, pp, sign, n, 1, 2);
+
+#undef v0
+#undef v1
+#undef v2
+#undef v3
+#undef wse
+
+  /* A(0)*B(0) */
+  TOOM6H_MUL_N_REC(pp, ap, bp, 0, pp, ap, bp, n, wsi);
+
+  /* Infinity */
+  if (UNLIKELY (half != 0)) {
+    if (s > t) {
+      TOOM6H_MUL_REC(r0, ap + p * n, s, bp + q * n, t, wsi);
+    } else {
+      TOOM6H_MUL_REC(r0, bp + q * n, t, ap + p * n, s, wsi);
+    };
+  };
+
+  mpn_toom_interpolate_12pts (pp, r1, r3, r5, n, s+t, half, wsi);
+
+#undef r0
+#undef r1
+#undef r2
+#undef r3
+#undef r4
+#undef r5
+#undef wsi
+}
+
+#undef TOOM6H_MUL_N_REC
+#undef TOOM6H_MUL_REC
+#undef MAYBE_mul_basecase
+#undef MAYBE_mul_toom22
+#undef MAYBE_mul_toom33
+#undef MAYBE_mul_toom6h

diff --git a/third_party/gmp/mpn/generic/toom8_sqr.c b/third_party/gmp/mpn/generic/toom8_sqr.c
new file mode 100644
index 0000000..03e5c64
--- /dev/null
+++ b/third_party/gmp/mpn/generic/toom8_sqr.c

@@ -0,0 +1,225 @@
+/* Implementation of the squaring algorithm with Toom-Cook 8.5-way.
+
+   Contributed to the GNU project by Marco Bodrato.
+
+   THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
+   SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+Copyright 2009, 2012 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+#include "gmp-impl.h"
+
+#if GMP_NUMB_BITS < 29
+#error Not implemented.
+#endif
+
+#if GMP_NUMB_BITS < 43
+#define BIT_CORRECTION 1
+#define CORRECTION_BITS GMP_NUMB_BITS
+#else
+#define BIT_CORRECTION 0
+#define CORRECTION_BITS 0
+#endif
+
+#ifndef SQR_TOOM8_THRESHOLD
+#define SQR_TOOM8_THRESHOLD MUL_TOOM8H_THRESHOLD
+#endif
+
+#ifndef SQR_TOOM6_THRESHOLD
+#define SQR_TOOM6_THRESHOLD MUL_TOOM6H_THRESHOLD
+#endif
+
+#if TUNE_PROGRAM_BUILD
+#define MAYBE_sqr_basecase 1
+#define MAYBE_sqr_above_basecase   1
+#define MAYBE_sqr_toom2   1
+#define MAYBE_sqr_above_toom2   1
+#define MAYBE_sqr_toom3   1
+#define MAYBE_sqr_above_toom3   1
+#define MAYBE_sqr_toom4   1
+#define MAYBE_sqr_above_toom4   1
+#define MAYBE_sqr_above_toom6   1
+#else
+#define SQR_TOOM8_MAX					\
+  ((SQR_FFT_THRESHOLD <= MP_SIZE_T_MAX - (8*2-1+7)) ?	\
+   ((SQR_FFT_THRESHOLD+8*2-1+7)/8)			\
+   : MP_SIZE_T_MAX )
+#define MAYBE_sqr_basecase					\
+  (SQR_TOOM8_THRESHOLD < 8 * SQR_TOOM2_THRESHOLD)
+#define MAYBE_sqr_above_basecase				\
+  (SQR_TOOM8_MAX >= SQR_TOOM2_THRESHOLD)
+#define MAYBE_sqr_toom2						\
+  (SQR_TOOM8_THRESHOLD < 8 * SQR_TOOM3_THRESHOLD)
+#define MAYBE_sqr_above_toom2					\
+  (SQR_TOOM8_MAX >= SQR_TOOM3_THRESHOLD)
+#define MAYBE_sqr_toom3						\
+  (SQR_TOOM8_THRESHOLD < 8 * SQR_TOOM4_THRESHOLD)
+#define MAYBE_sqr_above_toom3					\
+  (SQR_TOOM8_MAX >= SQR_TOOM4_THRESHOLD)
+#define MAYBE_sqr_toom4						\
+  (SQR_TOOM8_THRESHOLD < 8 * SQR_TOOM6_THRESHOLD)
+#define MAYBE_sqr_above_toom4					\
+  (SQR_TOOM8_MAX >= SQR_TOOM6_THRESHOLD)
+#define MAYBE_sqr_above_toom6					\
+  (SQR_TOOM8_MAX >= SQR_TOOM8_THRESHOLD)
+#endif
+
+#define TOOM8_SQR_REC(p, a, f, p2, a2, n, ws)				\
+  do {									\
+    if (MAYBE_sqr_basecase && ( !MAYBE_sqr_above_basecase		\
+	|| BELOW_THRESHOLD (n, SQR_TOOM2_THRESHOLD))) {			\
+      mpn_sqr_basecase (p, a, n);					\
+      if (f) mpn_sqr_basecase (p2, a2, n);				\
+    } else if (MAYBE_sqr_toom2 && ( !MAYBE_sqr_above_toom2		\
+	     || BELOW_THRESHOLD (n, SQR_TOOM3_THRESHOLD))) {		\
+      mpn_toom2_sqr (p, a, n, ws);					\
+      if (f) mpn_toom2_sqr (p2, a2, n, ws);				\
+    } else if (MAYBE_sqr_toom3 && ( !MAYBE_sqr_above_toom3		\
+	     || BELOW_THRESHOLD (n, SQR_TOOM4_THRESHOLD))) {		\
+      mpn_toom3_sqr (p, a, n, ws);					\
+      if (f) mpn_toom3_sqr (p2, a2, n, ws);				\
+    } else if (MAYBE_sqr_toom4 && ( !MAYBE_sqr_above_toom4		\
+	     || BELOW_THRESHOLD (n, SQR_TOOM6_THRESHOLD))) {		\
+      mpn_toom4_sqr (p, a, n, ws);					\
+      if (f) mpn_toom4_sqr (p2, a2, n, ws);				\
+    } else if (! MAYBE_sqr_above_toom6					\
+	     || BELOW_THRESHOLD (n, SQR_TOOM8_THRESHOLD)) {		\
+      mpn_toom6_sqr (p, a, n, ws);					\
+      if (f) mpn_toom6_sqr (p2, a2, n, ws);				\
+    } else {								\
+      mpn_toom8_sqr (p, a, n, ws);					\
+      if (f) mpn_toom8_sqr (p2, a2, n, ws);				\
+    }									\
+  } while (0)
+
+void
+mpn_toom8_sqr  (mp_ptr pp, mp_srcptr ap, mp_size_t an, mp_ptr scratch)
+{
+  mp_size_t n, s;
+
+  /***************************** decomposition *******************************/
+
+  ASSERT ( an >= 40 );
+
+  n = 1 + ((an - 1)>>3);
+
+  s = an - 7 * n;
+
+  ASSERT (0 < s && s <= n);
+  ASSERT ( s + s > 3 );
+
+#define   r6    (pp + 3 * n)			/* 3n+1 */
+#define   r4    (pp + 7 * n)			/* 3n+1 */
+#define   r2    (pp +11 * n)			/* 3n+1 */
+#define   r0    (pp +15 * n)			/* s+t <= 2*n */
+#define   r7    (scratch)			/* 3n+1 */
+#define   r5    (scratch + 3 * n + 1)		/* 3n+1 */
+#define   r3    (scratch + 6 * n + 2)		/* 3n+1 */
+#define   r1    (scratch + 9 * n + 3)		/* 3n+1 */
+#define   v0    (pp +11 * n)			/* n+1 */
+#define   v2    (pp +13 * n+2)			/* n+1 */
+#define   wse   (scratch +12 * n + 4)		/* 3n+1 */
+
+  /* Alloc also 3n+1 limbs for ws... toom_interpolate_16pts may
+     need all of them, when DO_mpn_sublsh_n usea a scratch  */
+/*   if (scratch == NULL) */
+/*     scratch = TMP_SALLOC_LIMBS (30 * n + 6); */
+
+  /********************** evaluation and recursive calls *********************/
+  /* $\pm1/8$ */
+  mpn_toom_eval_pm2rexp (v2, v0, 7, ap, n, s, 3, pp);
+  /* A(-1/8)*B(-1/8)*8^. */ /* A(+1/8)*B(+1/8)*8^. */
+  TOOM8_SQR_REC(pp, v0, 2, r7, v2, n + 1, wse);
+  mpn_toom_couple_handling (r7, 2 * n + 1 + BIT_CORRECTION, pp, 0, n, 3, 0);
+
+  /* $\pm1/4$ */
+  mpn_toom_eval_pm2rexp (v2, v0, 7, ap, n, s, 2, pp);
+  /* A(-1/4)*B(-1/4)*4^. */ /* A(+1/4)*B(+1/4)*4^. */
+  TOOM8_SQR_REC(pp, v0, 2, r5, v2, n + 1, wse);
+  mpn_toom_couple_handling (r5, 2 * n + 1, pp, 0, n, 2, 0);
+
+  /* $\pm2$ */
+  mpn_toom_eval_pm2 (v2, v0, 7, ap, n, s, pp);
+  /* A(-2)*B(-2) */ /* A(+2)*B(+2) */
+  TOOM8_SQR_REC(pp, v0, 2, r3, v2, n + 1, wse);
+  mpn_toom_couple_handling (r3, 2 * n + 1, pp, 0, n, 1, 2);
+
+  /* $\pm8$ */
+  mpn_toom_eval_pm2exp (v2, v0, 7, ap, n, s, 3, pp);
+  /* A(-8)*B(-8) */ /* A(+8)*B(+8) */
+  TOOM8_SQR_REC(pp, v0, 2, r1, v2, n + 1, wse);
+  mpn_toom_couple_handling (r1, 2 * n + 1 + BIT_CORRECTION, pp, 0, n, 3, 6);
+
+  /* $\pm1/2$ */
+  mpn_toom_eval_pm2rexp (v2, v0, 7, ap, n, s, 1, pp);
+  /* A(-1/2)*B(-1/2)*2^. */ /* A(+1/2)*B(+1/2)*2^. */
+  TOOM8_SQR_REC(pp, v0, 2, r6, v2, n + 1, wse);
+  mpn_toom_couple_handling (r6, 2 * n + 1, pp, 0, n, 1, 0);
+
+  /* $\pm1$ */
+  mpn_toom_eval_pm1 (v2, v0, 7, ap, n, s,    pp);
+  /* A(-1)*B(-1) */ /* A(1)*B(1) */
+  TOOM8_SQR_REC(pp, v0, 2, r4, v2, n + 1, wse);
+  mpn_toom_couple_handling (r4, 2 * n + 1, pp, 0, n, 0, 0);
+
+  /* $\pm4$ */
+  mpn_toom_eval_pm2exp (v2, v0, 7, ap, n, s, 2, pp);
+  /* A(-4)*B(-4) */ /* A(+4)*B(+4) */
+  TOOM8_SQR_REC(pp, v0, 2, r2, v2, n + 1, wse);
+  mpn_toom_couple_handling (r2, 2 * n + 1, pp, 0, n, 2, 4);
+
+#undef v0
+#undef v2
+
+  /* A(0)*B(0) */
+  TOOM8_SQR_REC(pp, ap, 0, pp, ap, n, wse);
+
+  mpn_toom_interpolate_16pts (pp, r1, r3, r5, r7, n, 2 * s, 0, wse);
+
+#undef r0
+#undef r1
+#undef r2
+#undef r3
+#undef r4
+#undef r5
+#undef r6
+#undef wse
+
+}
+
+#undef TOOM8_SQR_REC
+#undef MAYBE_sqr_basecase
+#undef MAYBE_sqr_above_basecase
+#undef MAYBE_sqr_toom2
+#undef MAYBE_sqr_above_toom2
+#undef MAYBE_sqr_toom3
+#undef MAYBE_sqr_above_toom3
+#undef MAYBE_sqr_above_toom4

diff --git a/third_party/gmp/mpn/generic/toom8h_mul.c b/third_party/gmp/mpn/generic/toom8h_mul.c
new file mode 100644
index 0000000..5ba259a
--- /dev/null
+++ b/third_party/gmp/mpn/generic/toom8h_mul.c

@@ -0,0 +1,305 @@
+/* Implementation of the multiplication algorithm for Toom-Cook 8.5-way.
+
+   Contributed to the GNU project by Marco Bodrato.
+
+   THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
+   SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+Copyright 2009, 2010, 2012 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+#include "gmp-impl.h"
+
+
+#if GMP_NUMB_BITS < 29
+#error Not implemented.
+#endif
+
+#if GMP_NUMB_BITS < 43
+#define BIT_CORRECTION 1
+#define CORRECTION_BITS GMP_NUMB_BITS
+#else
+#define BIT_CORRECTION 0
+#define CORRECTION_BITS 0
+#endif
+
+
+#if TUNE_PROGRAM_BUILD
+#define MAYBE_mul_basecase 1
+#define MAYBE_mul_toom22   1
+#define MAYBE_mul_toom33   1
+#define MAYBE_mul_toom44   1
+#define MAYBE_mul_toom8h   1
+#else
+#define MAYBE_mul_basecase						\
+  (MUL_TOOM8H_THRESHOLD < 8 * MUL_TOOM22_THRESHOLD)
+#define MAYBE_mul_toom22						\
+  (MUL_TOOM8H_THRESHOLD < 8 * MUL_TOOM33_THRESHOLD)
+#define MAYBE_mul_toom33						\
+  (MUL_TOOM8H_THRESHOLD < 8 * MUL_TOOM44_THRESHOLD)
+#define MAYBE_mul_toom44						\
+  (MUL_TOOM8H_THRESHOLD < 8 * MUL_TOOM6H_THRESHOLD)
+#define MAYBE_mul_toom8h						\
+  (MUL_FFT_THRESHOLD >= 8 * MUL_TOOM8H_THRESHOLD)
+#endif
+
+#define TOOM8H_MUL_N_REC(p, a, b, f, p2, a2, b2, n, ws)			\
+  do {									\
+    if (MAYBE_mul_basecase						\
+	&& BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD)) {			\
+      mpn_mul_basecase (p, a, n, b, n);					\
+      if (f) mpn_mul_basecase (p2, a2, n, b2, n);			\
+    } else if (MAYBE_mul_toom22						\
+	     && BELOW_THRESHOLD (n, MUL_TOOM33_THRESHOLD)) {		\
+      mpn_toom22_mul (p, a, n, b, n, ws);				\
+      if (f) mpn_toom22_mul (p2, a2, n, b2, n, ws);			\
+    } else if (MAYBE_mul_toom33						\
+	     && BELOW_THRESHOLD (n, MUL_TOOM44_THRESHOLD)) {		\
+      mpn_toom33_mul (p, a, n, b, n, ws);				\
+      if (f) mpn_toom33_mul (p2, a2, n, b2, n, ws);			\
+    } else if (MAYBE_mul_toom44						\
+	     && BELOW_THRESHOLD (n, MUL_TOOM6H_THRESHOLD)) {		\
+      mpn_toom44_mul (p, a, n, b, n, ws);				\
+      if (f) mpn_toom44_mul (p2, a2, n, b2, n, ws);			\
+    } else if (! MAYBE_mul_toom8h					\
+	     || BELOW_THRESHOLD (n, MUL_TOOM8H_THRESHOLD)) {		\
+      mpn_toom6h_mul (p, a, n, b, n, ws);				\
+      if (f) mpn_toom6h_mul (p2, a2, n, b2, n, ws);			\
+    } else {								\
+      mpn_toom8h_mul (p, a, n, b, n, ws);				\
+      if (f) mpn_toom8h_mul (p2, a2, n, b2, n, ws);			\
+    }									\
+  } while (0)
+
+#define TOOM8H_MUL_REC(p, a, na, b, nb, ws)		\
+  do { mpn_mul (p, a, na, b, nb); } while (0)
+
+/* Toom-8.5 , compute the product {pp,an+bn} <- {ap,an} * {bp,bn}
+   With: an >= bn >= 86, an*5 <  bn * 11.
+   It _may_ work with bn<=?? and bn*?? < an*? < bn*??
+
+   Evaluate in: infinity, +8,-8,+4,-4,+2,-2,+1,-1,+1/2,-1/2,+1/4,-1/4,+1/8,-1/8,0.
+*/
+/* Estimate on needed scratch:
+   S(n) <= (n+7)\8*13+5+MAX(S((n+7)\8),1+2*(n+7)\8),
+   since n>80; S(n) <= ceil(log(n/10)/log(8))*(13+5)+n*15\8 < n*15\8 + lg2(n)*6
+ */
+
+void
+mpn_toom8h_mul   (mp_ptr pp,
+		  mp_srcptr ap, mp_size_t an,
+		  mp_srcptr bp, mp_size_t bn, mp_ptr scratch)
+{
+  mp_size_t n, s, t;
+  int p, q, half;
+  int sign;
+
+  /***************************** decomposition *******************************/
+
+  ASSERT (an >= bn);
+  /* Can not handle too small operands */
+  ASSERT (bn >= 86);
+  /* Can not handle too much unbalancement */
+  ASSERT (an <= bn*4);
+  ASSERT (GMP_NUMB_BITS > 11*3 || an*4 <= bn*11);
+  ASSERT (GMP_NUMB_BITS > 10*3 || an*1 <= bn* 2);
+  ASSERT (GMP_NUMB_BITS >  9*3 || an*2 <= bn* 3);
+
+  /* Limit num/den is a rational number between
+     (16/15)^(log(6)/log(2*6-1)) and (16/15)^(log(8)/log(2*8-1))             */
+#define LIMIT_numerator (21)
+#define LIMIT_denominat (20)
+
+  if (LIKELY (an == bn) || an * (LIMIT_denominat>>1) < LIMIT_numerator * (bn>>1) ) /* is 8*... < 8*... */
+    {
+      half = 0;
+      n = 1 + ((an - 1)>>3);
+      p = q = 7;
+      s = an - 7 * n;
+      t = bn - 7 * n;
+    }
+  else
+    {
+      if (an * 13 < 16 * bn) /* (an*7*LIMIT_numerator<LIMIT_denominat*9*bn) */
+	{ p = 9; q = 8; }
+      else if (GMP_NUMB_BITS <= 9*3 ||
+	       an *(LIMIT_denominat>>1) < (LIMIT_numerator/7*9) * (bn>>1))
+	{ p = 9; q = 7; }
+      else if (an * 10 < 33 * (bn>>1)) /* (an*3*LIMIT_numerator<LIMIT_denominat*5*bn) */
+	{ p =10; q = 7; }
+      else if (GMP_NUMB_BITS <= 10*3 ||
+	       an * (LIMIT_denominat/5) < (LIMIT_numerator/3) * bn)
+	{ p =10; q = 6; }
+      else if (an * 6 < 13 * bn) /*(an * 5 * LIMIT_numerator < LIMIT_denominat *11 * bn)*/
+	{ p =11; q = 6; }
+      else if (GMP_NUMB_BITS <= 11*3 ||
+	       an * 4 < 9 * bn)
+	{ p =11; q = 5; }
+      else if (an *(LIMIT_numerator/3) < LIMIT_denominat * bn)  /* is 4*... <12*... */
+	{ p =12; q = 5; }
+      else if (GMP_NUMB_BITS <= 12*3 ||
+	       an * 9 < 28 * bn )  /* is 4*... <12*... */
+	{ p =12; q = 4; }
+      else
+	{ p =13; q = 4; }
+
+      half = (p+q)&1;
+      n = 1 + (q * an >= p * bn ? (an - 1) / (size_t) p : (bn - 1) / (size_t) q);
+      p--; q--;
+
+      s = an - p * n;
+      t = bn - q * n;
+
+      if(half) { /* Recover from badly chosen splitting */
+	if (UNLIKELY (s<1)) {p--; s+=n; half=0;}
+	else if (UNLIKELY (t<1)) {q--; t+=n; half=0;}
+      }
+    }
+#undef LIMIT_numerator
+#undef LIMIT_denominat
+
+  ASSERT (0 < s && s <= n);
+  ASSERT (0 < t && t <= n);
+  ASSERT (half || s + t > 3);
+  ASSERT (n > 2);
+
+#define   r6    (pp + 3 * n)			/* 3n+1 */
+#define   r4    (pp + 7 * n)			/* 3n+1 */
+#define   r2    (pp +11 * n)			/* 3n+1 */
+#define   r0    (pp +15 * n)			/* s+t <= 2*n */
+#define   r7    (scratch)			/* 3n+1 */
+#define   r5    (scratch + 3 * n + 1)		/* 3n+1 */
+#define   r3    (scratch + 6 * n + 2)		/* 3n+1 */
+#define   r1    (scratch + 9 * n + 3)		/* 3n+1 */
+#define   v0    (pp +11 * n)			/* n+1 */
+#define   v1    (pp +12 * n+1)			/* n+1 */
+#define   v2    (pp +13 * n+2)			/* n+1 */
+#define   v3    (scratch +12 * n + 4)		/* n+1 */
+#define   wsi   (scratch +12 * n + 4)		/* 3n+1 */
+#define   wse   (scratch +13 * n + 5)		/* 2n+1 */
+
+  /* Alloc also 3n+1 limbs for wsi... toom_interpolate_16pts may
+     need all of them  */
+/*   if (scratch == NULL) */
+/*     scratch = TMP_SALLOC_LIMBS(mpn_toom8_sqr_itch(n * 8)); */
+  ASSERT (15 * n + 6 <= mpn_toom8h_mul_itch (an, bn));
+  ASSERT (15 * n + 6 <= mpn_toom8_sqr_itch (n * 8));
+
+  /********************** evaluation and recursive calls *********************/
+
+  /* $\pm1/8$ */
+  sign = mpn_toom_eval_pm2rexp (v2, v0, p, ap, n, s, 3, pp) ^
+	 mpn_toom_eval_pm2rexp (v3, v1, q, bp, n, t, 3, pp);
+  /* A(-1/8)*B(-1/8)*8^. */ /* A(+1/8)*B(+1/8)*8^. */
+  TOOM8H_MUL_N_REC(pp, v0, v1, 2, r7, v2, v3, n + 1, wse);
+  mpn_toom_couple_handling (r7, 2 * n + 1 + BIT_CORRECTION, pp, sign, n, 3*(1+half), 3*(half));
+
+  /* $\pm1/4$ */
+  sign = mpn_toom_eval_pm2rexp (v2, v0, p, ap, n, s, 2, pp) ^
+	 mpn_toom_eval_pm2rexp (v3, v1, q, bp, n, t, 2, pp);
+  /* A(-1/4)*B(-1/4)*4^. */ /* A(+1/4)*B(+1/4)*4^. */
+  TOOM8H_MUL_N_REC(pp, v0, v1, 2, r5, v2, v3, n + 1, wse);
+  mpn_toom_couple_handling (r5, 2 * n + 1, pp, sign, n, 2*(1+half), 2*(half));
+
+  /* $\pm2$ */
+  sign = mpn_toom_eval_pm2 (v2, v0, p, ap, n, s, pp) ^
+	 mpn_toom_eval_pm2 (v3, v1, q, bp, n, t, pp);
+  /* A(-2)*B(-2) */ /* A(+2)*B(+2) */
+  TOOM8H_MUL_N_REC(pp, v0, v1, 2, r3, v2, v3, n + 1, wse);
+  mpn_toom_couple_handling (r3, 2 * n + 1, pp, sign, n, 1, 2);
+
+  /* $\pm8$ */
+  sign = mpn_toom_eval_pm2exp (v2, v0, p, ap, n, s, 3, pp) ^
+	 mpn_toom_eval_pm2exp (v3, v1, q, bp, n, t, 3, pp);
+  /* A(-8)*B(-8) */ /* A(+8)*B(+8) */
+  TOOM8H_MUL_N_REC(pp, v0, v1, 2, r1, v2, v3, n + 1, wse);
+  mpn_toom_couple_handling (r1, 2 * n + 1 + BIT_CORRECTION, pp, sign, n, 3, 6);
+
+  /* $\pm1/2$ */
+  sign = mpn_toom_eval_pm2rexp (v2, v0, p, ap, n, s, 1, pp) ^
+	 mpn_toom_eval_pm2rexp (v3, v1, q, bp, n, t, 1, pp);
+  /* A(-1/2)*B(-1/2)*2^. */ /* A(+1/2)*B(+1/2)*2^. */
+  TOOM8H_MUL_N_REC(pp, v0, v1, 2, r6, v2, v3, n + 1, wse);
+  mpn_toom_couple_handling (r6, 2 * n + 1, pp, sign, n, 1+half, half);
+
+  /* $\pm1$ */
+  sign = mpn_toom_eval_pm1 (v2, v0, p, ap, n, s,    pp);
+  if (GMP_NUMB_BITS > 12*3 && UNLIKELY (q == 3))
+    sign ^= mpn_toom_eval_dgr3_pm1 (v3, v1, bp, n, t,    pp);
+  else
+    sign ^= mpn_toom_eval_pm1 (v3, v1, q, bp, n, t,    pp);
+  /* A(-1)*B(-1) */ /* A(1)*B(1) */
+  TOOM8H_MUL_N_REC(pp, v0, v1, 2, r4, v2, v3, n + 1, wse);
+  mpn_toom_couple_handling (r4, 2 * n + 1, pp, sign, n, 0, 0);
+
+  /* $\pm4$ */
+  sign = mpn_toom_eval_pm2exp (v2, v0, p, ap, n, s, 2, pp) ^
+	 mpn_toom_eval_pm2exp (v3, v1, q, bp, n, t, 2, pp);
+  /* A(-4)*B(-4) */ /* A(+4)*B(+4) */
+  TOOM8H_MUL_N_REC(pp, v0, v1, 2, r2, v2, v3, n + 1, wse);
+  mpn_toom_couple_handling (r2, 2 * n + 1, pp, sign, n, 2, 4);
+
+#undef v0
+#undef v1
+#undef v2
+#undef v3
+#undef wse
+
+  /* A(0)*B(0) */
+  TOOM8H_MUL_N_REC(pp, ap, bp, 0, pp, ap, bp, n, wsi);
+
+  /* Infinity */
+  if (UNLIKELY (half != 0)) {
+    if (s > t) {
+      TOOM8H_MUL_REC(r0, ap + p * n, s, bp + q * n, t, wsi);
+    } else {
+      TOOM8H_MUL_REC(r0, bp + q * n, t, ap + p * n, s, wsi);
+    };
+  };
+
+  mpn_toom_interpolate_16pts (pp, r1, r3, r5, r7, n, s+t, half, wsi);
+
+#undef r0
+#undef r1
+#undef r2
+#undef r3
+#undef r4
+#undef r5
+#undef r6
+#undef wsi
+}
+
+#undef TOOM8H_MUL_N_REC
+#undef TOOM8H_MUL_REC
+#undef MAYBE_mul_basecase
+#undef MAYBE_mul_toom22
+#undef MAYBE_mul_toom33
+#undef MAYBE_mul_toom44
+#undef MAYBE_mul_toom8h

diff --git a/third_party/gmp/mpn/generic/toom_couple_handling.c b/third_party/gmp/mpn/generic/toom_couple_handling.c
new file mode 100644
index 0000000..cd253f7
--- /dev/null
+++ b/third_party/gmp/mpn/generic/toom_couple_handling.c

@@ -0,0 +1,80 @@
+/* Helper function for high degree Toom-Cook algorithms.
+
+   Contributed to the GNU project by Marco Bodrato.
+
+   THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
+   SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+Copyright 2009, 2010 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+#include "gmp-impl.h"
+
+/* Gets {pp,n} and (sign?-1:1)*{np,n}. Computes at once:
+     {pp,n} <- ({pp,n}+{np,n})/2^{ps+1}
+     {pn,n} <- ({pp,n}-{np,n})/2^{ns+1}
+   Finally recompose them obtaining:
+     {pp,n+off} <- {pp,n}+{np,n}*2^{off*GMP_NUMB_BITS}
+*/
+void
+mpn_toom_couple_handling (mp_ptr pp, mp_size_t n, mp_ptr np,
+			  int nsign, mp_size_t off, int ps, int ns)
+{
+  if (nsign) {
+#ifdef HAVE_NATIVE_mpn_rsh1sub_n
+    mpn_rsh1sub_n (np, pp, np, n);
+#else
+    mpn_sub_n (np, pp, np, n);
+    mpn_rshift (np, np, n, 1);
+#endif
+  } else {
+#ifdef HAVE_NATIVE_mpn_rsh1add_n
+    mpn_rsh1add_n (np, pp, np, n);
+#else
+    mpn_add_n (np, pp, np, n);
+    mpn_rshift (np, np, n, 1);
+#endif
+  }
+
+#ifdef HAVE_NATIVE_mpn_rsh1sub_n
+  if (ps == 1)
+    mpn_rsh1sub_n (pp, pp, np, n);
+  else
+#endif
+  {
+    mpn_sub_n (pp, pp, np, n);
+    if (ps > 0)
+      mpn_rshift (pp, pp, n, ps);
+  }
+  if (ns > 0)
+    mpn_rshift (np, np, n, ns);
+  pp[n] = mpn_add_n (pp+off, pp+off, np, n-off);
+  ASSERT_NOCARRY (mpn_add_1(pp+n, np+n-off, off, pp[n]) );
+}

diff --git a/third_party/gmp/mpn/generic/toom_eval_dgr3_pm1.c b/third_party/gmp/mpn/generic/toom_eval_dgr3_pm1.c
new file mode 100644
index 0000000..5f491b6
--- /dev/null
+++ b/third_party/gmp/mpn/generic/toom_eval_dgr3_pm1.c

@@ -0,0 +1,72 @@
+/* mpn_toom_eval_dgr3_pm1 -- Evaluate a degree 3 polynomial in +1 and -1
+
+   Contributed to the GNU project by Niels Möller
+
+   THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
+   SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+Copyright 2009 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+#include "gmp-impl.h"
+
+int
+mpn_toom_eval_dgr3_pm1 (mp_ptr xp1, mp_ptr xm1,
+			mp_srcptr xp, mp_size_t n, mp_size_t x3n, mp_ptr tp)
+{
+  int neg;
+
+  ASSERT (x3n > 0);
+  ASSERT (x3n <= n);
+
+  xp1[n] = mpn_add_n (xp1, xp, xp + 2*n, n);
+  tp[n] = mpn_add (tp, xp + n, n, xp + 3*n, x3n);
+
+  neg = (mpn_cmp (xp1, tp, n + 1) < 0) ? ~0 : 0;
+
+#if HAVE_NATIVE_mpn_add_n_sub_n
+  if (neg)
+    mpn_add_n_sub_n (xp1, xm1, tp, xp1, n + 1);
+  else
+    mpn_add_n_sub_n (xp1, xm1, xp1, tp, n + 1);
+#else
+  if (neg)
+    mpn_sub_n (xm1, tp, xp1, n + 1);
+  else
+    mpn_sub_n (xm1, xp1, tp, n + 1);
+
+  mpn_add_n (xp1, xp1, tp, n + 1);
+#endif
+
+  ASSERT (xp1[n] <= 3);
+  ASSERT (xm1[n] <= 1);
+
+  return neg;
+}

diff --git a/third_party/gmp/mpn/generic/toom_eval_dgr3_pm2.c b/third_party/gmp/mpn/generic/toom_eval_dgr3_pm2.c
new file mode 100644
index 0000000..55e6b89
--- /dev/null
+++ b/third_party/gmp/mpn/generic/toom_eval_dgr3_pm2.c

@@ -0,0 +1,97 @@
+/* mpn_toom_eval_dgr3_pm2 -- Evaluate a degree 3 polynomial in +2 and -2
+
+   Contributed to the GNU project by Niels Möller
+
+   THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
+   SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+Copyright 2009 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+#include "gmp-impl.h"
+
+/* Needs n+1 limbs of temporary storage. */
+int
+mpn_toom_eval_dgr3_pm2 (mp_ptr xp2, mp_ptr xm2,
+			mp_srcptr xp, mp_size_t n, mp_size_t x3n, mp_ptr tp)
+{
+  mp_limb_t cy;
+  int neg;
+
+  ASSERT (x3n > 0);
+  ASSERT (x3n <= n);
+
+  /* (x0 + 4 * x2) +/- (2 x1 + 8 x_3) */
+#if HAVE_NATIVE_mpn_addlsh_n || HAVE_NATIVE_mpn_addlsh2_n
+#if HAVE_NATIVE_mpn_addlsh2_n
+  xp2[n] = mpn_addlsh2_n (xp2, xp, xp + 2*n, n);
+
+  cy = mpn_addlsh2_n (tp, xp + n, xp + 3*n, x3n);
+#else /* HAVE_NATIVE_mpn_addlsh_n */
+  xp2[n] = mpn_addlsh_n (xp2, xp, xp + 2*n, n, 2);
+
+  cy = mpn_addlsh_n (tp, xp + n, xp + 3*n, x3n, 2);
+#endif
+  if (x3n < n)
+    cy = mpn_add_1 (tp + x3n, xp + n + x3n, n - x3n, cy);
+  tp[n] = cy;
+#else
+  cy = mpn_lshift (tp, xp + 2*n, n, 2);
+  xp2[n] = cy + mpn_add_n (xp2, tp, xp, n);
+
+  tp[x3n] = mpn_lshift (tp, xp + 3*n, x3n, 2);
+  if (x3n < n)
+    tp[n] = mpn_add (tp, xp + n, n, tp, x3n + 1);
+  else
+    tp[n] += mpn_add_n (tp, xp + n, tp, n);
+#endif
+  mpn_lshift (tp, tp, n+1, 1);
+
+  neg = (mpn_cmp (xp2, tp, n + 1) < 0) ? ~0 : 0;
+
+#if HAVE_NATIVE_mpn_add_n_sub_n
+  if (neg)
+    mpn_add_n_sub_n (xp2, xm2, tp, xp2, n + 1);
+  else
+    mpn_add_n_sub_n (xp2, xm2, xp2, tp, n + 1);
+#else
+  if (neg)
+    mpn_sub_n (xm2, tp, xp2, n + 1);
+  else
+    mpn_sub_n (xm2, xp2, tp, n + 1);
+
+  mpn_add_n (xp2, xp2, tp, n + 1);
+#endif
+
+  ASSERT (xp2[n] < 15);
+  ASSERT (xm2[n] < 10);
+
+  return neg;
+}

diff --git a/third_party/gmp/mpn/generic/toom_eval_pm1.c b/third_party/gmp/mpn/generic/toom_eval_pm1.c
new file mode 100644
index 0000000..a8cfa93
--- /dev/null
+++ b/third_party/gmp/mpn/generic/toom_eval_pm1.c

@@ -0,0 +1,89 @@
+/* mpn_toom_eval_pm1 -- Evaluate a polynomial in +1 and -1
+
+   Contributed to the GNU project by Niels Möller
+
+   THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
+   SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+Copyright 2009 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+#include "gmp-impl.h"
+
+/* Evaluates a polynomial of degree k > 3, in the points +1 and -1. */
+int
+mpn_toom_eval_pm1 (mp_ptr xp1, mp_ptr xm1, unsigned k,
+		   mp_srcptr xp, mp_size_t n, mp_size_t hn, mp_ptr tp)
+{
+  unsigned i;
+  int neg;
+
+  ASSERT (k >= 4);
+
+  ASSERT (hn > 0);
+  ASSERT (hn <= n);
+
+  /* The degree k is also the number of full-size coefficients, so
+   * that last coefficient, of size hn, starts at xp + k*n. */
+
+  xp1[n] = mpn_add_n (xp1, xp, xp + 2*n, n);
+  for (i = 4; i < k; i += 2)
+    ASSERT_NOCARRY (mpn_add (xp1, xp1, n+1, xp+i*n, n));
+
+  tp[n] = mpn_add_n (tp, xp + n, xp + 3*n, n);
+  for (i = 5; i < k; i += 2)
+    ASSERT_NOCARRY (mpn_add (tp, tp, n+1, xp+i*n, n));
+
+  if (k & 1)
+    ASSERT_NOCARRY (mpn_add (tp, tp, n+1, xp+k*n, hn));
+  else
+    ASSERT_NOCARRY (mpn_add (xp1, xp1, n+1, xp+k*n, hn));
+
+  neg = (mpn_cmp (xp1, tp, n + 1) < 0) ? ~0 : 0;
+
+#if HAVE_NATIVE_mpn_add_n_sub_n
+  if (neg)
+    mpn_add_n_sub_n (xp1, xm1, tp, xp1, n + 1);
+  else
+    mpn_add_n_sub_n (xp1, xm1, xp1, tp, n + 1);
+#else
+  if (neg)
+    mpn_sub_n (xm1, tp, xp1, n + 1);
+  else
+    mpn_sub_n (xm1, xp1, tp, n + 1);
+
+  mpn_add_n (xp1, xp1, tp, n + 1);
+#endif
+
+  ASSERT (xp1[n] <= k);
+  ASSERT (xm1[n] <= k/2 + 1);
+
+  return neg;
+}

diff --git a/third_party/gmp/mpn/generic/toom_eval_pm2.c b/third_party/gmp/mpn/generic/toom_eval_pm2.c
new file mode 100644
index 0000000..be682c7
--- /dev/null
+++ b/third_party/gmp/mpn/generic/toom_eval_pm2.c

@@ -0,0 +1,130 @@
+/* mpn_toom_eval_pm2 -- Evaluate a polynomial in +2 and -2
+
+   Contributed to the GNU project by Niels Möller and Marco Bodrato
+
+   THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
+   SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+Copyright 2009 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+
+/* DO_addlsh2(d,a,b,n,cy) computes cy,{d,n} <- {a,n} + 4*(cy,{b,n}), it
+   can be used as DO_addlsh2(d,a,d,n,d[n]), for accumulation on {d,n+1}. */
+#if HAVE_NATIVE_mpn_addlsh2_n
+#define DO_addlsh2(d, a, b, n, cy)	\
+do {					\
+  (cy) <<= 2;				\
+  (cy) += mpn_addlsh2_n(d, a, b, n);	\
+} while (0)
+#else
+#if HAVE_NATIVE_mpn_addlsh_n
+#define DO_addlsh2(d, a, b, n, cy)	\
+do {					\
+  (cy) <<= 2;				\
+  (cy) += mpn_addlsh_n(d, a, b, n, 2);	\
+} while (0)
+#else
+/* The following is not a general substitute for addlsh2.
+   It is correct if d == b, but it is not if d == a.  */
+#define DO_addlsh2(d, a, b, n, cy)	\
+do {					\
+  (cy) <<= 2;				\
+  (cy) += mpn_lshift(d, b, n, 2);	\
+  (cy) += mpn_add_n(d, d, a, n);	\
+} while (0)
+#endif
+#endif
+
+/* Evaluates a polynomial of degree 2 < k < GMP_NUMB_BITS, in the
+   points +2 and -2. */
+int
+mpn_toom_eval_pm2 (mp_ptr xp2, mp_ptr xm2, unsigned k,
+		   mp_srcptr xp, mp_size_t n, mp_size_t hn, mp_ptr tp)
+{
+  int i;
+  int neg;
+  mp_limb_t cy;
+
+  ASSERT (k >= 3);
+  ASSERT (k < GMP_NUMB_BITS);
+
+  ASSERT (hn > 0);
+  ASSERT (hn <= n);
+
+  /* The degree k is also the number of full-size coefficients, so
+   * that last coefficient, of size hn, starts at xp + k*n. */
+
+  cy = 0;
+  DO_addlsh2 (xp2, xp + (k-2) * n, xp + k * n, hn, cy);
+  if (hn != n)
+    cy = mpn_add_1 (xp2 + hn, xp + (k-2) * n + hn, n - hn, cy);
+  for (i = k - 4; i >= 0; i -= 2)
+    DO_addlsh2 (xp2, xp + i * n, xp2, n, cy);
+  xp2[n] = cy;
+
+  k--;
+
+  cy = 0;
+  DO_addlsh2 (tp, xp + (k-2) * n, xp + k * n, n, cy);
+  for (i = k - 4; i >= 0; i -= 2)
+    DO_addlsh2 (tp, xp + i * n, tp, n, cy);
+  tp[n] = cy;
+
+  if (k & 1)
+    ASSERT_NOCARRY(mpn_lshift (tp , tp , n + 1, 1));
+  else
+    ASSERT_NOCARRY(mpn_lshift (xp2, xp2, n + 1, 1));
+
+  neg = (mpn_cmp (xp2, tp, n + 1) < 0) ? ~0 : 0;
+
+#if HAVE_NATIVE_mpn_add_n_sub_n
+  if (neg)
+    mpn_add_n_sub_n (xp2, xm2, tp, xp2, n + 1);
+  else
+    mpn_add_n_sub_n (xp2, xm2, xp2, tp, n + 1);
+#else /* !HAVE_NATIVE_mpn_add_n_sub_n */
+  if (neg)
+    mpn_sub_n (xm2, tp, xp2, n + 1);
+  else
+    mpn_sub_n (xm2, xp2, tp, n + 1);
+
+  mpn_add_n (xp2, xp2, tp, n + 1);
+#endif /* !HAVE_NATIVE_mpn_add_n_sub_n */
+
+  ASSERT (xp2[n] < (1<<(k+2))-1);
+  ASSERT (xm2[n] < ((1<<(k+3))-1 - (1^k&1))/3);
+
+  neg ^= ((k & 1) - 1);
+
+  return neg;
+}
+
+#undef DO_addlsh2

diff --git a/third_party/gmp/mpn/generic/toom_eval_pm2exp.c b/third_party/gmp/mpn/generic/toom_eval_pm2exp.c
new file mode 100644
index 0000000..c3c4651
--- /dev/null
+++ b/third_party/gmp/mpn/generic/toom_eval_pm2exp.c

@@ -0,0 +1,127 @@
+/* mpn_toom_eval_pm2exp -- Evaluate a polynomial in +2^k and -2^k
+
+   Contributed to the GNU project by Niels Möller
+
+   THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
+   SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+Copyright 2009 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+#include "gmp-impl.h"
+
+/* Evaluates a polynomial of degree k > 2, in the points +2^shift and -2^shift. */
+int
+mpn_toom_eval_pm2exp (mp_ptr xp2, mp_ptr xm2, unsigned k,
+		      mp_srcptr xp, mp_size_t n, mp_size_t hn, unsigned shift,
+		      mp_ptr tp)
+{
+  unsigned i;
+  int neg;
+#if HAVE_NATIVE_mpn_addlsh_n
+  mp_limb_t cy;
+#endif
+
+  ASSERT (k >= 3);
+  ASSERT (shift*k < GMP_NUMB_BITS);
+
+  ASSERT (hn > 0);
+  ASSERT (hn <= n);
+
+  /* The degree k is also the number of full-size coefficients, so
+   * that last coefficient, of size hn, starts at xp + k*n. */
+
+#if HAVE_NATIVE_mpn_addlsh_n
+  xp2[n] = mpn_addlsh_n (xp2, xp, xp + 2*n, n, 2*shift);
+  for (i = 4; i < k; i += 2)
+    xp2[n] += mpn_addlsh_n (xp2, xp2, xp + i*n, n, i*shift);
+
+  tp[n] = mpn_lshift (tp, xp+n, n, shift);
+  for (i = 3; i < k; i+= 2)
+    tp[n] += mpn_addlsh_n (tp, tp, xp+i*n, n, i*shift);
+
+  if (k & 1)
+    {
+      cy = mpn_addlsh_n (tp, tp, xp+k*n, hn, k*shift);
+      MPN_INCR_U (tp + hn, n+1 - hn, cy);
+    }
+  else
+    {
+      cy = mpn_addlsh_n (xp2, xp2, xp+k*n, hn, k*shift);
+      MPN_INCR_U (xp2 + hn, n+1 - hn, cy);
+    }
+
+#else /* !HAVE_NATIVE_mpn_addlsh_n */
+  xp2[n] = mpn_lshift (tp, xp+2*n, n, 2*shift);
+  xp2[n] += mpn_add_n (xp2, xp, tp, n);
+  for (i = 4; i < k; i += 2)
+    {
+      xp2[n] += mpn_lshift (tp, xp + i*n, n, i*shift);
+      xp2[n] += mpn_add_n (xp2, xp2, tp, n);
+    }
+
+  tp[n] = mpn_lshift (tp, xp+n, n, shift);
+  for (i = 3; i < k; i+= 2)
+    {
+      tp[n] += mpn_lshift (xm2, xp + i*n, n, i*shift);
+      tp[n] += mpn_add_n (tp, tp, xm2, n);
+    }
+
+  xm2[hn] = mpn_lshift (xm2, xp + k*n, hn, k*shift);
+  if (k & 1)
+    mpn_add (tp, tp, n+1, xm2, hn+1);
+  else
+    mpn_add (xp2, xp2, n+1, xm2, hn+1);
+#endif /* !HAVE_NATIVE_mpn_addlsh_n */
+
+  neg = (mpn_cmp (xp2, tp, n + 1) < 0) ? ~0 : 0;
+
+#if HAVE_NATIVE_mpn_add_n_sub_n
+  if (neg)
+    mpn_add_n_sub_n (xp2, xm2, tp, xp2, n + 1);
+  else
+    mpn_add_n_sub_n (xp2, xm2, xp2, tp, n + 1);
+#else /* !HAVE_NATIVE_mpn_add_n_sub_n */
+  if (neg)
+    mpn_sub_n (xm2, tp, xp2, n + 1);
+  else
+    mpn_sub_n (xm2, xp2, tp, n + 1);
+
+  mpn_add_n (xp2, xp2, tp, n + 1);
+#endif /* !HAVE_NATIVE_mpn_add_n_sub_n */
+
+  /* FIXME: the following asserts are useless if (k+1)*shift >= GMP_LIMB_BITS */
+  ASSERT ((k+1)*shift >= GMP_LIMB_BITS ||
+	  xp2[n] < ((CNST_LIMB(1)<<((k+1)*shift))-1)/((CNST_LIMB(1)<<shift)-1));
+  ASSERT ((k+2)*shift >= GMP_LIMB_BITS ||
+	  xm2[n] < ((CNST_LIMB(1)<<((k+2)*shift))-((k&1)?(CNST_LIMB(1)<<shift):1))/((CNST_LIMB(1)<<(2*shift))-1));
+
+  return neg;
+}

diff --git a/third_party/gmp/mpn/generic/toom_eval_pm2rexp.c b/third_party/gmp/mpn/generic/toom_eval_pm2rexp.c
new file mode 100644
index 0000000..6cd62fb
--- /dev/null
+++ b/third_party/gmp/mpn/generic/toom_eval_pm2rexp.c

@@ -0,0 +1,101 @@
+/* mpn_toom_eval_pm2rexp -- Evaluate a polynomial in +2^-k and -2^-k
+
+   Contributed to the GNU project by Marco Bodrato
+
+   THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
+   SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+Copyright 2009 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+#include "gmp-impl.h"
+
+#if HAVE_NATIVE_mpn_addlsh_n
+#define DO_mpn_addlsh_n(dst,src,n,s,ws) mpn_addlsh_n(dst,dst,src,n,s)
+#else
+static mp_limb_t
+DO_mpn_addlsh_n(mp_ptr dst, mp_srcptr src, mp_size_t n, unsigned int s, mp_ptr ws)
+{
+#if USE_MUL_1 && 0
+  return mpn_addmul_1(dst,src,n,CNST_LIMB(1) <<(s));
+#else
+  mp_limb_t __cy;
+  __cy = mpn_lshift(ws,src,n,s);
+  return    __cy + mpn_add_n(dst,dst,ws,n);
+#endif
+}
+#endif
+
+/* Evaluates a polynomial of degree k >= 3. */
+int
+mpn_toom_eval_pm2rexp (mp_ptr rp, mp_ptr rm,
+		      unsigned int q, mp_srcptr ap, mp_size_t n, mp_size_t t,
+		      unsigned int s, mp_ptr ws)
+{
+  unsigned int i;
+  int neg;
+  /* {ap,q*n+t} -> {rp,n+1} {rm,n+1} , with {ws, n+1}*/
+  ASSERT (n >= t);
+  ASSERT (s != 0); /* or _eval_pm1 should be used */
+  ASSERT (q > 1);
+  ASSERT (s*q < GMP_NUMB_BITS);
+  rp[n] = mpn_lshift(rp, ap, n, s*q);
+  ws[n] = mpn_lshift(ws, ap+n, n, s*(q-1));
+  if( (q & 1) != 0) {
+    ASSERT_NOCARRY(mpn_add(ws,ws,n+1,ap+n*q,t));
+    rp[n] += DO_mpn_addlsh_n(rp, ap+n*(q-1), n, s, rm);
+  } else {
+    ASSERT_NOCARRY(mpn_add(rp,rp,n+1,ap+n*q,t));
+  }
+  for(i=2; i<q-1; i++)
+  {
+    rp[n] += DO_mpn_addlsh_n(rp, ap+n*i, n, s*(q-i), rm);
+    i++;
+    ws[n] += DO_mpn_addlsh_n(ws, ap+n*i, n, s*(q-i), rm);
+  };
+
+  neg = (mpn_cmp (rp, ws, n + 1) < 0) ? ~0 : 0;
+
+#if HAVE_NATIVE_mpn_add_n_sub_n
+  if (neg)
+    mpn_add_n_sub_n (rp, rm, ws, rp, n + 1);
+  else
+    mpn_add_n_sub_n (rp, rm, rp, ws, n + 1);
+#else /* !HAVE_NATIVE_mpn_add_n_sub_n */
+  if (neg)
+    mpn_sub_n (rm, ws, rp, n + 1);
+  else
+    mpn_sub_n (rm, rp, ws, n + 1);
+
+  ASSERT_NOCARRY (mpn_add_n (rp, rp, ws, n + 1));
+#endif /* !HAVE_NATIVE_mpn_add_n_sub_n */
+
+  return neg;
+}

diff --git a/third_party/gmp/mpn/generic/toom_interpolate_12pts.c b/third_party/gmp/mpn/generic/toom_interpolate_12pts.c
new file mode 100644
index 0000000..347e341
--- /dev/null
+++ b/third_party/gmp/mpn/generic/toom_interpolate_12pts.c

@@ -0,0 +1,370 @@
+/* Interpolation for the algorithm Toom-Cook 6.5-way.
+
+   Contributed to the GNU project by Marco Bodrato.
+
+   THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
+   SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+Copyright 2009, 2010, 2012, 2015 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+#include "gmp-impl.h"
+
+
+#if HAVE_NATIVE_mpn_sublsh_n
+#define DO_mpn_sublsh_n(dst,src,n,s,ws) mpn_sublsh_n(dst,dst,src,n,s)
+#else
+static mp_limb_t
+DO_mpn_sublsh_n(mp_ptr dst, mp_srcptr src, mp_size_t n, unsigned int s, mp_ptr ws)
+{
+#if USE_MUL_1 && 0
+  return mpn_submul_1(dst,src,n,CNST_LIMB(1) <<(s));
+#else
+  mp_limb_t __cy;
+  __cy = mpn_lshift(ws,src,n,s);
+  return    __cy + mpn_sub_n(dst,dst,ws,n);
+#endif
+}
+#endif
+
+#if HAVE_NATIVE_mpn_addlsh_n
+#define DO_mpn_addlsh_n(dst,src,n,s,ws) mpn_addlsh_n(dst,dst,src,n,s)
+#else
+static mp_limb_t
+DO_mpn_addlsh_n(mp_ptr dst, mp_srcptr src, mp_size_t n, unsigned int s, mp_ptr ws)
+{
+#if USE_MUL_1 && 0
+  return mpn_addmul_1(dst,src,n,CNST_LIMB(1) <<(s));
+#else
+  mp_limb_t __cy;
+  __cy = mpn_lshift(ws,src,n,s);
+  return    __cy + mpn_add_n(dst,dst,ws,n);
+#endif
+}
+#endif
+
+#if HAVE_NATIVE_mpn_subrsh
+#define DO_mpn_subrsh(dst,nd,src,ns,s,ws) mpn_subrsh(dst,nd,src,ns,s)
+#else
+/* FIXME: This is not a correct definition, it assumes no carry */
+#define DO_mpn_subrsh(dst,nd,src,ns,s,ws)				\
+do {									\
+  mp_limb_t __cy;							\
+  MPN_DECR_U (dst, nd, src[0] >> s);					\
+  __cy = DO_mpn_sublsh_n (dst, src + 1, ns - 1, GMP_NUMB_BITS - s, ws);	\
+  MPN_DECR_U (dst + ns - 1, nd - ns + 1, __cy);				\
+} while (0)
+#endif
+
+
+#if GMP_NUMB_BITS < 21
+#error Not implemented: Both sublsh_n(,,,20) should be corrected.
+#endif
+
+#if GMP_NUMB_BITS < 16
+#error Not implemented: divexact_by42525 needs splitting.
+#endif
+
+#if GMP_NUMB_BITS < 12
+#error Not implemented: Hard to adapt...
+#endif
+
+/* FIXME: tuneup should decide the best variant */
+#ifndef AORSMUL_FASTER_AORS_AORSLSH
+#define AORSMUL_FASTER_AORS_AORSLSH 1
+#endif
+#ifndef AORSMUL_FASTER_AORS_2AORSLSH
+#define AORSMUL_FASTER_AORS_2AORSLSH 1
+#endif
+#ifndef AORSMUL_FASTER_2AORSLSH
+#define AORSMUL_FASTER_2AORSLSH 1
+#endif
+#ifndef AORSMUL_FASTER_3AORSLSH
+#define AORSMUL_FASTER_3AORSLSH 1
+#endif
+
+#define BINVERT_9 \
+  ((((GMP_NUMB_MAX / 9) << (6 - GMP_NUMB_BITS % 6)) * 8 & GMP_NUMB_MAX) | 0x39)
+
+#define BINVERT_255 \
+  (GMP_NUMB_MAX - ((GMP_NUMB_MAX / 255) << (8 - GMP_NUMB_BITS % 8)))
+
+  /* FIXME: find some more general expressions for 2835^-1, 42525^-1 */
+#if GMP_LIMB_BITS == 32
+#define BINVERT_2835  (GMP_NUMB_MASK &		CNST_LIMB(0x53E3771B))
+#define BINVERT_42525 (GMP_NUMB_MASK &		CNST_LIMB(0x9F314C35))
+#else
+#if GMP_LIMB_BITS == 64
+#define BINVERT_2835  (GMP_NUMB_MASK &	CNST_LIMB(0x938CC70553E3771B))
+#define BINVERT_42525 (GMP_NUMB_MASK &	CNST_LIMB(0xE7B40D449F314C35))
+#endif
+#endif
+
+#ifndef mpn_divexact_by255
+#if GMP_NUMB_BITS % 8 == 0
+#define mpn_divexact_by255(dst,src,size) \
+  (255 & 1 * mpn_bdiv_dbm1 (dst, src, size, __GMP_CAST (mp_limb_t, GMP_NUMB_MASK / 255)))
+#else
+#if HAVE_NATIVE_mpn_pi1_bdiv_q_1
+#define mpn_divexact_by255(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(255),BINVERT_255,0)
+#else
+#define mpn_divexact_by255(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(255))
+#endif
+#endif
+#endif
+
+#ifndef mpn_divexact_by9x4
+#if HAVE_NATIVE_mpn_pi1_bdiv_q_1
+#define mpn_divexact_by9x4(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(9),BINVERT_9,2)
+#else
+#define mpn_divexact_by9x4(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(9)<<2)
+#endif
+#endif
+
+#ifndef mpn_divexact_by42525
+#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_42525)
+#define mpn_divexact_by42525(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(42525),BINVERT_42525,0)
+#else
+#define mpn_divexact_by42525(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(42525))
+#endif
+#endif
+
+#ifndef mpn_divexact_by2835x4
+#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_2835)
+#define mpn_divexact_by2835x4(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(2835),BINVERT_2835,2)
+#else
+#define mpn_divexact_by2835x4(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(2835)<<2)
+#endif
+#endif
+
+/* Interpolation for Toom-6.5 (or Toom-6), using the evaluation
+   points: infinity(6.5 only), +-4, +-2, +-1, +-1/4, +-1/2, 0. More precisely,
+   we want to compute f(2^(GMP_NUMB_BITS * n)) for a polynomial f of
+   degree 11 (or 10), given the 12 (rsp. 11) values:
+
+     r0 = limit at infinity of f(x) / x^11,
+     r1 = f(4),f(-4),
+     r2 = f(2),f(-2),
+     r3 = f(1),f(-1),
+     r4 = f(1/4),f(-1/4),
+     r5 = f(1/2),f(-1/2),
+     r6 = f(0).
+
+   All couples of the form f(n),f(-n) must be already mixed with
+   toom_couple_handling(f(n),...,f(-n),...)
+
+   The result is stored in {pp, spt + 7*n (or 6*n)}.
+   At entry, r6 is stored at {pp, 2n},
+   r4 is stored at {pp + 3n, 3n + 1}.
+   r2 is stored at {pp + 7n, 3n + 1}.
+   r0 is stored at {pp +11n, spt}.
+
+   The other values are 3n+1 limbs each (with most significant limbs small).
+
+   Negative intermediate results are stored two-complemented.
+   Inputs are destroyed.
+*/
+
+void
+mpn_toom_interpolate_12pts (mp_ptr pp, mp_ptr r1, mp_ptr r3, mp_ptr r5,
+			mp_size_t n, mp_size_t spt, int half, mp_ptr wsi)
+{
+  mp_limb_t cy;
+  mp_size_t n3;
+  mp_size_t n3p1;
+  n3 = 3 * n;
+  n3p1 = n3 + 1;
+
+#define   r4    (pp + n3)			/* 3n+1 */
+#define   r2    (pp + 7 * n)			/* 3n+1 */
+#define   r0    (pp +11 * n)			/* s+t <= 2*n */
+
+  /******************************* interpolation *****************************/
+  if (half != 0) {
+    cy = mpn_sub_n (r3, r3, r0, spt);
+    MPN_DECR_U (r3 + spt, n3p1 - spt, cy);
+
+    cy = DO_mpn_sublsh_n (r2, r0, spt, 10, wsi);
+    MPN_DECR_U (r2 + spt, n3p1 - spt, cy);
+    DO_mpn_subrsh(r5, n3p1, r0, spt, 2, wsi);
+
+    cy = DO_mpn_sublsh_n (r1, r0, spt, 20, wsi);
+    MPN_DECR_U (r1 + spt, n3p1 - spt, cy);
+    DO_mpn_subrsh(r4, n3p1, r0, spt, 4, wsi);
+  };
+
+  r4[n3] -= DO_mpn_sublsh_n (r4 + n, pp, 2 * n, 20, wsi);
+  DO_mpn_subrsh(r1 + n, 2 * n + 1, pp, 2 * n, 4, wsi);
+
+#if HAVE_NATIVE_mpn_add_n_sub_n
+  mpn_add_n_sub_n (r1, r4, r4, r1, n3p1);
+#else
+  ASSERT_NOCARRY(mpn_add_n (wsi, r1, r4, n3p1));
+  mpn_sub_n (r4, r4, r1, n3p1); /* can be negative */
+  MP_PTR_SWAP(r1, wsi);
+#endif
+
+  r5[n3] -= DO_mpn_sublsh_n (r5 + n, pp, 2 * n, 10, wsi);
+  DO_mpn_subrsh(r2 + n, 2 * n + 1, pp, 2 * n, 2, wsi);
+
+#if HAVE_NATIVE_mpn_add_n_sub_n
+  mpn_add_n_sub_n (r2, r5, r5, r2, n3p1);
+#else
+  mpn_sub_n (wsi, r5, r2, n3p1); /* can be negative */
+  ASSERT_NOCARRY(mpn_add_n (r2, r2, r5, n3p1));
+  MP_PTR_SWAP(r5, wsi);
+#endif
+
+  r3[n3] -= mpn_sub_n (r3+n, r3+n, pp, 2 * n);
+
+#if AORSMUL_FASTER_AORS_AORSLSH
+  mpn_submul_1 (r4, r5, n3p1, 257); /* can be negative */
+#else
+  mpn_sub_n (r4, r4, r5, n3p1); /* can be negative */
+  DO_mpn_sublsh_n (r4, r5, n3p1, 8, wsi); /* can be negative */
+#endif
+  /* A division by 2835x4 follows. Warning: the operand can be negative! */
+  mpn_divexact_by2835x4(r4, r4, n3p1);
+  if ((r4[n3] & (GMP_NUMB_MAX << (GMP_NUMB_BITS-3))) != 0)
+    r4[n3] |= (GMP_NUMB_MAX << (GMP_NUMB_BITS-2));
+
+#if AORSMUL_FASTER_2AORSLSH
+  mpn_addmul_1 (r5, r4, n3p1, 60); /* can be negative */
+#else
+  DO_mpn_sublsh_n (r5, r4, n3p1, 2, wsi); /* can be negative */
+  DO_mpn_addlsh_n (r5, r4, n3p1, 6, wsi); /* can give a carry */
+#endif
+  mpn_divexact_by255(r5, r5, n3p1);
+
+  ASSERT_NOCARRY(DO_mpn_sublsh_n (r2, r3, n3p1, 5, wsi));
+
+#if AORSMUL_FASTER_3AORSLSH
+  ASSERT_NOCARRY(mpn_submul_1 (r1, r2, n3p1, 100));
+#else
+  ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r2, n3p1, 6, wsi));
+  ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r2, n3p1, 5, wsi));
+  ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r2, n3p1, 2, wsi));
+#endif
+  ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r3, n3p1, 9, wsi));
+  mpn_divexact_by42525(r1, r1, n3p1);
+
+#if AORSMUL_FASTER_AORS_2AORSLSH
+  ASSERT_NOCARRY(mpn_submul_1 (r2, r1, n3p1, 225));
+#else
+  ASSERT_NOCARRY(mpn_sub_n (r2, r2, r1, n3p1));
+  ASSERT_NOCARRY(DO_mpn_addlsh_n (r2, r1, n3p1, 5, wsi));
+  ASSERT_NOCARRY(DO_mpn_sublsh_n (r2, r1, n3p1, 8, wsi));
+#endif
+  mpn_divexact_by9x4(r2, r2, n3p1);
+
+  ASSERT_NOCARRY(mpn_sub_n (r3, r3, r2, n3p1));
+
+#ifdef HAVE_NATIVE_mpn_rsh1sub_n
+  mpn_rsh1sub_n (r4, r2, r4, n3p1);
+  r4 [n3p1 - 1] &= GMP_NUMB_MASK >> 1;
+#else
+  mpn_sub_n (r4, r2, r4, n3p1);
+  ASSERT_NOCARRY(mpn_rshift(r4, r4, n3p1, 1));
+#endif
+  ASSERT_NOCARRY(mpn_sub_n (r2, r2, r4, n3p1));
+
+#ifdef HAVE_NATIVE_mpn_rsh1add_n
+  mpn_rsh1add_n (r5, r5, r1, n3p1);
+  r5 [n3p1 - 1] &= GMP_NUMB_MASK >> 1;
+#else
+  mpn_add_n (r5, r5, r1, n3p1);
+  ASSERT_NOCARRY(mpn_rshift(r5, r5, n3p1, 1));
+#endif
+
+  /* last interpolation steps... */
+  ASSERT_NOCARRY(mpn_sub_n (r3, r3, r1, n3p1));
+  ASSERT_NOCARRY(mpn_sub_n (r1, r1, r5, n3p1));
+  /* ... could be mixed with recomposition
+	||H-r5|M-r5|L-r5|   ||H-r1|M-r1|L-r1|
+  */
+
+  /***************************** recomposition *******************************/
+  /*
+    pp[] prior to operations:
+    |M r0|L r0|___||H r2|M r2|L r2|___||H r4|M r4|L r4|____|H_r6|L r6|pp
+
+    summation scheme for remaining operations:
+    |__12|n_11|n_10|n__9|n__8|n__7|n__6|n__5|n__4|n__3|n__2|n___|n___|pp
+    |M r0|L r0|___||H r2|M r2|L r2|___||H r4|M r4|L r4|____|H_r6|L r6|pp
+	||H r1|M r1|L r1|   ||H r3|M r3|L r3|   ||H_r5|M_r5|L_r5|
+  */
+
+  cy = mpn_add_n (pp + n, pp + n, r5, n);
+  cy = mpn_add_1 (pp + 2 * n, r5 + n, n, cy);
+#if HAVE_NATIVE_mpn_add_nc
+  cy = r5[n3] + mpn_add_nc(pp + n3, pp + n3, r5 + 2 * n, n, cy);
+#else
+  MPN_INCR_U (r5 + 2 * n, n + 1, cy);
+  cy = r5[n3] + mpn_add_n (pp + n3, pp + n3, r5 + 2 * n, n);
+#endif
+  MPN_INCR_U (pp + n3 + n, 2 * n + 1, cy);
+
+  pp[2 * n3]+= mpn_add_n (pp + 5 * n, pp + 5 * n, r3, n);
+  cy = mpn_add_1 (pp + 2 * n3, r3 + n, n, pp[2 * n3]);
+#if HAVE_NATIVE_mpn_add_nc
+  cy = r3[n3] + mpn_add_nc(pp + 7 * n, pp + 7 * n, r3 + 2 * n, n, cy);
+#else
+  MPN_INCR_U (r3 + 2 * n, n + 1, cy);
+  cy = r3[n3] + mpn_add_n (pp + 7 * n, pp + 7 * n, r3 + 2 * n, n);
+#endif
+  MPN_INCR_U (pp + 8 * n, 2 * n + 1, cy);
+
+  pp[10*n]+=mpn_add_n (pp + 9 * n, pp + 9 * n, r1, n);
+  if (half) {
+    cy = mpn_add_1 (pp + 10 * n, r1 + n, n, pp[10 * n]);
+#if HAVE_NATIVE_mpn_add_nc
+    if (LIKELY (spt > n)) {
+      cy = r1[n3] + mpn_add_nc(pp + 11 * n, pp + 11 * n, r1 + 2 * n, n, cy);
+      MPN_INCR_U (pp + 4 * n3, spt - n, cy);
+    } else {
+      ASSERT_NOCARRY(mpn_add_nc(pp + 11 * n, pp + 11 * n, r1 + 2 * n, spt, cy));
+    }
+#else
+    MPN_INCR_U (r1 + 2 * n, n + 1, cy);
+    if (LIKELY (spt > n)) {
+      cy = r1[n3] + mpn_add_n (pp + 11 * n, pp + 11 * n, r1 + 2 * n, n);
+      MPN_INCR_U (pp + 4 * n3, spt - n, cy);
+    } else {
+      ASSERT_NOCARRY(mpn_add_n (pp + 11 * n, pp + 11 * n, r1 + 2 * n, spt));
+    }
+#endif
+  } else {
+    ASSERT_NOCARRY(mpn_add_1 (pp + 10 * n, r1 + n, spt, pp[10 * n]));
+  }
+
+#undef   r0
+#undef   r2
+#undef   r4
+}

diff --git a/third_party/gmp/mpn/generic/toom_interpolate_16pts.c b/third_party/gmp/mpn/generic/toom_interpolate_16pts.c
new file mode 100644
index 0000000..5d76bba
--- /dev/null
+++ b/third_party/gmp/mpn/generic/toom_interpolate_16pts.c

@@ -0,0 +1,541 @@
+/* Interpolation for the algorithm Toom-Cook 8.5-way.
+
+   Contributed to the GNU project by Marco Bodrato.
+
+   THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
+   SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+Copyright 2009, 2010, 2012, 2015 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+#include "gmp-impl.h"
+
+#if GMP_NUMB_BITS < 29
+#error Not implemented: Both sublsh_n(,,,28) should be corrected; r2 and r5 need one more LIMB.
+#endif
+
+#if GMP_NUMB_BITS < 28
+#error Not implemented: divexact_by188513325 and _by182712915 will not work.
+#endif
+
+
+#if HAVE_NATIVE_mpn_sublsh_n
+#define DO_mpn_sublsh_n(dst,src,n,s,ws) mpn_sublsh_n(dst,dst,src,n,s)
+#else
+static mp_limb_t
+DO_mpn_sublsh_n(mp_ptr dst, mp_srcptr src, mp_size_t n, unsigned int s, mp_ptr ws)
+{
+#if USE_MUL_1 && 0
+  return mpn_submul_1(dst,src,n,CNST_LIMB(1) <<(s));
+#else
+  mp_limb_t __cy;
+  __cy = mpn_lshift(ws,src,n,s);
+  return    __cy + mpn_sub_n(dst,dst,ws,n);
+#endif
+}
+#endif
+
+#if HAVE_NATIVE_mpn_addlsh_n
+#define DO_mpn_addlsh_n(dst,src,n,s,ws) mpn_addlsh_n(dst,dst,src,n,s)
+#else
+static mp_limb_t
+DO_mpn_addlsh_n(mp_ptr dst, mp_srcptr src, mp_size_t n, unsigned int s, mp_ptr ws)
+{
+#if USE_MUL_1 && 0
+  return mpn_addmul_1(dst,src,n,CNST_LIMB(1) <<(s));
+#else
+  mp_limb_t __cy;
+  __cy = mpn_lshift(ws,src,n,s);
+  return    __cy + mpn_add_n(dst,dst,ws,n);
+#endif
+}
+#endif
+
+#if HAVE_NATIVE_mpn_subrsh
+#define DO_mpn_subrsh(dst,nd,src,ns,s,ws) mpn_subrsh(dst,nd,src,ns,s)
+#else
+/* FIXME: This is not a correct definition, it assumes no carry */
+#define DO_mpn_subrsh(dst,nd,src,ns,s,ws)				\
+do {									\
+  mp_limb_t __cy;							\
+  MPN_DECR_U (dst, nd, src[0] >> s);					\
+  __cy = DO_mpn_sublsh_n (dst, src + 1, ns - 1, GMP_NUMB_BITS - s, ws);	\
+  MPN_DECR_U (dst + ns - 1, nd - ns + 1, __cy);				\
+} while (0)
+#endif
+
+
+/* FIXME: tuneup should decide the best variant */
+#ifndef AORSMUL_FASTER_AORS_AORSLSH
+#define AORSMUL_FASTER_AORS_AORSLSH 1
+#endif
+#ifndef AORSMUL_FASTER_AORS_2AORSLSH
+#define AORSMUL_FASTER_AORS_2AORSLSH 1
+#endif
+#ifndef AORSMUL_FASTER_2AORSLSH
+#define AORSMUL_FASTER_2AORSLSH 1
+#endif
+#ifndef AORSMUL_FASTER_3AORSLSH
+#define AORSMUL_FASTER_3AORSLSH 1
+#endif
+
+#if GMP_NUMB_BITS < 43
+#define BIT_CORRECTION 1
+#define CORRECTION_BITS GMP_NUMB_BITS
+#else
+#define BIT_CORRECTION 0
+#define CORRECTION_BITS 0
+#endif
+
+#define BINVERT_9 \
+  ((((GMP_NUMB_MAX / 9) << (6 - GMP_NUMB_BITS % 6)) * 8 & GMP_NUMB_MAX) | 0x39)
+
+#define BINVERT_255 \
+  (GMP_NUMB_MAX - ((GMP_NUMB_MAX / 255) << (8 - GMP_NUMB_BITS % 8)))
+
+  /* FIXME: find some more general expressions for inverses */
+#if GMP_LIMB_BITS == 32
+#define BINVERT_2835  (GMP_NUMB_MASK &		CNST_LIMB(0x53E3771B))
+#define BINVERT_42525 (GMP_NUMB_MASK &		CNST_LIMB(0x9F314C35))
+#define BINVERT_182712915 (GMP_NUMB_MASK &	CNST_LIMB(0x550659DB))
+#define BINVERT_188513325 (GMP_NUMB_MASK &	CNST_LIMB(0xFBC333A5))
+#define BINVERT_255x182712915L (GMP_NUMB_MASK &	CNST_LIMB(0x6FC4CB25))
+#define BINVERT_255x188513325L (GMP_NUMB_MASK &	CNST_LIMB(0x6864275B))
+#if GMP_NAIL_BITS == 0
+#define BINVERT_255x182712915H CNST_LIMB(0x1B649A07)
+#define BINVERT_255x188513325H CNST_LIMB(0x06DB993A)
+#else /* GMP_NAIL_BITS != 0 */
+#define BINVERT_255x182712915H \
+  (GMP_NUMB_MASK & CNST_LIMB((0x1B649A07<<GMP_NAIL_BITS) | (0x6FC4CB25>>GMP_NUMB_BITS)))
+#define BINVERT_255x188513325H \
+  (GMP_NUMB_MASK & CNST_LIMB((0x06DB993A<<GMP_NAIL_BITS) | (0x6864275B>>GMP_NUMB_BITS)))
+#endif
+#else
+#if GMP_LIMB_BITS == 64
+#define BINVERT_2835  (GMP_NUMB_MASK &	CNST_LIMB(0x938CC70553E3771B))
+#define BINVERT_42525 (GMP_NUMB_MASK &	CNST_LIMB(0xE7B40D449F314C35))
+#define BINVERT_255x182712915  (GMP_NUMB_MASK &	CNST_LIMB(0x1B649A076FC4CB25))
+#define BINVERT_255x188513325  (GMP_NUMB_MASK &	CNST_LIMB(0x06DB993A6864275B))
+#endif
+#endif
+
+#ifndef mpn_divexact_by255
+#if GMP_NUMB_BITS % 8 == 0
+#define mpn_divexact_by255(dst,src,size) \
+  (255 & 1 * mpn_bdiv_dbm1 (dst, src, size, __GMP_CAST (mp_limb_t, GMP_NUMB_MASK / 255)))
+#else
+#if HAVE_NATIVE_mpn_pi1_bdiv_q_1
+#define mpn_divexact_by255(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(255),BINVERT_255,0)
+#else
+#define mpn_divexact_by255(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(255))
+#endif
+#endif
+#endif
+
+#ifndef mpn_divexact_by255x4
+#if HAVE_NATIVE_mpn_pi1_bdiv_q_1
+#define mpn_divexact_by255x4(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(255),BINVERT_255,2)
+#else
+#define mpn_divexact_by255x4(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(255)<<2)
+#endif
+#endif
+
+#ifndef mpn_divexact_by9x16
+#if HAVE_NATIVE_mpn_pi1_bdiv_q_1
+#define mpn_divexact_by9x16(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(9),BINVERT_9,4)
+#else
+#define mpn_divexact_by9x16(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(9)<<4)
+#endif
+#endif
+
+#ifndef mpn_divexact_by42525x16
+#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_42525)
+#define mpn_divexact_by42525x16(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(42525),BINVERT_42525,4)
+#else
+#define mpn_divexact_by42525x16(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(42525)<<4)
+#endif
+#endif
+
+#ifndef mpn_divexact_by2835x64
+#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_2835)
+#define mpn_divexact_by2835x64(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(2835),BINVERT_2835,6)
+#else
+#define mpn_divexact_by2835x64(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(2835)<<6)
+#endif
+#endif
+
+#ifndef  mpn_divexact_by255x182712915
+#if GMP_NUMB_BITS < 36
+#if HAVE_NATIVE_mpn_bdiv_q_2_pi2 && defined(BINVERT_255x182712915H)
+/* FIXME: use mpn_bdiv_q_2_pi2 */
+#endif
+#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_182712915)
+#define mpn_divexact_by255x182712915(dst,src,size)				\
+  do {										\
+    mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(182712915),BINVERT_182712915,0);	\
+    mpn_divexact_by255(dst,dst,size);						\
+  } while(0)
+#else
+#define mpn_divexact_by255x182712915(dst,src,size)	\
+  do {							\
+    mpn_divexact_1(dst,src,size,CNST_LIMB(182712915));	\
+    mpn_divexact_by255(dst,dst,size);			\
+  } while(0)
+#endif
+#else /* GMP_NUMB_BITS > 35 */
+#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_255x182712915)
+#define mpn_divexact_by255x182712915(dst,src,size) \
+  mpn_pi1_bdiv_q_1(dst,src,size,255*CNST_LIMB(182712915),BINVERT_255x182712915,0)
+#else
+#define mpn_divexact_by255x182712915(dst,src,size) mpn_divexact_1(dst,src,size,255*CNST_LIMB(182712915))
+#endif
+#endif /* GMP_NUMB_BITS >?< 36 */
+#endif
+
+#ifndef  mpn_divexact_by255x188513325
+#if GMP_NUMB_BITS < 36
+#if HAVE_NATIVE_mpn_bdiv_q_1_pi2 && defined(BINVERT_255x188513325H)
+/* FIXME: use mpn_bdiv_q_1_pi2 */
+#endif
+#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_188513325)
+#define mpn_divexact_by255x188513325(dst,src,size)			\
+  do {									\
+    mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(188513325),BINVERT_188513325,0);	\
+    mpn_divexact_by255(dst,dst,size);					\
+  } while(0)
+#else
+#define mpn_divexact_by255x188513325(dst,src,size)	\
+  do {							\
+    mpn_divexact_1(dst,src,size,CNST_LIMB(188513325));	\
+    mpn_divexact_by255(dst,dst,size);			\
+  } while(0)
+#endif
+#else /* GMP_NUMB_BITS > 35 */
+#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_255x188513325)
+#define mpn_divexact_by255x188513325(dst,src,size) \
+  mpn_pi1_bdiv_q_1(dst,src,size,255*CNST_LIMB(188513325),BINVERT_255x188513325,0)
+#else
+#define mpn_divexact_by255x188513325(dst,src,size) mpn_divexact_1(dst,src,size,255*CNST_LIMB(188513325))
+#endif
+#endif /* GMP_NUMB_BITS >?< 36 */
+#endif
+
+/* Interpolation for Toom-8.5 (or Toom-8), using the evaluation
+   points: infinity(8.5 only), +-8, +-4, +-2, +-1, +-1/4, +-1/2,
+   +-1/8, 0. More precisely, we want to compute
+   f(2^(GMP_NUMB_BITS * n)) for a polynomial f of degree 15 (or
+   14), given the 16 (rsp. 15) values:
+
+     r0 = limit at infinity of f(x) / x^15,
+     r1 = f(8),f(-8),
+     r2 = f(4),f(-4),
+     r3 = f(2),f(-2),
+     r4 = f(1),f(-1),
+     r5 = f(1/4),f(-1/4),
+     r6 = f(1/2),f(-1/2),
+     r7 = f(1/8),f(-1/8),
+     r8 = f(0).
+
+   All couples of the form f(n),f(-n) must be already mixed with
+   toom_couple_handling(f(n),...,f(-n),...)
+
+   The result is stored in {pp, spt + 7*n (or 8*n)}.
+   At entry, r8 is stored at {pp, 2n},
+   r6 is stored at {pp + 3n, 3n + 1}.
+   r4 is stored at {pp + 7n, 3n + 1}.
+   r2 is stored at {pp +11n, 3n + 1}.
+   r0 is stored at {pp +15n, spt}.
+
+   The other values are 3n+1 limbs each (with most significant limbs small).
+
+   Negative intermediate results are stored two-complemented.
+   Inputs are destroyed.
+*/
+
+void
+mpn_toom_interpolate_16pts (mp_ptr pp, mp_ptr r1, mp_ptr r3, mp_ptr r5, mp_ptr r7,
+			mp_size_t n, mp_size_t spt, int half, mp_ptr wsi)
+{
+  mp_limb_t cy;
+  mp_size_t n3;
+  mp_size_t n3p1;
+  n3 = 3 * n;
+  n3p1 = n3 + 1;
+
+#define   r6    (pp + n3)			/* 3n+1 */
+#define   r4    (pp + 7 * n)			/* 3n+1 */
+#define   r2    (pp +11 * n)			/* 3n+1 */
+#define   r0    (pp +15 * n)			/* s+t <= 2*n */
+
+  ASSERT( spt <= 2 * n );
+  /******************************* interpolation *****************************/
+  if( half != 0) {
+    cy = mpn_sub_n (r4, r4, r0, spt);
+    MPN_DECR_U (r4 + spt, n3p1 - spt, cy);
+
+    cy = DO_mpn_sublsh_n (r3, r0, spt, 14, wsi);
+    MPN_DECR_U (r3 + spt, n3p1 - spt, cy);
+    DO_mpn_subrsh(r6, n3p1, r0, spt, 2, wsi);
+
+    cy = DO_mpn_sublsh_n (r2, r0, spt, 28, wsi);
+    MPN_DECR_U (r2 + spt, n3p1 - spt, cy);
+    DO_mpn_subrsh(r5, n3p1, r0, spt, 4, wsi);
+
+    cy = DO_mpn_sublsh_n (r1 + BIT_CORRECTION, r0, spt, 42 - CORRECTION_BITS, wsi);
+#if BIT_CORRECTION
+    cy = mpn_sub_1 (r1 + spt + BIT_CORRECTION, r1 + spt + BIT_CORRECTION,
+		    n3p1 - spt - BIT_CORRECTION, cy);
+    ASSERT (BIT_CORRECTION > 0 || cy == 0);
+    /* FIXME: assumes r7[n3p1] is writable (it is if r5 follows). */
+    cy = r7[n3p1];
+    r7[n3p1] = 0x80;
+#else
+    MPN_DECR_U (r1 + spt + BIT_CORRECTION, n3p1 - spt - BIT_CORRECTION, cy);
+#endif
+    DO_mpn_subrsh(r7, n3p1 + BIT_CORRECTION, r0, spt, 6, wsi);
+#if BIT_CORRECTION
+    /* FIXME: assumes r7[n3p1] is writable. */
+    ASSERT ( BIT_CORRECTION > 0 || r7[n3p1] == 0x80 );
+    r7[n3p1] = cy;
+#endif
+  };
+
+  r5[n3] -= DO_mpn_sublsh_n (r5 + n, pp, 2 * n, 28, wsi);
+  DO_mpn_subrsh(r2 + n, 2 * n + 1, pp, 2 * n, 4, wsi);
+
+#if HAVE_NATIVE_mpn_add_n_sub_n
+  mpn_add_n_sub_n (r2, r5, r5, r2, n3p1);
+#else
+  mpn_sub_n (wsi, r5, r2, n3p1); /* can be negative */
+  ASSERT_NOCARRY(mpn_add_n (r2, r2, r5, n3p1));
+  MP_PTR_SWAP(r5, wsi);
+#endif
+
+  r6[n3] -= DO_mpn_sublsh_n (r6 + n, pp, 2 * n, 14, wsi);
+  DO_mpn_subrsh(r3 + n, 2 * n + 1, pp, 2 * n, 2, wsi);
+
+#if HAVE_NATIVE_mpn_add_n_sub_n
+  mpn_add_n_sub_n (r3, r6, r6, r3, n3p1);
+#else
+  ASSERT_NOCARRY(mpn_add_n (wsi, r3, r6, n3p1));
+  mpn_sub_n (r6, r6, r3, n3p1); /* can be negative */
+  MP_PTR_SWAP(r3, wsi);
+#endif
+
+  cy = DO_mpn_sublsh_n (r7 + n + BIT_CORRECTION, pp, 2 * n, 42 - CORRECTION_BITS, wsi);
+#if BIT_CORRECTION
+  MPN_DECR_U (r1 + n, 2 * n + 1, pp[0] >> 6);
+  cy = DO_mpn_sublsh_n (r1 + n, pp + 1, 2 * n - 1, GMP_NUMB_BITS - 6, wsi);
+  cy = mpn_sub_1(r1 + 3 * n - 1, r1 + 3 * n - 1, 2, cy);
+  ASSERT ( BIT_CORRECTION > 0 || cy != 0 );
+#else
+  r7[n3] -= cy;
+  DO_mpn_subrsh(r1 + n, 2 * n + 1, pp, 2 * n, 6, wsi);
+#endif
+
+#if HAVE_NATIVE_mpn_add_n_sub_n
+  mpn_add_n_sub_n (r1, r7, r7, r1, n3p1);
+#else
+  mpn_sub_n (wsi, r7, r1, n3p1); /* can be negative */
+  mpn_add_n (r1, r1, r7, n3p1);  /* if BIT_CORRECTION != 0, can give a carry. */
+  MP_PTR_SWAP(r7, wsi);
+#endif
+
+  r4[n3] -= mpn_sub_n (r4+n, r4+n, pp, 2 * n);
+
+#if AORSMUL_FASTER_2AORSLSH
+  mpn_submul_1 (r5, r6, n3p1, 1028); /* can be negative */
+#else
+  DO_mpn_sublsh_n (r5, r6, n3p1, 2, wsi); /* can be negative */
+  DO_mpn_sublsh_n (r5, r6, n3p1,10, wsi); /* can be negative */
+#endif
+
+  mpn_submul_1 (r7, r5, n3p1, 1300); /* can be negative */
+#if AORSMUL_FASTER_3AORSLSH
+  mpn_submul_1 (r7, r6, n3p1, 1052688); /* can be negative */
+#else
+  DO_mpn_sublsh_n (r7, r6, n3p1, 4, wsi); /* can be negative */
+  DO_mpn_sublsh_n (r7, r6, n3p1,12, wsi); /* can be negative */
+  DO_mpn_sublsh_n (r7, r6, n3p1,20, wsi); /* can be negative */
+#endif
+  mpn_divexact_by255x188513325(r7, r7, n3p1);
+
+  mpn_submul_1 (r5, r7, n3p1, 12567555); /* can be negative */
+  /* A division by 2835x64 follows. Warning: the operand can be negative! */
+  mpn_divexact_by2835x64(r5, r5, n3p1);
+  if ((r5[n3] & (GMP_NUMB_MAX << (GMP_NUMB_BITS-7))) != 0)
+    r5[n3] |= (GMP_NUMB_MAX << (GMP_NUMB_BITS-6));
+
+#if AORSMUL_FASTER_AORS_AORSLSH
+  mpn_submul_1 (r6, r7, n3p1, 4095); /* can be negative */
+#else
+  mpn_add_n (r6, r6, r7, n3p1); /* can give a carry */
+  DO_mpn_sublsh_n (r6, r7, n3p1, 12, wsi); /* can be negative */
+#endif
+#if AORSMUL_FASTER_2AORSLSH
+  mpn_addmul_1 (r6, r5, n3p1, 240); /* can be negative */
+#else
+  DO_mpn_addlsh_n (r6, r5, n3p1, 8, wsi); /* can give a carry */
+  DO_mpn_sublsh_n (r6, r5, n3p1, 4, wsi); /* can be negative */
+#endif
+  /* A division by 255x4 follows. Warning: the operand can be negative! */
+  mpn_divexact_by255x4(r6, r6, n3p1);
+  if ((r6[n3] & (GMP_NUMB_MAX << (GMP_NUMB_BITS-3))) != 0)
+    r6[n3] |= (GMP_NUMB_MAX << (GMP_NUMB_BITS-2));
+
+  ASSERT_NOCARRY(DO_mpn_sublsh_n (r3, r4, n3p1, 7, wsi));
+
+  ASSERT_NOCARRY(DO_mpn_sublsh_n (r2, r4, n3p1, 13, wsi));
+  ASSERT_NOCARRY(mpn_submul_1 (r2, r3, n3p1, 400));
+
+  /* If GMP_NUMB_BITS < 42 next operations on r1 can give a carry!*/
+  DO_mpn_sublsh_n (r1, r4, n3p1, 19, wsi);
+  mpn_submul_1 (r1, r2, n3p1, 1428);
+  mpn_submul_1 (r1, r3, n3p1, 112896);
+  mpn_divexact_by255x182712915(r1, r1, n3p1);
+
+  ASSERT_NOCARRY(mpn_submul_1 (r2, r1, n3p1, 15181425));
+  mpn_divexact_by42525x16(r2, r2, n3p1);
+
+#if AORSMUL_FASTER_AORS_2AORSLSH
+  ASSERT_NOCARRY(mpn_submul_1 (r3, r1, n3p1, 3969));
+#else
+  ASSERT_NOCARRY(mpn_sub_n (r3, r3, r1, n3p1));
+  ASSERT_NOCARRY(DO_mpn_addlsh_n (r3, r1, n3p1, 7, wsi));
+  ASSERT_NOCARRY(DO_mpn_sublsh_n (r3, r1, n3p1, 12, wsi));
+#endif
+  ASSERT_NOCARRY(mpn_submul_1 (r3, r2, n3p1, 900));
+  mpn_divexact_by9x16(r3, r3, n3p1);
+
+  ASSERT_NOCARRY(mpn_sub_n (r4, r4, r1, n3p1));
+  ASSERT_NOCARRY(mpn_sub_n (r4, r4, r3, n3p1));
+  ASSERT_NOCARRY(mpn_sub_n (r4, r4, r2, n3p1));
+
+#ifdef HAVE_NATIVE_mpn_rsh1add_n
+  mpn_rsh1add_n (r6, r2, r6, n3p1);
+  r6 [n3p1 - 1] &= GMP_NUMB_MASK >> 1;
+#else
+  mpn_add_n (r6, r2, r6, n3p1);
+  ASSERT_NOCARRY(mpn_rshift(r6, r6, n3p1, 1));
+#endif
+  ASSERT_NOCARRY(mpn_sub_n (r2, r2, r6, n3p1));
+
+#ifdef HAVE_NATIVE_mpn_rsh1sub_n
+  mpn_rsh1sub_n (r5, r3, r5, n3p1);
+  r5 [n3p1 - 1] &= GMP_NUMB_MASK >> 1;
+#else
+  mpn_sub_n (r5, r3, r5, n3p1);
+  ASSERT_NOCARRY(mpn_rshift(r5, r5, n3p1, 1));
+#endif
+  ASSERT_NOCARRY(mpn_sub_n (r3, r3, r5, n3p1));
+
+#ifdef HAVE_NATIVE_mpn_rsh1add_n
+  mpn_rsh1add_n (r7, r1, r7, n3p1);
+  r7 [n3p1 - 1] &= GMP_NUMB_MASK >> 1;
+#else
+  mpn_add_n (r7, r1, r7, n3p1);
+  ASSERT_NOCARRY(mpn_rshift(r7, r7, n3p1, 1));
+#endif
+  ASSERT_NOCARRY(mpn_sub_n (r1, r1, r7, n3p1));
+
+  /* last interpolation steps... */
+  /* ... could be mixed with recomposition
+	||H-r7|M-r7|L-r7|   ||H-r5|M-r5|L-r5|
+  */
+
+  /***************************** recomposition *******************************/
+  /*
+    pp[] prior to operations:
+    |M r0|L r0|___||H r2|M r2|L r2|___||H r4|M r4|L r4|___||H r6|M r6|L r6|____|H_r8|L r8|pp
+
+    summation scheme for remaining operations:
+    |__16|n_15|n_14|n_13|n_12|n_11|n_10|n__9|n__8|n__7|n__6|n__5|n__4|n__3|n__2|n___|n___|pp
+    |M r0|L r0|___||H r2|M r2|L r2|___||H r4|M r4|L r4|___||H r6|M r6|L r6|____|H_r8|L r8|pp
+	||H r1|M r1|L r1|   ||H r3|M r3|L r3|   ||H_r5|M_r5|L_r5|   ||H r7|M r7|L r7|
+  */
+
+  cy = mpn_add_n (pp + n, pp + n, r7, n);
+  cy = mpn_add_1 (pp + 2 * n, r7 + n, n, cy);
+#if HAVE_NATIVE_mpn_add_nc
+  cy = r7[n3] + mpn_add_nc(pp + n3, pp + n3, r7 + 2 * n, n, cy);
+#else
+  MPN_INCR_U (r7 + 2 * n, n + 1, cy);
+  cy = r7[n3] + mpn_add_n (pp + n3, pp + n3, r7 + 2 * n, n);
+#endif
+  MPN_INCR_U (pp + 4 * n, 2 * n + 1, cy);
+
+  pp[2 * n3]+= mpn_add_n (pp + 5 * n, pp + 5 * n, r5, n);
+  cy = mpn_add_1 (pp + 2 * n3, r5 + n, n, pp[2 * n3]);
+#if HAVE_NATIVE_mpn_add_nc
+  cy = r5[n3] + mpn_add_nc(pp + 7 * n, pp + 7 * n, r5 + 2 * n, n, cy);
+#else
+  MPN_INCR_U (r5 + 2 * n, n + 1, cy);
+  cy = r5[n3] + mpn_add_n (pp + 7 * n, pp + 7 * n, r5 + 2 * n, n);
+#endif
+  MPN_INCR_U (pp + 8 * n, 2 * n + 1, cy);
+
+  pp[10 * n]+= mpn_add_n (pp + 9 * n, pp + 9 * n, r3, n);
+  cy = mpn_add_1 (pp + 10 * n, r3 + n, n, pp[10 * n]);
+#if HAVE_NATIVE_mpn_add_nc
+  cy = r3[n3] + mpn_add_nc(pp +11 * n, pp +11 * n, r3 + 2 * n, n, cy);
+#else
+  MPN_INCR_U (r3 + 2 * n, n + 1, cy);
+  cy = r3[n3] + mpn_add_n (pp +11 * n, pp +11 * n, r3 + 2 * n, n);
+#endif
+  MPN_INCR_U (pp +12 * n, 2 * n + 1, cy);
+
+  pp[14 * n]+=mpn_add_n (pp +13 * n, pp +13 * n, r1, n);
+  if ( half ) {
+    cy = mpn_add_1 (pp + 14 * n, r1 + n, n, pp[14 * n]);
+#if HAVE_NATIVE_mpn_add_nc
+    if(LIKELY(spt > n)) {
+      cy = r1[n3] + mpn_add_nc(pp + 15 * n, pp + 15 * n, r1 + 2 * n, n, cy);
+      MPN_INCR_U (pp + 16 * n, spt - n, cy);
+    } else {
+      ASSERT_NOCARRY(mpn_add_nc(pp + 15 * n, pp + 15 * n, r1 + 2 * n, spt, cy));
+    }
+#else
+    MPN_INCR_U (r1 + 2 * n, n + 1, cy);
+    if(LIKELY(spt > n)) {
+      cy = r1[n3] + mpn_add_n (pp + 15 * n, pp + 15 * n, r1 + 2 * n, n);
+      MPN_INCR_U (pp + 16 * n, spt - n, cy);
+    } else {
+      ASSERT_NOCARRY(mpn_add_n (pp + 15 * n, pp + 15 * n, r1 + 2 * n, spt));
+    }
+#endif
+  } else {
+    ASSERT_NOCARRY(mpn_add_1 (pp + 14 * n, r1 + n, spt, pp[14 * n]));
+  }
+
+#undef   r0
+#undef   r2
+#undef   r4
+#undef   r6
+}

diff --git a/third_party/gmp/mpn/generic/toom_interpolate_5pts.c b/third_party/gmp/mpn/generic/toom_interpolate_5pts.c
new file mode 100644
index 0000000..466ab85
--- /dev/null
+++ b/third_party/gmp/mpn/generic/toom_interpolate_5pts.c

@@ -0,0 +1,198 @@
+/* mpn_toom_interpolate_5pts -- Interpolate for toom3, 33, 42.
+
+   Contributed to the GNU project by Robert Harley.
+   Improvements by Paul Zimmermann and Marco Bodrato.
+
+   THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
+   SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+Copyright 2000-2003, 2005-2007, 2009 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+
+void
+mpn_toom_interpolate_5pts (mp_ptr c, mp_ptr v2, mp_ptr vm1,
+			   mp_size_t k, mp_size_t twor, int sa,
+			   mp_limb_t vinf0)
+{
+  mp_limb_t cy, saved;
+  mp_size_t twok;
+  mp_size_t kk1;
+  mp_ptr c1, v1, c3, vinf;
+
+  twok = k + k;
+  kk1 = twok + 1;
+
+  c1 = c  + k;
+  v1 = c1 + k;
+  c3 = v1 + k;
+  vinf = c3 + k;
+
+#define v0 (c)
+  /* (1) v2 <- v2-vm1 < v2+|vm1|,       (16 8 4 2 1) - (1 -1 1 -1  1) =
+     thus 0 <= v2 < 50*B^(2k) < 2^6*B^(2k)             (15 9 3  3  0)
+  */
+  if (sa)
+    ASSERT_NOCARRY (mpn_add_n (v2, v2, vm1, kk1));
+  else
+    ASSERT_NOCARRY (mpn_sub_n (v2, v2, vm1, kk1));
+
+  /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r}
+       v0       v1       hi(vinf)       |vm1|     v2-vm1      EMPTY */
+
+  ASSERT_NOCARRY (mpn_divexact_by3 (v2, v2, kk1));    /* v2 <- v2 / 3 */
+						      /* (5 3 1 1 0)*/
+
+  /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r}
+       v0       v1      hi(vinf)       |vm1|     (v2-vm1)/3    EMPTY */
+
+  /* (2) vm1 <- tm1 := (v1 - vm1) / 2  [(1 1 1 1 1) - (1 -1 1 -1 1)] / 2 =
+     tm1 >= 0                                         (0  1 0  1 0)
+     No carry comes out from {v1, kk1} +/- {vm1, kk1},
+     and the division by two is exact.
+     If (sa!=0) the sign of vm1 is negative */
+  if (sa)
+    {
+#ifdef HAVE_NATIVE_mpn_rsh1add_n
+      mpn_rsh1add_n (vm1, v1, vm1, kk1);
+#else
+      ASSERT_NOCARRY (mpn_add_n (vm1, v1, vm1, kk1));
+      ASSERT_NOCARRY (mpn_rshift (vm1, vm1, kk1, 1));
+#endif
+    }
+  else
+    {
+#ifdef HAVE_NATIVE_mpn_rsh1sub_n
+      mpn_rsh1sub_n (vm1, v1, vm1, kk1);
+#else
+      ASSERT_NOCARRY (mpn_sub_n (vm1, v1, vm1, kk1));
+      ASSERT_NOCARRY (mpn_rshift (vm1, vm1, kk1, 1));
+#endif
+    }
+
+  /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r}
+       v0       v1        hi(vinf)       tm1     (v2-vm1)/3    EMPTY */
+
+  /* (3) v1 <- t1 := v1 - v0    (1 1 1 1 1) - (0 0 0 0 1) = (1 1 1 1 0)
+     t1 >= 0
+  */
+  vinf[0] -= mpn_sub_n (v1, v1, c, twok);
+
+  /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r}
+       v0     v1-v0        hi(vinf)       tm1     (v2-vm1)/3    EMPTY */
+
+  /* (4) v2 <- t2 := ((v2-vm1)/3-t1)/2 = (v2-vm1-3*t1)/6
+     t2 >= 0                  [(5 3 1 1 0) - (1 1 1 1 0)]/2 = (2 1 0 0 0)
+  */
+#ifdef HAVE_NATIVE_mpn_rsh1sub_n
+  mpn_rsh1sub_n (v2, v2, v1, kk1);
+#else
+  ASSERT_NOCARRY (mpn_sub_n (v2, v2, v1, kk1));
+  ASSERT_NOCARRY (mpn_rshift (v2, v2, kk1, 1));
+#endif
+
+  /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r}
+       v0     v1-v0        hi(vinf)     tm1    (v2-vm1-3t1)/6    EMPTY */
+
+  /* (5) v1 <- t1-tm1           (1 1 1 1 0) - (0 1 0 1 0) = (1 0 1 0 0)
+     result is v1 >= 0
+  */
+  ASSERT_NOCARRY (mpn_sub_n (v1, v1, vm1, kk1));
+
+  /* We do not need to read the value in vm1, so we add it in {c+k, ...} */
+  cy = mpn_add_n (c1, c1, vm1, kk1);
+  MPN_INCR_U (c3 + 1, twor + k - 1, cy); /* 2n-(3k+1) = 2r+k-1 */
+  /* Memory allocated for vm1 is now free, it can be recycled ...*/
+
+  /* (6) v2 <- v2 - 2*vinf,     (2 1 0 0 0) - 2*(1 0 0 0 0) = (0 1 0 0 0)
+     result is v2 >= 0 */
+  saved = vinf[0];       /* Remember v1's highest byte (will be overwritten). */
+  vinf[0] = vinf0;       /* Set the right value for vinf0                     */
+#ifdef HAVE_NATIVE_mpn_sublsh1_n_ip1
+  cy = mpn_sublsh1_n_ip1 (v2, vinf, twor);
+#else
+  /* Overwrite unused vm1 */
+  cy = mpn_lshift (vm1, vinf, twor, 1);
+  cy += mpn_sub_n (v2, v2, vm1, twor);
+#endif
+  MPN_DECR_U (v2 + twor, kk1 - twor, cy);
+
+  /* Current matrix is
+     [1 0 0 0 0; vinf
+      0 1 0 0 0; v2
+      1 0 1 0 0; v1
+      0 1 0 1 0; vm1
+      0 0 0 0 1] v0
+     Some values already are in-place (we added vm1 in the correct position)
+     | vinf|  v1 |  v0 |
+	      | vm1 |
+     One still is in a separated area
+	| +v2 |
+     We have to compute v1-=vinf; vm1 -= v2,
+	   |-vinf|
+	      | -v2 |
+     Carefully reordering operations we can avoid to compute twice the sum
+     of the high half of v2 plus the low half of vinf.
+  */
+
+  /* Add the high half of t2 in {vinf} */
+  if ( LIKELY(twor > k + 1) ) { /* This is the expected flow  */
+    cy = mpn_add_n (vinf, vinf, v2 + k, k + 1);
+    MPN_INCR_U (c3 + kk1, twor - k - 1, cy); /* 2n-(5k+1) = 2r-k-1 */
+  } else { /* triggered only by very unbalanced cases like
+	      (k+k+(k-2))x(k+k+1) , should be handled by toom32 */
+    ASSERT_NOCARRY (mpn_add_n (vinf, vinf, v2 + k, twor));
+  }
+  /* (7) v1 <- v1 - vinf,       (1 0 1 0 0) - (1 0 0 0 0) = (0 0 1 0 0)
+     result is >= 0 */
+  /* Side effect: we also subtracted (high half) vm1 -= v2 */
+  cy = mpn_sub_n (v1, v1, vinf, twor);          /* vinf is at most twor long.  */
+  vinf0 = vinf[0];                     /* Save again the right value for vinf0 */
+  vinf[0] = saved;
+  MPN_DECR_U (v1 + twor, kk1 - twor, cy);       /* Treat the last bytes.       */
+
+  /* (8) vm1 <- vm1-v2          (0 1 0 1 0) - (0 1 0 0 0) = (0 0 0 1 0)
+     Operate only on the low half.
+  */
+  cy = mpn_sub_n (c1, c1, v2, k);
+  MPN_DECR_U (v1, kk1, cy);
+
+  /********************* Beginning the final phase **********************/
+
+  /* Most of the recomposition was done */
+
+  /* add t2 in {c+3k, ...}, but only the low half */
+  cy = mpn_add_n (c3, c3, v2, k);
+  vinf[0] += cy;
+  ASSERT(vinf[0] >= cy); /* No carry */
+  MPN_INCR_U (vinf, twor, vinf0); /* Add vinf0, propagate carry. */
+
+#undef v0
+}

diff --git a/third_party/gmp/mpn/generic/toom_interpolate_6pts.c b/third_party/gmp/mpn/generic/toom_interpolate_6pts.c
new file mode 100644
index 0000000..eb23661
--- /dev/null
+++ b/third_party/gmp/mpn/generic/toom_interpolate_6pts.c

@@ -0,0 +1,241 @@
+/* mpn_toom_interpolate_6pts -- Interpolate for toom43, 52
+
+   Contributed to the GNU project by Marco Bodrato.
+
+   THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
+   SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+Copyright 2009, 2010, 2012 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+
+#define BINVERT_3 MODLIMB_INVERSE_3
+
+/* For odd divisors, mpn_divexact_1 works fine with two's complement. */
+#ifndef mpn_divexact_by3
+#if HAVE_NATIVE_mpn_pi1_bdiv_q_1
+#define mpn_divexact_by3(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,3,BINVERT_3,0)
+#else
+#define mpn_divexact_by3(dst,src,size) mpn_divexact_1(dst,src,size,3)
+#endif
+#endif
+
+/* Interpolation for Toom-3.5, using the evaluation points: infinity,
+   1, -1, 2, -2. More precisely, we want to compute
+   f(2^(GMP_NUMB_BITS * n)) for a polynomial f of degree 5, given the
+   six values
+
+     w5 = f(0),
+     w4 = f(-1),
+     w3 = f(1)
+     w2 = f(-2),
+     w1 = f(2),
+     w0 = limit at infinity of f(x) / x^5,
+
+   The result is stored in {pp, 5*n + w0n}. At entry, w5 is stored at
+   {pp, 2n}, w3 is stored at {pp + 2n, 2n+1}, and w0 is stored at
+   {pp + 5n, w0n}. The other values are 2n + 1 limbs each (with most
+   significant limbs small). f(-1) and f(-2) may be negative, signs
+   determined by the flag bits. All intermediate results are positive.
+   Inputs are destroyed.
+
+   Interpolation sequence was taken from the paper: "Integer and
+   Polynomial Multiplication: Towards Optimal Toom-Cook Matrices".
+   Some slight variations were introduced: adaptation to "gmp
+   instruction set", and a final saving of an operation by interlacing
+   interpolation and recomposition phases.
+*/
+
+void
+mpn_toom_interpolate_6pts (mp_ptr pp, mp_size_t n, enum toom6_flags flags,
+			   mp_ptr w4, mp_ptr w2, mp_ptr w1,
+			   mp_size_t w0n)
+{
+  mp_limb_t cy;
+  /* cy6 can be stored in w1[2*n], cy4 in w4[0], embankment in w2[0] */
+  mp_limb_t cy4, cy6, embankment;
+
+  ASSERT( n > 0 );
+  ASSERT( 2*n >= w0n && w0n > 0 );
+
+#define w5  pp					/* 2n   */
+#define w3  (pp + 2 * n)			/* 2n+1 */
+#define w0  (pp + 5 * n)			/* w0n  */
+
+  /* Interpolate with sequence:
+     W2 =(W1 - W2)>>2
+     W1 =(W1 - W5)>>1
+     W1 =(W1 - W2)>>1
+     W4 =(W3 - W4)>>1
+     W2 =(W2 - W4)/3
+     W3 = W3 - W4 - W5
+     W1 =(W1 - W3)/3
+     // Last steps are mixed with recomposition...
+     W2 = W2 - W0<<2
+     W4 = W4 - W2
+     W3 = W3 - W1
+     W2 = W2 - W0
+  */
+
+  /* W2 =(W1 - W2)>>2 */
+  if (flags & toom6_vm2_neg)
+    mpn_add_n (w2, w1, w2, 2 * n + 1);
+  else
+    mpn_sub_n (w2, w1, w2, 2 * n + 1);
+  mpn_rshift (w2, w2, 2 * n + 1, 2);
+
+  /* W1 =(W1 - W5)>>1 */
+  w1[2*n] -= mpn_sub_n (w1, w1, w5, 2*n);
+  mpn_rshift (w1, w1, 2 * n + 1, 1);
+
+  /* W1 =(W1 - W2)>>1 */
+#if HAVE_NATIVE_mpn_rsh1sub_n
+  mpn_rsh1sub_n (w1, w1, w2, 2 * n + 1);
+#else
+  mpn_sub_n (w1, w1, w2, 2 * n + 1);
+  mpn_rshift (w1, w1, 2 * n + 1, 1);
+#endif
+
+  /* W4 =(W3 - W4)>>1 */
+  if (flags & toom6_vm1_neg)
+    {
+#if HAVE_NATIVE_mpn_rsh1add_n
+      mpn_rsh1add_n (w4, w3, w4, 2 * n + 1);
+#else
+      mpn_add_n (w4, w3, w4, 2 * n + 1);
+      mpn_rshift (w4, w4, 2 * n + 1, 1);
+#endif
+    }
+  else
+    {
+#if HAVE_NATIVE_mpn_rsh1sub_n
+      mpn_rsh1sub_n (w4, w3, w4, 2 * n + 1);
+#else
+      mpn_sub_n (w4, w3, w4, 2 * n + 1);
+      mpn_rshift (w4, w4, 2 * n + 1, 1);
+#endif
+    }
+
+  /* W2 =(W2 - W4)/3 */
+  mpn_sub_n (w2, w2, w4, 2 * n + 1);
+  mpn_divexact_by3 (w2, w2, 2 * n + 1);
+
+  /* W3 = W3 - W4 - W5 */
+  mpn_sub_n (w3, w3, w4, 2 * n + 1);
+  w3[2 * n] -= mpn_sub_n (w3, w3, w5, 2 * n);
+
+  /* W1 =(W1 - W3)/3 */
+  mpn_sub_n (w1, w1, w3, 2 * n + 1);
+  mpn_divexact_by3 (w1, w1, 2 * n + 1);
+
+  /*
+    [1 0 0 0 0 0;
+     0 1 0 0 0 0;
+     1 0 1 0 0 0;
+     0 1 0 1 0 0;
+     1 0 1 0 1 0;
+     0 0 0 0 0 1]
+
+    pp[] prior to operations:
+     |_H w0__|_L w0__|______||_H w3__|_L w3__|_H w5__|_L w5__|
+
+    summation scheme for remaining operations:
+     |______________5|n_____4|n_____3|n_____2|n______|n______|pp
+     |_H w0__|_L w0__|______||_H w3__|_L w3__|_H w5__|_L w5__|
+				    || H w4  | L w4  |
+		    || H w2  | L w2  |
+	    || H w1  | L w1  |
+			    ||-H w1  |-L w1  |
+		     |-H w0  |-L w0 ||-H w2  |-L w2  |
+  */
+  cy = mpn_add_n (pp + n, pp + n, w4, 2 * n + 1);
+  MPN_INCR_U (pp + 3 * n + 1, n, cy);
+
+  /* W2 -= W0<<2 */
+#if HAVE_NATIVE_mpn_sublsh_n || HAVE_NATIVE_mpn_sublsh2_n_ip1
+#if HAVE_NATIVE_mpn_sublsh2_n_ip1
+  cy = mpn_sublsh2_n_ip1 (w2, w0, w0n);
+#else
+  cy = mpn_sublsh_n (w2, w2, w0, w0n, 2);
+#endif
+#else
+  /* {W4,2*n+1} is now free and can be overwritten. */
+  cy = mpn_lshift(w4, w0, w0n, 2);
+  cy+= mpn_sub_n(w2, w2, w4, w0n);
+#endif
+  MPN_DECR_U (w2 + w0n, 2 * n + 1 - w0n, cy);
+
+  /* W4L = W4L - W2L */
+  cy = mpn_sub_n (pp + n, pp + n, w2, n);
+  MPN_DECR_U (w3, 2 * n + 1, cy);
+
+  /* W3H = W3H + W2L */
+  cy4 = w3[2 * n] + mpn_add_n (pp + 3 * n, pp + 3 * n, w2, n);
+  /* W1L + W2H */
+  cy = w2[2 * n] + mpn_add_n (pp + 4 * n, w1, w2 + n, n);
+  MPN_INCR_U (w1 + n, n + 1, cy);
+
+  /* W0 = W0 + W1H */
+  if (LIKELY (w0n > n))
+    cy6 = w1[2 * n] + mpn_add_n (w0, w0, w1 + n, n);
+  else
+    cy6 = mpn_add_n (w0, w0, w1 + n, w0n);
+
+  /*
+    summation scheme for the next operation:
+     |...____5|n_____4|n_____3|n_____2|n______|n______|pp
+     |...w0___|_w1_w2_|_H w3__|_L w3__|_H w5__|_L w5__|
+		     ...-w0___|-w1_w2 |
+  */
+  /* if(LIKELY(w0n>n)) the two operands below DO overlap! */
+  cy = mpn_sub_n (pp + 2 * n, pp + 2 * n, pp + 4 * n, n + w0n);
+
+  /* embankment is a "dirty trick" to avoid carry/borrow propagation
+     beyond allocated memory */
+  embankment = w0[w0n - 1] - 1;
+  w0[w0n - 1] = 1;
+  if (LIKELY (w0n > n)) {
+    if (cy4 > cy6)
+      MPN_INCR_U (pp + 4 * n, w0n + n, cy4 - cy6);
+    else
+      MPN_DECR_U (pp + 4 * n, w0n + n, cy6 - cy4);
+    MPN_DECR_U (pp + 3 * n + w0n, 2 * n, cy);
+    MPN_INCR_U (w0 + n, w0n - n, cy6);
+  } else {
+    MPN_INCR_U (pp + 4 * n, w0n + n, cy4);
+    MPN_DECR_U (pp + 3 * n + w0n, 2 * n, cy + cy6);
+  }
+  w0[w0n - 1] += embankment;
+
+#undef w5
+#undef w3
+#undef w0
+
+}

diff --git a/third_party/gmp/mpn/generic/toom_interpolate_7pts.c b/third_party/gmp/mpn/generic/toom_interpolate_7pts.c
new file mode 100644
index 0000000..167c45b
--- /dev/null
+++ b/third_party/gmp/mpn/generic/toom_interpolate_7pts.c

@@ -0,0 +1,274 @@
+/* mpn_toom_interpolate_7pts -- Interpolate for toom44, 53, 62.
+
+   Contributed to the GNU project by Niels Möller.
+   Improvements by Marco Bodrato.
+
+   THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
+   SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+Copyright 2006, 2007, 2009, 2014, 2015 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+
+#define BINVERT_3 MODLIMB_INVERSE_3
+
+#define BINVERT_9 \
+  ((((GMP_NUMB_MAX / 9) << (6 - GMP_NUMB_BITS % 6)) * 8 & GMP_NUMB_MAX) | 0x39)
+
+#define BINVERT_15 \
+  ((((GMP_NUMB_MAX >> (GMP_NUMB_BITS % 4)) / 15) * 14 * 16 & GMP_NUMB_MAX) + 15)
+
+/* For the various mpn_divexact_byN here, fall back to using either
+   mpn_pi1_bdiv_q_1 or mpn_divexact_1.  The former has less overhead and is
+   many faster if it is native.  For now, since mpn_divexact_1 is native on
+   several platforms where mpn_pi1_bdiv_q_1 does not yet exist, do not use
+   mpn_pi1_bdiv_q_1 unconditionally.  FIXME.  */
+
+/* For odd divisors, mpn_divexact_1 works fine with two's complement. */
+#ifndef mpn_divexact_by3
+#if HAVE_NATIVE_mpn_pi1_bdiv_q_1
+#define mpn_divexact_by3(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,3,BINVERT_3,0)
+#else
+#define mpn_divexact_by3(dst,src,size) mpn_divexact_1(dst,src,size,3)
+#endif
+#endif
+
+#ifndef mpn_divexact_by9
+#if HAVE_NATIVE_mpn_pi1_bdiv_q_1
+#define mpn_divexact_by9(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,9,BINVERT_9,0)
+#else
+#define mpn_divexact_by9(dst,src,size) mpn_divexact_1(dst,src,size,9)
+#endif
+#endif
+
+#ifndef mpn_divexact_by15
+#if HAVE_NATIVE_mpn_pi1_bdiv_q_1
+#define mpn_divexact_by15(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,15,BINVERT_15,0)
+#else
+#define mpn_divexact_by15(dst,src,size) mpn_divexact_1(dst,src,size,15)
+#endif
+#endif
+
+/* Interpolation for toom4, using the evaluation points 0, infinity,
+   1, -1, 2, -2, 1/2. More precisely, we want to compute
+   f(2^(GMP_NUMB_BITS * n)) for a polynomial f of degree 6, given the
+   seven values
+
+     w0 = f(0),
+     w1 = f(-2),
+     w2 = f(1),
+     w3 = f(-1),
+     w4 = f(2)
+     w5 = 64 * f(1/2)
+     w6 = limit at infinity of f(x) / x^6,
+
+   The result is 6*n + w6n limbs. At entry, w0 is stored at {rp, 2n },
+   w2 is stored at { rp + 2n, 2n+1 }, and w6 is stored at { rp + 6n,
+   w6n }. The other values are 2n + 1 limbs each (with most
+   significant limbs small). f(-1) and f(-1/2) may be negative, signs
+   determined by the flag bits. Inputs are destroyed.
+
+   Needs (2*n + 1) limbs of temporary storage.
+*/
+
+void
+mpn_toom_interpolate_7pts (mp_ptr rp, mp_size_t n, enum toom7_flags flags,
+			   mp_ptr w1, mp_ptr w3, mp_ptr w4, mp_ptr w5,
+			   mp_size_t w6n, mp_ptr tp)
+{
+  mp_size_t m;
+  mp_limb_t cy;
+
+  m = 2*n + 1;
+#define w0 rp
+#define w2 (rp + 2*n)
+#define w6 (rp + 6*n)
+
+  ASSERT (w6n > 0);
+  ASSERT (w6n <= 2*n);
+
+  /* Using formulas similar to Marco Bodrato's
+
+     W5 = W5 + W4
+     W1 =(W4 - W1)/2
+     W4 = W4 - W0
+     W4 =(W4 - W1)/4 - W6*16
+     W3 =(W2 - W3)/2
+     W2 = W2 - W3
+
+     W5 = W5 - W2*65      May be negative.
+     W2 = W2 - W6 - W0
+     W5 =(W5 + W2*45)/2   Now >= 0 again.
+     W4 =(W4 - W2)/3
+     W2 = W2 - W4
+
+     W1 = W5 - W1         May be negative.
+     W5 =(W5 - W3*8)/9
+     W3 = W3 - W5
+     W1 =(W1/15 + W5)/2   Now >= 0 again.
+     W5 = W5 - W1
+
+     where W0 = f(0), W1 = f(-2), W2 = f(1), W3 = f(-1),
+	   W4 = f(2), W5 = f(1/2), W6 = f(oo),
+
+     Note that most intermediate results are positive; the ones that
+     may be negative are represented in two's complement. We must
+     never shift right a value that may be negative, since that would
+     invalidate the sign bit. On the other hand, divexact by odd
+     numbers work fine with two's complement.
+  */
+
+  mpn_add_n (w5, w5, w4, m);
+  if (flags & toom7_w1_neg)
+    {
+#ifdef HAVE_NATIVE_mpn_rsh1add_n
+      mpn_rsh1add_n (w1, w1, w4, m);
+#else
+      mpn_add_n (w1, w1, w4, m);  ASSERT (!(w1[0] & 1));
+      mpn_rshift (w1, w1, m, 1);
+#endif
+    }
+  else
+    {
+#ifdef HAVE_NATIVE_mpn_rsh1sub_n
+      mpn_rsh1sub_n (w1, w4, w1, m);
+#else
+      mpn_sub_n (w1, w4, w1, m);  ASSERT (!(w1[0] & 1));
+      mpn_rshift (w1, w1, m, 1);
+#endif
+    }
+  mpn_sub (w4, w4, m, w0, 2*n);
+  mpn_sub_n (w4, w4, w1, m);  ASSERT (!(w4[0] & 3));
+  mpn_rshift (w4, w4, m, 2); /* w4>=0 */
+
+  tp[w6n] = mpn_lshift (tp, w6, w6n, 4);
+  mpn_sub (w4, w4, m, tp, w6n+1);
+
+  if (flags & toom7_w3_neg)
+    {
+#ifdef HAVE_NATIVE_mpn_rsh1add_n
+      mpn_rsh1add_n (w3, w3, w2, m);
+#else
+      mpn_add_n (w3, w3, w2, m);  ASSERT (!(w3[0] & 1));
+      mpn_rshift (w3, w3, m, 1);
+#endif
+    }
+  else
+    {
+#ifdef HAVE_NATIVE_mpn_rsh1sub_n
+      mpn_rsh1sub_n (w3, w2, w3, m);
+#else
+      mpn_sub_n (w3, w2, w3, m);  ASSERT (!(w3[0] & 1));
+      mpn_rshift (w3, w3, m, 1);
+#endif
+    }
+
+  mpn_sub_n (w2, w2, w3, m);
+
+  mpn_submul_1 (w5, w2, m, 65);
+  mpn_sub (w2, w2, m, w6, w6n);
+  mpn_sub (w2, w2, m, w0, 2*n);
+
+  mpn_addmul_1 (w5, w2, m, 45);  ASSERT (!(w5[0] & 1));
+  mpn_rshift (w5, w5, m, 1);
+  mpn_sub_n (w4, w4, w2, m);
+
+  mpn_divexact_by3 (w4, w4, m);
+  mpn_sub_n (w2, w2, w4, m);
+
+  mpn_sub_n (w1, w5, w1, m);
+  mpn_lshift (tp, w3, m, 3);
+  mpn_sub_n (w5, w5, tp, m);
+  mpn_divexact_by9 (w5, w5, m);
+  mpn_sub_n (w3, w3, w5, m);
+
+  mpn_divexact_by15 (w1, w1, m);
+#ifdef HAVE_NATIVE_mpn_rsh1add_n
+  mpn_rsh1add_n (w1, w1, w5, m);
+  w1[m - 1] &= GMP_NUMB_MASK >> 1;
+#else
+  mpn_add_n (w1, w1, w5, m);  ASSERT (!(w1[0] & 1));
+  mpn_rshift (w1, w1, m, 1); /* w1>=0 now */
+#endif
+
+  mpn_sub_n (w5, w5, w1, m);
+
+  /* These bounds are valid for the 4x4 polynomial product of toom44,
+   * and they are conservative for toom53 and toom62. */
+  ASSERT (w1[2*n] < 2);
+  ASSERT (w2[2*n] < 3);
+  ASSERT (w3[2*n] < 4);
+  ASSERT (w4[2*n] < 3);
+  ASSERT (w5[2*n] < 2);
+
+  /* Addition chain. Note carries and the 2n'th limbs that need to be
+   * added in.
+   *
+   * Special care is needed for w2[2n] and the corresponding carry,
+   * since the "simple" way of adding it all together would overwrite
+   * the limb at wp[2*n] and rp[4*n] (same location) with the sum of
+   * the high half of w3 and the low half of w4.
+   *
+   *         7    6    5    4    3    2    1    0
+   *    |    |    |    |    |    |    |    |    |
+   *                  ||w3 (2n+1)|
+   *             ||w4 (2n+1)|
+   *        ||w5 (2n+1)|        ||w1 (2n+1)|
+   *  + | w6 (w6n)|        ||w2 (2n+1)| w0 (2n) |  (share storage with r)
+   *  -----------------------------------------------
+   *  r |    |    |    |    |    |    |    |    |
+   *        c7   c6   c5   c4   c3                 Carries to propagate
+   */
+
+  cy = mpn_add_n (rp + n, rp + n, w1, m);
+  MPN_INCR_U (w2 + n + 1, n , cy);
+  cy = mpn_add_n (rp + 3*n, rp + 3*n, w3, n);
+  MPN_INCR_U (w3 + n, n + 1, w2[2*n] + cy);
+  cy = mpn_add_n (rp + 4*n, w3 + n, w4, n);
+  MPN_INCR_U (w4 + n, n + 1, w3[2*n] + cy);
+  cy = mpn_add_n (rp + 5*n, w4 + n, w5, n);
+  MPN_INCR_U (w5 + n, n + 1, w4[2*n] + cy);
+  if (w6n > n + 1)
+    {
+      cy = mpn_add_n (rp + 6*n, rp + 6*n, w5 + n, n + 1);
+      MPN_INCR_U (rp + 7*n + 1, w6n - n - 1, cy);
+    }
+  else
+    {
+      ASSERT_NOCARRY (mpn_add_n (rp + 6*n, rp + 6*n, w5 + n, w6n));
+#if WANT_ASSERT
+      {
+	mp_size_t i;
+	for (i = w6n; i <= n; i++)
+	  ASSERT (w5[n + i] == 0);
+      }
+#endif
+    }
+}

diff --git a/third_party/gmp/mpn/generic/toom_interpolate_8pts.c b/third_party/gmp/mpn/generic/toom_interpolate_8pts.c
new file mode 100644
index 0000000..5e65fab
--- /dev/null
+++ b/third_party/gmp/mpn/generic/toom_interpolate_8pts.c

@@ -0,0 +1,211 @@
+/* mpn_toom_interpolate_8pts -- Interpolate for toom54, 63, 72.
+
+   Contributed to the GNU project by Marco Bodrato.
+
+   THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
+   SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+Copyright 2009, 2011, 2012 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+
+#define BINVERT_3 MODLIMB_INVERSE_3
+
+#define BINVERT_15 \
+  ((((GMP_NUMB_MAX >> (GMP_NUMB_BITS % 4)) / 15) * 14 * 16 & GMP_NUMB_MAX) + 15)
+
+#define BINVERT_45 ((BINVERT_15 * BINVERT_3) & GMP_NUMB_MASK)
+
+#ifndef mpn_divexact_by3
+#if HAVE_NATIVE_mpn_pi1_bdiv_q_1
+#define mpn_divexact_by3(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,3,BINVERT_3,0)
+#else
+#define mpn_divexact_by3(dst,src,size) mpn_divexact_1(dst,src,size,3)
+#endif
+#endif
+
+#ifndef mpn_divexact_by45
+#if GMP_NUMB_BITS % 12 == 0
+#define mpn_divexact_by45(dst,src,size) \
+  (63 & 19 * mpn_bdiv_dbm1 (dst, src, size, __GMP_CAST (mp_limb_t, GMP_NUMB_MASK / 45)))
+#else
+#if HAVE_NATIVE_mpn_pi1_bdiv_q_1
+#define mpn_divexact_by45(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,45,BINVERT_45,0)
+#else
+#define mpn_divexact_by45(dst,src,size) mpn_divexact_1(dst,src,size,45)
+#endif
+#endif
+#endif
+
+#if HAVE_NATIVE_mpn_sublsh2_n_ip1
+#define DO_mpn_sublsh2_n(dst,src,n,ws) mpn_sublsh2_n_ip1(dst,src,n)
+#else
+#define DO_mpn_sublsh2_n(dst,src,n,ws) DO_mpn_sublsh_n(dst,src,n,2,ws)
+#endif
+
+#if HAVE_NATIVE_mpn_sublsh_n
+#define DO_mpn_sublsh_n(dst,src,n,s,ws) mpn_sublsh_n (dst,dst,src,n,s)
+#else
+static mp_limb_t
+DO_mpn_sublsh_n (mp_ptr dst, mp_srcptr src, mp_size_t n, unsigned int s, mp_ptr ws)
+{
+#if USE_MUL_1 && 0
+  return mpn_submul_1(dst,src,n,CNST_LIMB(1) <<(s));
+#else
+  mp_limb_t __cy;
+  __cy = mpn_lshift (ws,src,n,s);
+  return __cy + mpn_sub_n (dst,dst,ws,n);
+#endif
+}
+#endif
+
+
+#if HAVE_NATIVE_mpn_subrsh
+#define DO_mpn_subrsh(dst,nd,src,ns,s,ws) mpn_subrsh (dst,nd,src,ns,s)
+#else
+/* This is not a correct definition, it assumes no carry */
+#define DO_mpn_subrsh(dst,nd,src,ns,s,ws)				\
+do {									\
+  mp_limb_t __cy;							\
+  MPN_DECR_U (dst, nd, src[0] >> s);					\
+  __cy = DO_mpn_sublsh_n (dst, src + 1, ns - 1, GMP_NUMB_BITS - s, ws);	\
+  MPN_DECR_U (dst + ns - 1, nd - ns + 1, __cy);				\
+} while (0)
+#endif
+
+/* Interpolation for Toom-4.5 (or Toom-4), using the evaluation
+   points: infinity(4.5 only), 4, -4, 2, -2, 1, -1, 0. More precisely,
+   we want to compute f(2^(GMP_NUMB_BITS * n)) for a polynomial f of
+   degree 7 (or 6), given the 8 (rsp. 7) values:
+
+     r1 = limit at infinity of f(x) / x^7,
+     r2 = f(4),
+     r3 = f(-4),
+     r4 = f(2),
+     r5 = f(-2),
+     r6 = f(1),
+     r7 = f(-1),
+     r8 = f(0).
+
+   All couples of the form f(n),f(-n) must be already mixed with
+   toom_couple_handling(f(n),...,f(-n),...)
+
+   The result is stored in {pp, spt + 7*n (or 6*n)}.
+   At entry, r8 is stored at {pp, 2n},
+   r5 is stored at {pp + 3n, 3n + 1}.
+
+   The other values are 2n+... limbs each (with most significant limbs small).
+
+   All intermediate results are positive.
+   Inputs are destroyed.
+*/
+
+void
+mpn_toom_interpolate_8pts (mp_ptr pp, mp_size_t n,
+			   mp_ptr r3, mp_ptr r7,
+			   mp_size_t spt, mp_ptr ws)
+{
+  mp_limb_signed_t cy;
+  mp_ptr r5, r1;
+  r5 = (pp + 3 * n);			/* 3n+1 */
+  r1 = (pp + 7 * n);			/* spt */
+
+  /******************************* interpolation *****************************/
+
+  DO_mpn_subrsh(r3+n, 2 * n + 1, pp, 2 * n, 4, ws);
+  cy = DO_mpn_sublsh_n (r3, r1, spt, 12, ws);
+  MPN_DECR_U (r3 + spt, 3 * n + 1 - spt, cy);
+
+  DO_mpn_subrsh(r5+n, 2 * n + 1, pp, 2 * n, 2, ws);
+  cy = DO_mpn_sublsh_n (r5, r1, spt, 6, ws);
+  MPN_DECR_U (r5 + spt, 3 * n + 1 - spt, cy);
+
+  r7[3*n] -= mpn_sub_n (r7+n, r7+n, pp, 2 * n);
+  cy = mpn_sub_n (r7, r7, r1, spt);
+  MPN_DECR_U (r7 + spt, 3 * n + 1 - spt, cy);
+
+  ASSERT_NOCARRY(mpn_sub_n (r3, r3, r5, 3 * n + 1));
+  ASSERT_NOCARRY(mpn_rshift(r3, r3, 3 * n + 1, 2));
+
+  ASSERT_NOCARRY(mpn_sub_n (r5, r5, r7, 3 * n + 1));
+
+  ASSERT_NOCARRY(mpn_sub_n (r3, r3, r5, 3 * n + 1));
+
+  mpn_divexact_by45 (r3, r3, 3 * n + 1);
+
+  ASSERT_NOCARRY(mpn_divexact_by3 (r5, r5, 3 * n + 1));
+
+  ASSERT_NOCARRY(DO_mpn_sublsh2_n (r5, r3, 3 * n + 1, ws));
+
+  /* last interpolation steps... */
+  /* ... are mixed with recomposition */
+
+  /***************************** recomposition *******************************/
+  /*
+    pp[] prior to operations:
+     |_H r1|_L r1|____||_H r5|_M_r5|_L r5|_____|_H r8|_L r8|pp
+
+    summation scheme for remaining operations:
+     |____8|n___7|n___6|n___5|n___4|n___3|n___2|n____|n____|pp
+     |_H r1|_L r1|____||_H*r5|_M r5|_L r5|_____|_H_r8|_L r8|pp
+	  ||_H r3|_M r3|_L*r3|
+				  ||_H_r7|_M_r7|_L_r7|
+		      ||-H r3|-M r3|-L*r3|
+				  ||-H*r5|-M_r5|-L_r5|
+  */
+
+  cy = mpn_add_n (pp + n, pp + n, r7, n); /* Hr8+Lr7-Lr5 */
+  cy-= mpn_sub_n (pp + n, pp + n, r5, n);
+  if (cy > 0) {
+    MPN_INCR_U (r7 + n, 2*n + 1, 1);
+    cy = 0;
+  }
+
+  cy = mpn_sub_nc (pp + 2*n, r7 + n, r5 + n, n, -cy); /* Mr7-Mr5 */
+  MPN_DECR_U (r7 + 2*n, n + 1, cy);
+
+  cy = mpn_add_n (pp + 3*n, r5, r7+ 2*n, n+1); /* Hr7+Lr5 */
+  r5[3*n]+= mpn_add_n (r5 + 2*n, r5 + 2*n, r3, n); /* Hr5+Lr3 */
+  cy-= mpn_sub_n (pp + 3*n, pp + 3*n, r5 + 2*n, n+1); /* Hr7-Hr5+Lr5-Lr3 */
+  if (UNLIKELY(0 > cy))
+    MPN_DECR_U (r5 + n + 1, 2*n, 1);
+  else
+    MPN_INCR_U (r5 + n + 1, 2*n, cy);
+
+  ASSERT_NOCARRY(mpn_sub_n(pp + 4*n, r5 + n, r3 + n, 2*n +1)); /* Mr5-Mr3,Hr5-Hr3 */
+
+  cy = mpn_add_1 (pp + 6*n, r3 + n, n, pp[6*n]);
+  MPN_INCR_U (r3 + 2*n, n + 1, cy);
+  cy = mpn_add_n (pp + 7*n, pp + 7*n, r3 + 2*n, n);
+  if (LIKELY(spt != n))
+    MPN_INCR_U (pp + 8*n, spt - n, cy + r3[3*n]);
+  else
+    ASSERT (r3[3*n] + cy == 0);
+}

diff --git a/third_party/gmp/mpn/generic/trialdiv.c b/third_party/gmp/mpn/generic/trialdiv.c
new file mode 100644
index 0000000..65e089f
--- /dev/null
+++ b/third_party/gmp/mpn/generic/trialdiv.c

@@ -0,0 +1,131 @@
+/* mpn_trialdiv -- find small factors of an mpn number using trial division.
+
+   Contributed to the GNU project by Torbjorn Granlund.
+
+   THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
+   SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+Copyright 2009, 2010, 2012, 2013 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+/*
+   This function finds the first (smallest) factor represented in
+   trialdivtab.h.  It does not stop the factoring effort just because it has
+   reached some sensible limit, such as the square root of the input number.
+
+   The caller can limit the factoring effort by passing NPRIMES.  The function
+   will then divide until that limit, or perhaps a few primes more.  A position
+   which only mpn_trialdiv can make sense of is returned in the WHERE
+   parameter.  It can be used for restarting the factoring effort; the first
+   call should pass 0 here.
+
+   Input:        1. A non-negative number T = {tp,tn}
+                 2. NPRIMES as described above,
+                 3. *WHERE as described above.
+   Output:       1. *WHERE updated as described above.
+                 2. Return value is non-zero if we found a factor, else zero
+                    To get the actual prime factor, compute the mod B inverse
+                    of the return value.
+*/
+
+#include "gmp-impl.h"
+
+struct gmp_primes_dtab {
+  mp_limb_t binv;
+  mp_limb_t lim;
+};
+
+struct gmp_primes_ptab {
+  mp_limb_t ppp;	/* primes, multiplied together */
+  mp_limb_t cps[7];	/* ppp values pre-computed for mpn_mod_1s_4p */
+  gmp_uint_least32_t idx:24;	/* index of  first primes in dtab */
+  gmp_uint_least32_t np :8;	/* number of primes related to this entry */
+};
+
+
+static const struct gmp_primes_dtab gmp_primes_dtab[] =
+{
+#define WANT_dtab
+#define P(p,inv,lim) {inv,lim}
+#include "trialdivtab.h"
+#undef WANT_dtab
+#undef P
+  {0,0}
+};
+
+static const struct gmp_primes_ptab gmp_primes_ptab[] =
+{
+#define WANT_ptab
+#include "trialdivtab.h"
+#undef WANT_ptab
+};
+
+#define PTAB_LINES (sizeof (gmp_primes_ptab) / sizeof (gmp_primes_ptab[0]))
+
+/* FIXME: We could optimize out one of the outer loop conditions if we
+   had a final ptab entry with a huge np field.  */
+mp_limb_t
+mpn_trialdiv (mp_srcptr tp, mp_size_t tn, mp_size_t nprimes, int *where)
+{
+  mp_limb_t ppp;
+  const mp_limb_t *cps;
+  const struct gmp_primes_dtab *dp;
+  long i, j, idx, np;
+  mp_limb_t r, q;
+
+  ASSERT (tn >= 1);
+
+  for (i = *where; i < PTAB_LINES; i++)
+    {
+      ppp = gmp_primes_ptab[i].ppp;
+      cps = gmp_primes_ptab[i].cps;
+
+      r = mpn_mod_1s_4p (tp, tn, ppp << cps[1], cps);
+
+      idx = gmp_primes_ptab[i].idx;
+      np = gmp_primes_ptab[i].np;
+
+      /* Check divisibility by individual primes.  */
+      dp = &gmp_primes_dtab[idx] + np;
+      for (j = -np; j < 0; j++)
+	{
+	  q = r * dp[j].binv;
+	  if (q <= dp[j].lim)
+	    {
+	      *where = i;
+	      return dp[j].binv;
+	    }
+	}
+
+      nprimes -= np;
+      if (nprimes <= 0)
+	return 0;
+    }
+  return 0;
+}

diff --git a/third_party/gmp/mpn/generic/udiv_w_sdiv.c b/third_party/gmp/mpn/generic/udiv_w_sdiv.c
new file mode 100644
index 0000000..7907135
--- /dev/null
+++ b/third_party/gmp/mpn/generic/udiv_w_sdiv.c

@@ -0,0 +1,141 @@
+/* mpn_udiv_w_sdiv -- implement udiv_qrnnd on machines with only signed
+   division.
+
+   Contributed by Peter L. Montgomery.
+
+   THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE.  IT IS ONLY SAFE
+   TO REACH THIS FUNCTION THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS
+   ALMOST GUARANTEED THAT THIS FUNCTION WILL CHANGE OR DISAPPEAR IN A FUTURE
+   GNU MP RELEASE.
+
+
+Copyright 1992, 1994, 1996, 2000, 2011, 2012 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+mp_limb_t
+mpn_udiv_w_sdiv (mp_limb_t *rp, mp_limb_t a1, mp_limb_t a0, mp_limb_t d)
+{
+  mp_limb_t q, r;
+  mp_limb_t c0, c1, b1;
+
+  ASSERT (d != 0);
+  ASSERT (a1 < d);
+
+  if ((mp_limb_signed_t) d >= 0)
+    {
+      if (a1 < d - a1 - (a0 >> (GMP_LIMB_BITS - 1)))
+	{
+	  /* dividend, divisor, and quotient are nonnegative */
+	  sdiv_qrnnd (q, r, a1, a0, d);
+	}
+      else
+	{
+	  /* Compute c1*2^32 + c0 = a1*2^32 + a0 - 2^31*d */
+	  sub_ddmmss (c1, c0, a1, a0, d >> 1, d << (GMP_LIMB_BITS - 1));
+	  /* Divide (c1*2^32 + c0) by d */
+	  sdiv_qrnnd (q, r, c1, c0, d);
+	  /* Add 2^31 to quotient */
+	  q += (mp_limb_t) 1 << (GMP_LIMB_BITS - 1);
+	}
+    }
+  else
+    {
+      b1 = d >> 1;			/* d/2, between 2^30 and 2^31 - 1 */
+      c1 = a1 >> 1;			/* A/2 */
+      c0 = (a1 << (GMP_LIMB_BITS - 1)) + (a0 >> 1);
+
+      if (a1 < b1)			/* A < 2^32*b1, so A/2 < 2^31*b1 */
+	{
+	  sdiv_qrnnd (q, r, c1, c0, b1); /* (A/2) / (d/2) */
+
+	  r = 2*r + (a0 & 1);		/* Remainder from A/(2*b1) */
+	  if ((d & 1) != 0)
+	    {
+	      if (r >= q)
+		r = r - q;
+	      else if (q - r <= d)
+		{
+		  r = r - q + d;
+		  q--;
+		}
+	      else
+		{
+		  r = r - q + 2*d;
+		  q -= 2;
+		}
+	    }
+	}
+      else if (c1 < b1)			/* So 2^31 <= (A/2)/b1 < 2^32 */
+	{
+	  c1 = (b1 - 1) - c1;
+	  c0 = ~c0;			/* logical NOT */
+
+	  sdiv_qrnnd (q, r, c1, c0, b1); /* (A/2) / (d/2) */
+
+	  q = ~q;			/* (A/2)/b1 */
+	  r = (b1 - 1) - r;
+
+	  r = 2*r + (a0 & 1);		/* A/(2*b1) */
+
+	  if ((d & 1) != 0)
+	    {
+	      if (r >= q)
+		r = r - q;
+	      else if (q - r <= d)
+		{
+		  r = r - q + d;
+		  q--;
+		}
+	      else
+		{
+		  r = r - q + 2*d;
+		  q -= 2;
+		}
+	    }
+	}
+      else				/* Implies c1 = b1 */
+	{				/* Hence a1 = d - 1 = 2*b1 - 1 */
+	  if (a0 >= -d)
+	    {
+	      q = -CNST_LIMB(1);
+	      r = a0 + d;
+	    }
+	  else
+	    {
+	      q = -CNST_LIMB(2);
+	      r = a0 + 2*d;
+	    }
+	}
+    }
+
+  *rp = r;
+  return q;
+}

diff --git a/third_party/gmp/mpn/generic/zero.c b/third_party/gmp/mpn/generic/zero.c
new file mode 100644
index 0000000..1a05453
--- /dev/null
+++ b/third_party/gmp/mpn/generic/zero.c

@@ -0,0 +1,41 @@
+/* mpn_zero
+
+Copyright 2009 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+
+void
+mpn_zero (mp_ptr rp, mp_size_t n)
+{
+  mp_size_t i;
+
+  rp += n;
+  for (i = -n; i != 0; i++)
+    rp[i] = 0;
+}

diff --git a/third_party/gmp/mpn/generic/zero_p.c b/third_party/gmp/mpn/generic/zero_p.c
new file mode 100644
index 0000000..c92f9b8
--- /dev/null
+++ b/third_party/gmp/mpn/generic/zero_p.c

@@ -0,0 +1,33 @@
+/* mpn_zero_p (x,xsize) -- Return 1 if X is zero, 0 if it is non-zero.
+
+Copyright 2015 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define __GMP_FORCE_mpn_zero_p 1
+
+#include "gmp-impl.h"

diff --git a/third_party/gmp/mpn/ia64/README b/third_party/gmp/mpn/ia64/README
new file mode 100644
index 0000000..45c2d63
--- /dev/null
+++ b/third_party/gmp/mpn/ia64/README

@@ -0,0 +1,281 @@
+Copyright 2000-2005 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.
+
+
+
+                      IA-64 MPN SUBROUTINES
+
+
+This directory contains mpn functions for the IA-64 architecture.
+
+
+CODE ORGANIZATION
+
+	mpn/ia64          itanium-2, and generic ia64
+
+The code here has been optimized primarily for Itanium 2.  Very few Itanium 1
+chips were ever sold, and Itanium 2 is more powerful, so the latter is what
+we concentrate on.
+
+
+
+CHIP NOTES
+
+The IA-64 ISA keeps instructions three and three in 128 bit bundles.
+Programmers/compilers need to put explicit breaks `;;' when there are WAW or
+RAW dependencies, with some notable exceptions.  Such "breaks" are typically
+at the end of a bundle, but can be put between operations within some bundle
+types too.
+
+The Itanium 1 and Itanium 2 implementations can under ideal conditions
+execute two bundles per cycle.  The Itanium 1 allows 4 of these instructions
+to do integer operations, while the Itanium 2 allows all 6 to be integer
+operations.
+
+Taken cloop branches seem to insert a bubble into the pipeline most of the
+time on Itanium 1.
+
+Loads to the fp registers bypass the L1 cache and thus get extremely long
+latencies, 9 cycles on the Itanium 1 and 6 cycles on the Itanium 2.
+
+The software pipeline stuff using br.ctop instruction causes delays, since
+many issue slots are taken up by instructions with zero predicates, and
+since many extra instructions are needed to set things up.  These features
+are clearly designed for code density, not speed.
+
+Misc pipeline limitations (Itanium 1):
+* The getf.sig instruction can only execute in M0.
+* At most four integer instructions/cycle.
+* Nops take up resources like any plain instructions.
+
+Misc pipeline limitations (Itanium 2):
+* The getf.sig instruction can only execute in M0.
+* Nops take up resources like any plain instructions.
+
+
+ASSEMBLY SYNTAX
+
+.align pads with nops in a text segment, but gas 2.14 and earlier
+incorrectly byte-swaps its nop bundle in big endian mode (eg. hpux), making
+it come out as break instructions.  We use the ALIGN() macro in
+mpn/ia64/ia64-defs.m4 when it might be executed across.  That macro
+suppresses any .align if the problem is detected by configure.  Lack of
+alignment might hurt performance but will at least be correct.
+
+foo:: to create a global symbol is not accepted by gas.  Use separate
+".global foo" and "foo:" instead.
+
+.global is the standard global directive.  gas accepts .globl, but hpux "as"
+doesn't.
+
+.proc / .endp generates the appropriate .type and .size information for ELF,
+so the latter directives don't need to be given explicitly.
+
+.pred.rel "mutex"... is standard for annotating predicate register
+relationships.  gas also accepts .pred.rel.mutex, but hpux "as" doesn't.
+
+.pred directives can't be put on a line with a label, like
+".Lfoo: .pred ...", the HP assembler on HP-UX 11.23 rejects that.
+gas is happy with it, and past versions of HP had seemed ok.
+
+// is the standard comment sequence, but we prefer "C" since it inhibits m4
+macro expansion.  See comments in ia64-defs.m4.
+
+
+REGISTER USAGE
+
+Special:
+   r0: constant 0
+   r1: global pointer (gp)
+   r8: return value
+   r12: stack pointer (sp)
+   r13: thread pointer (tp)
+Caller-saves: r8-r11 r14-r31 f6-f15 f32-f127
+Caller-saves but rotating: r32-
+
+
+================================================================
+mpn_add_n, mpn_sub_n:
+
+The current code runs at 1.25 c/l on Itanium 2.
+
+================================================================
+mpn_mul_1:
+
+The current code runs at 2 c/l on Itanium 2.
+
+Using a blocked approach, working off of 4 separate places in the operands,
+one could make use of the xma accumulation, and approach 1 c/l.
+
+	ldf8 [up]
+	xma.l
+	xma.hu
+	stf8  [wrp]
+
+================================================================
+mpn_addmul_1:
+
+The current code runs at 2 c/l on Itanium 2.
+
+It seems possible to use a blocked approach, as with mpn_mul_1.  We should
+read rp[] to integer registers, allowing for just one getf.sig per cycle.
+
+	ld8  [rp]
+	ldf8 [up]
+	xma.l
+	xma.hu
+	getf.sig
+	add+add+cmp+cmp
+	st8  [wrp]
+
+These 10 instructions can be scheduled to approach 1.667 cycles, and with
+the 4 cycle latency of xma, this means we need at least 3 blocks.  Using
+ldfp8 we could approach 1.583 c/l.
+
+================================================================
+mpn_submul_1:
+
+The current code runs at 2.25 c/l on Itanium 2.  Getting to 2 c/l requires
+ldfp8 with all alignment headache that implies.
+
+================================================================
+mpn_addmul_N
+
+For best speed, we need to give up using mpn_addmul_2 as the main multiply
+building block, and instead take multiple v limbs per loop.  For the Itanium
+1, we need to take about 8 limbs at a time for full speed.  For the Itanium
+2, something like mpn_addmul_4 should be enough.
+
+The add+cmp+cmp+add we use on the other codes is optimal for shortening
+recurrencies (1 cycle) but the sequence takes up 4 execution slots.  When
+recurrency depth is not critical, a more standard 3-cycle add+cmp+add is
+better.
+
+/* First load the 8 values from v */
+	ldfp8		v0, v1 = [r35], 16;;
+	ldfp8		v2, v3 = [r35], 16;;
+	ldfp8		v4, v5 = [r35], 16;;
+	ldfp8		v6, v7 = [r35], 16;;
+
+/* In the inner loop, get a new U limb and store a result limb. */
+	mov		lc = un
+Loop:	ldf8		u0 = [r33], 8
+	ld8		r0 = [r32]
+	xma.l		lp0 = v0, u0, hp0
+	xma.hu		hp0 = v0, u0, hp0
+	xma.l		lp1 = v1, u0, hp1
+	xma.hu		hp1 = v1, u0, hp1
+	xma.l		lp2 = v2, u0, hp2
+	xma.hu		hp2 = v2, u0, hp2
+	xma.l		lp3 = v3, u0, hp3
+	xma.hu		hp3 = v3, u0, hp3
+	xma.l		lp4 = v4, u0, hp4
+	xma.hu		hp4 = v4, u0, hp4
+	xma.l		lp5 = v5, u0, hp5
+	xma.hu		hp5 = v5, u0, hp5
+	xma.l		lp6 = v6, u0, hp6
+	xma.hu		hp6 = v6, u0, hp6
+	xma.l		lp7 = v7, u0, hp7
+	xma.hu		hp7 = v7, u0, hp7
+	getf.sig	l0 = lp0
+	getf.sig	l1 = lp1
+	getf.sig	l2 = lp2
+	getf.sig	l3 = lp3
+	getf.sig	l4 = lp4
+	getf.sig	l5 = lp5
+	getf.sig	l6 = lp6
+	add+cmp+add	xx, l0, r0
+	add+cmp+add	acc0, acc1, l1
+	add+cmp+add	acc1, acc2, l2
+	add+cmp+add	acc2, acc3, l3
+	add+cmp+add	acc3, acc4, l4
+	add+cmp+add	acc4, acc5, l5
+	add+cmp+add	acc5, acc6, l6
+	getf.sig	acc6 = lp7
+	st8		[r32] = xx, 8
+	br.cloop Loop
+
+	49 insn at max 6 insn/cycle:		8.167 cycles/limb8
+	11 memops at max 2 memops/cycle:	5.5 cycles/limb8
+	16 fpops at max 2 fpops/cycle:		8 cycles/limb8
+	21 intops at max 4 intops/cycle:	5.25 cycles/limb8
+	11+21 memops+intops at max 4/cycle	8 cycles/limb8
+
+================================================================
+mpn_lshift, mpn_rshift
+
+The current code runs at 1 cycle/limb on Itanium 2.
+
+Using 63 separate loops, we could use the double-word shrp instruction.
+That instruction has a plain single-cycle latency.  We need 63 loops since
+this instruction only accept immediate count.  That would lead to a somewhat
+silly code size, but the speed would be 0.75 c/l on Itanium 2 (by using shrp
+each cycle plus shl/shr going down I1 for a further limb every second
+cycle).
+
+================================================================
+mpn_copyi, mpn_copyd
+
+The current code runs at 0.5 c/l on Itanium 2.  But that is just for L1
+cache hit.  The 4-way unrolled loop takes just 2 cycles, and thus load-use
+scheduling isn't great.  It might be best to actually use modulo scheduled
+loops, since that will allow us to do better load-use scheduling without too
+much unrolling.
+
+Depending on size or operand alignment, we get 1 c/l or 0.5 c/l on Itanium
+2, according to tune/speed.  Cache bank conflicts?
+
+
+
+REFERENCES
+
+Intel Itanium Architecture Software Developer's Manual, volumes 1 to 3,
+Intel document 245317-004, 245318-004, 245319-004 October 2002.  Volume 1
+includes an Itanium optimization guide.
+
+Intel Itanium Processor-specific Application Binary Interface (ABI), Intel
+document 245370-003, May 2001.  Describes C type sizes, dynamic linking,
+etc.
+
+Intel Itanium Architecture Assembly Language Reference Guide, Intel document
+248801-004, 2000-2002.  Describes assembly instruction syntax and other
+directives.
+
+Itanium Software Conventions and Runtime Architecture Guide, Intel document
+245358-003, May 2001.  Describes calling conventions, including stack
+unwinding requirements.
+
+Intel Itanium Processor Reference Manual for Software Optimization, Intel
+document 245473-003, November 2001.
+
+Intel Itanium-2 Processor Reference Manual for Software Development and
+Optimization, Intel document 251110-003, May 2004.
+
+All the above documents can be found online at
+
+    http://developer.intel.com/design/itanium/manuals.htm

diff --git a/third_party/gmp/mpn/ia64/add_n_sub_n.asm b/third_party/gmp/mpn/ia64/add_n_sub_n.asm
new file mode 100644
index 0000000..c15afaa
--- /dev/null
+++ b/third_party/gmp/mpn/ia64/add_n_sub_n.asm

@@ -0,0 +1,307 @@
+dnl  IA-64 mpn_add_n_sub_n -- mpn parallel addition and subtraction.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  Copyright 2010 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C           cycles/limb
+C Itanium:      ?
+C Itanium 2:    2.25
+
+C INPUT PARAMETERS
+define(`sp', `r32')
+define(`dp', `r33')
+define(`up', `r34')
+define(`vp', `r35')
+define(`n',  `r36')
+
+C Some useful aliases for registers we use
+define(`u0',`r16') define(`u1',`r17') define(`u2',`r18') define(`u3',`r19')
+define(`v0',`r20') define(`v1',`r21') define(`v2',`r22') define(`v3',`r23')
+define(`s0',`r24') define(`s1',`r25') define(`s2',`r26') define(`s3',`r27')
+define(`d0',`r28') define(`d1',`r29') define(`d2',`r30') define(`d3',`r31')
+define(`up0',`up')
+define(`up1',`r14')
+define(`vp0',`vp')
+define(`vp1',`r15')
+
+
+ASM_START()
+PROLOGUE(mpn_add_n_sub_n)
+	.prologue
+	.save	ar.lc, r2
+	.body
+ifdef(`HAVE_ABI_32',`
+	addp4	sp = 0, sp		C				M I
+	addp4	dp = 0, dp		C				M I
+	nop.i	0
+	addp4	up = 0, up		C				M I
+	addp4	vp = 0, vp		C				M I
+	zxt4	n = n			C				I
+	;;
+')
+
+	and	r9 = 3, n		C				M I
+	mov.i	r2 = ar.lc		C				I0
+	add	up1 = 8, up0		C				M I
+	add	vp1 = 8, vp0		C				M I
+	add	r8 = -2, n		C				M I
+	add	r10 = 256, up		C				M I
+	;;
+	shr.u	r8 = r8, 2		C				I0
+	cmp.eq	p10, p0 = 0, r9		C				M I
+	cmp.eq	p11, p0 = 2, r9		C				M I
+	cmp.eq	p12, p0 = 3, r9		C				M I
+	add	r11 = 256, vp		C				M I
+	;;
+	mov.i	ar.lc = r8		C				I0
+  (p10)	br	L(b0)			C				B
+  (p11)	br	L(b2)			C				B
+  (p12)	br	L(b3)			C				B
+
+L(b1):	ld8	u3 = [up0], 8		C				M01
+	add	up1 = 8, up1		C				M I
+	cmpltu	p14, p15 = 4, n		C				M I
+	ld8	v3 = [vp0], 8		C				M01
+	add	vp1 = 8, vp1		C				M I
+	;;
+	add	s3 = u3, v3		C				M I
+	sub	d3 = u3, v3		C				M I
+	mov	r8 = 0			C				M I
+	;;
+	cmpltu	p9, p0 = s3, v3		C  carry from add3		M I
+	cmpltu	p13, p0 = u3, v3	C borrow from sub3		M I
+  (p15)	br	L(cj1)			C				B
+	st8	[sp] = s3, 8		C				M23
+	st8	[dp] = d3, 8		C				M23
+	br	L(c0)			C				B
+
+L(b0):	cmp.ne	p9, p0 = r0, r0		C				M I
+	cmp.ne	p13, p0 = r0, r0	C				M I
+L(c0):	ld8	u0 = [up0], 16		C				M01
+	ld8	u1 = [up1], 16		C				M01
+	;;
+	ld8	v0 = [vp0], 16		C				M01
+	ld8	v1 = [vp1], 16		C				M01
+	;;
+	ld8	u2 = [up0], 16		C				M01
+	ld8	u3 = [up1], 16		C				M01
+	;;
+	ld8	v2 = [vp0], 16		C				M01
+	ld8	v3 = [vp1], 16		C				M01
+	;;
+	add	s0 = u0, v0		C				M I
+	add	s1 = u1, v1		C				M I
+	sub	d0 = u0, v0		C				M I
+	sub	d1 = u1, v1		C				M I
+	;;
+	cmpltu	p6, p0 = s0, v0		C  carry from add0		M I
+	cmpltu	p7, p0 = s1, v1		C  carry from add1		M I
+	cmpltu	p10, p0 = u0, v0	C borrow from sub0		M I
+	cmpltu	p11, p0 = u1, v1	C borrow from sub1		M I
+	;;
+	nop	0			C
+	br.cloop.dptk	L(top)		C				B
+	br	L(end)			C				B
+
+L(b3):	ld8	u1 = [up0], 8		C				M01
+	add	up1 = 8, up1		C				M I
+	ld8	v1 = [vp0], 8		C				M01
+	;;
+	add	vp1 = 8, vp1		C				M I
+	add	s1 = u1, v1		C				M I
+	sub	d1 = u1, v1		C				M I
+	;;
+	cmpltu	p7, p0 = s1, v1		C  carry from add1		M I
+	cmpltu	p11, p0 = u1, v1	C borrow from sub1		M I
+	;;
+	st8	[sp] = s1, 8		C				M23
+	st8	[dp] = d1, 8		C				M23
+	br	L(c2)			C				B
+
+	ALIGN(32)
+L(b2):	cmp.ne	p7, p0 = r0, r0		C				M I
+	cmp.ne	p11, p0 = r0, r0	C				M I
+	nop	0
+L(c2):	ld8	u2 = [up0], 16		C				M01
+	ld8	u3 = [up1], 16		C				M01
+	cmpltu	p14, p0 = 4, n		C				M I
+	;;
+	ld8	v2 = [vp0], 16		C				M01
+	ld8	v3 = [vp1], 16		C				M01
+  (p14)	br	L(gt4)			C				B
+	;;
+	add	s2 = u2, v2		C				M I
+	add	s3 = u3, v3		C				M I
+	sub	d2 = u2, v2		C				M I
+	sub	d3 = u3, v3		C				M I
+	;;
+	cmpltu	p8, p0 = s2, v2		C  carry from add0		M I
+	cmpltu	p9, p0 = s3, v3		C  carry from add3		M I
+	cmpltu	p12, p0 = u2, v2	C borrow from sub2		M I
+	cmpltu	p13, p0 = u3, v3	C borrow from sub3		M I
+	br	L(cj2)			C				B
+	;;
+L(gt4):	ld8	u0 = [up0], 16		C				M01
+	ld8	u1 = [up1], 16		C				M01
+	;;
+	ld8	v0 = [vp0], 16		C				M01
+	ld8	v1 = [vp1], 16		C				M01
+	;;
+	add	s2 = u2, v2		C				M I
+	add	s3 = u3, v3		C				M I
+	sub	d2 = u2, v2		C				M I
+	sub	d3 = u3, v3		C				M I
+	;;
+	cmpltu	p8, p0 = s2, v2		C  carry from add0		M I
+	cmpltu	p9, p0 = s3, v3		C  carry from add1		M I
+	cmpltu	p12, p0 = u2, v2	C borrow from sub0		M I
+	cmpltu	p13, p0 = u3, v3	C borrow from sub1		M I
+	br.cloop.dptk	L(mid)		C				B
+
+	ALIGN(32)
+L(top):
+	ld8	u0 = [up0], 16		C				M01
+	ld8	u1 = [up1], 16		C				M01
+   (p9)	cmpeqor	p6, p0 = -1, s0		C				M I
+   (p9)	add	s0 = 1, s0		C				M I
+  (p13)	cmpeqor	p10, p0 = 0, d0		C				M I
+  (p13)	add	d0 = -1, d0		C				M I
+	;;
+	ld8	v0 = [vp0], 16		C				M01
+	ld8	v1 = [vp1], 16		C				M01
+   (p6)	cmpeqor	p7, p0 = -1, s1		C				M I
+   (p6)	add	s1 = 1, s1		C				M I
+  (p10)	cmpeqor	p11, p0 = 0, d1		C				M I
+  (p10)	add	d1 = -1, d1		C				M I
+	;;
+	st8	[sp] = s0, 8		C				M23
+	st8	[dp] = d0, 8		C				M23
+	add	s2 = u2, v2		C				M I
+	add	s3 = u3, v3		C				M I
+	sub	d2 = u2, v2		C				M I
+	sub	d3 = u3, v3		C				M I
+	;;
+	st8	[sp] = s1, 8		C				M23
+	st8	[dp] = d1, 8		C				M23
+	cmpltu	p8, p0 = s2, v2		C  carry from add2		M I
+	cmpltu	p9, p0 = s3, v3		C  carry from add3		M I
+	cmpltu	p12, p0 = u2, v2	C borrow from sub2		M I
+	cmpltu	p13, p0 = u3, v3	C borrow from sub3		M I
+	;;
+L(mid):
+	ld8	u2 = [up0], 16		C				M01
+	ld8	u3 = [up1], 16		C				M01
+   (p7)	cmpeqor	p8, p0 = -1, s2		C				M I
+   (p7)	add	s2 = 1, s2		C				M I
+  (p11)	cmpeqor	p12, p0 = 0, d2		C				M I
+  (p11)	add	d2 = -1, d2		C				M I
+	;;
+	ld8	v2 = [vp0], 16		C				M01
+	ld8	v3 = [vp1], 16		C				M01
+   (p8)	cmpeqor	p9, p0 = -1, s3		C				M I
+   (p8)	add	s3 = 1, s3		C				M I
+  (p12)	cmpeqor	p13, p0 = 0, d3		C				M I
+  (p12)	add	d3 = -1, d3		C				M I
+	;;
+	st8	[sp] = s2, 8		C				M23
+	st8	[dp] = d2, 8		C				M23
+	add	s0 = u0, v0		C				M I
+	add	s1 = u1, v1		C				M I
+	sub	d0 = u0, v0		C				M I
+	sub	d1 = u1, v1		C				M I
+	;;
+	st8	[sp] = s3, 8		C				M23
+	st8	[dp] = d3, 8		C				M23
+	cmpltu	p6, p0 = s0, v0		C  carry from add0		M I
+	cmpltu	p7, p0 = s1, v1		C  carry from add1		M I
+	cmpltu	p10, p0 = u0, v0	C borrow from sub0		M I
+	cmpltu	p11, p0 = u1, v1	C borrow from sub1		M I
+	;;
+	lfetch	[r10], 32		C				M?
+	lfetch	[r11], 32		C				M?
+	br.cloop.dptk	L(top)		C				B
+	;;
+
+L(end):
+	nop	0
+	nop	0
+   (p9)	cmpeqor	p6, p0 = -1, s0		C				M I
+   (p9)	add	s0 = 1, s0		C				M I
+  (p13)	cmpeqor	p10, p0 = 0, d0		C				M I
+  (p13)	add	d0 = -1, d0		C				M I
+	;;
+	nop	0
+	nop	0
+   (p6)	cmpeqor	p7, p0 = -1, s1		C				M I
+   (p6)	add	s1 = 1, s1		C				M I
+  (p10)	cmpeqor	p11, p0 = 0, d1		C				M I
+  (p10)	add	d1 = -1, d1		C				M I
+	;;
+	st8	[sp] = s0, 8		C				M23
+	st8	[dp] = d0, 8		C				M23
+	add	s2 = u2, v2		C				M I
+	add	s3 = u3, v3		C				M I
+	sub	d2 = u2, v2		C				M I
+	sub	d3 = u3, v3		C				M I
+	;;
+	st8	[sp] = s1, 8		C				M23
+	st8	[dp] = d1, 8		C				M23
+	cmpltu	p8, p0 = s2, v2		C  carry from add2		M I
+	cmpltu	p9, p0 = s3, v3		C  carry from add3		M I
+	cmpltu	p12, p0 = u2, v2	C borrow from sub2		M I
+	cmpltu	p13, p0 = u3, v3	C borrow from sub3		M I
+	;;
+L(cj2):
+   (p7)	cmpeqor	p8, p0 = -1, s2		C				M I
+   (p7)	add	s2 = 1, s2		C				M I
+  (p11)	cmpeqor	p12, p0 = 0, d2		C				M I
+  (p11)	add	d2 = -1, d2		C				M I
+	mov	r8 = 0			C				M I
+	nop	0
+	;;
+	st8	[sp] = s2, 8		C				M23
+	st8	[dp] = d2, 8		C				M23
+   (p8)	cmpeqor	p9, p0 = -1, s3		C				M I
+   (p8)	add	s3 = 1, s3		C				M I
+  (p12)	cmpeqor	p13, p0 = 0, d3		C				M I
+  (p12)	add	d3 = -1, d3		C				M I
+	;;
+L(cj1):
+   (p9)	mov	r8 = 2			C				M I
+	;;
+	mov.i	ar.lc = r2		C				I0
+  (p13)	add	r8 = 1, r8		C				M I
+	st8	[sp] = s3		C				M23
+	st8	[dp] = d3		C				M23
+	br.ret.sptk.many b0		C				B
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/ia64/addmul_1.asm b/third_party/gmp/mpn/ia64/addmul_1.asm
new file mode 100644
index 0000000..ffa3297
--- /dev/null
+++ b/third_party/gmp/mpn/ia64/addmul_1.asm

@@ -0,0 +1,602 @@
+dnl  IA-64 mpn_addmul_1 -- Multiply a limb vector with a limb and add the
+dnl  result to a second limb vector.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  Copyright 2000-2005, 2007 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C         cycles/limb
+C Itanium:    3.0
+C Itanium 2:  2.0
+
+C TODO
+C  * Further optimize feed-in and wind-down code, both for speed and code size.
+C  * Handle low limb input and results specially, using a common stf8 in the
+C    epilogue.
+C  * Use 1 c/l carry propagation scheme in wind-down code.
+C  * Use extra pointer registers for `up' and rp to speed up feed-in loads.
+C  * Work out final differences with mul_1.asm.  That function is 300 bytes
+C    smaller than this due to better loop scheduling and thus simpler feed-in
+C    code.
+
+C INPUT PARAMETERS
+define(`rp', `r32')
+define(`up', `r33')
+define(`n', `r34')
+define(`vl', `r35')
+
+ASM_START()
+PROLOGUE(mpn_addmul_1)
+	.prologue
+	.save	ar.lc, r2
+	.body
+
+ifdef(`HAVE_ABI_32',
+`	addp4		rp = 0, rp		C M I
+	addp4		up = 0, up		C M I
+	zxt4		n = n			C I
+	;;
+')
+{.mmi
+	adds		r15 = -1, n		C M I
+	mov		r20 = rp		C M I
+	mov.i		r2 = ar.lc		C I0
+}
+{.mmi
+	ldf8		f7 = [up], 8		C M
+	ldf8		f8 = [rp], 8		C M
+	and		r14 = 3, n		C M I
+	;;
+}
+{.mmi
+	setf.sig	f6 = vl			C M2 M3
+	cmp.eq		p10, p0 = 0, r14	C M I
+	shr.u		r31 = r15, 2		C I0
+}
+{.mmi
+	cmp.eq		p11, p0 = 2, r14	C M I
+	cmp.eq		p12, p0 = 3, r14	C M I
+	nop.i		0			C I
+	;;
+}
+{.mii
+	cmp.ne		p6, p7 = r0, r0		C M I
+	mov.i		ar.lc = r31		C I0
+	cmp.ne		p8, p9 = r0, r0		C M I
+}
+{.bbb
+  (p10)	br.dptk		.Lb00			C B
+  (p11)	br.dptk		.Lb10			C B
+  (p12)	br.dptk		.Lb11			C B
+	;;
+}
+
+.Lb01:	br.cloop.dptk	.grt1			C B
+
+	xma.l		f39 = f7, f6, f8	C F
+	xma.hu		f43 = f7, f6, f8	C F
+	;;
+	getf.sig	r8 = f43		C M2
+	stf8		[r20] = f39		C M2 M3
+	mov.i		ar.lc = r2		C I0
+	br.ret.sptk.many b0			C B
+
+.grt1:
+	ldf8		f32 = [up], 8
+	ldf8		f44 = [rp], 8
+	;;
+	ldf8		f33 = [up], 8
+	ldf8		f45 = [rp], 8
+	;;
+	ldf8		f34 = [up], 8
+	xma.l		f39 = f7, f6, f8
+	ldf8		f46 = [rp], 8
+	xma.hu		f43 = f7, f6, f8
+	;;
+	ldf8		f35 = [up], 8
+	ldf8		f47 = [rp], 8
+	br.cloop.dptk	.grt5
+
+	xma.l		f36 = f32, f6, f44
+	xma.hu		f40 = f32, f6, f44
+	;;
+	stf8		[r20] = f39, 8
+	xma.l		f37 = f33, f6, f45
+	xma.hu		f41 = f33, f6, f45
+	;;
+	getf.sig	r31 = f43
+	getf.sig	r24 = f36
+	xma.l		f38 = f34, f6, f46
+	xma.hu		f42 = f34, f6, f46
+	;;
+	getf.sig	r28 = f40
+	getf.sig	r25 = f37
+	xma.l		f39 = f35, f6, f47
+	xma.hu		f43 = f35, f6, f47
+	;;
+	getf.sig	r29 = f41
+	getf.sig	r26 = f38
+	br		.Lcj5
+
+.grt5:
+	mov		r30 = 0
+	xma.l		f36 = f32, f6, f44
+	xma.hu		f40 = f32, f6, f44
+	;;
+	ldf8		f32 = [up], 8
+	xma.l		f37 = f33, f6, f45
+	ldf8		f44 = [rp], 8
+	xma.hu		f41 = f33, f6, f45
+	;;
+	ldf8		f33 = [up], 8
+	getf.sig	r27 = f39
+	;;
+	getf.sig	r31 = f43
+	xma.l		f38 = f34, f6, f46
+	ldf8		f45 = [rp], 8
+	xma.hu		f42 = f34, f6, f46
+	;;
+	ldf8		f34 = [up], 8
+	getf.sig	r24 = f36
+	;;
+	getf.sig	r28 = f40
+	xma.l		f39 = f35, f6, f47
+	ldf8		f46 = [rp], 8
+	xma.hu		f43 = f35, f6, f47
+	;;
+	ldf8		f35 = [up], 8
+	getf.sig	r25 = f37
+	br.cloop.dptk	.Loop
+	br		.Le0
+
+
+.Lb10:	ldf8		f35 = [up], 8
+	ldf8		f47 = [rp], 8
+	br.cloop.dptk	.grt2
+
+	xma.l		f38 = f7, f6, f8
+	xma.hu		f42 = f7, f6, f8
+	;;
+	xma.l		f39 = f35, f6, f47
+	xma.hu		f43 = f35, f6, f47
+	;;
+	getf.sig	r30 = f42
+	stf8		[r20] = f38, 8
+	getf.sig	r27 = f39
+	getf.sig	r8 = f43
+	br		.Lcj2
+
+.grt2:
+	ldf8		f32 = [up], 8
+	ldf8		f44 = [rp], 8
+	;;
+	ldf8		f33 = [up], 8
+	xma.l		f38 = f7, f6, f8
+	ldf8		f45 = [rp], 8
+	xma.hu		f42 = f7, f6, f8
+	;;
+	ldf8		f34 = [up], 8
+	xma.l		f39 = f35, f6, f47
+	ldf8		f46 = [rp], 8
+	xma.hu		f43 = f35, f6, f47
+	;;
+	ldf8		f35 = [up], 8
+	ldf8		f47 = [rp], 8
+	br.cloop.dptk	.grt6
+
+	stf8		[r20] = f38, 8
+	xma.l		f36 = f32, f6, f44
+	xma.hu		f40 = f32, f6, f44
+	;;
+	getf.sig	r30 = f42
+	getf.sig	r27 = f39
+	xma.l		f37 = f33, f6, f45
+	xma.hu		f41 = f33, f6, f45
+	;;
+	getf.sig	r31 = f43
+	getf.sig	r24 = f36
+	xma.l		f38 = f34, f6, f46
+	xma.hu		f42 = f34, f6, f46
+	;;
+	getf.sig	r28 = f40
+	getf.sig	r25 = f37
+	xma.l		f39 = f35, f6, f47
+	xma.hu		f43 = f35, f6, f47
+	br		.Lcj6
+
+.grt6:
+	mov		r29 = 0
+	xma.l		f36 = f32, f6, f44
+	xma.hu		f40 = f32, f6, f44
+	;;
+	ldf8		f32 = [up], 8
+	getf.sig	r26 = f38
+	;;
+	getf.sig	r30 = f42
+	xma.l		f37 = f33, f6, f45
+	ldf8		f44 = [rp], 8
+	xma.hu		f41 = f33, f6, f45
+	;;
+	ldf8		f33 = [up], 8
+	getf.sig	r27 = f39
+	;;
+	getf.sig	r31 = f43
+	xma.l		f38 = f34, f6, f46
+	ldf8		f45 = [rp], 8
+	xma.hu		f42 = f34, f6, f46
+	;;
+	ldf8		f34 = [up], 8
+	getf.sig	r24 = f36
+	br		.LL10
+
+
+.Lb11:	ldf8		f34 = [up], 8
+	ldf8		f46 = [rp], 8
+	;;
+	ldf8		f35 = [up], 8
+	ldf8		f47 = [rp], 8
+	br.cloop.dptk	.grt3
+	;;
+
+	xma.l		f37 = f7, f6, f8
+	xma.hu		f41 = f7, f6, f8
+	xma.l		f38 = f34, f6, f46
+	xma.hu		f42 = f34, f6, f46
+	xma.l		f39 = f35, f6, f47
+	xma.hu		f43 = f35, f6, f47
+	;;
+	getf.sig	r29 = f41
+	stf8		[r20] = f37, 8
+	getf.sig	r26 = f38
+	getf.sig	r30 = f42
+	getf.sig	r27 = f39
+	getf.sig	r8 = f43
+	br		.Lcj3
+
+.grt3:
+	ldf8		f32 = [up], 8
+	xma.l		f37 = f7, f6, f8
+	ldf8		f44 = [rp], 8
+	xma.hu		f41 = f7, f6, f8
+	;;
+	ldf8		f33 = [up], 8
+	xma.l		f38 = f34, f6, f46
+	ldf8		f45 = [rp], 8
+	xma.hu		f42 = f34, f6, f46
+	;;
+	ldf8		f34 = [up], 8
+	xma.l		f39 = f35, f6, f47
+	ldf8		f46 = [rp], 8
+	xma.hu		f43 = f35, f6, f47
+	;;
+	ldf8		f35 = [up], 8
+	getf.sig	r25 = f37		C FIXME
+	ldf8		f47 = [rp], 8
+	br.cloop.dptk	.grt7
+
+	getf.sig	r29 = f41
+	stf8		[r20] = f37, 8		C FIXME
+	xma.l		f36 = f32, f6, f44
+	getf.sig	r26 = f38
+	xma.hu		f40 = f32, f6, f44
+	;;
+	getf.sig	r30 = f42
+	xma.l		f37 = f33, f6, f45
+	getf.sig	r27 = f39
+	xma.hu		f41 = f33, f6, f45
+	;;
+	getf.sig	r31 = f43
+	xma.l		f38 = f34, f6, f46
+	getf.sig	r24 = f36
+	xma.hu		f42 = f34, f6, f46
+	br		.Lcj7
+
+.grt7:
+	getf.sig	r29 = f41
+	xma.l		f36 = f32, f6, f44
+	mov		r28 = 0
+	xma.hu		f40 = f32, f6, f44
+	;;
+	ldf8		f32 = [up], 8
+	getf.sig	r26 = f38
+	;;
+	getf.sig	r30 = f42
+	xma.l		f37 = f33, f6, f45
+	ldf8		f44 = [rp], 8
+	xma.hu		f41 = f33, f6, f45
+	;;
+	ldf8		f33 = [up], 8
+	getf.sig	r27 = f39
+	br		.LL11
+
+
+.Lb00:	ldf8		f33 = [up], 8
+	ldf8		f45 = [rp], 8
+	;;
+	ldf8		f34 = [up], 8
+	ldf8		f46 = [rp], 8
+	;;
+	ldf8		f35 = [up], 8
+	xma.l		f36 = f7, f6, f8
+	ldf8		f47 = [rp], 8
+	xma.hu		f40 = f7, f6, f8
+	br.cloop.dptk	.grt4
+
+	xma.l		f37 = f33, f6, f45
+	xma.hu		f41 = f33, f6, f45
+	xma.l		f38 = f34, f6, f46
+	xma.hu		f42 = f34, f6, f46
+	;;
+	getf.sig	r28 = f40
+	stf8		[r20] = f36, 8
+	xma.l		f39 = f35, f6, f47
+	getf.sig	r25 = f37
+	xma.hu		f43 = f35, f6, f47
+	;;
+	getf.sig	r29 = f41
+	getf.sig	r26 = f38
+	getf.sig	r30 = f42
+	getf.sig	r27 = f39
+	br		.Lcj4
+
+.grt4:
+	ldf8		f32 = [up], 8
+	xma.l		f37 = f33, f6, f45
+	ldf8		f44 = [rp], 8
+	xma.hu		f41 = f33, f6, f45
+	;;
+	ldf8		f33 = [up], 8
+	xma.l		f38 = f34, f6, f46
+	ldf8		f45 = [rp], 8
+	xma.hu		f42 = f34, f6, f46
+	;;
+	ldf8		f34 = [up], 8
+	getf.sig	r24 = f36		C FIXME
+	xma.l		f39 = f35, f6, f47
+	ldf8		f46 = [rp], 8
+	getf.sig	r28 = f40
+	xma.hu		f43 = f35, f6, f47
+	;;
+	ldf8		f35 = [up], 8
+	getf.sig	r25 = f37
+	ldf8		f47 = [rp], 8
+	br.cloop.dptk	.grt8
+
+	getf.sig	r29 = f41
+	stf8		[r20] = f36, 8		C FIXME
+	xma.l		f36 = f32, f6, f44
+	getf.sig	r26 = f38
+	getf.sig	r30 = f42
+	xma.hu		f40 = f32, f6, f44
+	;;
+	xma.l		f37 = f33, f6, f45
+	getf.sig	r27 = f39
+	xma.hu		f41 = f33, f6, f45
+	br		.Lcj8
+
+.grt8:
+	getf.sig	r29 = f41
+	xma.l		f36 = f32, f6, f44
+	mov		r31 = 0
+	xma.hu		f40 = f32, f6, f44
+	;;
+	ldf8		f32 = [up], 8
+	getf.sig	r26 = f38
+	br		.LL00
+
+
+C *** MAIN LOOP START ***
+	ALIGN(32)				C insn	fed	cycle #
+.Loop:
+	.pred.rel "mutex", p6, p7		C num	by	i1 i2
+	getf.sig	r29 = f41		C 00	16	0   0
+	xma.l		f36 = f32, f6, f44	C 01	06,15	0   0
+   (p6)	add		r14 = r30, r27, 1	C 02		0   0
+	ldf8		f47 = [rp], 8		C 03		0   0
+	xma.hu		f40 = f32, f6, f44	C 04	06,15	0   0
+   (p7)	add		r14 = r30, r27		C 05		0   0
+	;;
+	.pred.rel "mutex", p6, p7
+	ldf8		f32 = [up], 8		C 06		1   1
+   (p6)	cmp.leu		p8, p9 = r14, r27	C 07		1   1
+   (p7)	cmp.ltu		p8, p9 = r14, r27	C 08		1   1
+	getf.sig	r26 = f38		C 09	25	2   1
+	st8		[r20] = r14, 8		C 10		2   1
+	nop.b		0			C 11		2   1
+	;;
+.LL00:
+	.pred.rel "mutex", p8, p9
+	getf.sig	r30 = f42		C 12	28	3   2
+	xma.l		f37 = f33, f6, f45	C 13	18,27	3   2
+   (p8)	add		r16 = r31, r24, 1	C 14		3   2
+	ldf8		f44 = [rp], 8		C 15		3   2
+	xma.hu		f41 = f33, f6, f45	C 16	18,27	3   2
+   (p9)	add		r16 = r31, r24		C 17		3   2
+	;;
+	.pred.rel "mutex", p8, p9
+	ldf8		f33 = [up], 8		C 18		4   3
+   (p8)	cmp.leu		p6, p7 = r16, r24	C 19		4   3
+   (p9)	cmp.ltu		p6, p7 = r16, r24	C 20		4   3
+	getf.sig	r27 = f39		C 21	37	5   3
+	st8		[r20] = r16, 8		C 22		5   3
+	nop.b		0			C 23		5   3
+	;;
+.LL11:
+	.pred.rel "mutex", p6, p7
+	getf.sig	r31 = f43		C 24	40	6   4
+	xma.l		f38 = f34, f6, f46	C 25	30,39	6   4
+   (p6)	add		r14 = r28, r25, 1	C 26		6   4
+	ldf8		f45 = [rp], 8		C 27		6   4
+	xma.hu		f42 = f34, f6, f46	C 28	30,39	6   4
+   (p7)	add		r14 = r28, r25		C 29		6   4
+	;;
+	.pred.rel "mutex", p6, p7
+	ldf8		f34 = [up], 8		C 30		7   5
+   (p6)	cmp.leu		p8, p9 = r14, r25	C 31		7   5
+   (p7)	cmp.ltu		p8, p9 = r14, r25	C 32		7   5
+	getf.sig	r24 = f36		C 33	01	8   5
+	st8		[r20] = r14, 8		C 34		8   5
+	nop.b		0			C 35		8   5
+	;;
+.LL10:
+	.pred.rel "mutex", p8, p9
+	getf.sig	r28 = f40		C 36	04	9   6
+	xma.l		f39 = f35, f6, f47	C 37	42,03	9   6
+   (p8)	add		r16 = r29, r26, 1	C 38		9   6
+	ldf8		f46 = [rp], 8		C 39		9   6
+	xma.hu		f43 = f35, f6, f47	C 40	42,03	9   6
+   (p9)	add		r16 = r29, r26		C 41		9   6
+	;;
+	.pred.rel "mutex", p8, p9
+	ldf8		f35 = [up], 8		C 42	       10   7
+   (p8)	cmp.leu		p6, p7 = r16, r26	C 43	       10   7
+   (p9)	cmp.ltu		p6, p7 = r16, r26	C 44	       10   7
+	getf.sig	r25 = f37		C 45	13     11   7
+	st8		[r20] = r16, 8		C 46	       11   7
+	br.cloop.dptk	.Loop			C 47	       11   7
+C *** MAIN LOOP END ***
+	;;
+.Le0:
+	.pred.rel "mutex", p6, p7
+	getf.sig	r29 = f41		C
+	xma.l		f36 = f32, f6, f44	C
+   (p6)	add		r14 = r30, r27, 1	C
+	ldf8		f47 = [rp], 8		C
+	xma.hu		f40 = f32, f6, f44	C
+   (p7)	add		r14 = r30, r27		C
+	;;
+	.pred.rel "mutex", p6, p7
+   (p6)	cmp.leu		p8, p9 = r14, r27	C
+   (p7)	cmp.ltu		p8, p9 = r14, r27	C
+	getf.sig	r26 = f38		C
+	st8		[r20] = r14, 8		C
+	;;
+	.pred.rel "mutex", p8, p9
+	getf.sig	r30 = f42		C
+	xma.l		f37 = f33, f6, f45	C
+   (p8)	add		r16 = r31, r24, 1	C
+	xma.hu		f41 = f33, f6, f45	C
+   (p9)	add		r16 = r31, r24		C
+	;;
+	.pred.rel "mutex", p8, p9
+   (p8)	cmp.leu		p6, p7 = r16, r24	C
+   (p9)	cmp.ltu		p6, p7 = r16, r24	C
+	getf.sig	r27 = f39		C
+	st8		[r20] = r16, 8		C
+	;;
+.Lcj8:
+	.pred.rel "mutex", p6, p7
+	getf.sig	r31 = f43		C
+	xma.l		f38 = f34, f6, f46	C
+   (p6)	add		r14 = r28, r25, 1	C
+	xma.hu		f42 = f34, f6, f46	C
+   (p7)	add		r14 = r28, r25		C
+	;;
+	.pred.rel "mutex", p6, p7
+   (p6)	cmp.leu		p8, p9 = r14, r25	C
+   (p7)	cmp.ltu		p8, p9 = r14, r25	C
+	getf.sig	r24 = f36		C
+	st8		[r20] = r14, 8		C
+	;;
+.Lcj7:
+	.pred.rel "mutex", p8, p9
+	getf.sig	r28 = f40		C
+	xma.l		f39 = f35, f6, f47	C
+   (p8)	add		r16 = r29, r26, 1	C
+	xma.hu		f43 = f35, f6, f47	C
+   (p9)	add		r16 = r29, r26		C
+	;;
+	.pred.rel "mutex", p8, p9
+   (p8)	cmp.leu		p6, p7 = r16, r26	C
+   (p9)	cmp.ltu		p6, p7 = r16, r26	C
+	getf.sig	r25 = f37		C
+	st8		[r20] = r16, 8		C
+	;;
+.Lcj6:
+	.pred.rel "mutex", p6, p7
+	getf.sig	r29 = f41		C
+   (p6)	add		r14 = r30, r27, 1	C
+   (p7)	add		r14 = r30, r27		C
+	;;
+	.pred.rel "mutex", p6, p7
+   (p6)	cmp.leu		p8, p9 = r14, r27	C
+   (p7)	cmp.ltu		p8, p9 = r14, r27	C
+	getf.sig	r26 = f38		C
+	st8		[r20] = r14, 8		C
+	;;
+.Lcj5:
+	.pred.rel "mutex", p8, p9
+	getf.sig	r30 = f42		C
+   (p8)	add		r16 = r31, r24, 1	C
+   (p9)	add		r16 = r31, r24		C
+	;;
+	.pred.rel "mutex", p8, p9
+   (p8)	cmp.leu		p6, p7 = r16, r24	C
+   (p9)	cmp.ltu		p6, p7 = r16, r24	C
+	getf.sig	r27 = f39		C
+	st8		[r20] = r16, 8		C
+	;;
+.Lcj4:
+	.pred.rel "mutex", p6, p7
+	getf.sig	r8 = f43		C
+   (p6)	add		r14 = r28, r25, 1	C
+   (p7)	add		r14 = r28, r25		C
+	;;
+	.pred.rel "mutex", p6, p7
+	st8		[r20] = r14, 8		C
+   (p6)	cmp.leu		p8, p9 = r14, r25	C
+   (p7)	cmp.ltu		p8, p9 = r14, r25	C
+	;;
+.Lcj3:
+	.pred.rel "mutex", p8, p9
+   (p8)	add		r16 = r29, r26, 1	C
+   (p9)	add		r16 = r29, r26		C
+	;;
+	.pred.rel "mutex", p8, p9
+	st8		[r20] = r16, 8		C
+   (p8)	cmp.leu		p6, p7 = r16, r26	C
+   (p9)	cmp.ltu		p6, p7 = r16, r26	C
+	;;
+.Lcj2:
+	.pred.rel "mutex", p6, p7
+   (p6)	add		r14 = r30, r27, 1	C
+   (p7)	add		r14 = r30, r27		C
+	;;
+	.pred.rel "mutex", p6, p7
+	st8		[r20] = r14		C
+   (p6)	cmp.leu		p8, p9 = r14, r27	C
+   (p7)	cmp.ltu		p8, p9 = r14, r27	C
+	;;
+   (p8)	add		r8 = 1, r8		C M I
+	mov.i		ar.lc = r2		C I0
+	br.ret.sptk.many b0			C B
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/ia64/addmul_2.asm b/third_party/gmp/mpn/ia64/addmul_2.asm
new file mode 100644
index 0000000..86e8de4
--- /dev/null
+++ b/third_party/gmp/mpn/ia64/addmul_2.asm

@@ -0,0 +1,715 @@
+dnl  IA-64 mpn_addmul_2 -- Multiply a n-limb number with a 2-limb number and
+dnl  add the result to a (n+1)-limb number.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  Copyright 2004, 2005, 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C         cycles/limb
+C Itanium:    3.65
+C Itanium 2:  1.625
+
+C TODO
+C  * Clean up variable names, and try to decrease the number of distinct
+C    registers used.
+C  * Clean up feed-in code to not require zeroing several registers.
+C  * Make sure we don't depend on uninitialised predicate registers.
+C  * Could perhaps save a few cycles by using 1 c/l carry propagation in
+C    wind-down code.
+C  * Ultimately rewrite.  The problem with this code is that it first uses a
+C    loaded u value in one xma pair, then leaves it live over several unrelated
+C    xma pairs, before it uses it again.  It should actually be quite possible
+C    to just swap some aligned xma pairs around.  But we should then schedule
+C    u loads further from the first use.
+
+C INPUT PARAMETERS
+define(`rp',`r32')
+define(`up',`r33')
+define(`n',`r34')
+define(`vp',`r35')
+
+define(`srp',`r3')
+
+define(`v0',`f6')
+define(`v1',`f7')
+
+define(`s0',`r14')
+define(`acc0',`r15')
+
+define(`pr0_0',`r16') define(`pr0_1',`r17')
+define(`pr0_2',`r18') define(`pr0_3',`r19')
+
+define(`pr1_0',`r20') define(`pr1_1',`r21')
+define(`pr1_2',`r22') define(`pr1_3',`r23')
+
+define(`acc1_0',`r24') define(`acc1_1',`r25')
+define(`acc1_2',`r26') define(`acc1_3',`r27')
+
+dnl define(`',`r28')
+dnl define(`',`r29')
+dnl define(`',`r30')
+dnl define(`',`r31')
+
+define(`fp0b_0',`f8') define(`fp0b_1',`f9')
+define(`fp0b_2',`f10') define(`fp0b_3',`f11')
+
+define(`fp1a_0',`f12') define(`fp1a_1',`f13')
+define(`fp1a_2',`f14') define(`fp1a_3',`f15')
+
+define(`fp1b_0',`f32') define(`fp1b_1',`f33')
+define(`fp1b_2',`f34') define(`fp1b_3',`f35')
+
+define(`fp2a_0',`f36') define(`fp2a_1',`f37')
+define(`fp2a_2',`f38') define(`fp2a_3',`f39')
+
+define(`r_0',`f40') define(`r_1',`f41')
+define(`r_2',`f42') define(`r_3',`f43')
+
+define(`u_0',`f44') define(`u_1',`f45')
+define(`u_2',`f46') define(`u_3',`f47')
+
+define(`rx',`f48')
+define(`ux',`f49')
+define(`ry',`f50')
+define(`uy',`f51')
+
+ASM_START()
+PROLOGUE(mpn_addmul_2s)
+	.prologue
+	.save	ar.lc, r2
+	.body
+
+ifdef(`HAVE_ABI_32',`
+ {.mmi;		addp4	rp = 0, rp		C			M I
+		addp4	up = 0, up		C			M I
+		addp4	vp = 0, vp		C			M I
+}{.mmi;		nop	1
+		nop	1
+		zxt4	n = n			C			I
+	;;
+}')
+
+ {.mmi;		ldf8	ux = [up], 8		C			M
+		ldf8	v0 = [vp], 8		C			M
+		mov	r2 = ar.lc		C			I0
+}{.mmi;		ldf8	rx = [rp], 8		C			M
+		and	r14 = 3, n		C			M I
+		add	n = -2, n		C			M I
+	;;
+}{.mmi;		ldf8	uy = [up], 8		C			M
+		ldf8	v1 = [vp]		C			M
+		shr.u	n = n, 2		C			I0
+}{.mmi;		ldf8	ry = [rp], -8		C			M
+		cmp.eq	p14, p0 = 1, r14	C			M I
+		cmp.eq	p11, p0 = 2, r14	C			M I
+	;;
+}{.mmi;		add	srp = 16, rp		C			M I
+		cmp.eq	p15, p0 = 3, r14	C			M I
+		mov	ar.lc = n		C			I0
+}{.bbb;	(p14)	br.dptk	L(x01)			C			B
+	(p11)	br.dptk	L(x10)			C			B
+	(p15)	br.dptk	L(x11)			C			B
+	;;
+}
+L(x00):		cmp.ne	p6, p0 = r0, r0		C suppress initial xma pair
+		mov	fp2a_3 = f0
+		br	L(b00)
+L(x01):		cmp.ne	p14, p0 = r0, r0	C suppress initial xma pair
+		mov	fp2a_2 = f0
+		br	L(b01)
+L(x10):		cmp.ne	p11, p0 = r0, r0	C suppress initial xma pair
+		mov	fp2a_1 = f0
+		br	L(b10)
+L(x11):		cmp.ne	p15, p0 = r0, r0	C suppress initial xma pair
+		mov	fp2a_0 = f0
+		br	L(b11)
+
+EPILOGUE()
+
+PROLOGUE(mpn_addmul_2)
+	.prologue
+	.save	ar.lc, r2
+	.body
+
+ifdef(`HAVE_ABI_32',`
+ {.mmi;		addp4	rp = 0, rp		C			M I
+		addp4	up = 0, up		C			M I
+		addp4	vp = 0, vp		C			M I
+}{.mmi;		nop	1
+		nop	1
+		zxt4	n = n			C			I
+	;;
+}')
+
+ {.mmi;		ldf8	ux = [up], 8		C			M
+		ldf8	v0 = [vp], 8		C			M
+		mov	r2 = ar.lc		C			I0
+}{.mmi;		ldf8	rx = [rp], 8		C			M
+		and	r14 = 3, n		C			M I
+		add	n = -2, n		C			M I
+	;;
+}{.mmi;		ldf8	uy = [up], 8		C			M
+		ldf8	v1 = [vp]		C			M
+		shr.u	n = n, 2		C			I0
+}{.mmi;		ldf8	ry = [rp], -8		C			M
+		cmp.eq	p14, p0 = 1, r14	C			M I
+		cmp.eq	p11, p0 = 2, r14	C			M I
+	;;
+}{.mmi;		add	srp = 16, rp		C			M I
+		cmp.eq	p15, p6 = 3, r14	C			M I
+		mov	ar.lc = n		C			I0
+}{.bbb;	(p14)	br.dptk	L(b01)			C			B
+	(p11)	br.dptk	L(b10)			C			B
+	(p15)	br.dptk	L(b11)			C			B
+	;;
+}
+	ALIGN(32)
+L(b00):
+ {.mmi;		ldf8	r_1 = [srp], 8
+		ldf8	u_1 = [up], 8
+		mov	acc1_2 = 0
+}{.mmi;		mov	pr1_2 = 0
+		mov	pr0_3 = 0
+		cmp.ne	p8, p9 = r0, r0
+	;;
+}{.mfi;		ldf8	r_2 = [srp], 8
+		xma.l	fp0b_3 = ux, v0, rx
+		cmp.ne	p12, p13 = r0, r0
+}{.mfb;		ldf8	u_2 = [up], 8
+		xma.hu	fp1b_3 = ux, v0, rx
+		br.cloop.dptk	L(gt4)
+}
+		xma.l	fp0b_0 = uy, v0, ry
+		xma.hu	fp1a_0 = uy, v0, ry
+	;;
+		getfsig	acc0 = fp0b_3
+	(p6)	xma.hu	fp2a_3 = ux, v1, fp1b_3		C suppressed for addmul_2s
+	(p6)	xma.l	fp1b_3 = ux, v1, fp1b_3		C suppressed for addmul_2s
+	;;
+		xma.l	fp0b_1 = u_1, v0, r_1
+		xma.hu	fp1a_1 = u_1, v0, r_1
+	;;
+		getfsig	pr0_0 = fp0b_0
+		xma.l	fp1b_0 = uy, v1, fp1a_0
+		xma.hu	fp2a_0 = uy, v1, fp1a_0
+	;;
+		getfsig	pr1_3 = fp1b_3
+		getfsig	acc1_3 = fp2a_3
+		xma.l	fp0b_2 = u_2, v0, r_2
+		xma.hu	fp1a_2 = u_2, v0, r_2
+		br	L(cj4)
+
+L(gt4):		xma.l	fp0b_0 = uy, v0, ry
+		xma.hu	fp1a_0 = uy, v0, ry
+	;;
+		ldf8	r_3 = [srp], 8
+		getfsig	acc0 = fp0b_3
+	(p6)	xma.hu	fp2a_3 = ux, v1, fp1b_3		C suppressed for addmul_2s
+		ldf8	u_3 = [up], 8
+	(p6)	xma.l	fp1b_3 = ux, v1, fp1b_3		C suppressed for addmul_2s
+	;;
+		xma.l	fp0b_1 = u_1, v0, r_1
+		xma.hu	fp1a_1 = u_1, v0, r_1
+	;;
+		ldf8	r_0 = [srp], 8
+		getfsig	pr0_0 = fp0b_0
+		xma.l	fp1b_0 = uy, v1, fp1a_0
+		xma.hu	fp2a_0 = uy, v1, fp1a_0
+	;;
+		ldf8	u_0 = [up], 8
+		getfsig	pr1_3 = fp1b_3
+		xma.l	fp0b_2 = u_2, v0, r_2
+	;;
+		getfsig	acc1_3 = fp2a_3
+		xma.hu	fp1a_2 = u_2, v0, r_2
+		br	L(00)
+
+
+	ALIGN(32)
+L(b01):
+ {.mmi;		ldf8	r_0 = [srp], 8		C M
+		ldf8	u_0 = [up], 8		C M
+		mov	acc1_1 = 0		C M I
+}{.mmi;		mov	pr1_1 = 0		C M I
+		mov	pr0_2 = 0		C M I
+		cmp.ne	p6, p7 = r0, r0		C M I
+	;;
+}{.mfi;		ldf8	r_1 = [srp], 8		C M
+		xma.l	fp0b_2 = ux, v0, rx	C F
+		cmp.ne	p10, p11 = r0, r0	C M I
+}{.mfi;		ldf8	u_1 = [up], 8		C M
+		xma.hu	fp1b_2 = ux, v0, rx	C F
+		nop	1
+	;;
+}		xma.l	fp0b_3 = uy, v0, ry	C F
+		xma.hu	fp1a_3 = uy, v0, ry	C F
+	;;
+ {.mmf;		getfsig	acc0 = fp0b_2		C M
+		ldf8	r_2 = [srp], 8		C M
+	(p14)	xma.hu	fp2a_2 = ux, v1,fp1b_2	C F	suppressed for addmul_2s
+}{.mfb;		ldf8	u_2 = [up], 8		C M
+	(p14)	xma.l	fp1b_2 = ux, v1,fp1b_2	C F	suppressed for addmul_2s
+		br.cloop.dptk	L(gt5)
+}
+		xma.l	fp0b_0 = u_0, v0, r_0	C F
+		xma.hu	fp1a_0 = u_0, v0, r_0	C F
+	;;
+		getfsig	pr0_3 = fp0b_3		C M
+		xma.l	fp1b_3 = uy, v1,fp1a_3	C F
+		xma.hu	fp2a_3 = uy, v1,fp1a_3	C F
+	;;
+		getfsig	pr1_2 = fp1b_2		C M
+		getfsig	acc1_2 = fp2a_2		C M
+		xma.l	fp0b_1 = u_1, v0, r_1	C F
+		xma.hu	fp1a_1 = u_1, v0, r_1	C F
+		br	L(cj5)
+
+L(gt5):		xma.l	fp0b_0 = u_0, v0, r_0
+		xma.hu	fp1a_0 = u_0, v0, r_0
+	;;
+		getfsig	pr0_3 = fp0b_3
+		ldf8	r_3 = [srp], 8
+		xma.l	fp1b_3 = uy, v1, fp1a_3
+		xma.hu	fp2a_3 = uy, v1, fp1a_3
+	;;
+		ldf8	u_3 = [up], 8
+		getfsig	pr1_2 = fp1b_2
+		xma.l	fp0b_1 = u_1, v0, r_1
+	;;
+		getfsig	acc1_2 = fp2a_2
+		xma.hu	fp1a_1 = u_1, v0, r_1
+		br	L(01)
+
+
+	ALIGN(32)
+L(b10):		br.cloop.dptk	L(gt2)
+		xma.l	fp0b_1 = ux, v0, rx
+		xma.hu	fp1b_1 = ux, v0, rx
+	;;
+		xma.l	fp0b_2 = uy, v0, ry
+		xma.hu	fp1a_2 = uy, v0, ry
+	;;
+		stf8	[rp] = fp0b_1, 8
+	(p11)	xma.hu	fp2a_1 = ux, v1, fp1b_1		C suppressed for addmul_2s
+	(p11)	xma.l	fp1b_1 = ux, v1, fp1b_1		C suppressed for addmul_2s
+	;;
+		getfsig	acc0 = fp0b_2
+		xma.l	fp1b_2 = uy, v1, fp1a_2
+		xma.hu	fp2a_2 = uy, v1, fp1a_2
+	;;
+		getfsig	pr1_1 = fp1b_1
+		getfsig	acc1_1 = fp2a_1
+		mov	ar.lc = r2
+		getfsig	pr1_2 = fp1b_2
+		getfsig	r8 = fp2a_2
+	;;
+		add	s0 = pr1_1, acc0
+	;;
+		st8	[rp] = s0, 8
+		cmp.ltu	p8, p9 = s0, pr1_1
+		sub	r31 = -1, acc1_1
+	;;
+	.pred.rel "mutex", p8, p9
+	(p8)	add	acc0 = pr1_2, acc1_1, 1
+	(p9)	add	acc0 = pr1_2, acc1_1
+	(p8)	cmp.leu	p10, p0 = r31, pr1_2
+	(p9)	cmp.ltu	p10, p0 = r31, pr1_2
+	;;
+		st8	[rp] = acc0, 8
+	(p10)	add	r8 = 1, r8
+		br.ret.sptk.many b0
+
+
+L(gt2):
+ {.mmi;		ldf8	r_3 = [srp], 8
+		ldf8	u_3 = [up], 8
+		mov	acc1_0 = 0
+	;;
+}{.mfi;		ldf8	r_0 = [srp], 8
+		xma.l	fp0b_1 = ux, v0, rx
+		mov	pr1_0 = 0
+}{.mfi;		ldf8	u_0 = [up], 8
+		xma.hu	fp1b_1 = ux, v0, rx
+		mov	pr0_1 = 0
+	;;
+}		xma.l	fp0b_2 = uy, v0, ry
+		xma.hu	fp1a_2 = uy, v0, ry
+	;;
+		getfsig	acc0 = fp0b_1
+		ldf8	r_1 = [srp], 8
+	(p11)	xma.hu	fp2a_1 = ux, v1, fp1b_1		C suppressed for addmul_2s
+	(p11)	xma.l	fp1b_1 = ux, v1, fp1b_1		C suppressed for addmul_2s
+	;;
+		ldf8	u_1 = [up], 8
+		xma.l	fp0b_3 = u_3, v0, r_3
+		xma.hu	fp1a_3 = u_3, v0, r_3
+	;;
+		getfsig	pr0_2 = fp0b_2
+		ldf8	r_2 = [srp], 8
+		xma.l	fp1b_2 = uy, v1, fp1a_2
+		xma.hu	fp2a_2 = uy, v1, fp1a_2
+	;;
+		ldf8	u_2 = [up], 8
+		getfsig	pr1_1 = fp1b_1
+	;;
+ {.mfi;		getfsig	acc1_1 = fp2a_1
+		xma.l	fp0b_0 = u_0, v0, r_0
+		cmp.ne	p8, p9 = r0, r0
+}{.mfb;		cmp.ne	p12, p13 = r0, r0
+		xma.hu	fp1a_0 = u_0, v0, r_0
+		br.cloop.sptk.clr	L(top)
+}
+		br.many	L(end)
+
+
+	ALIGN(32)
+L(b11):		ldf8	r_2 = [srp], 8
+		mov	pr1_3 = 0
+		mov	pr0_0 = 0
+	;;
+		ldf8	u_2 = [up], 8
+		mov	acc1_3 = 0
+		br.cloop.dptk	L(gt3)
+	;;
+		cmp.ne	p6, p7 = r0, r0
+		xma.l	fp0b_0 = ux, v0, rx
+		xma.hu	fp1b_0 = ux, v0, rx
+	;;
+		cmp.ne	p10, p11 = r0, r0
+		xma.l	fp0b_1 = uy, v0, ry
+		xma.hu	fp1a_1 = uy, v0, ry
+	;;
+		getfsig	acc0 = fp0b_0
+	(p15)	xma.hu	fp2a_0 = ux, v1, fp1b_0		C suppressed for addmul_2s
+	(p15)	xma.l	fp1b_0 = ux, v1, fp1b_0		C suppressed for addmul_2s
+	;;
+		xma.l	fp0b_2 = uy, v1, r_2
+		xma.hu	fp1a_2 = uy, v1, r_2
+	;;
+		getfsig	pr0_1 = fp0b_1
+		xma.l	fp1b_1 = u_2, v0, fp1a_1
+		xma.hu	fp2a_1 = u_2, v0, fp1a_1
+	;;
+		getfsig	pr1_0 = fp1b_0
+		getfsig	acc1_0 = fp2a_0
+		br	L(cj3)
+
+L(gt3):		ldf8	r_3 = [srp], 8
+		xma.l	fp0b_0 = ux, v0, rx
+		cmp.ne	p10, p11 = r0, r0
+		ldf8	u_3 = [up], 8
+		xma.hu	fp1b_0 = ux, v0, rx
+		cmp.ne	p6, p7 = r0, r0
+	;;
+		xma.l	fp0b_1 = uy, v0, ry
+		xma.hu	fp1a_1 = uy, v0, ry
+	;;
+		getfsig	acc0 = fp0b_0
+		ldf8	r_0 = [srp], 8
+	(p15)	xma.hu	fp2a_0 = ux, v1, fp1b_0		C suppressed for addmul_2s
+		ldf8	u_0 = [up], 8
+	(p15)	xma.l	fp1b_0 = ux, v1, fp1b_0		C suppressed for addmul_2s
+	;;
+		xma.l	fp0b_2 = u_2, v0, r_2
+		xma.hu	fp1a_2 = u_2, v0, r_2
+	;;
+		getfsig	pr0_1 = fp0b_1
+		ldf8	r_1 = [srp], 8
+		xma.l	fp1b_1 = uy, v1, fp1a_1
+		xma.hu	fp2a_1 = uy, v1, fp1a_1
+	;;
+		ldf8	u_1 = [up], 8
+		getfsig	pr1_0 = fp1b_0
+	;;
+		getfsig	acc1_0 = fp2a_0
+		xma.l	fp0b_3 = u_3, v0, r_3
+		xma.hu	fp1a_3 = u_3, v0, r_3
+		br	L(11)
+
+
+C *** MAIN LOOP START ***
+	ALIGN(32)
+L(top):						C 00
+	.pred.rel "mutex", p12, p13
+		getfsig	pr0_3 = fp0b_3
+		ldf8	r_3 = [srp], 8
+		xma.l	fp1b_3 = u_3, v1, fp1a_3
+	(p12)	add	s0 = pr1_0, acc0, 1
+	(p13)	add	s0 = pr1_0, acc0
+		xma.hu	fp2a_3 = u_3, v1, fp1a_3
+	;;					C 01
+	.pred.rel "mutex", p8, p9
+	.pred.rel "mutex", p12, p13
+		ldf8	u_3 = [up], 8
+		getfsig	pr1_2 = fp1b_2
+	(p8)	cmp.leu	p6, p7 = acc0, pr0_1
+	(p9)	cmp.ltu	p6, p7 = acc0, pr0_1
+	(p12)	cmp.leu	p10, p11 = s0, pr1_0
+	(p13)	cmp.ltu	p10, p11 = s0, pr1_0
+	;;					C 02
+	.pred.rel "mutex", p6, p7
+		getfsig	acc1_2 = fp2a_2
+		st8	[rp] = s0, 8
+		xma.l	fp0b_1 = u_1, v0, r_1
+	(p6)	add	acc0 = pr0_2, acc1_0, 1
+	(p7)	add	acc0 = pr0_2, acc1_0
+		xma.hu	fp1a_1 = u_1, v0, r_1
+	;;					C 03
+L(01):
+	.pred.rel "mutex", p10, p11
+		getfsig	pr0_0 = fp0b_0
+		ldf8	r_0 = [srp], 8
+		xma.l	fp1b_0 = u_0, v1, fp1a_0
+	(p10)	add	s0 = pr1_1, acc0, 1
+	(p11)	add	s0 = pr1_1, acc0
+		xma.hu	fp2a_0 = u_0, v1, fp1a_0
+	;;					C 04
+	.pred.rel "mutex", p6, p7
+	.pred.rel "mutex", p10, p11
+		ldf8	u_0 = [up], 8
+		getfsig	pr1_3 = fp1b_3
+	(p6)	cmp.leu	p8, p9 = acc0, pr0_2
+	(p7)	cmp.ltu	p8, p9 = acc0, pr0_2
+	(p10)	cmp.leu	p12, p13 = s0, pr1_1
+	(p11)	cmp.ltu	p12, p13 = s0, pr1_1
+	;;					C 05
+	.pred.rel "mutex", p8, p9
+		getfsig	acc1_3 = fp2a_3
+		st8	[rp] = s0, 8
+		xma.l	fp0b_2 = u_2, v0, r_2
+	(p8)	add	acc0 = pr0_3, acc1_1, 1
+	(p9)	add	acc0 = pr0_3, acc1_1
+		xma.hu	fp1a_2 = u_2, v0, r_2
+	;;					C 06
+L(00):
+	.pred.rel "mutex", p12, p13
+		getfsig	pr0_1 = fp0b_1
+		ldf8	r_1 = [srp], 8
+		xma.l	fp1b_1 = u_1, v1, fp1a_1
+	(p12)	add	s0 = pr1_2, acc0, 1
+	(p13)	add	s0 = pr1_2, acc0
+		xma.hu	fp2a_1 = u_1, v1, fp1a_1
+	;;					C 07
+	.pred.rel "mutex", p8, p9
+	.pred.rel "mutex", p12, p13
+		ldf8	u_1 = [up], 8
+		getfsig	pr1_0 = fp1b_0
+	(p8)	cmp.leu	p6, p7 = acc0, pr0_3
+	(p9)	cmp.ltu	p6, p7 = acc0, pr0_3
+	(p12)	cmp.leu	p10, p11 = s0, pr1_2
+	(p13)	cmp.ltu	p10, p11 = s0, pr1_2
+	;;					C 08
+	.pred.rel "mutex", p6, p7
+		getfsig	acc1_0 = fp2a_0
+		st8	[rp] = s0, 8
+		xma.l	fp0b_3 = u_3, v0, r_3
+	(p6)	add	acc0 = pr0_0, acc1_2, 1
+	(p7)	add	acc0 = pr0_0, acc1_2
+		xma.hu	fp1a_3 = u_3, v0, r_3
+	;;					C 09
+L(11):
+	.pred.rel "mutex", p10, p11
+		getfsig	pr0_2 = fp0b_2
+		ldf8	r_2 = [srp], 8
+		xma.l	fp1b_2 = u_2, v1, fp1a_2
+	(p10)	add	s0 = pr1_3, acc0, 1
+	(p11)	add	s0 = pr1_3, acc0
+		xma.hu	fp2a_2 = u_2, v1, fp1a_2
+	;;					C 10
+	.pred.rel "mutex", p6, p7
+	.pred.rel "mutex", p10, p11
+		ldf8	u_2 = [up], 8
+		getfsig	pr1_1 = fp1b_1
+	(p6)	cmp.leu	p8, p9 = acc0, pr0_0
+	(p7)	cmp.ltu	p8, p9 = acc0, pr0_0
+	(p10)	cmp.leu	p12, p13 = s0, pr1_3
+	(p11)	cmp.ltu	p12, p13 = s0, pr1_3
+	;;					C 11
+	.pred.rel "mutex", p8, p9
+		getfsig	acc1_1 = fp2a_1
+		st8	[rp] = s0, 8
+		xma.l	fp0b_0 = u_0, v0, r_0
+	(p8)	add	acc0 = pr0_1, acc1_3, 1
+	(p9)	add	acc0 = pr0_1, acc1_3
+		xma.hu	fp1a_0 = u_0, v0, r_0
+L(10):		br.cloop.sptk.clr	L(top)	C 12
+	;;
+C *** MAIN LOOP END ***
+L(end):
+	.pred.rel "mutex", p12, p13
+ {.mfi;		getfsig	pr0_3 = fp0b_3
+		xma.l	fp1b_3 = u_3, v1, fp1a_3
+	(p12)	add	s0 = pr1_0, acc0, 1
+}{.mfi;	(p13)	add	s0 = pr1_0, acc0
+		xma.hu	fp2a_3 = u_3, v1, fp1a_3
+		nop	1
+	;;
+}	.pred.rel "mutex", p8, p9
+	.pred.rel "mutex", p12, p13
+ {.mmi;		getfsig	pr1_2 = fp1b_2
+		st8	[rp] = s0, 8
+	(p8)	cmp.leu	p6, p7 = acc0, pr0_1
+}{.mmi;	(p9)	cmp.ltu	p6, p7 = acc0, pr0_1
+	(p12)	cmp.leu	p10, p11 = s0, pr1_0
+	(p13)	cmp.ltu	p10, p11 = s0, pr1_0
+	;;
+}	.pred.rel "mutex", p6, p7
+ {.mfi;		getfsig	acc1_2 = fp2a_2
+		xma.l	fp0b_1 = u_1, v0, r_1
+		nop	1
+}{.mmf;	(p6)	add	acc0 = pr0_2, acc1_0, 1
+	(p7)	add	acc0 = pr0_2, acc1_0
+		xma.hu	fp1a_1 = u_1, v0, r_1
+	;;
+}
+L(cj5):
+	.pred.rel "mutex", p10, p11
+ {.mfi;		getfsig	pr0_0 = fp0b_0
+		xma.l	fp1b_0 = u_0, v1, fp1a_0
+	(p10)	add	s0 = pr1_1, acc0, 1
+}{.mfi;	(p11)	add	s0 = pr1_1, acc0
+		xma.hu	fp2a_0 = u_0, v1, fp1a_0
+		nop	1
+	;;
+}	.pred.rel "mutex", p6, p7
+	.pred.rel "mutex", p10, p11
+ {.mmi;		getfsig	pr1_3 = fp1b_3
+	st8	[rp] = s0, 8
+	(p6)	cmp.leu	p8, p9 = acc0, pr0_2
+}{.mmi;	(p7)	cmp.ltu	p8, p9 = acc0, pr0_2
+	(p10)	cmp.leu	p12, p13 = s0, pr1_1
+	(p11)	cmp.ltu	p12, p13 = s0, pr1_1
+	;;
+}	.pred.rel "mutex", p8, p9
+ {.mfi;		getfsig	acc1_3 = fp2a_3
+		xma.l	fp0b_2 = u_2, v0, r_2
+		nop	1
+}{.mmf;	(p8)	add	acc0 = pr0_3, acc1_1, 1
+	(p9)	add	acc0 = pr0_3, acc1_1
+		xma.hu	fp1a_2 = u_2, v0, r_2
+	;;
+}
+L(cj4):
+	.pred.rel "mutex", p12, p13
+ {.mfi;		getfsig	pr0_1 = fp0b_1
+		xma.l	fp1b_1 = u_1, v1, fp1a_1
+	(p12)	add	s0 = pr1_2, acc0, 1
+}{.mfi;	(p13)	add	s0 = pr1_2, acc0
+		xma.hu	fp2a_1 = u_1, v1, fp1a_1
+		nop	1
+	;;
+}	.pred.rel "mutex", p8, p9
+	.pred.rel "mutex", p12, p13
+ {.mmi;		getfsig	pr1_0 = fp1b_0
+		st8	[rp] = s0, 8
+	(p8)	cmp.leu	p6, p7 = acc0, pr0_3
+}{.mmi;	(p9)	cmp.ltu	p6, p7 = acc0, pr0_3
+	(p12)	cmp.leu	p10, p11 = s0, pr1_2
+	(p13)	cmp.ltu	p10, p11 = s0, pr1_2
+	;;
+}	.pred.rel "mutex", p6, p7
+ {.mmi;		getfsig	acc1_0 = fp2a_0
+	(p6)	add	acc0 = pr0_0, acc1_2, 1
+	(p7)	add	acc0 = pr0_0, acc1_2
+	;;
+}
+L(cj3):
+	.pred.rel "mutex", p10, p11
+ {.mfi;		getfsig	pr0_2 = fp0b_2
+		xma.l	fp1b_2 = u_2, v1, fp1a_2
+	(p10)	add	s0 = pr1_3, acc0, 1
+}{.mfi;	(p11)	add	s0 = pr1_3, acc0
+		xma.hu	fp2a_2 = u_2, v1, fp1a_2
+		nop	1
+	;;
+}	.pred.rel "mutex", p6, p7
+	.pred.rel "mutex", p10, p11
+ {.mmi;		getfsig	pr1_1 = fp1b_1
+		st8	[rp] = s0, 8
+	(p6)	cmp.leu	p8, p9 = acc0, pr0_0
+}{.mmi;	(p7)	cmp.ltu	p8, p9 = acc0, pr0_0
+	(p10)	cmp.leu	p12, p13 = s0, pr1_3
+	(p11)	cmp.ltu	p12, p13 = s0, pr1_3
+	;;
+}	.pred.rel "mutex", p8, p9
+ {.mmi;		getfsig	acc1_1 = fp2a_1
+	(p8)	add	acc0 = pr0_1, acc1_3, 1
+	(p9)	add	acc0 = pr0_1, acc1_3
+	;;
+}	.pred.rel "mutex", p12, p13
+ {.mmi;	(p12)	add	s0 = pr1_0, acc0, 1
+	(p13)	add	s0 = pr1_0, acc0
+		nop	1
+	;;
+}	.pred.rel "mutex", p8, p9
+	.pred.rel "mutex", p12, p13
+ {.mmi;		getfsig	pr1_2 = fp1b_2
+		st8	[rp] = s0, 8
+	(p8)	cmp.leu	p6, p7 = acc0, pr0_1
+}{.mmi;	(p9)	cmp.ltu	p6, p7 = acc0, pr0_1
+	(p12)	cmp.leu	p10, p11 = s0, pr1_0
+	(p13)	cmp.ltu	p10, p11 = s0, pr1_0
+	;;
+}	.pred.rel "mutex", p6, p7
+ {.mmi;		getfsig	r8 = fp2a_2
+	(p6)	add	acc0 = pr0_2, acc1_0, 1
+	(p7)	add	acc0 = pr0_2, acc1_0
+	;;
+}	.pred.rel "mutex", p10, p11
+ {.mmi;	(p10)	add	s0 = pr1_1, acc0, 1
+	(p11)	add	s0 = pr1_1, acc0
+	(p6)	cmp.leu	p8, p9 = acc0, pr0_2
+	;;
+}	.pred.rel "mutex", p10, p11
+ {.mmi;	(p7)	cmp.ltu	p8, p9 = acc0, pr0_2
+	(p10)	cmp.leu	p12, p13 = s0, pr1_1
+	(p11)	cmp.ltu	p12, p13 = s0, pr1_1
+	;;
+}	.pred.rel "mutex", p8, p9
+ {.mmi;		st8	[rp] = s0, 8
+	(p8)	add	acc0 = pr1_2, acc1_1, 1
+	(p9)	add	acc0 = pr1_2, acc1_1
+	;;
+}	.pred.rel "mutex", p8, p9
+ {.mmi;	(p8)	cmp.leu	p10, p11 = acc0, pr1_2
+	(p9)	cmp.ltu	p10, p11 = acc0, pr1_2
+	(p12)	add	acc0 = 1, acc0
+	;;
+}{.mmi;		st8	[rp] = acc0, 8
+	(p12)	cmpeqor	p10, p0 = 0, acc0
+		nop	1
+	;;
+}{.mib;	(p10)	add	r8 = 1, r8
+		mov	ar.lc = r2
+		br.ret.sptk.many b0
+}
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/ia64/aors_n.asm b/third_party/gmp/mpn/ia64/aors_n.asm
new file mode 100644
index 0000000..7705ce6
--- /dev/null
+++ b/third_party/gmp/mpn/ia64/aors_n.asm

@@ -0,0 +1,852 @@
+dnl  IA-64 mpn_add_n/mpn_sub_n -- mpn addition and subtraction.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  Copyright 2003-2005, 2010, 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C           cycles/limb
+C Itanium:      2.67
+C Itanium 2:    1.25
+
+C TODO
+C  * Consider using special code for small n, using something like
+C    "switch (8 * (n >= 8) + (n mod 8))" to enter it and feed-in code.
+C  * The non-nc code was trimmed cycle for cycle to its current state.  It is
+C    probably hard to save more that an odd cycle there.  The nc code is much
+C    cruder (since tune/speed doesn't have any applicable direct measurements).
+C  * Without the nc entry points, this becomes around 1800 bytes of object
+C    code; the nc code adds over 1000 bytes.  We should perhaps sacrifice a
+C    few cycles for the non-nc code and let it fall into the nc code.
+
+C INPUT PARAMETERS
+define(`rp', `r32')
+define(`up', `r33')
+define(`vp', `r34')
+define(`n',  `r35')
+define(`cy', `r36')
+
+ifdef(`OPERATION_add_n',`
+  define(ADDSUB,	add)
+  define(CND,		ltu)
+  define(INCR,		1)
+  define(LIM,		-1)
+  define(LIM2,		0)
+  define(func,    mpn_add_n)
+  define(func_nc, mpn_add_nc)
+')
+ifdef(`OPERATION_sub_n',`
+  define(ADDSUB,	sub)
+  define(CND,		gtu)
+  define(INCR,		-1)
+  define(LIM,		0)
+  define(LIM2,		-1)
+  define(func,    mpn_sub_n)
+  define(func_nc, mpn_sub_nc)
+')
+
+define(PFDIST, 500)
+
+C Some useful aliases for registers we use
+define(`u0',`r14') define(`u1',`r15') define(`u2',`r16') define(`u3',`r17')
+define(`v0',`r24') define(`v1',`r25') define(`v2',`r26') define(`v3',`r27')
+define(`w0',`r28') define(`w1',`r29') define(`w2',`r30') define(`w3',`r31')
+define(`rpx',`r3')
+define(`upadv',`r20') define(`vpadv',`r21')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+
+ASM_START()
+PROLOGUE(func_nc)
+	.prologue
+	.save	ar.lc, r2
+	.body
+ifdef(`HAVE_ABI_32',`
+		addp4	rp = 0, rp		C			M I
+		addp4	up = 0, up		C			M I
+		nop.i	0
+		addp4	vp = 0, vp		C			M I
+		nop.m	0
+		zxt4	n = n			C			I
+	;;
+')
+
+ {.mmi;		ld8	r11 = [vp], 8		C			M01
+		ld8	r10 = [up], 8		C			M01
+		mov	r2 = ar.lc		C			I0
+}{.mmi;		and	r14 = 7, n		C			M I
+		cmp.lt	p15, p14 = 8, n		C			M I
+		add	n = -6, n		C			M I
+	;;
+}{.mmi;		add	upadv = PFDIST, up	C Merging these lines into the feed-in
+		add	vpadv = PFDIST, vp	C code could save a cycle per call at
+		mov	r23 = cy		C the expense of code size.
+	;;
+}{.mmi;		cmp.eq	p6, p0 = 1, r14		C			M I
+		cmp.eq	p7, p0 = 2, r14		C			M I
+		cmp.eq	p8, p0 = 3, r14		C			M I
+}{.bbb;	(p6)	br.dptk	.Lc001			C			B
+	(p7)	br.dptk	.Lc010			C			B
+	(p8)	br.dptk	.Lc011			C			B
+	;;
+}{.mmi;		cmp.eq	p9, p0 = 4, r14		C			M I
+		cmp.eq	p10, p0 = 5, r14	C			M I
+		cmp.eq	p11, p0 = 6, r14	C			M I
+}{.bbb;	(p9)	br.dptk	.Lc100			C			B
+	(p10)	br.dptk	.Lc101			C			B
+	(p11)	br.dptk	.Lc110			C			B
+	;;
+}{.mmi;		ld8	r19 = [vp], 8		C			M01
+		ld8	r18 = [up], 8		C			M01
+		cmp.ne	p13, p0 = 0, cy		C copy cy to p13	M I
+}{.mmb;		cmp.eq	p12, p0 = 7, r14	C			M I
+		nop	0
+	(p12)	br.dptk	.Lc111			C			B
+	;;
+}
+
+.Lc000:
+ {.mmi;		ld8	v3 = [vp], 8		C			M01
+		ld8	u3 = [up], 8		C			M01
+		shr.u	n = n, 3		C			I0
+	;;
+}{.mmi;		add	vpadv = PFDIST, vp	C			M I
+		ld8	v0 = [vp], 8		C			M01
+		mov	ar.lc = n		C			I0
+}{.mmi;		ld8	u0 = [up], 8		C			M01
+		ADDSUB	w1 = r10, r11		C			M I
+		nop	0
+	;;
+}{.mmi;		add	upadv = PFDIST, up	C			M I
+		ld8	v1 = [vp], 8		C			M01
+		cmp.CND	p7, p0 = w1, r10	C			M I
+}{.mmi;		ld8	u1 = [up], 8		C			M01
+		ADDSUB	w2 = r18, r19		C			M I
+		add	rpx = 8, rp		C			M I
+	;;
+}{.mmi;		ld8	v2 = [vp], 8		C			M01
+		cmp.CND	p8, p0 = w2, r18	C			M I
+	(p13)	cmpeqor	p7, p0 = LIM, w1	C			M I
+}{.mmi;		ld8	u2 = [up], 8		C			M01
+	(p13)	add	w1 = INCR, w1		C			M I
+		ADDSUB	w3 = u3, v3		C			M I
+	;;
+}{.mmi;		ld8	v3 = [vp], 8		C			M01
+		cmp.CND	p9, p0 = w3, u3		C			M I
+	(p7)	cmpeqor	p8, p0 = LIM, w2	C			M I
+}{.mmb;		ld8	u3 = [up], 8		C			M01
+	(p7)	add	w2 = INCR, w2		C			M I
+		br	L(m0)
+}
+
+.Lc001:
+ {.mmi;	(p15)	ld8	v1 = [vp], 8		C			M01
+	(p15)	ld8	u1 = [up], 8		C			M01
+		ADDSUB	w0 = r10, r11		C			M I
+}{.mmb;		nop	0
+		nop	0
+	(p15)	br	L(0)
+	;;
+}{.mmi;		cmp.ne	p9, p0 = 0, r23		C			M I
+		mov	r8 = 0
+		cmp.CND	p6, p0 = w0, r10	C			M I
+	;;
+}{.mmb;	(p9)	cmpeqor	p6, p0 = LIM, w0	C			M I
+	(p9)	add	w0 = INCR, w0		C			M I
+		br	L(cj1)			C			B
+}
+L(0):
+ {.mmi;		ld8	v2 = [vp], 8		C			M01
+		ld8	u2 = [up], 8		C			M01
+		shr.u	n = n, 3		C			I0
+	;;
+}{.mmi;		ld8	v3 = [vp], 8		C			M01
+		ld8	u3 = [up], 8		C			M01
+		mov	ar.lc = n		C			I0
+}{.mmi;		nop	0
+		cmp.ne	p9, p0 = 0, r23		C			M I
+		nop	0
+	;;
+}{.mmi;		ld8	v0 = [vp], 8		C			M01
+		cmp.CND	p6, p0 = w0, r10	C			M I
+		add	rpx = 16, rp		C			M I
+}{.mmb;		ld8	u0 = [up], 8		C			M01
+		ADDSUB	w1 = u1, v1		C			M I
+		br	L(c1)			C			B
+}
+
+.Lc010:
+ {.mmi;		ld8	v0 = [vp], 8		C			M01
+		ld8	u0 = [up], 8		C			M01
+		mov	r8 = 0			C			M I
+}{.mmb;		ADDSUB	w3 = r10, r11		C			M I
+		cmp.ne	p8, p0 = 0, r23		C			M I
+	(p15)	br	L(1)			C			B
+	;;
+}{.mmi;		cmp.CND	p9, p0 = w3, r10	C			M I
+		ADDSUB	w0 = u0, v0		C			M I
+	(p8)	add	w3 = INCR, w3		C			M I
+	;;
+}{.mmb;		cmp.CND	p6, p0 = w0, u0		C			M I
+	(p8)	cmpeqor	p9, p0 = LIM2, w3	C			M I
+		br	L(cj2)			C			B
+}
+L(1):
+ {.mmi;		ld8	v1 = [vp], 8		C			M01
+		ld8	u1 = [up], 8		C			M01
+		shr.u	n = n, 3		C			I0
+	;;
+}{.mmi;		ld8	v2 = [vp], 8		C			M01
+		ld8	u2 = [up], 8		C			M01
+		mov	ar.lc = n		C			I0
+	;;
+}{.mmi;		ld8	v3 = [vp], 8		C			M01
+		ld8	u3 = [up], 8		C			M01
+		cmp.CND	p9, p0 = w3, r10	C			M I
+	;;
+}{.mmi;	(p8)	cmpeqor	p9, p0 = LIM, w3	C			M I
+	(p8)	add	w3 = INCR, w3		C			M I
+		ADDSUB	w0 = u0, v0		C			M I
+}{.mmb;		add	rpx = 24, rp		C			M I
+		nop	0
+		br	L(m23)			C			B
+}
+
+.Lc011:
+ {.mmi;		ld8	v3 = [vp], 8		C			M01
+		ld8	u3 = [up], 8		C			M01
+		shr.u	n = n, 3		C			I0
+}{.mmi;		ADDSUB	w2 = r10, r11		C			M I
+		cmp.ne	p7, p0 = 0, r23		C			M I
+		nop	0
+	;;
+}{.mmb;		ld8	v0 = [vp], 8		C			M01
+		ld8	u0 = [up], 8		C			M01
+	(p15)	br	L(2)			C			B
+}{.mmi;		cmp.CND	p8, p0 = w2, r10	C			M I
+		ADDSUB	w3 = u3, v3		C			M I
+		nop	0
+	;;
+}{.mmb;	(p7)	cmpeqor	p8, p0 = LIM, w2	C			M I
+	(p7)	add	w2 = INCR, w2		C			M I
+		br	L(cj3)			C			B
+}
+L(2):
+ {.mmi;		ld8	v1 = [vp], 8		C			M01
+		ld8	u1 = [up], 8		C			M01
+		ADDSUB	w3 = u3, v3		C			M I
+	;;
+}{.mmi;		ld8	v2 = [vp], 8		C			M01
+		ld8	u2 = [up], 8		C			M01
+		cmp.CND	p8, p0 = w2, r10	C			M I
+	;;
+}{.mmi;		ld8	v3 = [vp], 8		C			M01
+		cmp.CND	p9, p0 = w3, u3		C			M I
+		mov	ar.lc = n		C			I0
+}{.mmi;		ld8	u3 = [up], 8		C			M01
+	(p7)	cmpeqor	p8, p0 = LIM, w2	C			M I
+	(p7)	add	w2 = INCR, w2		C			M I
+	;;
+}{.mmi;		add	rpx = 32, rp		C			M I
+		st8	[rp] = w2, 8		C			M23
+	(p8)	cmpeqor	p9, p0 = LIM, w3	C			M I
+}{.mmb;	(p8)	add	w3 = INCR, w3		C			M I
+		ADDSUB	w0 = u0, v0		C			M I
+		br	L(m23)
+}
+
+.Lc100:
+ {.mmi;		ld8	v2 = [vp], 8		C			M01
+		ld8	u2 = [up], 8		C			M01
+		shr.u	n = n, 3		C			I0
+}{.mmi;		ADDSUB	w1 = r10, r11		C			M I
+		nop	0
+		nop	0
+	;;
+}{.mmi;		ld8	v3 = [vp], 8		C			M01
+		ld8	u3 = [up], 8		C			M01
+		add	rpx = 8, rp		C			M I
+}{.mmi;		cmp.ne	p6, p0 = 0, r23		C			M I
+		cmp.CND	p7, p0 = w1, r10	C			M I
+		nop	0
+	;;
+}{.mmi;		ld8	v0 = [vp], 8		C			M01
+		ld8	u0 = [up], 8		C			M01
+		ADDSUB	w2 = u2, v2		C			M I
+}{.mmb;	(p6)	cmpeqor	p7, p0 = LIM, w1	C			M I
+	(p6)	add	w1 = INCR, w1		C			M I
+	(p14)	br	L(cj4)
+	;;
+}{.mmi;		ld8	v1 = [vp], 8		C			M01
+		ld8	u1 = [up], 8		C			M01
+		mov	ar.lc = n		C			I0
+	;;
+}{.mmi;		ld8	v2 = [vp], 8		C			M01
+		cmp.CND	p8, p0 = w2, u2		C			M I
+		nop	0
+}{.mmi;		ld8	u2 = [up], 8		C			M01
+		nop	0
+		ADDSUB	w3 = u3, v3		C			M I
+	;;
+}{.mmi;		ld8	v3 = [vp], 8		C			M01
+		cmp.CND	p9, p0 = w3, u3		C			M I
+	(p7)	cmpeqor	p8, p0 = LIM, w2	C			M I
+}{.mmb;		ld8	u3 = [up], 8		C			M01
+	(p7)	add	w2 = INCR, w2		C			M I
+		br	L(m4)
+}
+
+.Lc101:
+ {.mmi;		ld8	v1 = [vp], 8		C			M01
+		ld8	u1 = [up], 8		C			M01
+		shr.u	n = n, 3		C			I0
+	;;
+}{.mmi;		ld8	v2 = [vp], 8		C			M01
+		ld8	u2 = [up], 8		C			M01
+		mov	ar.lc = n		C			I0
+	;;
+}{.mmi;		ld8	v3 = [vp], 8		C			M01
+		ld8	u3 = [up], 8		C			M01
+		ADDSUB	w0 = r10, r11		C			M I
+}{.mmi;		cmp.ne	p9, p0 = 0, r23		C			M I
+		add	rpx = 16, rp		C			M I
+		nop	0
+	;;
+}{.mmi;		ld8	v0 = [vp], 8		C			M01
+		ld8	u0 = [up], 8		C			M01
+		cmp.CND	p6, p0 = w0, r10	C			M I
+}{.mbb;		ADDSUB	w1 = u1, v1		C			M I
+	(p15)	br	L(c5)			C			B
+		br	L(end)			C			B
+}
+
+.Lc110:
+ {.mmi;		ld8	v0 = [vp], 8		C			M01
+		ld8	u0 = [up], 8		C			M01
+		shr.u	n = n, 3		C			I0
+	;;
+}{.mmi;		add	upadv = PFDIST, up	C			M I
+		add	vpadv = PFDIST, vp	C			M I
+		mov	ar.lc = n		C			I0
+}{.mmi;		ld8	v1 = [vp], 8		C			M01
+		ld8	u1 = [up], 8		C			M01
+		ADDSUB	w3 = r10, r11		C			M I
+	;;
+}{.mmi;		ld8	v2 = [vp], 8		C			M01
+		ld8	u2 = [up], 8		C			M01
+		ADDSUB	w0 = u0, v0		C			M I
+}{.mmi;		cmp.CND	p9, p0 = w3, r10	C			M I
+		cmp.ne	p8, p0 = 0, r23		C			M I
+		add	rpx = 24, rp		C			M I
+	;;
+}{.mmi;		ld8	v3 = [vp], 8		C			M01
+		ld8	u3 = [up], 8		C			M01
+		nop	0
+}{.mmb;	(p8)	cmpeqor	p9, p0 = LIM, w3	C			M I
+	(p8)	add	w3 = INCR, w3		C			M I
+		br	L(m67)			C			B
+}
+
+.Lc111:
+ {.mmi;		ld8	v0 = [vp], 8		C			M01
+		ld8	u0 = [up], 8		C			M01
+		shr.u	n = n, 3		C			I0
+	;;
+}{.mmi;		add	upadv = PFDIST, up	C			M I
+		ld8	v1 = [vp], 8		C			M01
+		mov	ar.lc = n		C			I0
+}{.mmi;		ld8	u1 = [up], 8		C			M01
+		ADDSUB	w2 = r10, r11		C			M I
+		nop	0
+	;;
+}{.mmi;		add	vpadv = PFDIST, vp	C			M I
+		ld8	v2 = [vp], 8		C			M01
+		cmp.CND	p8, p0 = w2, r10	C			M I
+}{.mmi;		ld8	u2 = [up], 8		C			M01
+		ADDSUB	w3 = r18, r19		C			M I
+		nop	0
+	;;
+}{.mmi;		ld8	v3 = [vp], 8		C			M01
+		cmp.CND	p9, p0 = w3, r18	C			M I
+	(p13)	cmpeqor	p8, p0 = LIM, w2	C			M I
+}{.mmi;		ld8	u3 = [up], 8		C			M01
+	(p13)	add	w2 = INCR, w2		C			M I
+		nop	0
+	;;
+}{.mmi;		add	rpx = 32, rp		C			M I
+		st8	[rp] = w2, 8		C			M23
+	(p8)	cmpeqor	p9, p0 = LIM, w3	C			M I
+}{.mmb;	(p8)	add	w3 = INCR, w3		C			M I
+		ADDSUB	w0 = u0, v0		C			M I
+		br	L(m67)
+}
+EPILOGUE()
+
+PROLOGUE(func)
+	.prologue
+	.save	ar.lc, r2
+	.body
+ifdef(`HAVE_ABI_32',`
+		addp4	rp = 0, rp		C			M I
+		addp4	up = 0, up		C			M I
+		nop.i	0
+		addp4	vp = 0, vp		C			M I
+		nop.m	0
+		zxt4	n = n			C			I
+	;;
+')
+
+ {.mmi;		ld8	r11 = [vp], 8		C			M01
+		ld8	r10 = [up], 8		C			M01
+		mov	r2 = ar.lc		C			I0
+}{.mmi;		and	r14 = 7, n		C			M I
+		cmp.lt	p15, p14 = 8, n		C			M I
+		add	n = -6, n		C			M I
+	;;
+}{.mmi;		cmp.eq	p6, p0 = 1, r14		C			M I
+		cmp.eq	p7, p0 = 2, r14		C			M I
+		cmp.eq	p8, p0 = 3, r14		C			M I
+}{.bbb;	(p6)	br.dptk	.Lb001			C			B
+	(p7)	br.dptk	.Lb010			C			B
+	(p8)	br.dptk	.Lb011			C			B
+	;;
+}{.mmi;		cmp.eq	p9, p0 = 4, r14		C			M I
+		cmp.eq	p10, p0 = 5, r14	C			M I
+		cmp.eq	p11, p0 = 6, r14	C			M I
+}{.bbb;	(p9)	br.dptk	.Lb100			C			B
+	(p10)	br.dptk	.Lb101			C			B
+	(p11)	br.dptk	.Lb110			C			B
+	;;
+}{.mmi;		ld8	r19 = [vp], 8		C			M01
+		ld8	r18 = [up], 8		C			M01
+		cmp.ne	p13, p0 = r0, r0	C clear "CF"		M I
+}{.mmb;		cmp.eq	p12, p0 = 7, r14	C			M I
+		mov	r23 = 0			C			M I
+	(p12)	br.dptk	.Lb111			C			B
+	;;
+}
+
+.Lb000:
+ {.mmi;		ld8	v3 = [vp], 8		C			M01
+		ld8	u3 = [up], 8		C			M01
+		shr.u	n = n, 3		C			I0
+	;;
+}{.mmi;		ld8	v0 = [vp], 8		C			M01
+		ld8	u0 = [up], 8		C			M01
+		ADDSUB	w1 = r10, r11		C			M I
+	;;
+}{.mmi;		ld8	v1 = [vp], 8		C			M01
+		cmp.CND	p7, p0 = w1, r10	C			M I
+		mov	ar.lc = n		C			I0
+}{.mmi;		ld8	u1 = [up], 8		C			M01
+		ADDSUB	w2 = r18, r19		C			M I
+		add	rpx = 8, rp		C			M I
+	;;
+}{.mmi;		add	upadv = PFDIST, up
+		add	vpadv = PFDIST, vp
+		cmp.CND	p8, p0 = w2, r18	C			M I
+}{.mmi;		ld8	v2 = [vp], 8		C			M01
+		ld8	u2 = [up], 8		C			M01
+		ADDSUB	w3 = u3, v3		C			M I
+	;;
+}{.mmi;		ld8	v3 = [vp], 8		C			M01
+		cmp.CND	p9, p0 = w3, u3		C			M I
+	(p7)	cmpeqor	p8, p0 = LIM, w2	C			M I
+}{.mmb;		ld8	u3 = [up], 8		C			M01
+	(p7)	add	w2 = INCR, w2		C			M I
+		br	L(m0)			C			B
+}
+
+	ALIGN(32)
+.Lb001:
+ {.mmi;		ADDSUB	w0 = r10, r11		C			M I
+	(p15)	ld8	v1 = [vp], 8		C			M01
+		mov	r8 = 0			C			M I
+	;;
+}{.mmb;		cmp.CND	p6, p0 = w0, r10	C			M I
+	(p15)	ld8	u1 = [up], 8		C			M01
+	(p14)	br	L(cj1)			C			B
+	;;
+}{.mmi;		add	upadv = PFDIST, up
+		add	vpadv = PFDIST, vp
+		shr.u	n = n, 3		C			I0
+}{.mmi;		ld8	v2 = [vp], 8		C			M01
+		ld8	u2 = [up], 8		C			M01
+		cmp.CND	p6, p0 = w0, r10	C			M I
+	;;
+}{.mmi;		ld8	v3 = [vp], 8		C			M01
+		ld8	u3 = [up], 8		C			M01
+		mov	ar.lc = n		C			I0
+	;;
+}{.mmi;		ld8	v0 = [vp], 8		C			M01
+		ld8	u0 = [up], 8		C			M01
+		ADDSUB	w1 = u1, v1		C			M I
+	;;
+}{.mmi;		ld8	v1 = [vp], 8		C			M01
+		cmp.CND	p7, p0 = w1, u1		C			M I
+		ADDSUB	w2 = u2, v2		C			M I
+}{.mmb;		ld8	u1 = [up], 8		C			M01
+		add	rpx = 16, rp		C			M I
+		br	L(m1)			C			B
+}
+
+	ALIGN(32)
+.Lb010:
+ {.mmi;		ld8	v0 = [vp], 8		C			M01
+		ld8	u0 = [up], 8		C			M01
+		shr.u	n = n, 3		C			I0
+}{.mmb;		ADDSUB	w3 = r10, r11		C			M I
+		nop	0
+	(p15)	br	L(gt2)			C			B
+	;;
+}{.mmi;		cmp.CND	p9, p0 = w3, r10	C			M I
+		ADDSUB	w0 = u0, v0		C			M I
+		mov	r8 = 0			C			M I
+	;;
+}{.mmb;		nop	0
+		cmp.CND	p6, p0 = w0, u0		C			M I
+		br	L(cj2)			C			B
+}
+L(gt2):
+ {.mmi;		ld8	v1 = [vp], 8		C			M01
+		ld8	u1 = [up], 8		C			M01
+		nop	0
+	;;
+}{.mmi;		add	upadv = PFDIST, up
+		add	vpadv = PFDIST, vp
+		mov	ar.lc = n		C			I0
+}{.mmi;		ld8	v2 = [vp], 8		C			M01
+		ld8	u2 = [up], 8		C			M01
+		nop	0
+	;;
+}{.mmi;		ld8	v3 = [vp], 8		C			M01
+		cmp.CND	p9, p0 = w3, r10	C			M I
+		ADDSUB	w0 = u0, v0		C			M I
+}{.mmb;		ld8	u3 = [up], 8		C			M01
+		add	rpx = 24, rp		C			M I
+		br	L(m23)			C			B
+}
+
+	ALIGN(32)
+.Lb011:
+ {.mmi;		ld8	v3 = [vp], 8		C			M01
+		ld8	u3 = [up], 8		C			M01
+		ADDSUB	w2 = r10, r11		C			M I
+	;;
+}{.mmb;		ld8	v0 = [vp], 8		C			M01
+		ld8	u0 = [up], 8		C			M01
+	(p15)	br	L(3)			C			B
+}{.mmb;		cmp.CND	p8, p0 = w2, r10	C			M I
+		ADDSUB	w3 = u3, v3		C			M I
+		br	L(cj3)			C			B
+}
+L(3):
+ {.mmi;		ld8	v1 = [vp], 8		C			M01
+		ld8	u1 = [up], 8		C			M01
+		shr.u	n = n, 3		C			I0
+	;;
+}{.mmi;		add	upadv = PFDIST, up
+		add	vpadv = PFDIST, vp
+		ADDSUB	w3 = u3, v3		C			M I
+}{.mmi;		ld8	v2 = [vp], 8		C			M01
+		ld8	u2 = [up], 8		C			M01
+		cmp.CND	p8, p0 = w2, r10	C			M I
+	;;
+}{.mmi;		ld8	v3 = [vp], 8		C			M01
+		cmp.CND	p9, p0 = w3, u3		C			M I
+		mov	ar.lc = n		C			I0
+}{.mmi;		ld8	u3 = [up], 8		C			M01
+		nop	0
+		nop	0
+	;;
+}{.mmi;		add	rpx = 32, rp		C			M I
+		st8	[rp] = w2, 8		C			M23
+	(p8)	cmpeqor	p9, p0 = LIM, w3	C			M I
+}{.mmb;	(p8)	add	w3 = INCR, w3		C			M I
+		ADDSUB	w0 = u0, v0		C			M I
+		br	L(m23)			C			B
+}
+
+	ALIGN(32)
+.Lb100:
+ {.mmi;		ld8	v2 = [vp], 8		C			M01
+		ld8	u2 = [up], 8		C			M01
+		shr.u	n = n, 3		C			I0
+	;;
+}{.mmi;		ld8	v3 = [vp], 8		C			M01
+		ld8	u3 = [up], 8		C			M01
+		ADDSUB	w1 = r10, r11		C			M I
+	;;
+}{.mmi;		ld8	v0 = [vp], 8		C			M01
+		ld8	u0 = [up], 8		C			M01
+		cmp.CND	p7, p0 = w1, r10	C			M I
+}{.mmb;		nop	0
+		ADDSUB	w2 = u2, v2		C			M I
+	(p14)	br	L(cj4)			C			B
+	;;
+}
+L(gt4):
+ {.mmi;		add	upadv = PFDIST, up
+		add	vpadv = PFDIST, vp
+		mov	ar.lc = n		C			I0
+}{.mmi;		ld8	v1 = [vp], 8		C			M01
+		ld8	u1 = [up], 8		C			M01
+		nop	0
+	;;
+}{.mmi;		ld8	v2 = [vp], 8		C			M01
+		cmp.CND	p8, p0 = w2, u2		C			M I
+		nop	0
+}{.mmi;		ld8	u2 = [up], 8		C			M01
+		ADDSUB	w3 = u3, v3		C			M I
+		add	rpx = 8, rp		C			M I
+	;;
+}{.mmi;		ld8	v3 = [vp], 8		C			M01
+		cmp.CND	p9, p0 = w3, u3		C			M I
+	(p7)	cmpeqor	p8, p0 = LIM, w2	C			M I
+}{.mmb;		ld8	u3 = [up], 8		C			M01
+	(p7)	add	w2 = INCR, w2		C			M I
+		br	L(m4)			C			B
+}
+
+	ALIGN(32)
+.Lb101:
+ {.mmi;		ld8	v1 = [vp], 8		C			M01
+		ld8	u1 = [up], 8		C			M01
+		shr.u	n = n, 3		C			I0
+	;;
+}{.mmi;		ld8	v2 = [vp], 8		C			M01
+		ld8	u2 = [up], 8		C			M01
+		ADDSUB	w0 = r10, r11		C			M I
+	;;
+}{.mmi;		add	upadv = PFDIST, up
+		add	vpadv = PFDIST, vp
+		add	rpx = 16, rp		C			M I
+}{.mmi;		ld8	v3 = [vp], 8		C			M01
+		ld8	u3 = [up], 8		C			M01
+		nop	0
+	;;
+}{.mmi;		ld8	v0 = [vp], 8		C			M01
+		cmp.CND	p6, p0 = w0, r10	C			M I
+		nop	0
+}{.mmb;		ld8	u0 = [up], 8		C			M01
+		ADDSUB	w1 = u1, v1		C			M I
+	(p14)	br	L(cj5)			C			B
+	;;
+}
+L(gt5):
+ {.mmi;		ld8	v1 = [vp], 8		C			M01
+		cmp.CND	p7, p0 = w1, u1		C			M I
+		mov	ar.lc = n		C			I0
+}{.mmb;		ld8	u1 = [up], 8		C			M01
+		ADDSUB	w2 = u2, v2		C			M I
+		br	L(m5)			C			B
+}
+
+	ALIGN(32)
+.Lb110:
+ {.mmi;		ld8	v0 = [vp], 8		C			M01
+		ld8	u0 = [up], 8		C			M01
+		shr.u	n = n, 3		C			I0
+	;;
+}{.mmi;		ld8	v1 = [vp], 8		C			M01
+		ld8	u1 = [up], 8		C			M01
+		ADDSUB	w3 = r10, r11		C			M I
+	;;
+}{.mmi;		add	upadv = PFDIST, up
+		add	vpadv = PFDIST, vp
+		mov	ar.lc = n		C			I0
+}{.mmi;		ld8	v2 = [vp], 8		C			M01
+		ld8	u2 = [up], 8		C			M01
+		nop	0
+	;;
+}{.mmi;		ld8	v3 = [vp], 8		C			M01
+		cmp.CND	p9, p0 = w3, r10	C			M I
+		ADDSUB	w0 = u0, v0		C			M I
+}{.mmb;		ld8	u3 = [up], 8		C			M01
+		add	rpx = 24, rp		C			M I
+		br	L(m67)			C			B
+}
+
+	ALIGN(32)
+.Lb111:
+ {.mmi;		ld8	v0 = [vp], 8		C			M01
+		ld8	u0 = [up], 8		C			M01
+		shr.u	n = n, 3		C			I0
+	;;
+}{.mmi;		ld8	v1 = [vp], 8		C			M01
+		ld8	u1 = [up], 8		C			M01
+		ADDSUB	w2 = r10, r11		C			M I
+	;;
+}{.mmi;		ld8	v2 = [vp], 8		C			M01
+		cmp.CND	p8, p0 = w2, r10	C			M I
+		mov	ar.lc = n		C			I0
+}{.mmi;		ld8	u2 = [up], 8		C			M01
+		ADDSUB	w3 = r18, r19		C			M I
+		nop	0
+	;;
+}{.mmi;		add	upadv = PFDIST, up
+		add	vpadv = PFDIST, vp
+		nop	0
+}{.mmi;		ld8	v3 = [vp], 8		C			M01
+		ld8	u3 = [up], 8		C			M01
+		cmp.CND	p9, p0 = w3, r18	C			M I
+	;;
+}{.mmi;		add	rpx = 32, rp		C			M I
+		st8	[rp] = w2, 8		C			M23
+	(p8)	cmpeqor	p9, p0 = LIM, w3	C			M I
+}{.mmb;	(p8)	add	w3 = INCR, w3		C			M I
+		ADDSUB	w0 = u0, v0		C			M I
+		br	L(m67)			C			B
+}
+
+C *** MAIN LOOP START ***
+	ALIGN(32)
+L(top):
+L(c5):		ld8	v1 = [vp], 8		C			M01
+		cmp.CND	p7, p0 = w1, u1		C			M I
+	(p9)	cmpeqor	p6, p0 = LIM, w0	C			M I
+		ld8	u1 = [up], 8		C			M01
+	(p9)	add	w0 = INCR, w0		C			M I
+		ADDSUB	w2 = u2, v2		C			M I
+	;;
+L(m5):		ld8	v2 = [vp], 8		C			M01
+		cmp.CND	p8, p0 = w2, u2		C			M I
+	(p6)	cmpeqor	p7, p0 = LIM, w1	C			M I
+		ld8	u2 = [up], 8		C			M01
+	(p6)	add	w1 = INCR, w1		C			M I
+		ADDSUB	w3 = u3, v3		C			M I
+	;;
+		st8	[rp] = w0, 8		C			M23
+		ld8	v3 = [vp], 8		C			M01
+		cmp.CND	p9, p0 = w3, u3		C			M I
+	(p7)	cmpeqor	p8, p0 = LIM, w2	C			M I
+		ld8	u3 = [up], 8		C			M01
+	(p7)	add	w2 = INCR, w2		C			M I
+	;;
+L(m4):		st8	[rp] = w1, 16		C			M23
+		st8	[rpx] = w2, 32		C			M23
+	(p8)	cmpeqor	p9, p0 = LIM, w3	C			M I
+		lfetch	[upadv], 64
+	(p8)	add	w3 = INCR, w3		C			M I
+		ADDSUB	w0 = u0, v0		C			M I
+	;;
+L(m23):		st8	[rp] = w3, 8		C			M23
+		ld8	v0 = [vp], 8		C			M01
+		cmp.CND	p6, p0 = w0, u0		C			M I
+		ld8	u0 = [up], 8		C			M01
+		ADDSUB	w1 = u1, v1		C			M I
+		nop.b	0
+	;;
+L(c1):		ld8	v1 = [vp], 8		C			M01
+		cmp.CND	p7, p0 = w1, u1		C			M I
+	(p9)	cmpeqor	p6, p0 = LIM, w0	C			M I
+		ld8	u1 = [up], 8		C			M01
+	(p9)	add	w0 = INCR, w0		C			M I
+		ADDSUB	w2 = u2, v2		C			M I
+	;;
+L(m1):		ld8	v2 = [vp], 8		C			M01
+		cmp.CND	p8, p0 = w2, u2		C			M I
+	(p6)	cmpeqor	p7, p0 = LIM, w1	C			M I
+		ld8	u2 = [up], 8		C			M01
+	(p6)	add	w1 = INCR, w1		C			M I
+		ADDSUB	w3 = u3, v3		C			M I
+	;;
+		st8	[rp] = w0, 8		C			M23
+		ld8	v3 = [vp], 8		C			M01
+		cmp.CND	p9, p0 = w3, u3		C			M I
+	(p7)	cmpeqor	p8, p0 = LIM, w2	C			M I
+		ld8	u3 = [up], 8		C			M01
+	(p7)	add	w2 = INCR, w2		C			M I
+	;;
+L(m0):		st8	[rp] = w1, 16		C			M23
+		st8	[rpx] = w2, 32		C			M23
+	(p8)	cmpeqor	p9, p0 = LIM, w3	C			M I
+		lfetch	[vpadv], 64
+	(p8)	add	w3 = INCR, w3		C			M I
+		ADDSUB	w0 = u0, v0		C			M I
+	;;
+L(m67):		st8	[rp] = w3, 8		C			M23
+		ld8	v0 = [vp], 8		C			M01
+		cmp.CND	p6, p0 = w0, u0		C			M I
+		ld8	u0 = [up], 8		C			M01
+		ADDSUB	w1 = u1, v1		C			M I
+		br.cloop.dptk	L(top)		C			B
+	;;
+C *** MAIN LOOP END ***
+
+L(end):
+ {.mmi;	(p9)	cmpeqor	p6, p0 = LIM, w0	C			M I
+	(p9)	add	w0 = INCR, w0		C			M I
+		mov	ar.lc = r2		C			I0
+}
+L(cj5):
+ {.mmi;		cmp.CND	p7, p0 = w1, u1		C			M I
+		ADDSUB	w2 = u2, v2		C			M I
+		nop	0
+	;;
+}{.mmi;		st8	[rp] = w0, 8		C			M23
+	(p6)	cmpeqor	p7, p0 = LIM, w1	C			M I
+	(p6)	add	w1 = INCR, w1		C			M I
+}
+L(cj4):
+ {.mmi;		cmp.CND	p8, p0 = w2, u2		C			M I
+		ADDSUB	w3 = u3, v3		C			M I
+		nop	0
+	;;
+}{.mmi;		st8	[rp] = w1, 8		C			M23
+	(p7)	cmpeqor	p8, p0 = LIM, w2	C			M I
+	(p7)	add	w2 = INCR, w2		C			M I
+}
+L(cj3):
+ {.mmi;		cmp.CND	p9, p0 = w3, u3		C			M I
+		ADDSUB	w0 = u0, v0		C			M I
+		nop	0
+	;;
+}{.mmi;		st8	[rp] = w2, 8		C			M23
+	(p8)	cmpeqor	p9, p0 = LIM, w3	C			M I
+	(p8)	add	w3 = INCR, w3		C			M I
+}{.mmi;		cmp.CND	p6, p0 = w0, u0		C			M I
+		nop	0
+		mov	r8 = 0			C			M I
+	;;
+}
+L(cj2):
+ {.mmi;		st8	[rp] = w3, 8		C			M23
+	(p9)	cmpeqor	p6, p0 = LIM, w0	C			M I
+	(p9)	add	w0 = INCR, w0		C			M I
+	;;
+}
+L(cj1):
+ {.mmb;		st8	[rp] = w0, 8		C			M23
+	(p6)	mov	r8 = 1			C			M I
+		br.ret.sptk.many b0		C			B
+}
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/ia64/aorsorrlsh1_n.asm b/third_party/gmp/mpn/ia64/aorsorrlsh1_n.asm
new file mode 100644
index 0000000..9b58b9e
--- /dev/null
+++ b/third_party/gmp/mpn/ia64/aorsorrlsh1_n.asm

@@ -0,0 +1,48 @@
+dnl  IA-64 mpn_addlsh1_n, mpn_sublsh1_n, mpn_rsblsh1_n
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2003-2005, 2010, 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C           cycles/limb
+C Itanium:      3.0
+C Itanium 2:    1.5
+
+
+define(LSH,		1)
+
+ifdef(`OPERATION_addlsh1_n',`define(`DO_add')')
+ifdef(`OPERATION_sublsh1_n',`define(`DO_sub')')
+ifdef(`OPERATION_rsblsh1_n',`define(`DO_rsb')')
+
+MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_sublsh1_n mpn_rsblsh1_n)
+
+include_mpn(`ia64/aorsorrlshC_n.asm')

diff --git a/third_party/gmp/mpn/ia64/aorsorrlsh2_n.asm b/third_party/gmp/mpn/ia64/aorsorrlsh2_n.asm
new file mode 100644
index 0000000..39b384a
--- /dev/null
+++ b/third_party/gmp/mpn/ia64/aorsorrlsh2_n.asm

@@ -0,0 +1,48 @@
+dnl  IA-64 mpn_addlsh2_n, mpn_sublsh2_n, mpn_rsblsh2_n
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2003-2005, 2010, 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C           cycles/limb
+C Itanium:      3.0
+C Itanium 2:    1.5
+
+
+define(LSH,		2)
+
+ifdef(`OPERATION_addlsh2_n',`define(`DO_add')')
+ifdef(`OPERATION_sublsh2_n',`define(`DO_sub')')
+ifdef(`OPERATION_rsblsh2_n',`define(`DO_rsb')')
+
+MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_sublsh2_n mpn_rsblsh2_n)
+
+include_mpn(`ia64/aorsorrlshC_n.asm')

diff --git a/third_party/gmp/mpn/ia64/aorsorrlshC_n.asm b/third_party/gmp/mpn/ia64/aorsorrlshC_n.asm
new file mode 100644
index 0000000..2703ce2
--- /dev/null
+++ b/third_party/gmp/mpn/ia64/aorsorrlshC_n.asm

@@ -0,0 +1,412 @@
+dnl  IA-64 mpn_addlshC_n, mpn_sublshC_n, mpn_rsblshC_n.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2003-2005, 2010, 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+C           cycles/limb
+C Itanium:      ?
+C Itanium 2:    1.5
+
+C TODO
+C  * Use shladd in feed-in code (for mpn_addlshC_n).
+C  * Rewrite loop to schedule loads closer to use, since we do prefetch.
+
+C INPUT PARAMETERS
+define(`rp', `r32')
+define(`up', `r33')
+define(`vp', `r34')
+define(`n',  `r35')
+
+ifdef(`DO_add', `
+  define(`ADDSUB',     `add	$1 = $2, $3')
+  define(`CMP',        `cmp.ltu	$1,p0 = $2, $3')
+  define(`INCR',       1)
+  define(`LIM',        -1)
+  define(`func',        mpn_addlsh`'LSH`'_n)')
+ifdef(`DO_sub', `
+  define(`ADDSUB',     `sub	$1 = $2, $3')
+  define(`CMP',        `cmp.gtu	$1,p0 = $2, $3')
+  define(`INCR',       -1)
+  define(`LIM',        0)
+  define(`func',        mpn_sublsh`'LSH`'_n)')
+ifdef(`DO_rsb', `
+  define(`ADDSUB',     `sub	$1 = $3, $2')
+  define(`CMP',        `cmp.gtu	$1,p0 = $2, $4')
+  define(`INCR',       -1)
+  define(`LIM',        0)
+  define(`func',        mpn_rsblsh`'LSH`'_n)')
+
+define(PFDIST, 500)
+
+define(`u0',`r14') define(`u1',`r15') define(`u2',`r16') define(`u3',`r17')
+define(`v0',`r18') define(`v1',`r19') define(`v2',`r20') define(`v3',`r21')
+define(`w0',`r22') define(`w1',`r23') define(`w2',`r24') define(`w3',`r25')
+define(`s0',`r26') define(`s1',`r27') define(`s2',`r28') define(`s3',`r29')
+define(`x0',`r30') define(`x1',`r31') define(`x2',`r3')  define(`x3',`r9')
+
+C r3 r8 r9 r10 r11
+
+ASM_START()
+PROLOGUE(func)
+	.prologue
+	.save	ar.lc, r2
+	.body
+ifdef(`HAVE_ABI_32',`
+	addp4	rp = 0, rp		C			M I
+	addp4	up = 0, up		C			M I
+	nop.i	0
+	addp4	vp = 0, vp		C			M I
+	nop.m	0
+	zxt4	n = n			C			I
+	;;
+')
+ {.mmi;	ld8	r11 = [vp], 8		C			M01
+	ld8	r10 = [up], 8		C			M01
+	mov.i	r2 = ar.lc		C			I0
+}{.mmi;	and	r14 = 3, n		C			M I
+	cmp.lt	p15, p0 = 4, n		C			M I
+	add	n = -5, n		C			M I
+	;;
+}{.mmi;	cmp.eq	p6, p0 = 1, r14		C			M I
+	cmp.eq	p7, p0 = 2, r14		C			M I
+	cmp.eq	p8, p0 = 3, r14		C			M I
+}{.bbb
+  (p6)	br.dptk	.Lb01			C			B
+  (p7)	br.dptk	.Lb10			C			B
+  (p8)	br.dptk	.Lb11			C			B
+}
+
+.Lb00:
+ {.mmi;	ld8	v0 = [vp], 8		C			M01
+	ld8	u0 = [up], 8		C			M01
+	shr.u	n = n, 2		C			I0
+	;;
+}{.mmi;	ld8	v1 = [vp], 8		C			M01
+	ld8	u1 = [up], 8		C			M01
+	shl	x3 = r11, LSH		C			I0
+	;;
+}{.mmi;	ld8	v2 = [vp], 8		C			M01
+	ld8	u2 = [up], 8		C			M01
+	shrp	x0 = v0, r11, 64-LSH	C			I0
+}{.mmb;	ADDSUB(	w3, r10, x3)		C			M I
+	nop	0
+  (p15)	br.dpnt	.grt4			C			B
+	;;
+}{.mii;	CMP(	p7, w3, r10, x3)	C			M II0
+	shrp	x1 = v1, v0, 64-LSH	C			I0
+	ADDSUB(	w0, u0, x0)		C			M I
+	;;
+}{.mii;	CMP(	p8, w0, u0, x0)		C			M I
+	shrp	x2 = v2, v1, 64-LSH	C			I0
+	ADDSUB(	w1, u1, x1)		C			M I
+}{.mmb;	nop	0
+	nop	0
+	br	.Lcj4			C			B
+}
+ALIGN(32)
+.grt4:
+ {.mii;	ld8	v3 = [vp], 8		C			M01
+	shrp	x0 = v0, r11, 64-LSH	C			I0
+	CMP(	p8, w3, r10, x3)	C			M I
+	;;
+}{.mmi;	ld8	u3 = [up], 8		C			M01
+	add	r11 = PFDIST, vp
+	shrp	x1 = v1, v0, 64-LSH	C			I0
+}{.mmi;	ld8	v0 = [vp], 8		C			M01
+	ADDSUB(	w0, u0, x0)		C			M I
+	nop	0
+	;;
+}{.mmi;	CMP(	p6, w0, u0, x0)		C			M I
+	add	r10 = PFDIST, up
+	mov.i	ar.lc = n		C			I0
+}{.mmb;	ADDSUB(	w1, u1, x1)		C			M I
+	ld8	u0 = [up], 8		C			M01
+	br	.LL00			C			B
+}
+
+	ALIGN(32)
+.Lb01:
+ifdef(`DO_add',
+`	shladd	w2 = r11, LSH, r10	C			M I
+	shr.u	r8 = r11, 64-LSH	C retval		I0
+  (p15)	br.dpnt	.grt1			C			B
+	;;
+',`
+	shl	x2 = r11, LSH		C			I0
+  (p15)	br.dpnt	.grt1			C			B
+	;;
+	ADDSUB(	w2, r10, x2)		C			M I
+	shr.u	r8 = r11, 64-LSH	C retval		I0
+	;;
+')
+	CMP(	p6, w2, r10, x2)	C			M I
+	br		.Lcj1
+
+.grt1:	ld8	v3 = [vp], 8		C			M01
+	ld8	u3 = [up], 8		C			M01
+	shr.u	n = n, 2		C			I0
+	;;
+	ld8	v0 = [vp], 8		C			M01
+	ld8	u0 = [up], 8		C			M01
+	mov.i	ar.lc = n		C FIXME swap with next	I0
+ifdef(`DO_add',
+`',`
+	ADDSUB(	w2, r10, x2)
+')
+	;;
+ {.mmi;	ld8	v1 = [vp], 8		C			M01
+	ld8	u1 = [up], 8		C			M01
+	shrp	x3 = v3, r11, 64-LSH	C			I0
+	;;
+}{.mmi;	ld8	v2 = [vp], 8		C			M01
+	ld8	u2 = [up], 8		C			M01
+	shrp	x0 = v0, v3, 64-LSH	C			I0
+}{.mmb;	CMP(	p6, w2, r10, x2)	C			M I
+	ADDSUB(	w3, u3, x3)		C			M I
+	br.cloop.dptk	.grt5		C			B
+	;;
+}{.mmi;	CMP(	p7, w3, u3, x3)		C			M I
+	ADDSUB(	w0, u0, x0)		C			M I
+	shrp	x1 = v1, v0, 64-LSH	C			I0
+}{.mmb;	nop	0
+	nop	0
+	br	.Lcj5			C			B
+}
+.grt5:
+ {.mmi;	add	r10 = PFDIST, up
+	add	r11 = PFDIST, vp
+	shrp	x0 = v0, v3, 64-LSH	C			I0
+}{.mmb;	ld8	v3 = [vp], 8		C			M01
+	CMP(	p8, w3, u3, x3)		C			M I
+	br	.LL01			C			B
+}
+	ALIGN(32)
+.Lb10:
+ {.mmi;	ld8	v2 = [vp], 8		C			M01
+	ld8	u2 = [up], 8		C			M01
+	shl	x1 = r11, LSH		C			I0
+}{.mmb;	nop	0
+	nop	0
+  (p15)	br.dpnt	.grt2			C			B
+	;;
+}{.mmi;	ADDSUB(	w1, r10, x1)		C			M I
+	nop	0
+	shrp	x2 = v2, r11, 64-LSH	C			I0
+	;;
+}{.mmi;	CMP(	p9, w1, r10, x1)	C			M I
+	ADDSUB(	w2, u2, x2)		C			M I
+	shr.u	r8 = v2, 64-LSH		C retval		I0
+	;;
+}{.mmb;	CMP(	p6, w2, u2, x2)		C			M I
+	nop	0
+	br	.Lcj2			C			B
+}
+.grt2:
+ {.mmi;	ld8	v3 = [vp], 8		C			M01
+	ld8	u3 = [up], 8		C			M01
+	shr.u	n = n, 2		C			I0
+	;;
+}{.mmi;	ld8	v0 = [vp], 8		C			M01
+	ld8	u0 = [up], 8		C			M01
+	mov.i	ar.lc = n		C			I0
+}{.mmi;	ADDSUB(	w1, r10, x1)		C			M I
+	nop	0
+	nop	0
+	;;
+}{.mii;	ld8	v1 = [vp], 8		C			M01
+	shrp	x2 = v2, r11, 64-LSH	C			I0
+	CMP(	p8, w1, r10, x1)	C			M I
+	;;
+}{.mmi;	add	r10 = PFDIST, up
+	ld8	u1 = [up], 8		C			M01
+	shrp	x3 = v3, v2, 64-LSH	C			I0
+}{.mmi;	add	r11 = PFDIST, vp
+	ld8	v2 = [vp], 8		C			M01
+	ADDSUB(	w2, u2, x2)		C			M I
+	;;
+}{.mmi;	CMP(	p6, w2, u2, x2)		C			M I
+	ld8	u2 = [up], 8		C			M01
+	shrp	x0 = v0, v3, 64-LSH	C			I0
+}{.mib;	ADDSUB(	w3, u3, x3)		C			M I
+	nop	0
+	br.cloop.dpnt	L(top)		C			B
+}
+	br	L(end)			C			B
+.Lb11:
+ {.mmi;	ld8	v1 = [vp], 8		C			M01
+	ld8	u1 = [up], 8		C			M01
+	shl	x0 = r11, LSH		C			I0
+	;;
+}{.mmi;	ld8	v2 = [vp], 8		C			M01
+	ld8	u2 = [up], 8		C			M01
+	shr.u	n = n, 2		C			I0
+}{.mmb;	nop	0
+	nop	0
+  (p15)	br.dpnt	.grt3			C			B
+	;;
+}{.mii;	nop	0
+	shrp	x1 = v1, r11, 64-LSH	C			I0
+	ADDSUB(	w0, r10, x0)		C			M I
+	;;
+}{.mii;	CMP(	p8, w0, r10, x0)	C			M I
+	shrp	x2 = v2, v1, 64-LSH	C			I0
+	ADDSUB(	w1, u1, x1)		C			M I
+	;;
+}{.mmb;	CMP(	p9, w1, u1, x1)		C			M I
+	ADDSUB(	w2, u2, x2)		C			M I
+	br	.Lcj3			C			B
+}
+.grt3:
+ {.mmi;	ld8	v3 = [vp], 8		C			M01
+	ld8	u3 = [up], 8		C			M01
+	shrp	x1 = v1, r11, 64-LSH	C			I0
+}{.mmi;	ADDSUB(	w0, r10, x0)		C			M I
+	nop	0
+	nop	0
+	;;
+}{.mmi;	ld8	v0 = [vp], 8		C			M01
+	CMP(	p6, w0, r10, x0)	C			M I
+	mov.i	ar.lc = n		C			I0
+}{.mmi;	ld8	u0 = [up], 8		C			M01
+	ADDSUB(	w1, u1, x1)		C			M I
+	nop	0
+	;;
+}{.mmi;	add	r10 = PFDIST, up
+	add	r11 = PFDIST, vp
+	shrp	x2 = v2, v1, 64-LSH	C			I0
+}{.mmb;	ld8	v1 = [vp], 8		C			M01
+	CMP(	p8, w1, u1, x1)		C			M I
+	br	.LL11			C			B
+}
+
+C *** MAIN LOOP START ***
+	ALIGN(32)
+L(top):	st8	[rp] = w1, 8		C			M23
+	lfetch	[r10], 32
+   (p8)	cmpeqor	p6, p0 = LIM, w2	C			M I
+   (p8)	add	w2 = INCR, w2		C			M I
+	ld8	v3 = [vp], 8		C			M01
+	CMP(	p8, w3, u3, x3)		C			M I
+	;;
+.LL01:	ld8	u3 = [up], 8		C			M01
+	shrp	x1 = v1, v0, 64-LSH	C			I0
+   (p6)	cmpeqor	p8, p0 = LIM, w3	C			M I
+   (p6)	add	w3 = INCR, w3		C			M I
+	ld8	v0 = [vp], 8		C			M01
+	ADDSUB(	w0, u0, x0)		C			M I
+	;;
+	st8	[rp] = w2, 8		C			M23
+	CMP(	p6, w0, u0, x0)		C			M I
+	nop.b	0
+	ld8	u0 = [up], 8		C			M01
+	lfetch	[r11], 32
+	ADDSUB(	w1, u1, x1)		C			M I
+	;;
+.LL00:	st8	[rp] = w3, 8		C			M23
+	shrp	x2 = v2, v1, 64-LSH	C			I0
+   (p8)	cmpeqor	p6, p0 = LIM, w0	C			M I
+   (p8)	add	w0 = INCR, w0		C			M I
+	ld8	v1 = [vp], 8		C			M01
+	CMP(	p8, w1, u1, x1)		C			M I
+	;;
+.LL11:	ld8	u1 = [up], 8		C			M01
+	shrp	x3 = v3, v2, 64-LSH	C			I0
+   (p6)	cmpeqor	p8, p0 = LIM, w1	C			M I
+   (p6)	add	w1 = INCR, w1		C			M I
+	ld8	v2 = [vp], 8		C			M01
+	ADDSUB(	w2, u2, x2)		C			M I
+	;;
+ {.mmi;	st8	[rp] = w0, 8		C			M23
+	CMP(	p6, w2, u2, x2)		C			M I
+	shrp	x0 = v0, v3, 64-LSH	C			I0
+}{.mib;
+	ld8	u2 = [up], 8		C			M01
+	ADDSUB(	w3, u3, x3)		C			M I
+	br.cloop.dptk	L(top)		C			B
+	;;
+}
+C *** MAIN LOOP END ***
+
+L(end):
+ {.mmi;	st8	[rp] = w1, 8		C			M23
+   (p8)	cmpeqor	p6, p0 = LIM, w2	C			M I
+	shrp	x1 = v1, v0, 64-LSH	C			I0
+}{.mmi;
+   (p8)	add	w2 = INCR, w2		C			M I
+	CMP(	p7, w3, u3, x3)		C			M I
+	ADDSUB(	w0, u0, x0)		C			M I
+	;;
+}
+.Lcj5:
+ {.mmi;	st8	[rp] = w2, 8		C			M23
+   (p6)	cmpeqor	p7, p0 = LIM, w3	C			M I
+	shrp	x2 = v2, v1, 64-LSH	C			I0
+}{.mmi;
+   (p6)	add	w3 = INCR, w3		C			M I
+	CMP(	p8, w0, u0, x0)		C			M I
+	ADDSUB(	w1, u1, x1)		C			M I
+	;;
+}
+.Lcj4:
+ {.mmi;	st8	[rp] = w3, 8		C			M23
+   (p7)	cmpeqor	p8, p0 = LIM, w0	C			M I
+	mov.i	ar.lc = r2		C			I0
+}{.mmi;
+   (p7)	add	w0 = INCR, w0		C			M I
+	CMP(	p9, w1, u1, x1)		C			M I
+	ADDSUB(	w2, u2, x2)		C			M I
+	;;
+}
+.Lcj3:
+ {.mmi;	st8	[rp] = w0, 8		C			M23
+   (p8)	cmpeqor	p9, p0 = LIM, w1	C			M I
+	shr.u	r8 = v2, 64-LSH		C			I0
+}{.mmi;
+   (p8)	add	w1 = INCR, w1		C			M I
+	CMP(	p6, w2, u2, x2)		C			M I
+	nop	0
+	;;
+}
+.Lcj2:
+ {.mmi;	st8	[rp] = w1, 8		C			M23
+   (p9)	cmpeqor	p6, p0 = LIM, w2	C			M I
+   (p9)	add	w2 = INCR, w2		C			M I
+	;;
+}
+.Lcj1:
+ {.mmb;	st8	[rp] = w2		C			M23
+ifdef(`DO_rsb',`
+   (p6)	add	r8 = -1, r8		C			M I
+',`
+   (p6)	add	r8 = 1, r8		C			M I
+')	br.ret.sptk.many b0		C			B
+}
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/ia64/bdiv_dbm1c.asm b/third_party/gmp/mpn/ia64/bdiv_dbm1c.asm
new file mode 100644
index 0000000..47e4553
--- /dev/null
+++ b/third_party/gmp/mpn/ia64/bdiv_dbm1c.asm

@@ -0,0 +1,516 @@
+dnl  IA-64 mpn_bdiv_dbm1.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  Copyright 2008, 2009 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C         cycles/limb
+C Itanium:    4
+C Itanium 2:  2
+
+C TODO
+C  * Optimize feed-in and wind-down code, both for speed and code size.
+
+C INPUT PARAMETERS
+define(`rp', `r32')
+define(`up', `r33')
+define(`n', `r34')
+define(`bd', `r35')
+
+ASM_START()
+PROLOGUE(mpn_bdiv_dbm1c)
+	.prologue
+	.save		ar.lc, r2
+	.body
+
+ifdef(`HAVE_ABI_32',
+`	addp4		rp = 0, rp		C M I
+	addp4		up = 0, up		C M I
+	zxt4		n = n			C I
+	;;
+')
+{.mmb
+	mov		r15 = r36		C M I
+	ldf8		f9 = [up], 8		C M
+	nop.b		0			C B
+}
+.Lcommon:
+{.mii
+	adds		r16 = -1, n		C M I
+	mov		r2 = ar.lc		C I0
+	and		r14 = 3, n		C M I
+	;;
+}
+{.mii
+	setf.sig	f6 = bd			C M2 M3
+	shr.u		r31 = r16, 2		C I0
+	cmp.eq		p10, p0 = 0, r14	C M I
+}
+{.mii
+	nop.m		0			C M
+	cmp.eq		p11, p0 = 2, r14	C M I
+	cmp.eq		p12, p0 = 3, r14	C M I
+	;;
+}
+{.mii
+	cmp.ne		p6, p7 = r0, r0		C M I
+	mov.i		ar.lc = r31		C I0
+	cmp.ne		p8, p9 = r0, r0		C M I
+}
+{.bbb
+  (p10)	br.dptk		.Lb00			C B
+  (p11)	br.dptk		.Lb10			C B
+  (p12)	br.dptk		.Lb11			C B
+	;;
+}
+
+.Lb01:	br.cloop.dptk	.grt1
+	;;
+	xma.l		f38 = f9, f6, f0
+	xma.hu		f39 = f9, f6, f0
+	;;
+	getf.sig	r26 = f38
+	getf.sig	r27 = f39
+	br		.Lcj1
+
+.grt1:	ldf8		f10 = [r33], 8
+	;;
+	ldf8		f11 = [r33], 8
+	;;
+	ldf8		f12 = [r33], 8
+	;;
+	xma.l		f38 = f9, f6, f0
+	xma.hu		f39 = f9, f6, f0
+	;;
+	ldf8		f13 = [r33], 8
+	;;
+	xma.l		f32 = f10, f6, f0
+	xma.hu		f33 = f10, f6, f0
+	br.cloop.dptk	.grt5
+
+	;;
+	getf.sig	r26 = f38
+	xma.l		f34 = f11, f6, f0
+	xma.hu		f35 = f11, f6, f0
+	;;
+	getf.sig	r27 = f39
+	;;
+	getf.sig	r20 = f32
+	xma.l		f36 = f12, f6, f0
+	xma.hu		f37 = f12, f6, f0
+	;;
+	getf.sig	r21 = f33
+	;;
+	getf.sig	r22 = f34
+	xma.l		f38 = f13, f6, f0
+	xma.hu		f39 = f13, f6, f0
+	br		.Lcj5
+
+.grt5:	ldf8		f10 = [r33], 8
+	;;
+	getf.sig	r26 = f38
+	xma.l		f34 = f11, f6, f0
+	xma.hu		f35 = f11, f6, f0
+	;;
+	getf.sig	r27 = f39
+	ldf8		f11 = [r33], 8
+	;;
+	getf.sig	r20 = f32
+	xma.l		f36 = f12, f6, f0
+	xma.hu		f37 = f12, f6, f0
+	;;
+	getf.sig	r21 = f33
+	ldf8		f12 = [r33], 8
+	;;
+	getf.sig	r22 = f34
+	xma.l		f38 = f13, f6, f0
+	xma.hu		f39 = f13, f6, f0
+	br		.LL01
+
+.Lb10:	ldf8		f13 = [r33], 8
+	br.cloop.dptk	.grt2
+	;;
+
+	xma.l		f36 = f9, f6, f0
+	xma.hu		f37 = f9, f6, f0
+	;;
+	xma.l		f38 = f13, f6, f0
+	xma.hu		f39 = f13, f6, f0
+	;;
+	getf.sig	r24 = f36
+	;;
+	getf.sig	r25 = f37
+	;;
+	getf.sig	r26 = f38
+	;;
+	getf.sig	r27 = f39
+	br		.Lcj2
+
+.grt2:	ldf8		f10 = [r33], 8
+	;;
+	ldf8		f11 = [r33], 8
+	;;
+	xma.l		f36 = f9, f6, f0
+	xma.hu		f37 = f9, f6, f0
+	;;
+	ldf8		f12 = [r33], 8
+	;;
+	xma.l		f38 = f13, f6, f0
+	xma.hu		f39 = f13, f6, f0
+	;;
+	ldf8		f13 = [r33], 8
+	;;
+	getf.sig	r24 = f36
+	xma.l		f32 = f10, f6, f0
+	xma.hu		f33 = f10, f6, f0
+	br.cloop.dptk	.grt6
+
+	getf.sig	r25 = f37
+	;;
+	getf.sig	r26 = f38
+	xma.l		f34 = f11, f6, f0
+	xma.hu		f35 = f11, f6, f0
+	;;
+	getf.sig	r27 = f39
+	;;
+	getf.sig	r20 = f32
+	xma.l		f36 = f12, f6, f0
+	xma.hu		f37 = f12, f6, f0
+	br		.Lcj6
+
+.grt6:	getf.sig	r25 = f37
+	ldf8		f10 = [r33], 8
+	;;
+	getf.sig	r26 = f38
+	xma.l		f34 = f11, f6, f0
+	xma.hu		f35 = f11, f6, f0
+	;;
+	getf.sig	r27 = f39
+	ldf8		f11 = [r33], 8
+	;;
+	getf.sig	r20 = f32
+	xma.l		f36 = f12, f6, f0
+	xma.hu		f37 = f12, f6, f0
+	br		.LL10
+
+
+.Lb11:	ldf8		f12 = [r33], 8
+	;;
+	ldf8		f13 = [r33], 8
+	br.cloop.dptk	.grt3
+	;;
+
+	xma.l		f34 = f9, f6, f0
+	xma.hu		f35 = f9, f6, f0
+	;;
+	xma.l		f36 = f12, f6, f0
+	xma.hu		f37 = f12, f6, f0
+	;;
+	getf.sig	r22 = f34
+	xma.l		f38 = f13, f6, f0
+	xma.hu		f39 = f13, f6, f0
+	;;
+	getf.sig	r23 = f35
+	;;
+	getf.sig	r24 = f36
+	;;
+	getf.sig	r25 = f37
+	;;
+	getf.sig	r26 = f38
+	br		.Lcj3
+
+.grt3:	ldf8		f10 = [r33], 8
+	;;
+	xma.l		f34 = f9, f6, f0
+	xma.hu		f35 = f9, f6, f0
+	;;
+	ldf8		f11 = [r33], 8
+	;;
+	xma.l		f36 = f12, f6, f0
+	xma.hu		f37 = f12, f6, f0
+	;;
+	ldf8		f12 = [r33], 8
+	;;
+	getf.sig	r22 = f34
+	xma.l		f38 = f13, f6, f0
+	xma.hu		f39 = f13, f6, f0
+	;;
+	getf.sig	r23 = f35
+	ldf8		f13 = [r33], 8
+	;;
+	getf.sig	r24 = f36
+	xma.l		f32 = f10, f6, f0
+	xma.hu		f33 = f10, f6, f0
+	br.cloop.dptk	.grt7
+
+	getf.sig	r25 = f37
+	;;
+	getf.sig	r26 = f38
+	xma.l		f34 = f11, f6, f0
+	xma.hu		f35 = f11, f6, f0
+	br		.Lcj7
+
+.grt7:	getf.sig	r25 = f37
+	ldf8		f10 = [r33], 8
+	;;
+	getf.sig	r26 = f38
+	xma.l		f34 = f11, f6, f0
+	xma.hu		f35 = f11, f6, f0
+	br		.LL11
+
+
+.Lb00:	ldf8		f11 = [r33], 8
+	;;
+	ldf8		f12 = [r33], 8
+	;;
+	ldf8		f13 = [r33], 8
+	br.cloop.dptk	.grt4
+	;;
+
+	xma.l		f32 = f9, f6, f0
+	xma.hu		f33 = f9, f6, f0
+	;;
+	xma.l		f34 = f11, f6, f0
+	xma.hu		f35 = f11, f6, f0
+	;;
+	getf.sig	r20 = f32
+	xma.l		f36 = f12, f6, f0
+	xma.hu		f37 = f12, f6, f0
+	;;
+	getf.sig	r21 = f33
+	;;
+	getf.sig	r22 = f34
+	xma.l		f38 = f13, f6, f0
+	xma.hu		f39 = f13, f6, f0
+	;;
+	getf.sig	r23 = f35
+	;;
+	getf.sig	r24 = f36
+	br		.Lcj4
+
+.grt4:	xma.l		f32 = f9, f6, f0
+	xma.hu		f33 = f9, f6, f0
+	;;
+	ldf8		f10 = [r33], 8
+	;;
+	xma.l		f34 = f11, f6, f0
+	xma.hu		f35 = f11, f6, f0
+	;;
+	ldf8		f11 = [r33], 8
+	;;
+	getf.sig	r20 = f32
+	xma.l		f36 = f12, f6, f0
+	xma.hu		f37 = f12, f6, f0
+	;;
+	getf.sig	r21 = f33
+	ldf8		f12 = [r33], 8
+	;;
+	getf.sig	r22 = f34
+	xma.l		f38 = f13, f6, f0
+	xma.hu		f39 = f13, f6, f0
+	;;
+	getf.sig	r23 = f35
+	ldf8		f13 = [r33], 8
+	;;
+	getf.sig	r24 = f36
+	xma.l		f32 = f10, f6, f0
+	xma.hu		f33 = f10, f6, f0
+	br.cloop.dptk	.LL00
+	br		.Lcj8
+
+C *** MAIN LOOP START ***
+	ALIGN(32)
+.Ltop:
+	.pred.rel "mutex",p6,p7
+C	.mfi
+	getf.sig	r24 = f36
+	xma.l		f32 = f10, f6, f0
+  (p6)	sub		r15 = r19, r27, 1
+C	.mfi
+	st8		[r32] = r19, 8
+	xma.hu		f33 = f10, f6, f0
+  (p7)	sub		r15 = r19, r27
+	;;
+.LL00:
+C	.mfi
+	getf.sig	r25 = f37
+	nop.f 0
+	cmp.ltu		p6, p7 = r15, r20
+C	.mib
+	ldf8		f10 = [r33], 8
+	sub		r16 = r15, r20
+	nop.b 0
+	;;
+
+C	.mfi
+	getf.sig	r26 = f38
+	xma.l		f34 = f11, f6, f0
+  (p6)	sub		r15 = r16, r21, 1
+C	.mfi
+	st8		[r32] = r16, 8
+	xma.hu		f35 = f11, f6, f0
+  (p7)	sub		r15 = r16, r21
+	;;
+.LL11:
+C	.mfi
+	getf.sig	r27 = f39
+	nop.f 0
+	cmp.ltu		p6, p7 = r15, r22
+C	.mib
+	ldf8		f11 = [r33], 8
+	sub		r17 = r15, r22
+	nop.b 0
+	;;
+
+C	.mfi
+	getf.sig	r20 = f32
+	xma.l		f36 = f12, f6, f0
+  (p6)	sub		r15 = r17, r23, 1
+C	.mfi
+	st8		[r32] = r17, 8
+	xma.hu		f37 = f12, f6, f0
+  (p7)	sub		r15 = r17, r23
+	;;
+.LL10:
+C	.mfi
+	getf.sig	r21 = f33
+	nop.f 0
+	cmp.ltu		p6, p7 = r15, r24
+C	.mib
+	ldf8		f12 = [r33], 8
+	sub		r18 = r15, r24
+	nop.b 0
+	;;
+
+C	.mfi
+	getf.sig	r22 = f34
+	xma.l		f38 = f13, f6, f0
+  (p6)	sub		r15 = r18, r25, 1
+C	.mfi
+	st8		[r32] = r18, 8
+	xma.hu		f39 = f13, f6, f0
+  (p7)	sub		r15 = r18, r25
+	;;
+.LL01:
+C	.mfi
+	getf.sig	r23 = f35
+	nop.f 0
+	cmp.ltu		p6, p7 = r15, r26
+C	.mib
+	ldf8		f13 = [r33], 8
+	sub		r19 = r15, r26
+	br.cloop.sptk.few .Ltop
+C *** MAIN LOOP END ***
+	;;
+
+	getf.sig	r24 = f36
+	xma.l		f32 = f10, f6, f0
+  (p6)	sub		r15 = r19, r27, 1
+	st8		[r32] = r19, 8
+	xma.hu		f33 = f10, f6, f0
+  (p7)	sub		r15 = r19, r27
+	;;
+.Lcj8:	getf.sig	r25 = f37
+	cmp.ltu		p6, p7 = r15, r20
+	sub		r16 = r15, r20
+	;;
+	getf.sig	r26 = f38
+	xma.l		f34 = f11, f6, f0
+  (p6)	sub		r15 = r16, r21, 1
+	st8		[r32] = r16, 8
+	xma.hu		f35 = f11, f6, f0
+  (p7)	sub		r15 = r16, r21
+	;;
+.Lcj7:	getf.sig	r27 = f39
+	cmp.ltu		p6, p7 = r15, r22
+	sub		r17 = r15, r22
+	;;
+	getf.sig	r20 = f32
+	xma.l		f36 = f12, f6, f0
+  (p6)	sub		r15 = r17, r23, 1
+	st8		[r32] = r17, 8
+	xma.hu		f37 = f12, f6, f0
+  (p7)	sub		r15 = r17, r23
+	;;
+.Lcj6:	getf.sig	r21 = f33
+	cmp.ltu		p6, p7 = r15, r24
+	sub		r18 = r15, r24
+	;;
+	getf.sig	r22 = f34
+	xma.l		f38 = f13, f6, f0
+  (p6)	sub		r15 = r18, r25, 1
+	st8		[r32] = r18, 8
+	xma.hu		f39 = f13, f6, f0
+  (p7)	sub		r15 = r18, r25
+	;;
+.Lcj5:	getf.sig	r23 = f35
+	cmp.ltu		p6, p7 = r15, r26
+	sub		r19 = r15, r26
+	;;
+	getf.sig	r24 = f36
+  (p6)	sub		r15 = r19, r27, 1
+	st8		[r32] = r19, 8
+  (p7)	sub		r15 = r19, r27
+	;;
+.Lcj4:	getf.sig	r25 = f37
+	cmp.ltu		p6, p7 = r15, r20
+	sub		r16 = r15, r20
+	;;
+	getf.sig	r26 = f38
+  (p6)	sub		r15 = r16, r21, 1
+	st8		[r32] = r16, 8
+  (p7)	sub		r15 = r16, r21
+	;;
+.Lcj3:	getf.sig	r27 = f39
+	cmp.ltu		p6, p7 = r15, r22
+	sub		r17 = r15, r22
+	;;
+  (p6)	sub		r15 = r17, r23, 1
+	st8		[r32] = r17, 8
+  (p7)	sub		r15 = r17, r23
+	;;
+.Lcj2:	cmp.ltu		p6, p7 = r15, r24
+	sub		r18 = r15, r24
+	;;
+  (p6)	sub		r15 = r18, r25, 1
+	st8		[r32] = r18, 8
+  (p7)	sub		r15 = r18, r25
+	;;
+.Lcj1:	cmp.ltu		p6, p7 = r15, r26
+	sub		r19 = r15, r26
+	;;
+  (p6)	sub		r8 = r19, r27, 1
+	st8		[r32] = r19
+  (p7)	sub		r8 = r19, r27
+	mov ar.lc = r2
+	br.ret.sptk.many b0
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/ia64/cnd_aors_n.asm b/third_party/gmp/mpn/ia64/cnd_aors_n.asm
new file mode 100644
index 0000000..edd0552
--- /dev/null
+++ b/third_party/gmp/mpn/ia64/cnd_aors_n.asm

@@ -0,0 +1,264 @@
+dnl  IA-64 mpn_cnd_add_n/mpn_cnd_sub_n.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C           cycles/limb
+C Itanium:      ?
+C Itanium 2:    1.5
+
+C INPUT PARAMETERS
+define(`cnd', `r32')
+define(`rp',  `r33')
+define(`up',  `r34')
+define(`vp',  `r35')
+define(`n',   `r36')
+
+ifdef(`OPERATION_cnd_add_n',`
+  define(ADDSUB,	add)
+  define(CND,		ltu)
+  define(INCR,		1)
+  define(LIM,		-1)
+  define(func,    mpn_cnd_add_n)
+')
+ifdef(`OPERATION_cnd_sub_n',`
+  define(ADDSUB,	sub)
+  define(CND,		gtu)
+  define(INCR,		-1)
+  define(LIM,		0)
+  define(func,    mpn_cnd_sub_n)
+')
+
+define(PFDIST, 160)
+
+C Some useful aliases for registers we use
+define(`u0',`r14') define(`u1',`r15') define(`u2',`r16') define(`u3',`r17')
+define(`x0',`r20') define(`x1',`r21') define(`x2',`r22') define(`x3',`r23')
+define(`v0',`r24') define(`v1',`r25') define(`v2',`r26') define(`v3',`r27')
+define(`w0',`r28') define(`w1',`r29') define(`w2',`r30') define(`w3',`r31')
+define(`up1',`up') define(`up2',`r8') define(`upadv',`r1')
+define(`vp1',`vp') define(`vp2',`r9') define(`vpadv',`r11')
+define(`rp1',`rp') define(`rp2',`r10')
+
+MULFUNC_PROLOGUE(mpn_cnd_add_n mpn_cnd_sub_n)
+
+ASM_START()
+PROLOGUE(func)
+	.prologue
+	.save	ar.lc, r2
+	.body
+ifdef(`HAVE_ABI_32',`
+	addp4	rp = 0, rp		C				M I
+	addp4	up = 0, up		C				M I
+	nop.i	0
+	addp4	vp = 0, vp		C				M I
+	nop.m	0
+	zxt4	n = n			C				I
+	;;
+')
+ {.mmi;	and	r3 = 3, n		C				M I
+	add	n = -1, n		C				M I
+	mov	r2 = ar.lc		C				I0
+}{.mmi;	cmp.ne	p6, p7 = 0, cnd		C				M I
+	add	vp2 = 8, vp		C				M I
+	add	up2 = 8, up		C				M I
+	;;
+}{.mmi;	add	upadv = PFDIST, up	C				M I
+	add	vpadv = PFDIST, vp	C				M I
+	shr.u	n = n, 2		C				I0
+	.pred.rel "mutex", p6, p7
+}{.mmi;	add	rp2 = 8, rp		C				M I
+   (p6)	mov	cnd = -1		C				M I
+   (p7)	mov	cnd = 0			C				M I
+	;;
+}	cmp.eq	p9, p0 = 1, r3		C				M I
+	cmp.eq	p7, p0 = 2, r3		C				M I
+	cmp.eq	p8, p0 = 3, r3		C				M I
+   (p9)	br	L(b1)			C				B
+   (p7)	br	L(b2)			C				B
+   (p8)	br	L(b3)			C				B
+	;;
+L(b0):
+ {.mmi;	ld8	v2 = [vp1], 16		C				M01
+	ld8	v3 = [vp2], 16		C				M01
+	mov	ar.lc = n		C				I0
+	;;
+}	ld8	u2 = [up1], 16		C				M01
+	ld8	u3 = [up2], 16		C				M01
+	and	x2 = v2, cnd		C				M I
+	and	x3 = v3, cnd		C				M I
+	;;
+	ADDSUB	w2 = u2, x2		C				M I
+	ADDSUB	w3 = u3, x3		C				M I
+	;;
+	ld8	v0 = [vp1], 16		C				M01
+	ld8	v1 = [vp2], 16		C				M01
+	cmp.CND	p8, p0 = w2, u2		C				M I
+	cmp.CND	p9, p0 = w3, u3		C				M I
+	br	L(lo0)
+
+L(b1):	ld8	v1 = [vp1], 8		C				M01
+	add	vp2 = 8, vp2		C				M I
+	add	rp2 = 8, rp2		C				M I
+	;;
+	ld8	u1 = [up1], 8		C				M01
+	add	up2 = 8, up2		C				M I
+	and	x1 = v1, cnd		C				M I
+	;;
+	ADDSUB	w1 = u1, x1		C				M I
+	cmp.ne	p10, p0 = 0, n
+	add	n = -1, n
+	;;
+	cmp.CND	p7, p0 = w1, u1		C				M I
+	st8	[rp1] = w1, 8		C				M23
+  (p10)	br	L(b0)
+	;;
+	mov	r8 = 0			C				M I
+	br	L(e1)
+
+L(b3):	ld8	v3 = [vp1], 8		C				M01
+	add	vp2 = 8, vp2		C				M I
+	add	rp2 = 8, rp2		C				M I
+	;;
+	ld8	u3 = [up1], 8		C				M01
+	add	up2 = 8, up2		C				M I
+	and	x3 = v3, cnd		C				M I
+	;;
+	ADDSUB	w3 = u3, x3		C				M I
+	;;
+	cmp.CND	p9, p0 = w3, u3		C				M I
+	st8	[rp1] = w3, 8		C				M23
+	C fall through
+
+L(b2):
+ {.mmi;	ld8	v0 = [vp1], 16		C				M01
+	ld8	v1 = [vp2], 16		C				M01
+	mov	ar.lc = n		C				I0
+	;;
+}	ld8	u0 = [up1], 16		C				M01
+	ld8	u1 = [up2], 16		C				M01
+	and	x0 = v0, cnd		C				M I
+	and	x1 = v1, cnd		C				M I
+	;;
+	ADDSUB	w0 = u0, x0		C				M I
+	ADDSUB	w1 = u1, x1		C				M I
+	br.cloop.dptk	L(gt2)		C				B
+	;;
+	cmp.CND	p6, p0 = w0, u0		C				M I
+	br		L(e2)		C				B
+L(gt2):
+	ld8	v2 = [vp1], 16		C				M01
+	ld8	v3 = [vp2], 16		C				M01
+	cmp.CND	p6, p0 = w0, u0		C				M I
+	cmp.CND	p7, p0 = w1, u1		C				M I
+	br		L(lo2)		C				B
+
+
+C *** MAIN LOOP START ***
+C	ALIGN(32)
+L(top):
+ {.mmi;	ld8	v2 = [vp1], 16		C				M01
+	ld8	v3 = [vp2], 16		C				M01
+	cmp.CND	p6, p0 = w0, u0		C				M I
+}{.mmi;	st8	[rp1] = w2, 16		C				M23
+	st8	[rp2] = w3, 16		C				M23
+	cmp.CND	p7, p0 = w1, u1		C				M I
+	;;
+}
+L(lo2):
+ {.mmi;	ld8	u2 = [up1], 16		C				M01
+	ld8	u3 = [up2], 16		C				M01
+   (p9)	cmpeqor	p6, p0 = LIM, w0	C				M I
+}{.mmi;	and	x2 = v2, cnd		C				M I
+	and	x3 = v3, cnd		C				M I
+   (p9)	add	w0 = INCR, w0		C				M I
+	;;
+}{.mmi;	ADDSUB	w2 = u2, x2		C				M I
+   (p6)	cmpeqor	p7, p0 = LIM, w1	C				M I
+   (p6)	add	w1 = INCR, w1		C				M I
+}{.mmi;	ADDSUB	w3 = u3, x3		C				M I
+	lfetch	[upadv], 32
+	nop	0
+	;;
+}{.mmi;	ld8	v0 = [vp1], 16		C				M01
+	ld8	v1 = [vp2], 16		C				M01
+	cmp.CND	p8, p0 = w2, u2		C				M I
+}{.mmi;	st8	[rp1] = w0, 16		C				M23
+	st8	[rp2] = w1, 16		C				M23
+	cmp.CND	p9, p0 = w3, u3		C				M I
+	;;
+}
+L(lo0):
+ {.mmi;	ld8	u0 = [up1], 16		C				M01
+	ld8	u1 = [up2], 16		C				M01
+   (p7)	cmpeqor	p8, p0 = LIM, w2	C				M I
+}{.mmi;	and	x0 = v0, cnd		C				M I
+	and	x1 = v1, cnd		C				M I
+   (p7)	add	w2 = INCR, w2		C				M I
+	;;
+}{.mmi;	ADDSUB	w0 = u0, x0		C				M I
+   (p8)	cmpeqor	p9, p0 = LIM, w3	C				M I
+   (p8)	add	w3 = INCR, w3		C				M I
+}{.mmb;	ADDSUB	w1 = u1, x1		C				M I
+	lfetch	[vpadv], 32
+	br.cloop.dptk	L(top)		C				B
+	;;
+}
+C *** MAIN LOOP END ***
+
+
+L(end):
+ {.mmi;	st8	[rp1] = w2, 16		C				M23
+	st8	[rp2] = w3, 16		C				M23
+	cmp.CND	p6, p0 = w0, u0		C				M I
+	;;
+}
+L(e2):
+ {.mmi;	cmp.CND	p7, p0 = w1, u1		C				M I
+   (p9)	cmpeqor	p6, p0 = LIM, w0	C				M I
+   (p9)	add	w0 = INCR, w0		C				M I
+	;;
+}{.mmi;	mov	r8 = 0			C				M I
+   (p6)	cmpeqor	p7, p0 = LIM, w1	C				M I
+   (p6)	add	w1 = INCR, w1		C				M I
+	;;
+}{.mmi;	st8	[rp1] = w0, 16		C				M23
+	st8	[rp2] = w1, 16		C				M23
+	mov	ar.lc = r2		C				I0
+}
+L(e1):
+ {.mmb;	nop	0
+   (p7)	mov	r8 = 1			C				M I
+	br.ret.sptk.many b0		C				B
+}
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/ia64/copyd.asm b/third_party/gmp/mpn/ia64/copyd.asm
new file mode 100644
index 0000000..b94a1af
--- /dev/null
+++ b/third_party/gmp/mpn/ia64/copyd.asm

@@ -0,0 +1,186 @@
+dnl  IA-64 mpn_copyd -- copy limb vector, decrementing.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  Copyright 2001, 2002, 2004 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C         cycles/limb
+C Itanium:    1
+C Itanium 2:  0.5
+
+C INPUT PARAMETERS
+C rp = r32
+C sp = r33
+C n = r34
+
+ASM_START()
+PROLOGUE(mpn_copyd)
+	.prologue
+	.save ar.lc, r2
+	.body
+ifdef(`HAVE_ABI_32',
+`	addp4		r32 = 0, r32
+	addp4		r33 = 0, r33
+	sxt4		r34 = r34
+	;;
+')
+{.mmi
+	shladd		r32 = r34, 3, r32
+	shladd		r33 = r34, 3, r33
+	mov.i		r2 = ar.lc
+}
+{.mmi
+	and		r14 = 3, r34
+	cmp.ge		p14, p15 = 3, r34
+	add		r34 = -4, r34
+	;;
+}
+{.mmi
+	cmp.eq		p8, p0 = 1, r14
+	cmp.eq		p10, p0 = 2, r14
+	cmp.eq		p12, p0 = 3, r14
+}
+{.bbb
+  (p8)	br.dptk		.Lb01
+  (p10)	br.dptk		.Lb10
+  (p12)	br.dptk		.Lb11
+}
+
+.Lb00:	C  n = 0, 4, 8, 12, ...
+	add		r32 = -8, r32
+	add		r33 = -8, r33
+  (p14)	br.dptk		.Ls00
+	;;
+	add		r21 = -8, r33
+	ld8		r16 = [r33], -16
+	shr		r15 = r34, 2
+	;;
+	ld8		r17 = [r21], -16
+	mov.i		ar.lc = r15
+	ld8		r18 = [r33], -16
+	add		r20 = -8, r32
+	;;
+	ld8		r19 = [r21], -16
+	br.cloop.dptk	.Loop
+	;;
+	br.sptk		.Lend
+	;;
+
+.Lb01:	C  n = 1, 5, 9, 13, ...
+	add		r21 = -8, r33
+	add		r20 = -8, r32
+	add		r33 = -16, r33
+	add		r32 = -16, r32
+	;;
+	ld8		r19 = [r21], -16
+	shr		r15 = r34, 2
+  (p14)	br.dptk		.Ls01
+	;;
+	ld8		r16 = [r33], -16
+	mov.i		ar.lc = r15
+	;;
+	ld8		r17 = [r21], -16
+	ld8		r18 = [r33], -16
+	br.sptk		.Li01
+	;;
+
+.Lb10:	C  n = 2,6, 10, 14, ...
+	add		r21 = -16, r33
+	shr		r15 = r34, 2
+	add		r20 = -16, r32
+	add		r32 = -8, r32
+	add		r33 = -8, r33
+	;;
+	ld8		r18 = [r33], -16
+	ld8		r19 = [r21], -16
+	mov.i		ar.lc = r15
+  (p14)	br.dptk		.Ls10
+	;;
+	ld8		r16 = [r33], -16
+	ld8		r17 = [r21], -16
+	br.sptk		.Li10
+	;;
+
+.Lb11:	C  n = 3, 7, 11, 15, ...
+	add		r21 = -8, r33
+	add		r20 = -8, r32
+	add		r33 = -16, r33
+	add		r32 = -16, r32
+	;;
+	ld8		r17 = [r21], -16
+	shr		r15 = r34, 2
+	;;
+	ld8		r18 = [r33], -16
+	mov.i		ar.lc = r15
+	ld8		r19 = [r21], -16
+  (p14)	br.dptk		.Ls11
+	;;
+	ld8		r16 = [r33], -16
+	br.sptk		.Li11
+	;;
+
+	ALIGN(32)
+.Loop:
+.Li00:
+{.mmb
+	st8		[r32] = r16, -16
+	ld8		r16 = [r33], -16
+	nop.b		0
+}
+.Li11:
+{.mmb
+	st8		[r20] = r17, -16
+	ld8		r17 = [r21], -16
+	nop.b		0
+	;;
+}
+.Li10:
+{.mmb
+	st8		[r32] = r18, -16
+	ld8		r18 = [r33], -16
+	nop.b		0
+}
+.Li01:
+{.mmb
+	st8		[r20] = r19, -16
+	ld8		r19 = [r21], -16
+	br.cloop.dptk	.Loop
+	;;
+}
+.Lend:	st8		[r32] = r16, -16
+.Ls11:	st8		[r20] = r17, -16
+	;;
+.Ls10:	st8		[r32] = r18, -16
+.Ls01:	st8		[r20] = r19, -16
+.Ls00:	mov.i		ar.lc = r2
+	br.ret.sptk.many b0
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/ia64/copyi.asm b/third_party/gmp/mpn/ia64/copyi.asm
new file mode 100644
index 0000000..49ed192
--- /dev/null
+++ b/third_party/gmp/mpn/ia64/copyi.asm

@@ -0,0 +1,182 @@
+dnl  IA-64 mpn_copyi -- copy limb vector, incrementing.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  Copyright 2001, 2002, 2004 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C         cycles/limb
+C Itanium:    1
+C Itanium 2:  0.5
+
+C INPUT PARAMETERS
+C rp = r32
+C sp = r33
+C n = r34
+
+ASM_START()
+PROLOGUE(mpn_copyi)
+	.prologue
+	.save ar.lc, r2
+	.body
+ifdef(`HAVE_ABI_32',
+`	addp4		r32 = 0, r32
+	addp4		r33 = 0, r33
+	sxt4		r34 = r34
+	;;
+')
+{.mmi
+	nop		0
+	nop		0
+	mov.i		r2 = ar.lc
+}
+{.mmi
+	and		r14 = 3, r34
+	cmp.ge		p14, p15 = 3, r34
+	add		r34 = -4, r34
+	;;
+}
+{.mmi
+	cmp.eq		p8, p0 = 1, r14
+	cmp.eq		p10, p0 = 2, r14
+	cmp.eq		p12, p0 = 3, r14
+}
+{.bbb
+  (p8)	br.dptk		.Lb01
+  (p10)	br.dptk		.Lb10
+  (p12)	br.dptk		.Lb11
+}
+
+.Lb00:	C  n = 0, 4, 8, 12, ...
+  (p14)	br.dptk		.Ls00
+	;;
+	add		r21 = 8, r33
+	ld8		r16 = [r33], 16
+	shr		r15 = r34, 2
+	;;
+	ld8		r17 = [r21], 16
+	mov.i		ar.lc = r15
+	ld8		r18 = [r33], 16
+	add		r20 = 8, r32
+	;;
+	ld8		r19 = [r21], 16
+	br.cloop.dptk	.Loop
+	;;
+	br.sptk		.Lend
+	;;
+
+.Lb01:	C  n = 1, 5, 9, 13, ...
+	add		r21 = 0, r33
+	add		r20 = 0, r32
+	add		r33 = 8, r33
+	add		r32 = 8, r32
+	;;
+	ld8		r19 = [r21], 16
+	shr		r15 = r34, 2
+  (p14)	br.dptk		.Ls01
+	;;
+	ld8		r16 = [r33], 16
+	mov.i		ar.lc = r15
+	;;
+	ld8		r17 = [r21], 16
+	ld8		r18 = [r33], 16
+	br.sptk		.Li01
+	;;
+
+.Lb10:	C  n = 2,6, 10, 14, ...
+	add		r21 = 8, r33
+	add		r20 = 8, r32
+	ld8		r18 = [r33], 16
+	shr		r15 = r34, 2
+	;;
+	ld8		r19 = [r21], 16
+	mov.i		ar.lc = r15
+  (p14)	br.dptk		.Ls10
+	;;
+	ld8		r16 = [r33], 16
+	ld8		r17 = [r21], 16
+	br.sptk		.Li10
+	;;
+
+.Lb11:	C  n = 3, 7, 11, 15, ...
+	add		r21 = 0, r33
+	add		r20 = 0, r32
+	add		r33 = 8, r33
+	add		r32 = 8, r32
+	;;
+	ld8		r17 = [r21], 16
+	shr		r15 = r34, 2
+	;;
+	ld8		r18 = [r33], 16
+	mov.i		ar.lc = r15
+	ld8		r19 = [r21], 16
+  (p14)	br.dptk		.Ls11
+	;;
+	ld8		r16 = [r33], 16
+	br.sptk		.Li11
+	;;
+
+	ALIGN(32)
+.Loop:
+.Li00:
+{.mmb
+	st8		[r32] = r16, 16
+	ld8		r16 = [r33], 16
+	nop.b		0
+}
+.Li11:
+{.mmb
+	st8		[r20] = r17, 16
+	ld8		r17 = [r21], 16
+	nop.b		0
+	;;
+}
+.Li10:
+{.mmb
+	st8		[r32] = r18, 16
+	ld8		r18 = [r33], 16
+	nop.b		0
+}
+.Li01:
+{.mmb
+	st8		[r20] = r19, 16
+	ld8		r19 = [r21], 16
+	br.cloop.dptk	.Loop
+	;;
+}
+.Lend:	st8		[r32] = r16, 16
+.Ls11:	st8		[r20] = r17, 16
+	;;
+.Ls10:	st8		[r32] = r18, 16
+.Ls01:	st8		[r20] = r19, 16
+.Ls00:	mov.i		ar.lc = r2
+	br.ret.sptk.many b0
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/ia64/dive_1.asm b/third_party/gmp/mpn/ia64/dive_1.asm
new file mode 100644
index 0000000..5e4a273
--- /dev/null
+++ b/third_party/gmp/mpn/ia64/dive_1.asm

@@ -0,0 +1,236 @@
+dnl  IA-64 mpn_divexact_1 -- mpn by limb exact division.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund and Kevin Ryde.
+
+dnl  Copyright 2003-2005, 2010 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C            cycles/limb
+C Itanium:      16
+C Itanium 2:     8
+
+C INPUT PARAMETERS
+define(`rp', `r32')
+define(`up', `r33')
+define(`n',  `r34')
+define(`divisor', `r35')
+
+define(`lshift', `r24')
+define(`rshift', `r25')
+
+C This code is a bit messy, and not as similar to mode1o.asm as desired.
+
+C The critical path during initialization is for computing the inverse of the
+C divisor.  Since odd divisors are probably common, we conditionally execute
+C the initial count_trailing_zeros code and the downshift.
+
+C Possible improvement: Merge more of the feed-in code into the inverse
+C computation.
+
+ASM_START()
+	.text
+	.align	32
+.Ltab:
+data1	0,0x01, 0,0xAB, 0,0xCD, 0,0xB7, 0,0x39, 0,0xA3, 0,0xC5, 0,0xEF
+data1	0,0xF1, 0,0x1B, 0,0x3D, 0,0xA7, 0,0x29, 0,0x13, 0,0x35, 0,0xDF
+data1	0,0xE1, 0,0x8B, 0,0xAD, 0,0x97, 0,0x19, 0,0x83, 0,0xA5, 0,0xCF
+data1	0,0xD1, 0,0xFB, 0,0x1D, 0,0x87, 0,0x09, 0,0xF3, 0,0x15, 0,0xBF
+data1	0,0xC1, 0,0x6B, 0,0x8D, 0,0x77, 0,0xF9, 0,0x63, 0,0x85, 0,0xAF
+data1	0,0xB1, 0,0xDB, 0,0xFD, 0,0x67, 0,0xE9, 0,0xD3, 0,0xF5, 0,0x9F
+data1	0,0xA1, 0,0x4B, 0,0x6D, 0,0x57, 0,0xD9, 0,0x43, 0,0x65, 0,0x8F
+data1	0,0x91, 0,0xBB, 0,0xDD, 0,0x47, 0,0xC9, 0,0xB3, 0,0xD5, 0,0x7F
+data1	0,0x81, 0,0x2B, 0,0x4D, 0,0x37, 0,0xB9, 0,0x23, 0,0x45, 0,0x6F
+data1	0,0x71, 0,0x9B, 0,0xBD, 0,0x27, 0,0xA9, 0,0x93, 0,0xB5, 0,0x5F
+data1	0,0x61, 0,0x0B, 0,0x2D, 0,0x17, 0,0x99, 0,0x03, 0,0x25, 0,0x4F
+data1	0,0x51, 0,0x7B, 0,0x9D, 0,0x07, 0,0x89, 0,0x73, 0,0x95, 0,0x3F
+data1	0,0x41, 0,0xEB, 0,0x0D, 0,0xF7, 0,0x79, 0,0xE3, 0,0x05, 0,0x2F
+data1	0,0x31, 0,0x5B, 0,0x7D, 0,0xE7, 0,0x69, 0,0x53, 0,0x75, 0,0x1F
+data1	0,0x21, 0,0xCB, 0,0xED, 0,0xD7, 0,0x59, 0,0xC3, 0,0xE5, 0,0x0F
+data1	0,0x11, 0,0x3B, 0,0x5D, 0,0xC7, 0,0x49, 0,0x33, 0,0x55, 0,0xFF
+
+
+PROLOGUE(mpn_divexact_1)
+	.prologue
+	.save		ar.lc, r2
+	.body
+
+ {.mmi;	add		r8 = -1, divisor	C M0
+	nop		0			C M1
+	tbit.z		p8, p9 = divisor, 0	C I0
+}
+ifdef(`HAVE_ABI_32',
+`	addp4		rp = 0, rp		C M2  rp extend
+	addp4		up = 0, up		C M3  up extend
+	sxt4		n = n')			C I1  size extend
+	;;
+.Lhere:
+ {.mmi;	ld8		r20 = [up], 8		C M0  up[0]
+  (p8)	andcm		r8 = r8, divisor	C M1
+	mov		r15 = ip		C I0  .Lhere
+	;;
+}{.mii
+	.pred.rel "mutex", p8, p9
+  (p9)	mov		rshift = 0		C M0
+  (p8)	popcnt		rshift = r8		C I0 r8 = cnt_lo_zeros(divisor)
+	cmp.eq		p6, p10 = 1, n		C I1
+	;;
+}{.mii;	add		r9 = .Ltab-.Lhere, r15	C M0
+  (p8)	shr.u		divisor = divisor, rshift C I0
+	nop		0			C I1
+	;;
+}{.mmi;	add		n = -4, n		C M0  size-1
+  (p10)	ld8		r21 = [up], 8		C M1  up[1]
+	mov		r14 = 2			C M1  2
+}{.mfi;	setf.sig	f6 = divisor		C M2  divisor
+	mov		f9 = f0			C M3  carry		FIXME
+	zxt1		r3 = divisor		C I1  divisor low byte
+	;;
+}{.mmi;	add		r3 = r9, r3		C M0  table offset ip and index
+	sub		r16 = 0, divisor	C M1  -divisor
+	mov		r2 = ar.lc		C I0
+}{.mmi;	sub		lshift = 64, rshift	C M2
+	setf.sig	f13 = r14		C M3  2 in significand
+	mov		r17 = -1		C I1  -1
+	;;
+}{.mmi;	ld1		r3 = [r3]		C M0  inverse, 8 bits
+	nop		0			C M1
+	mov		ar.lc = n		C I0  size-1 loop count
+}{.mmi;	setf.sig	f12 = r16		C M2  -divisor
+	setf.sig	f8 = r17		C M3  -1
+	cmp.eq		p7, p0 = -2, n		C I1
+	;;
+}{.mmi;	setf.sig	f7 = r3			C M2  inverse, 8 bits
+	cmp.eq		p8, p0 = -1, n		C M0
+	shr.u		r23 = r20, rshift	C I0
+	;;
+}
+
+	C f6	divisor
+	C f7	inverse, being calculated
+	C f8	-1, will be -inverse
+	C f9	carry
+	C f12	-divisor
+	C f13	2
+	C f14	scratch
+
+	xmpy.l		f14 = f13, f7		C Newton 2*i
+	xmpy.l		f7 = f7, f7		C Newton i*i
+	;;
+	xma.l		f7 = f7, f12, f14	C Newton i*i*-d + 2*i, 16 bits
+	;;
+	setf.sig	f10 = r23		C speculative, used iff n = 1
+	xmpy.l		f14 = f13, f7		C Newton 2*i
+	shl		r22 = r21, lshift	C speculative, used iff n > 1
+	xmpy.l		f7 = f7, f7		C Newton i*i
+	;;
+	or		r31 = r22, r23		C speculative, used iff n > 1
+	xma.l		f7 = f7, f12, f14	C Newton i*i*-d + 2*i, 32 bits
+	shr.u		r23 = r21, rshift	C speculative, used iff n > 1
+	;;
+	setf.sig	f11 = r31		C speculative, used iff n > 1
+	xmpy.l		f14 = f13, f7		C Newton 2*i
+	xmpy.l		f7 = f7, f7		C Newton i*i
+	;;
+	xma.l		f7 = f7, f12, f14	C Newton i*i*-d + 2*i, 64 bits
+
+  (p7)	br.cond.dptk	.Ln2
+  (p10)	br.cond.dptk	.grt3
+	;;
+
+.Ln1:	xmpy.l		f12 = f10, f7		C q = ulimb * inverse
+	br		.Lx1
+
+.Ln2:
+	xmpy.l		f8 = f7, f8		C -inverse = inverse * -1
+	xmpy.l		f12 = f11, f7		C q = ulimb * inverse
+	setf.sig	f11 = r23
+	br		.Lx2
+
+.grt3:
+	ld8		r21 = [up], 8		C up[2]
+	xmpy.l		f8 = f7, f8		C -inverse = inverse * -1
+	;;
+	shl		r22 = r21, lshift
+	;;
+	xmpy.l		f12 = f11, f7		C q = ulimb * inverse
+	;;
+	or		r31 = r22, r23
+	shr.u		r23 = r21, rshift
+	;;
+	setf.sig	f11 = r31
+  (p8)	br.cond.dptk	.Lx3			C branch for n = 3
+	;;
+	ld8		r21 = [up], 8
+	br		.Lent
+
+.Ltop:	ld8		r21 = [up], 8
+	xma.l		f12 = f9, f8, f10	C q = c * -inverse + si
+	nop.b		0
+	;;
+.Lent:	add		r16 = 160, up
+	shl		r22 = r21, lshift
+	nop.b		0
+	;;
+	stf8		[rp] = f12, 8
+	xma.hu		f9 = f12, f6, f9	C c = high(q * divisor + c)
+	nop.b		0
+	nop.m		0
+	xmpy.l		f10 = f11, f7		C si = ulimb * inverse
+	nop.b		0
+	;;
+	or		r31 = r22, r23
+	shr.u		r23 = r21, rshift
+	nop.b		0
+	;;
+	lfetch		[r16]
+	setf.sig	f11 = r31
+	br.cloop.sptk.few.clr .Ltop
+
+
+	xma.l		f12 = f9, f8, f10	C q = c * -inverse + si
+	;;
+.Lx3:	stf8		[rp] = f12, 8
+	xma.hu		f9 = f12, f6, f9	C c = high(q * divisor + c)
+	xmpy.l		f10 = f11, f7		C si = ulimb * inverse
+	;;
+	setf.sig	f11 = r23
+	;;
+	xma.l		f12 = f9, f8, f10	C q = c * -inverse + si
+	;;
+.Lx2:	stf8		[rp] = f12, 8
+	xma.hu		f9 = f12, f6, f9	C c = high(q * divisor + c)
+	xmpy.l		f10 = f11, f7		C si = ulimb * inverse
+	;;
+	xma.l		f12 = f9, f8, f10	C q = c * -inverse + si
+	;;
+.Lx1:	stf8		[rp] = f12, 8
+	mov		ar.lc = r2		C I0
+	br.ret.sptk.many b0
+EPILOGUE()

diff --git a/third_party/gmp/mpn/ia64/divrem_1.asm b/third_party/gmp/mpn/ia64/divrem_1.asm
new file mode 100644
index 0000000..e887820
--- /dev/null
+++ b/third_party/gmp/mpn/ia64/divrem_1.asm

@@ -0,0 +1,477 @@
+dnl  IA-64 mpn_divrem_1 and mpn_preinv_divrem_1 -- Divide an mpn number by an
+dnl  unnormalized limb.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  Copyright 2002, 2004, 2005 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C         cycles/limb
+C Itanium:    40-42
+C Itanium 2:  29-30
+
+C This was generated by gcc, then the loops were optimized.  The preinv entry
+C point was shoehorned into the file.  Lots of things outside the loops could
+C be streamlined.  It would probably be a good idea to merge the loops for
+C normalized and unnormalized divisor, since the shifting stuff is done for
+C free in parallel with other operations.  It would even be possible to merge
+C all loops, if the ld8 were made conditional.
+
+C TODO
+C  * Consider delaying inversion for normalized mpn_divrem_1 entry till after
+C    computing leading limb.
+C  * Inline and interleave limb inversion code with loop setup code.
+
+ASM_START()
+
+C HP's assembler requires these declarations for importing mpn_invert_limb
+	.global	mpn_invert_limb
+	.type	mpn_invert_limb,@function
+
+C INPUT PARAMETERS
+C rp    = r32
+C qxn   = r33
+C up    = r34
+C n     = r35
+C vl    = r36
+C vlinv = r37  (preinv only)
+C cnt = r38    (preinv only)
+
+PROLOGUE(mpn_preinv_divrem_1)
+	.prologue
+	.save	ar.pfs, r42
+	alloc		r42 = ar.pfs, 7, 8, 1, 0
+	.save	ar.lc, r44
+	mov		r44 = ar.lc
+	.save	rp, r41
+	mov		r41 = b0
+	.body
+ifdef(`HAVE_ABI_32',
+`	addp4		r32 = 0, r32
+	sxt4		r33 = r33
+	addp4		r34 = 0, r34
+	sxt4		r35 = r35
+	;;
+')
+	mov		r40 = r38
+	shladd		r34 = r35, 3, r34
+	;;
+	adds		r34 = -8, r34
+	;;
+	ld8		r39 = [r34], -8
+	;;
+
+	add		r15 = r35, r33
+	;;
+	mov		r8 = r37
+	shladd		r32 = r15, 3, r32	C r32 = rp + n + qxn
+	cmp.le		p8, p0 = 0, r36
+	;;
+	adds		r32 = -8, r32		C r32 = rp + n + qxn - 1
+	cmp.leu		p6, p7 = r36, r39
+   (p8)	br.cond.dpnt	.Lpunnorm
+	;;
+
+   (p6)	addl		r15 = 1, r0
+   (p7)	mov		r15 = r0
+	;;
+   (p6)	sub		r38 = r39, r36
+   (p7)	mov		r38 = r39
+	st8		[r32] = r15, -8
+	adds		r35 = -2, r35		C un -= 2
+	br	.Lpn
+
+.Lpunnorm:
+   (p6)	add		r34 = 8, r34
+	mov		r38 = 0			C r = 0
+	shl		r36 = r36, r40
+   (p6)	br.cond.dptk	.Lpu
+	;;
+	shl		r38 = r39, r40		C r = ahigh << cnt
+	cmp.ne		p8, p0 = 1, r35
+	st8		[r32] = r0, -8
+	adds		r35 = -1, r35		C un--
+   (p8)	br.cond.dpnt	.Lpu
+
+	mov		r23 = 1
+	;;
+	setf.sig	f6 = r8
+	setf.sig	f12 = r23
+	br		.L435
+EPILOGUE()
+
+
+PROLOGUE(mpn_divrem_1)
+	.prologue
+	.save	ar.pfs, r42
+	alloc		r42 = ar.pfs, 5, 8, 1, 0
+	.save	ar.lc, r44
+	mov		r44 = ar.lc
+	.save	rp, r41
+	mov		r41 = b0
+	.body
+ifdef(`HAVE_ABI_32',
+`	addp4		r32 = 0, r32
+	sxt4		r33 = r33
+	addp4		r34 = 0, r34
+	sxt4		r35 = r35
+	;;
+')
+	mov		r38 = r0
+	add		r15 = r35, r33
+	;;
+	cmp.ne		p6, p7 = 0, r15
+	;;
+   (p7)	mov		r8 = r0
+   (p7)	br.cond.dpnt	.Lret
+	shladd		r14 = r15, 3, r32	C r14 = rp + n + qxn
+	cmp.le		p6, p7 = 0, r36
+	;;
+	adds		r32 = -8, r14		C r32 = rp + n + qxn - 1
+   (p6)	br.cond.dpnt	.Lunnorm
+	cmp.eq		p6, p7 = 0, r35
+   (p6)	br.cond.dpnt	.L179
+	shladd		r14 = r35, 3, r34
+	;;
+	adds		r14 = -8, r14
+	adds		r35 = -1, r35
+	;;
+	ld8		r38 = [r14]
+	;;
+	cmp.leu		p6, p7 = r36, r38
+	;;
+   (p6)	addl		r15 = 1, r0
+   (p7)	mov		r15 = r0
+	;;
+	st8		[r32] = r15, -8
+  (p6)	sub		r38 = r38, r36
+
+.L179:
+	mov		r45 = r36
+	adds		r35 = -1, r35
+	br.call.sptk.many b0 = mpn_invert_limb
+	;;
+	shladd		r34 = r35, 3, r34
+.Lpn:
+	mov		r23 = 1
+	;;
+	setf.sig	f6 = r8
+	setf.sig	f12 = r23
+	cmp.le		p6, p7 = 0, r35
+	mov		r40 = 0
+   (p7)	br.cond.dpnt	.L435
+	setf.sig	f10 = r36
+	mov		ar.lc = r35
+	setf.sig	f7 = r38
+	;;
+	sub		r28 = -1, r36
+C Develop quotient limbs for normalized divisor
+.Loop1:		C 00				C q=r18 nh=r38/f7
+	ld8		r20 = [r34], -8
+	xma.hu		f11 = f7, f6, f0
+	;;	C 04
+	xma.l		f8 = f11, f12, f7	C q = q + nh
+	;;	C 08
+	getf.sig	r18 = f8
+	xma.hu		f9 = f8, f10, f0
+	xma.l		f8 = f8, f10, f0
+	;;	C 12
+	getf.sig	r16 = f9
+		C 13
+	getf.sig	r15 = f8
+	;;	C 18
+	cmp.ltu		p6, p7 = r20, r15
+	sub		r15 = r20, r15
+	sub		r16 = r38, r16
+	;;	C 19
+   (p6)	cmp.ne		p8, p9 = 1, r16		C is rH != 0?
+   (p7)	cmp.ne		p8, p9 = 0, r16		C is rH != 0?
+   (p6)	add		r16 = -1, r16
+   (p0)	cmp.ne.unc	p6, p7 = r0, r0
+	;;	C 20
+   (p8)	cmp.ltu		p6, p7 = r15, r36
+   (p8)	sub		r15 = r15, r36
+   (p8)	add		r18 = 1, r18		C q = q + 1;	done if: rH > 0
+	;;	C 21
+	.pred.rel "mutex",p6,p7
+   (p6)	cmp.ne		p8, p9 = 1, r16		C is rH != 0 still?
+   (p7)	cmp.ne		p8, p9 = 0, r16		C is rH != 0 still?
+	cmp.ltu		p6, p7 = r15, r36	C speculative
+	sub		r28 = r15, r36		C speculative, just for cmp
+	;;	C 22
+   (p8)	cmp.ltu		p6, p7 = r28, r36	C redo last cmp if needed
+   (p8)	mov		r15 = r28
+   (p8)	add		r18 = 1, r18		C q = q + 1;	done if: rH > 0
+	;;	C 23
+   (p6)	setf.sig	f7 = r15
+   (p7)	sub		r15 = r15, r36
+   (p7)	add		r18 = 1, r18		C q = q + 1;	done if: rH > 0
+	;;	C 24
+   (p7)	setf.sig	f7 = r15
+	st8		[r32] = r18, -8
+	mov		r38 = r15
+	br.cloop.dptk	.Loop1
+		C 29/30
+	br.sptk		.L435
+	;;
+.Lunnorm:
+	mux1		r16 = r36, @rev
+	cmp.eq		p6, p7 = 0, r35
+   (p6)	br.cond.dpnt	.L322
+	shladd		r34 = r35, 3, r34
+	;;
+	adds		r34 = -8, r34
+	;;
+	ld8		r39 = [r34]
+	;;
+	cmp.leu		p6, p7 = r36, r39
+   (p6)	br.cond.dptk	.L322
+	adds		r34 = -8, r34
+	;;
+	mov		r38 = r39
+	;;
+	cmp.ne		p6, p7 = 1, r15
+	st8		[r32] = r0, -8
+	;;
+   (p7)	mov		r8 = r38
+   (p7)	br.cond.dpnt	.Lret
+	adds		r35 = -1, r35
+.L322:
+	sub		r14 = r0, r16
+	;;
+	or		r14 = r16, r14
+	;;
+	mov		r16 = -8
+	czx1.l		r14 = r14
+	;;
+	shladd		r16 = r14, 3, r16
+	;;
+	shr.u		r14 = r36, r16
+	;;
+	cmp.geu		p6, p7 = 15, r14
+	;;
+   (p7)	shr.u		r14 = r14, 4
+   (p7)	adds		r16 = 4, r16
+	;;
+	cmp.geu		p6, p7 = 3, r14
+	;;
+   (p7)	shr.u		r14 = r14, 2
+   (p7)	adds		r16 = 2, r16
+	;;
+	tbit.nz		p6, p7 = r14, 1
+	;;
+	.pred.rel "mutex",p6,p7
+  (p6)	sub		r40 = 62, r16
+  (p7)	sub		r40 = 63, r16
+	;;
+	shl		r45 = r36, r40
+	shl		r36 = r36, r40
+	shl		r38 = r38, r40
+	br.call.sptk.many b0 = mpn_invert_limb
+	;;
+.Lpu:
+	mov		r23 = 1
+	;;
+	setf.sig	f6 = r8
+	setf.sig	f12 = r23
+	cmp.eq		p6, p7 = 0, r35
+   (p6)	br.cond.dpnt	.L435
+	sub		r16 = 64, r40
+	adds		r35 = -2, r35
+	;;
+	ld8		r39 = [r34], -8
+	cmp.le		p6, p7 = 0, r35
+	;;
+	shr.u		r14 = r39, r16
+	;;
+	or		r38 = r14, r38
+   (p7)	br.cond.dpnt	.Lend3
+	;;
+	mov		r22 = r16
+	setf.sig	f10 = r36
+	setf.sig	f7 = r38
+	mov		ar.lc = r35
+	;;
+C Develop quotient limbs for unnormalized divisor
+.Loop3:
+	ld8		r14 = [r34], -8
+	xma.hu		f11 = f7, f6, f0
+	;;
+	xma.l		f8 = f11, f12, f7	C q = q + nh
+	;;
+	getf.sig	r18 = f8
+	xma.hu		f9 = f8, f10, f0
+	shl		r20 = r39, r40
+	xma.l		f8 = f8, f10, f0
+	shr.u		r24 = r14, r22
+	;;
+	getf.sig	r16 = f9
+	getf.sig	r15 = f8
+	or		r20 = r24, r20
+	;;
+	cmp.ltu		p6, p7 = r20, r15
+	sub		r15 = r20, r15
+	sub		r16 = r38, r16
+	;;
+   (p6)	cmp.ne		p8, p9 = 1, r16		C is rH != 0?
+   (p7)	cmp.ne		p8, p9 = 0, r16		C is rH != 0?
+   (p6)	add		r16 = -1, r16
+   (p0)	cmp.ne.unc	p6, p7 = r0, r0
+	;;
+   (p8)	cmp.ltu		p6, p7 = r15, r36
+   (p8)	sub		r15 = r15, r36
+   (p8)	add		r18 = 1, r18		C q = q + 1;	done if: rH > 0
+	;;
+	.pred.rel "mutex",p6,p7
+   (p6)	cmp.ne		p8, p9 = 1, r16		C is rH != 0 still?
+   (p7)	cmp.ne		p8, p9 = 0, r16		C is rH != 0 still?
+	cmp.ltu		p6, p7 = r15, r36	C speculative
+	sub		r28 = r15, r36		C speculative, just for cmp
+	;;
+   (p8)	cmp.ltu		p6, p7 = r28, r36	C redo last cmp if needed
+   (p8)	mov		r15 = r28
+   (p8)	add		r18 = 1, r18		C q = q + 1;	done if: rH > 0
+	;;
+   (p6)	setf.sig	f7 = r15
+   (p7)	sub		r15 = r15, r36
+   (p7)	add		r18 = 1, r18		C q = q + 1;	done if: rH > 0
+	;;
+   (p7)	setf.sig	f7 = r15
+	st8		[r32] = r18, -8
+	mov		r39 = r14
+	mov		r38 = r15
+	br.cloop.dptk	.Loop3
+	;;
+.Lend3:
+	setf.sig	f10 = r36
+	setf.sig	f7 = r38
+	;;
+	xma.hu		f11 = f7, f6, f0
+	;;
+	xma.l		f8 = f11, f12, f7	C q = q + nh
+	;;
+	getf.sig	r18 = f8
+	xma.hu		f9 = f8, f10, f0
+	shl		r20 = r39, r40
+	xma.l		f8 = f8, f10, f0
+	;;
+	getf.sig	r16 = f9
+	getf.sig	r15 = f8
+	;;
+	cmp.ltu		p6, p7 = r20, r15
+	sub		r15 = r20, r15
+	sub		r16 = r38, r16
+	;;
+   (p6)	cmp.ne		p8, p9 = 1, r16		C is rH != 0?
+   (p7)	cmp.ne		p8, p9 = 0, r16		C is rH != 0?
+   (p6)	add		r16 = -1, r16
+   (p0)	cmp.ne.unc	p6, p7 = r0, r0
+	;;
+   (p8)	cmp.ltu		p6, p7 = r15, r36
+   (p8)	sub		r15 = r15, r36
+   (p8)	add		r18 = 1, r18		C q = q + 1;	done if: rH > 0
+	;;
+	.pred.rel "mutex",p6,p7
+   (p6)	cmp.ne		p8, p9 = 1, r16		C is rH != 0 still?
+   (p7)	cmp.ne		p8, p9 = 0, r16		C is rH != 0 still?
+	;;
+   (p8)	sub		r15 = r15, r36
+   (p8)	add		r18 = 1, r18		C q = q + 1;	done if: rH > 0
+	;;
+	cmp.ltu		p6, p7 = r15, r36
+	;;
+   (p7)	sub		r15 = r15, r36
+   (p7)	add		r18 = 1, r18		C q = q + 1;	done if: rH > 0
+	;;
+	st8		[r32] = r18, -8
+	mov		r38 = r15
+.L435:
+	adds		r35 = -1, r33
+	cmp.le		p6, p7 = 1, r33
+   (p7)	br.cond.dpnt	.Lend4
+	;;
+	setf.sig	f7 = r38
+	setf.sig	f10 = r36
+	mov		ar.lc = r35
+	;;
+.Loop4:
+	xma.hu		f11 = f7, f6, f0
+	;;
+	xma.l		f8 = f11, f12, f7	C q = q + nh
+	;;
+	getf.sig	r18 = f8
+	xma.hu		f9 = f8, f10, f0
+	xma.l		f8 = f8, f10, f0
+	;;
+	getf.sig	r16 = f9
+	getf.sig	r15 = f8
+	;;
+	cmp.ltu		p6, p7 = 0, r15
+	sub		r15 = 0, r15
+	sub		r16 = r38, r16
+	;;
+   (p6)	cmp.ne		p8, p9 = 1, r16		C is rH != 0?
+   (p7)	cmp.ne		p8, p9 = 0, r16		C is rH != 0?
+   (p6)	add		r16 = -1, r16
+   (p0)	cmp.ne.unc	p6, p7 = r0, r0
+	;;
+   (p8)	cmp.ltu		p6, p7 = r15, r36
+   (p8)	sub		r15 = r15, r36
+   (p8)	add		r18 = 1, r18		C q = q + 1;	done if: rH > 0
+	;;
+	.pred.rel "mutex",p6,p7
+   (p6)	cmp.ne		p8, p9 = 1, r16		C is rH != 0 still?
+   (p7)	cmp.ne		p8, p9 = 0, r16		C is rH != 0 still?
+	cmp.ltu		p6, p7 = r15, r36	C speculative
+	sub		r28 = r15, r36		C speculative, just for cmp
+	;;
+   (p8)	cmp.ltu		p6, p7 = r28, r36	C redo last cmp if needed
+   (p8)	mov		r15 = r28
+   (p8)	add		r18 = 1, r18		C q = q + 1;	done if: rH > 0
+	;;
+   (p6)	setf.sig	f7 = r15
+   (p7)	sub		r15 = r15, r36
+   (p7)	add		r18 = 1, r18		C q = q + 1;	done if: rH > 0
+	;;
+   (p7)	setf.sig	f7 = r15
+	st8		[r32] = r18, -8
+	mov		r38 = r15
+	br.cloop.dptk	.Loop4
+	;;
+.Lend4:
+	shr.u		r8 = r38, r40
+.Lret:
+	mov		ar.pfs = r42
+	mov		ar.lc = r44
+	mov		b0 = r41
+	br.ret.sptk.many b0
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/ia64/divrem_2.asm b/third_party/gmp/mpn/ia64/divrem_2.asm
new file mode 100644
index 0000000..9864311
--- /dev/null
+++ b/third_party/gmp/mpn/ia64/divrem_2.asm

@@ -0,0 +1,280 @@
+dnl  IA-64 mpn_divrem_2 -- Divide an mpn number by a normalized 2-limb number.
+
+dnl  Copyright 2010, 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C               norm   frac
+C itanium 1
+C itanium 2     29     29
+
+
+C TODO
+C  * Inline and interleave limb inversion code with loop setup code.
+C  * We should use explicit bundling in much of the code, since it typically
+C    cuts some cycles with the GNU assembler.
+
+
+ASM_START()
+
+C HP's assembler requires these declarations for importing mpn_invert_limb
+	.global	mpn_invert_limb
+	.type	mpn_invert_limb,@function
+
+C INPUT PARAMETERS
+C qp   = r32
+C fn   = r33
+C np   = r34
+C nn   = r35
+C dp   = r36
+
+define(`f0x1', `f15')
+
+ASM_START()
+PROLOGUE(mpn_divrem_2)
+	.prologue
+ifdef(`HAVE_ABI_32',
+`	addp4		r32 = 0, r32		C M I
+	addp4		r34 = 0, r34		C M I
+	zxt4		r35 = r35		C I
+	addp4		r36 = 0, r36		C M I
+	nop.m		0
+	zxt4		r33 = r33		C I
+	;;
+')
+	.save ar.pfs, r42
+	alloc	 r42 = ar.pfs, 5, 9, 1, 0
+	shladd	 r34 = r35, 3, r34
+	adds	 r14 = 8, r36
+	mov	 r43 = r1
+	;;
+	adds	 r15 = -8, r34
+	ld8	 r39 = [r14]
+	.save ar.lc, r45
+	mov	 r45 = ar.lc
+	adds	 r14 = -16, r34
+	mov	 r40 = r0
+	adds	 r34 = -24, r34
+	;;
+	ld8	 r38 = [r15]
+	.save rp, r41
+	mov	 r41 = b0
+	.body
+	ld8	 r36 = [r36]
+	ld8	 r37 = [r14]
+	;;
+	cmp.gtu	 p6, p7 = r39, r38
+  (p6)	br.cond.dptk .L8
+	;;
+	cmp.leu	 p8, p9 = r36, r37
+	cmp.geu	 p6, p7 = r39, r38
+	;;
+  (p8)	cmp4.ne.and.orcm p6, p7 = 0, r0
+  (p7)	br.cond.dptk .L51
+.L8:
+	add	 r14 = r33, r35		// un + fn
+	mov	 r46 = r39		// argument to mpn_invert_limb
+	;;
+	adds	 r35 = -3, r14
+	;;
+	cmp.gt	 p12, p0 = r0, r35
+  (p12)	br.cond.dpnt L(end)
+	br.call.sptk.many b0 = mpn_invert_limb
+	;;
+	setf.sig f11 = r8		// di (non-final)
+	setf.sig f34 = r39		// d1
+	setf.sig f33 = r36		// d0
+	mov	 r1 = r43
+	;;
+	mov	 r17 = 1
+	setf.sig f9 = r38		// n2
+	xma.l	 f6 = f11, f34, f0	// t0 = LO(di * d1)
+	;;
+	setf.sig f10 = r37		// n1
+	setf.sig f15 = r17		// 1
+	xma.hu	 f8 = f11, f33, f0	// s0 = HI(di * d0)
+	;;
+	getf.sig r17 = f6
+	getf.sig r16 = f8
+	mov	 ar.lc = r35
+	;;
+	sub	 r18 = r0, r39		// -d1
+	add	 r14 = r17, r36
+	;;
+	setf.sig f14 = r18		// -d1
+	cmp.leu	 p8, p9 = r17, r14
+	add	 r16 = r14, r16
+	;;
+  (p9)	adds	 r19 = 0, r0
+  (p8)	adds	 r19 = -1, r0
+	cmp.gtu	 p6, p7 = r14, r16
+	;;
+  (p6)	adds	 r19 = 1, r19
+	;;
+ifelse(1,1,`
+	cmp.gt	 p7, p6 = r0, r19
+	;;
+  (p6)	adds	 r8 = -1, r8		// di--
+  (p6)	sub	 r14 = r16, r39		// t0 -= d1
+  (p6)	cmp.ltu	 p6, p7 = r16, r39	// cy for: t0 - d1
+	;;
+  (p6)	cmp.gt	 p9, p8 = 1, r19
+  (p7)	cmp.gt	 p9, p8 = 0, r19
+  (p6)	adds	 r19 = -1, r19		// t1 -= cy
+	mov	 r16 = r14
+	;;
+  (p8)	adds	 r8 = -1, r8		// di--
+  (p8)	sub	 r14 = r16, r39		// t0 -= d1
+  (p8)	cmp.ltu	 p8, p9 = r16, r39	// cy for: t0 - d1
+	;;
+  (p8)	cmp.gt	 p7, p6 = 1, r19
+  (p9)	cmp.gt	 p7, p6 = 0, r19
+  (p8)	adds	 r19 = -1, r19		// t1 -= cy
+	mov	 r16 = r14
+	;;
+  (p6)	adds	 r8 = -1, r8		// di--
+  (p6)	sub	 r14 = r16, r39		// t0 -= d1
+  (p6)	cmp.ltu	 p6, p7 = r16, r39	// cy for: t0 - d1
+	;;
+  (p6)	cmp.gt	 p9, p8 = 1, r19
+  (p7)	cmp.gt	 p9, p8 = 0, r19
+  (p6)	adds	 r19 = -1, r19		// t1 -= cy
+	mov	 r16 = r14
+	;;
+  (p8)	adds	 r8 = -1, r8		// di--
+  (p8)	sub	 r14 = r16, r39		// t0 -= d1
+  (p8)	cmp.ltu	 p8, p9 = r16, r39	// cy for: t0 - d1
+	;;
+  (p8)	adds	 r19 = -1, r19		// t1 -= cy
+	mov	 r16 = r14
+',`
+	cmp.gt	 p8, p9 = r0, r19
+  (p8)	br.cond.dpnt .L46
+.L52:
+	cmp.leu	 p6, p7 = r39, r16
+	sub	 r14 = r16, r39
+	adds	 r8 = -1, r8
+	;;
+  (p7)	adds	 r19 = -1, r19
+	mov	 r16 = r14
+	;;
+  (p7)	cmp.gt	 p8, p9 = r0, r19
+  (p9)	br.cond.dptk .L52
+.L46:
+')
+	setf.sig f32 = r8		// di
+	shladd	 r32 = r35, 3, r32
+	;;
+
+	ALIGN(16)
+L(top):	nop 0
+	nop 0
+	cmp.gt	 p8, p9 = r33, r35
+	;;
+ (p8)	mov	 r37 = r0
+ (p9)	ld8	 r37 = [r34], -8
+	xma.hu	 f8 = f9, f32, f10	//				0,29
+	xma.l	 f12 = f9, f32, f10	//				0
+	;;
+	getf.sig r20 = f12		// q0				4
+	xma.l	 f13 = f15, f8, f9	// q += n2			4
+	sub	 r8 = -1, r36		// bitnot d0
+	;;
+	getf.sig r18 = f13		//				8
+	xma.l	 f7 = f14, f13, f10	//				8
+	xma.l	 f6 = f33, f13, f33	// t0 = LO(d0*q+d0)		8
+	xma.hu	 f9 = f33, f13, f33	// t1 = HI(d0*q+d0)		9
+	;;
+	getf.sig r38 = f7		// n1				12
+	getf.sig r16 = f6		//				13
+	getf.sig r19 = f9		//				14
+	;;
+	sub	 r38 = r38, r39		// n1 -= d1			17
+	;;
+	cmp.ne	 p9, p0 = r0, r0	// clear p9
+	cmp.leu	 p10, p11 = r16, r37	// cy for: n0 - t0		18
+	;;
+	sub	 r37 = r37, r16		// n0 -= t0			19
+  (p11)	sub	 r38 = r38, r19, 1	// n1 -= t1 - cy		19
+  (p10)	sub	 r38 = r38, r19		// n1 -= t1			19
+	;;
+	cmp.gtu	 p6, p7 = r20, r38	// n1 >= q0			20
+	;;
+  (p7)	cmp.ltu	 p9, p0 = r8, r37	//				21
+  (p6)	add	 r18 = 1, r18		//
+  (p7)	add	 r37 = r37, r36		//				21
+  (p7)	add	 r38 = r38, r39		//				21
+	;;
+	setf.sig f10 = r37		// n1				22
+  (p9)	add	 r38 = 1, r38		//				22
+	;;
+	setf.sig f9 = r38		// n2				23
+	cmp.gtu	 p6, p7 = r39, r38	//				23
+  (p7)	br.cond.spnt L(fix)
+L(bck):	st8	 [r32] = r18, -8
+	adds	 r35 = -1, r35
+	br.cloop.sptk.few L(top)
+	;;
+
+L(end):	add	r14 = 8, r34
+	add	r15 = 16, r34
+	mov	 b0 = r41
+	;;
+	st8	[r14] = r37
+	st8	[r15] = r38
+	mov	 ar.pfs = r42
+	mov	 r8 = r40
+	mov	 ar.lc = r45
+	br.ret.sptk.many b0
+	;;
+.L51:
+	.pred.rel "mutex", p8, p9
+	sub	 r37 = r37, r36
+  (p9)	sub	 r38 = r38, r39, 1
+  (p8)	sub	 r38 = r38, r39
+	adds	 r40 = 1, r0
+	br .L8
+	;;
+
+L(fix):	cmp.geu	 p6, p7 = r39, r38
+	cmp.leu	 p8, p9 = r36, r37
+	;;
+  (p8)	cmp4.ne.and.orcm p6, p7 = 0, r0
+  (p6)	br.cond.dptk L(bck)
+	sub	 r37 = r37, r36
+  (p9)	sub	 r38 = r38, r39, 1
+  (p8)	sub	 r38 = r38, r39
+	adds	 r18 = 1, r18
+	;;
+	setf.sig f9 = r38		// n2
+	setf.sig f10 = r37		// n1
+	br	 L(bck)
+
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/ia64/gcd_11.asm b/third_party/gmp/mpn/ia64/gcd_11.asm
new file mode 100644
index 0000000..6137227
--- /dev/null
+++ b/third_party/gmp/mpn/ia64/gcd_11.asm

@@ -0,0 +1,110 @@
+dnl  Itanium-2 mpn_gcd_11
+
+dnl  Copyright 2002-2005, 2012, 2013, 2015, 2019 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C           cycles/bitpair (1x1 gcd)
+C Itanium:       ?
+C Itanium 2:     4.5
+
+
+ASM_START()
+
+C ctz_table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0.
+
+deflit(MAXSHIFT, 7)
+deflit(MASK, eval((m4_lshift(1,MAXSHIFT))-1))
+
+	.rodata
+	ALIGN(m4_lshift(1,MAXSHIFT))	C align table to allow using dep
+ctz_table:
+	data1	MAXSHIFT
+forloop(i,1,MASK,
+`	data1	m4_count_trailing_zeros(i)-1
+')
+
+define(`x0', r32)
+define(`y0', r33)
+
+PROLOGUE(mpn_gcd_11)
+	.prologue
+	.body
+		addl	r22 = @ltoff(ctz_table), r1
+	;;
+		ld8	r22 = [r22]
+		br	L(ent)
+	;;
+
+	ALIGN(32)
+L(top):
+	.pred.rel "mutex", p6,p7
+ {.mmi;	(p7)	mov	y0 = x0
+	(p6)	sub	x0 = x0, y0
+		dep	r21 = r19, r22, 0, MAXSHIFT	C concat(table,lowbits)
+}{.mmi;		and	r20 = MASK, r19
+	(p7)	mov	x0 = r19
+		and	r23 = 6, r19
+	;;
+}{.mmi;		cmp.eq	p6,p0 = 4, r23
+		cmp.eq	p7,p0 = 0, r23
+		shr.u	x0 = x0, 1		C shift-by-1, always OK
+}{.mmb;		ld1	r16 = [r21]
+		cmp.eq	p10,p0 = 0, r20
+	(p10)	br.spnt.few.clr	 L(count_better)
+	;;
+}
+L(bck):
+	.pred.rel "mutex", p6,p7
+ {.mii;		nop	0
+	(p6)	shr.u	x0 = x0, 1		C u was ...100 before shift-by-1 above
+	(p7)	shr.u	x0 = x0, r16		C u was ...000 before shift-by-1 above
+	;;
+}
+L(ent):
+ {.mmi;		sub	r19 = y0, x0
+		cmp.gtu	p6,p7 = x0, y0
+		cmp.ne	p8,p0 = x0, y0
+}{.mmb;		nop	0
+		nop	0
+	(p8)	br.sptk.few.clr L(top)
+}
+
+L(end):		mov	r8 = y0
+		br.ret.sptk.many b0
+
+L(count_better):
+		add	r20 = -1, x0
+	;;
+		andcm	r23 = r20, x0
+	;;
+		popcnt	r16 = r23
+		br	L(bck)
+EPILOGUE()

diff --git a/third_party/gmp/mpn/ia64/gmp-mparam.h b/third_party/gmp/mpn/ia64/gmp-mparam.h
new file mode 100644
index 0000000..34d2bf3
--- /dev/null
+++ b/third_party/gmp/mpn/ia64/gmp-mparam.h

@@ -0,0 +1,212 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+/* 900MHz Itanium2 (olympic.gmplib.org) */
+/* FFT tuning limit = 59,194,709 */
+/* Generated by tuneup.c, 2019-10-13, gcc 4.2 */
+
+#define MOD_1_1P_METHOD                      2  /* 17.40% faster than 1 */
+#define MOD_1_NORM_THRESHOLD                 0  /* always */
+#define MOD_1_UNNORM_THRESHOLD               0  /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          8
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          6
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD        18
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD         0  /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     13
+#define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_1N_PI1_METHOD                 1  /* 1.35% faster than 2 */
+#define DIV_QR_1_NORM_THRESHOLD          MP_SIZE_T_MAX  /* never */
+#define DIV_QR_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */
+#define DIV_QR_2_PI2_THRESHOLD              10
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD        MP_SIZE_T_MAX  /* never */
+
+#define DIV_1_VS_MUL_1_PERCENT             316
+
+#define MUL_TOOM22_THRESHOLD                47
+#define MUL_TOOM33_THRESHOLD                89
+#define MUL_TOOM44_THRESHOLD               220
+#define MUL_TOOM6H_THRESHOLD               327
+#define MUL_TOOM8H_THRESHOLD               454
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      97
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     153
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD     143
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD     153
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD     226
+
+#define SQR_BASECASE_THRESHOLD              11
+#define SQR_TOOM2_THRESHOLD                 98
+#define SQR_TOOM3_THRESHOLD                135
+#define SQR_TOOM4_THRESHOLD                272
+#define SQR_TOOM6_THRESHOLD                354
+#define SQR_TOOM8_THRESHOLD                490
+
+#define MULMID_TOOM42_THRESHOLD             99
+
+#define MULMOD_BNM1_THRESHOLD               23
+#define SQRMOD_BNM1_THRESHOLD               27
+
+#define MUL_FFT_MODF_THRESHOLD             840  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    840, 5}, {     30, 6}, {     16, 5}, {     33, 6}, \
+    {     17, 5}, {     36, 6}, {     35, 7}, {     18, 6}, \
+    {     37, 7}, {     19, 6}, {     42, 7}, {     37, 8}, \
+    {     19, 7}, {     43, 8}, {     23, 7}, {     47, 8}, \
+    {     43, 9}, {     23, 8}, {     51, 9}, {     27, 8}, \
+    {     57, 9}, {     31, 8}, {     63, 9}, {     35, 8}, \
+    {     71, 9}, {     43,10}, {     23, 9}, {     55,10}, \
+    {     31, 9}, {     71,10}, {     39, 9}, {     83,10}, \
+    {     47, 9}, {     99,10}, {     55,11}, {     31,10}, \
+    {     87,11}, {     47,10}, {    111,12}, {     31,11}, \
+    {     63,10}, {    135,11}, {     79,10}, {    167,11}, \
+    {     95,10}, {    191,11}, {    111,12}, {     63,11}, \
+    {    143,10}, {    287,11}, {    159,12}, {     95,11}, \
+    {    207,13}, {     63,12}, {    127,11}, {    271,12}, \
+    {    159,11}, {    335,10}, {    671,12}, {    191,10}, \
+    {    799,12}, {    223,13}, {    127,12}, {    287,11}, \
+    {    607,12}, {    319,11}, {    671,13}, {    191,12}, \
+    {    383,11}, {    799,10}, {   1599,12}, {    415,11}, \
+    {    863,14}, {    127,13}, {    255,12}, {    543,11}, \
+    {   1119,12}, {    607,13}, {    319,12}, {    735,11}, \
+    {   1471,12}, {    863,13}, {    447,12}, {    927,11}, \
+    {   1855,12}, {    959,14}, {    255,13}, {    511,12}, \
+    {   1055,11}, {   2111,12}, {   1119,13}, {    575,12}, \
+    {   1247,13}, {    639,12}, {   1311,13}, {    703,12}, \
+    {   1471,13}, {    831,12}, {   1727,13}, {    895,12}, \
+    {   1791,13}, {    959,15}, {    255,14}, {    511,13}, \
+    {   1087,12}, {   2239,13}, {   1215,14}, {    639,13}, \
+    {   1471,14}, {    767,13}, {   1727,14}, {    895,13}, \
+    {   1855,12}, {   3711,13}, {   1919,15}, {    511,14}, \
+    {   1023,13}, {   2111,12}, {   4223,13}, {   2175,14}, \
+    {   1151,13}, {   2495,14}, {   1279,13}, {   2623,14}, \
+    {   1407,15}, {    767,14}, {   1663,13}, {   3455,14}, \
+    {   1919,16}, {    511,15}, {   1023,14}, {   2175,13}, \
+    {   4479,14}, {   2431,15}, {   1279,14}, {   2943,15}, \
+    {   1535,14}, {   3455,15}, {   1791,14}, {  16384,15}, \
+    {  32768,16}, {  65536,17}, { 131072,18}, { 262144,19}, \
+    { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \
+    {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 149
+#define MUL_FFT_THRESHOLD                 8576
+
+#define SQR_FFT_MODF_THRESHOLD             765  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    765, 5}, {     36, 6}, {     37, 7}, {     19, 6}, \
+    {     42, 7}, {     43, 8}, {     37, 9}, {     19, 8}, \
+    {     43, 9}, {     23, 8}, {     49, 9}, {     27, 8}, \
+    {     57, 9}, {     43,10}, {     23, 9}, {     55,10}, \
+    {     31, 9}, {     71,10}, {     39, 9}, {     83,10}, \
+    {     47, 9}, {     99,10}, {     55,11}, {     31,10}, \
+    {     87,11}, {     47,10}, {    111,12}, {     31,11}, \
+    {     63,10}, {    135,11}, {     79,10}, {    175,11}, \
+    {     95,10}, {    199,11}, {    111,12}, {     63,11}, \
+    {    159,12}, {     95,11}, {    191,10}, {    399,11}, \
+    {    207,13}, {     63,12}, {    127,10}, {    511, 9}, \
+    {   1023,10}, {    527,11}, {    271,12}, {    159,10}, \
+    {    703,12}, {    191,11}, {    399,10}, {    799,11}, \
+    {    431,12}, {    223,13}, {    127,12}, {    255,11}, \
+    {    527,10}, {   1055,11}, {    559,12}, {    287,11}, \
+    {    607,10}, {   1215,11}, {    703,13}, {    191,12}, \
+    {    383,11}, {    799,12}, {    415,11}, {    863,12}, \
+    {    447,14}, {    127,13}, {    255,12}, {    511,11}, \
+    {   1055,12}, {    543,11}, {   1119,12}, {    607,11}, \
+    {   1215,12}, {    735,13}, {    383,12}, {    799,11}, \
+    {   1599,12}, {    863,13}, {    447,12}, {    991,14}, \
+    {    255,13}, {    511,12}, {   1055,11}, {   2111,12}, \
+    {   1119,13}, {    575,12}, {   1215,13}, {    639,12}, \
+    {   1311,13}, {    703,12}, {   1407,14}, {    383,13}, \
+    {    767,12}, {   1599,13}, {    831,12}, {   1727,13}, \
+    {    895,12}, {   1791,13}, {    959,12}, {   1919,15}, \
+    {    255,14}, {    511,13}, {   1023,12}, {   2047,13}, \
+    {   1087,12}, {   2239,13}, {   1151,12}, {   2303,13}, \
+    {   1215,14}, {    639,13}, {   1279,12}, {   2559,13}, \
+    {   1471,14}, {    767,13}, {   1727,14}, {    895,13}, \
+    {   1919,15}, {    511,14}, {   1023,13}, {   2239,14}, \
+    {   1151,13}, {   2495,14}, {   1279,13}, {   2623,14}, \
+    {   1407,15}, {    767,14}, {   1663,13}, {   3455,14}, \
+    {   1919,16}, {    511,15}, {   1023,14}, {   2175,13}, \
+    {   4479,14}, {   2431,15}, {   1279,14}, {   2943,15}, \
+    {   1535,14}, {   3455,15}, {   1791,14}, {  16384,15}, \
+    {  32768,16}, {  65536,17}, { 131072,18}, { 262144,19}, \
+    { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \
+    {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 153
+#define SQR_FFT_THRESHOLD                 6272
+
+#define MULLO_BASECASE_THRESHOLD            39
+#define MULLO_DC_THRESHOLD                   0  /* never mpn_mullo_basecase */
+#define MULLO_MUL_N_THRESHOLD            17050
+#define SQRLO_BASECASE_THRESHOLD             0  /* always */
+#define SQRLO_DC_THRESHOLD                 134
+#define SQRLO_SQR_THRESHOLD              12322
+
+#define DC_DIV_QR_THRESHOLD                 73
+#define DC_DIVAPPR_Q_THRESHOLD             262
+#define DC_BDIV_QR_THRESHOLD               111
+#define DC_BDIV_Q_THRESHOLD                315
+
+#define INV_MULMOD_BNM1_THRESHOLD           92
+#define INV_NEWTON_THRESHOLD                15
+#define INV_APPR_THRESHOLD                  17
+
+#define BINV_NEWTON_THRESHOLD              280
+#define REDC_1_TO_REDC_2_THRESHOLD           0  /* always */
+#define REDC_2_TO_REDC_N_THRESHOLD         172
+
+#define MU_DIV_QR_THRESHOLD               1470
+#define MU_DIVAPPR_Q_THRESHOLD            1210
+#define MUPI_DIV_QR_THRESHOLD                0  /* always */
+#define MU_BDIV_QR_THRESHOLD              1566
+#define MU_BDIV_Q_THRESHOLD               1787
+
+#define POWM_SEC_TABLE  3,22,139,1867
+
+#define GET_STR_DC_THRESHOLD                14
+#define GET_STR_PRECOMPUTE_THRESHOLD        42
+#define SET_STR_DC_THRESHOLD              1339
+#define SET_STR_PRECOMPUTE_THRESHOLD      3934
+
+#define FAC_DSC_THRESHOLD                  866
+#define FAC_ODD_THRESHOLD                    0  /* always */
+
+#define MATRIX22_STRASSEN_THRESHOLD         20
+#define HGCD2_DIV1_METHOD                    3  /* 13.73% faster than 1 */
+#define HGCD_THRESHOLD                     129
+#define HGCD_APPR_THRESHOLD                202
+#define HGCD_REDUCE_THRESHOLD             4455
+#define GCD_DC_THRESHOLD                   658
+#define GCDEXT_DC_THRESHOLD                469
+#define JACOBI_BASE_METHOD                   2  /* 0.62% faster than 4 */
+
+/* Tuneup completed successfully, took 199042 seconds */

diff --git a/third_party/gmp/mpn/ia64/hamdist.asm b/third_party/gmp/mpn/ia64/hamdist.asm
new file mode 100644
index 0000000..477df4c
--- /dev/null
+++ b/third_party/gmp/mpn/ia64/hamdist.asm

@@ -0,0 +1,365 @@
+dnl  IA-64 mpn_hamdist -- mpn hamming distance.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  Copyright 2003-2005 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C           cycles/limb
+C Itanium:       2
+C Itanium 2:     1
+
+C INPUT PARAMETERS
+define(`up', `r32')
+define(`vp', `r33')
+define(`n', `r34')
+
+define(`u0',`r16') define(`u1',`r17') define(`u2',`r18') define(`u3',`r19')
+define(`v0',`r20') define(`v1',`r21') define(`v2',`r22') define(`v3',`r23')
+define(`x0',`r24') define(`x1',`r25') define(`x2',`r26') define(`x3',`r27')
+define(`c0',`r28') define(`c1',`r29') define(`c2',`r30') define(`c3',`r31')
+define(`s',`r8')
+
+
+ASM_START()
+PROLOGUE(mpn_hamdist)
+	.prologue
+ifdef(`HAVE_ABI_32',
+`	addp4		up = 0, up		C			M I
+	addp4		vp = 0, vp		C			M I
+	zxt4		n = n			C			I
+	;;
+')
+
+ {.mmi;	ld8		r10 = [up], 8		C load first ulimb	M01
+	ld8		r11 = [vp], 8		C load first vlimb	M01
+	mov.i		r2 = ar.lc		C save ar.lc		I0
+}{.mmi;	and		r14 = 3, n		C			M I
+	cmp.lt		p15, p0 = 4, n		C small count?		M I
+	add		n = -5, n		C			M I
+	;;
+}{.mmi;	cmp.eq		p6, p0 = 1, r14		C			M I
+	cmp.eq		p7, p0 = 2, r14		C			M I
+	cmp.eq		p8, p0 = 3, r14		C			M I
+}{.bbb
+  (p6)	br.dptk		.Lb01			C			B
+  (p7)	br.dptk		.Lb10			C			B
+  (p8)	br.dptk		.Lb11			C			B
+}
+
+
+.Lb00:	ld8		u1 = [up], 8		C			M01
+	ld8		v1 = [vp], 8		C			M01
+	shr.u		n = n, 2		C			I0
+	xor		x0 = r10, r11		C			M I
+	;;
+	ld8		u2 = [up], 8		C			M01
+	ld8		v2 = [vp], 8		C			M01
+	mov.i		ar.lc = n		C			I0
+	xor		x1 = u1, v1		C			M I
+	;;
+	ld8		u3 = [up], 8		C			M01
+	ld8		v3 = [vp], 8		C			M01
+	xor		x2 = u2, v2		C			M I
+	mov		s = 0			C			M I
+  (p15)	br.cond.dptk	.grt4			C			B
+	;;
+	popcnt		c0 = x0			C			I0
+	xor		x3 = u3, v3		C			M I
+	;;
+	popcnt		c1 = x1			C			I0
+	;;
+	popcnt		c2 = x2			C			I0
+	br		.Lcj4			C			B
+
+.grt4:	ld8		u0 = [up], 8		C			M01
+	ld8		v0 = [vp], 8		C			M01
+	xor		x1 = u1, v1		C			M I
+	;;
+	ld8		u1 = [up], 8		C			M01
+	ld8		v1 = [vp], 8		C			M01
+	xor		x2 = u2, v2		C			M I
+	;;
+	ld8		u2 = [up], 8		C			M01
+	ld8		v2 = [vp], 8		C			M01
+	popcnt		c0 = x0			C			I0
+	xor		x3 = u3, v3		C			M I
+	;;
+	ld8		u3 = [up], 8		C			M01
+	ld8		v3 = [vp], 8		C			M01
+	popcnt		c1 = x1			C			I0
+	xor		x0 = u0, v0		C			M I
+	br.cloop.dpnt	.grt8			C			B
+
+	popcnt		c2 = x2			C			I0
+	xor		x1 = u1, v1		C			M I
+	br		.Lcj8			C			B
+
+.grt8:	ld8		u0 = [up], 8		C			M01
+	ld8		v0 = [vp], 8		C			M01
+	popcnt		c2 = x2			C			I0
+	xor		x1 = u1, v1		C			M I
+	br		.LL00			C			B
+
+
+.Lb01:	xor		x3 = r10, r11		C			M I
+	shr.u		n = n, 2		C			I0
+  (p15)	br.cond.dptk	.grt1			C			B
+	;;
+	popcnt		r8 = x3			C			I0
+	br.ret.sptk.many b0			C			B
+
+.grt1:	ld8		u0 = [up], 8		C			M01
+	ld8		v0 = [vp], 8		C			M01
+	mov.i		ar.lc = n		C			I0
+	;;
+	ld8		u1 = [up], 8		C			M01
+	ld8		v1 = [vp], 8		C			M01
+	mov		s = 0			C			M I
+	;;
+	ld8		u2 = [up], 8		C			M01
+	ld8		v2 = [vp], 8		C			M01
+	;;
+	ld8		u3 = [up], 8		C			M01
+	ld8		v3 = [vp], 8		C			M01
+	xor		x0 = u0, v0		C			M I
+	br.cloop.dpnt	.grt5			C			B
+
+	xor		x1 = u1, v1		C			M I
+	;;
+	popcnt		c3 = x3			C			I0
+	xor		x2 = u2, v2		C			M I
+	;;
+	popcnt		c0 = x0			C			I0
+	xor		x3 = u3, v3		C			M I
+	;;
+	popcnt		c1 = x1			C			I0
+	br		.Lcj5			C			B
+
+.grt5:	ld8		u0 = [up], 8		C			M01
+	ld8		v0 = [vp], 8		C			M01
+	xor		x1 = u1, v1		C			M I
+	;;
+	ld8		u1 = [up], 8		C			M01
+	ld8		v1 = [vp], 8		C			M01
+	popcnt		c3 = x3			C			I0
+	xor		x2 = u2, v2		C			M I
+	;;
+	ld8		u2 = [up], 8		C			M01
+	ld8		v2 = [vp], 8		C			M01
+	popcnt		c0 = x0			C			I0
+	xor		x3 = u3, v3		C			M I
+	;;
+	ld8		u3 = [up], 8		C			M01
+	ld8		v3 = [vp], 8		C			M01
+	popcnt		c1 = x1			C			I0
+	xor		x0 = u0, v0		C			M I
+	br.cloop.dpnt	.Loop			C			B
+	br		.Lend			C			B
+
+
+.Lb10:	ld8		u3 = [up], 8		C			M01
+	ld8		v3 = [vp], 8		C			M01
+	xor		x2 = r10, r11		C			M I
+  (p15)	br.cond.dptk	.grt2			C			B
+	;;
+	xor		x3 = u3, v3		C			M I
+	;;
+	popcnt		c2 = x2			C			I0
+	;;
+	popcnt		c3 = x3			C			I0
+	;;
+	add		s = c2, c3		C			M I
+	br.ret.sptk.many b0			C			B
+
+.grt2:	ld8		u0 = [up], 8		C			M01
+	ld8		v0 = [vp], 8		C			M01
+	shr.u		n = n, 2		C			I0
+	;;
+	ld8		u1 = [up], 8		C			M01
+	ld8		v1 = [vp], 8		C			M01
+	mov.i		ar.lc = n		C			I0
+	mov		s = 0			C			M I
+	;;
+	ld8		u2 = [up], 8		C			M01
+	ld8		v2 = [vp], 8		C			M01
+	xor		x3 = u3, v3		C			M I
+	;;
+	ld8		u3 = [up], 8		C			M01
+	ld8		v3 = [vp], 8		C			M01
+	xor		x0 = u0, v0		C			M I
+	br.cloop.dptk	.grt6			C			B
+
+	popcnt		c2 = x2			C			I0
+	xor		x1 = u1, v1		C			M I
+	;;
+	popcnt		c3 = x3			C			I0
+	xor		x2 = u2, v2		C			M I
+	;;
+	popcnt		c0 = x0			C			I0
+	xor		x3 = u3, v3		C			M I
+	br		.Lcj6			C			B
+
+.grt6:	ld8		u0 = [up], 8		C			M01
+	ld8		v0 = [vp], 8		C			M01
+	popcnt		c2 = x2			C			I0
+	xor		x1 = u1, v1		C			M I
+	;;
+	ld8		u1 = [up], 8		C			M01
+	ld8		v1 = [vp], 8		C			M01
+	popcnt		c3 = x3			C			I0
+	xor		x2 = u2, v2		C			M I
+	;;
+	ld8		u2 = [up], 8		C			M01
+	ld8		v2 = [vp], 8		C			M01
+	popcnt		c0 = x0			C			I0
+	xor		x3 = u3, v3		C			M I
+	br		.LL10			C			B
+
+
+.Lb11:	ld8		u2 = [up], 8		C			M01
+	ld8		v2 = [vp], 8		C			M01
+	shr.u		n = n, 2		C			I0
+	xor		x1 = r10, r11		C			M I
+	;;
+	ld8		u3 = [up], 8		C			M01
+	ld8		v3 = [vp], 8		C			M01
+	xor		x2 = u2, v2		C			M I
+  (p15)	br.cond.dptk	.grt3			C			B
+	;;
+	xor		x3 = u3, v3		C			M I
+	;;
+	popcnt		c1 = x1			C			I0
+	;;
+	popcnt		c2 = x2			C			I0
+	;;
+	popcnt		c3 = x3			C			I0
+	;;
+	add		s = c1, c2		C			M I
+	;;
+	add		s = s, c3		C			M I
+	br.ret.sptk.many b0			C			B
+
+.grt3:	ld8		u0 = [up], 8		C			M01
+	ld8		v0 = [vp], 8		C			M01
+	mov.i		ar.lc = n		C			I0
+	;;
+	ld8		u1 = [up], 8		C			M01
+	ld8		v1 = [vp], 8		C			M01
+	mov		s = 0			C			M I
+	;;
+	ld8		u2 = [up], 8		C			M01
+	ld8		v2 = [vp], 8		C			M01
+	xor		x3 = u3, v3		C			M I
+	;;
+	ld8		u3 = [up], 8		C			M01
+	ld8		v3 = [vp], 8		C			M01
+	popcnt		c1 = x1			C			I0
+	xor		x0 = u0, v0		C			M I
+	br.cloop.dptk	.grt7			C			B
+	popcnt		c2 = x2			C			I0
+	xor		x1 = u1, v1		C			M I
+	;;
+	popcnt		c3 = x3			C			I0
+	xor		x2 = u2, v2		C			M I
+	br		.Lcj7			C			B
+
+.grt7:	ld8		u0 = [up], 8		C			M01
+	ld8		v0 = [vp], 8		C			M01
+	popcnt		c2 = x2			C			I0
+	xor		x1 = u1, v1		C			M I
+	;;
+	ld8		u1 = [up], 8		C			M01
+	ld8		v1 = [vp], 8		C			M01
+	popcnt		c3 = x3			C			I0
+	xor		x2 = u2, v2		C			M I
+	br		.LL11			C			B
+
+
+	ALIGN(32)
+.Loop:	ld8		u0 = [up], 8		C			M01
+	ld8		v0 = [vp], 8		C			M01
+	popcnt		c2 = x2			C			I0
+	add		s = s, c3		C			M I
+	xor		x1 = u1, v1		C			M I
+	nop.b		1			C			-
+	;;
+.LL00:	ld8		u1 = [up], 8		C			M01
+	ld8		v1 = [vp], 8		C			M01
+	popcnt		c3 = x3			C			I0
+	add		s = s, c0		C			M I
+	xor		x2 = u2, v2		C			M I
+	nop.b		1			C			-
+	;;
+.LL11:	ld8		u2 = [up], 8		C			M01
+	ld8		v2 = [vp], 8		C			M01
+	popcnt		c0 = x0			C			I0
+	add		s = s, c1		C			M I
+	xor		x3 = u3, v3		C			M I
+	nop.b		1			C			-
+	;;
+.LL10:	ld8		u3 = [up], 8		C			M01
+	ld8		v3 = [vp], 8		C			M01
+	popcnt		c1 = x1			C			I0
+	add		s = s, c2		C			M I
+	xor		x0 = u0, v0		C			M I
+	br.cloop.dptk	.Loop			C			B
+	;;
+
+.Lend:	popcnt		c2 = x2			C			I0
+	add		s = s, c3		C			M I
+	xor		x1 = u1, v1		C			M I
+	;;
+.Lcj8:	popcnt		c3 = x3			C			I0
+	add		s = s, c0		C			M I
+	xor		x2 = u2, v2		C			M I
+	;;
+.Lcj7:	popcnt		c0 = x0			C			I0
+	add		s = s, c1		C			M I
+	xor		x3 = u3, v3		C			M I
+	;;
+.Lcj6:	popcnt		c1 = x1			C			I0
+	add		s = s, c2		C			M I
+	;;
+.Lcj5:	popcnt		c2 = x2			C			I0
+	add		s = s, c3		C			M I
+	;;
+.Lcj4:	popcnt		c3 = x3			C			I0
+	add		s = s, c0		C			M I
+	;;
+	add		s = s, c1		C			M I
+	;;
+	add		s = s, c2		C			M I
+	;;
+	add		s = s, c3		C			M I
+	mov.i		ar.lc = r2		C			I0
+	br.ret.sptk.many b0			C			B
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/ia64/ia64-defs.m4 b/third_party/gmp/mpn/ia64/ia64-defs.m4
new file mode 100644
index 0000000..f71d280
--- /dev/null
+++ b/third_party/gmp/mpn/ia64/ia64-defs.m4

@@ -0,0 +1,147 @@
+divert(-1)
+
+
+dnl  Copyright 2000, 2002, 2003 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+dnl  ia64 assembler comments are C++ style "//" to the end of line.  gas
+dnl  also accepts "#" as a comment, if it's the first non-blank on a line.
+dnl
+dnl  BSD m4 can't handle a multi-character comment like "//" (see notes in
+dnl  mpn/asm-defs.m4).  For now the default "#" is left, but with care taken
+dnl  not to put any macros after "foo#" (since of course they won't expand).
+
+
+define(`ASM_START',
+m4_assert_numargs(0)
+`')
+
+
+dnl  Called: PROLOGUE_cpu(GSYM_PREFIX`'foo)
+dnl          EPILOGUE_cpu(GSYM_PREFIX`'foo)
+dnl
+dnl  32-byte alignment is used for the benefit of itanium-2, where the code
+dnl  fetcher will only take 2 bundles from a 32-byte aligned target.  At
+dnl  16mod32 it only reads 1 in the first cycle.  This might not make any
+dnl  difference if the rotate buffers are full or there's other work holding
+dnl  up execution, but we use 32-bytes to give the best chance of peak
+dnl  throughput.
+dnl
+dnl  We can use .align here despite the gas bug noted in mpn/ia64/README,
+dnl  since we're not expecting to execute across a PROLOGUE(), at least not
+dnl  currently.
+
+define(`PROLOGUE_cpu',
+m4_assert_numargs(1)
+	`
+	.text
+	.align	32
+	.global	$1#
+	.proc	$1#
+$1:')
+
+define(`EPILOGUE_cpu',
+m4_assert_numargs(1)
+	`
+	.endp	$1#
+')
+
+define(`DATASTART',
+	`dnl
+	DATA
+$1:')
+define(`DATAEND',`dnl')
+
+define(`ASM_END',`dnl')
+
+
+dnl  Usage: ALIGN(bytes)
+dnl
+dnl  Emit a ".align" directive.  "bytes" is eval()ed, so can be an
+dnl  expression.
+dnl
+dnl  This version overrides the definition in mpn/asm-defs.m4.  We suppress
+dnl  any .align if the gas byte-swapped-nops bug was detected by configure
+dnl  GMP_ASM_IA64_ALIGN_OK.
+
+define(`ALIGN',
+m4_assert_numargs(1)
+m4_assert_defined(`IA64_ALIGN_OK')
+`ifelse(IA64_ALIGN_OK,no,,
+`.align	eval($1)')')
+
+
+dnl  Usage: ASSERT([pr] [,code])
+dnl
+dnl  Require that the given predicate register is true after executing the
+dnl  test code.  For example,
+dnl
+dnl         ASSERT(p6,
+dnl         `       cmp.eq  p6,p0 = r3, r4')
+dnl
+dnl  If the predicate register argument is empty then nothing is tested, the
+dnl  code is just executed.  This can be used for setups required by later
+dnl  ASSERTs.  The code argument can be omitted to just test a predicate
+dnl  with no special setup code.
+dnl
+dnl  For convenience, stops are inserted before and after the code emitted.
+
+define(ASSERT,
+m4_assert_numargs_range(1,2)
+m4_assert_defined(`WANT_ASSERT')
+`ifelse(WANT_ASSERT,1,
+`	;;
+ifelse(`$2',,,
+`$2
+	;;
+')
+ifelse(`$1',,,
+`($1)	br	.LASSERTok`'ASSERT_label_counter ;;
+	cmp.ne	p6,p6 = r0, r0	C illegal instruction
+	;;
+.LASSERTok`'ASSERT_label_counter:
+define(`ASSERT_label_counter',eval(ASSERT_label_counter+1))
+')
+')')
+define(`ASSERT_label_counter',1)
+
+define(`getfsig', `getf.sig')
+define(`setfsig', `setf.sig')
+define(`cmpeq',   `cmp.eq')
+define(`cmpne',   `cmp.ne')
+define(`cmpltu',  `cmp.ltu')
+define(`cmpleu',  `cmp.leu')
+define(`cmpgtu',  `cmp.gtu')
+define(`cmpgeu',  `cmp.geu')
+define(`cmple',   `cmp.le')
+define(`cmpgt',   `cmp.gt')
+define(`cmpeqor', `cmp.eq.or')
+define(`cmpequc', `cmp.eq.unc')
+
+divert

diff --git a/third_party/gmp/mpn/ia64/invert_limb.asm b/third_party/gmp/mpn/ia64/invert_limb.asm
new file mode 100644
index 0000000..5effdda
--- /dev/null
+++ b/third_party/gmp/mpn/ia64/invert_limb.asm

@@ -0,0 +1,105 @@
+dnl  IA-64 mpn_invert_limb -- Invert a normalized limb.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund and Kevin Ryde.
+
+dnl  Copyright 2000, 2002, 2004 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C d = r32
+
+C           cycles
+C Itanium:    74
+C Itanium 2:  50+6
+
+C It should be possible to avoid the xmpy.hu and the following tests by
+C explicitly chopping in the last fma.  That would save about 10 cycles.
+
+ASM_START()
+	.sdata
+	.align 16
+ifdef(`HAVE_DOUBLE_IEEE_LITTLE_ENDIAN',`
+.LC0:	data4 0x00000000, 0x80000000, 0x0000403f, 0x00000000	C 2^64
+.LC1:	data4 0x00000000, 0x80000000, 0x0000407f, 0x00000000	C 2^128
+
+',`ifdef(`HAVE_DOUBLE_IEEE_BIG_ENDIAN',`
+.LC0:	data4 0x403f8000, 0x00000000, 0x00000000, 0x00000000	C 2^64
+.LC1:	data4 0x407f8000, 0x00000000, 0x00000000, 0x00000000	C 2^128
+
+',`m4_error(`Oops, need to know float endianness
+')')')
+
+
+PROLOGUE(mpn_invert_limb)
+		C 00
+	addl		r14 = @gprel(.LC0), gp
+	addl		r15 = @gprel(.LC1), gp
+	setf.sig	f7 = r32
+	add		r9 = r32, r32		C check for d = 2^63
+	;;	C 01
+	ldfe		f10 = [r14]		C 2^64
+	ldfe		f8 = [r15]		C 2^128
+	cmp.eq		p6, p0 = 0, r9		C check for d = 2^63
+	mov		r8 = -1			C retval for 2^63
+   (p6)	br.ret.spnt.many b0
+	;;	C 07
+	fmpy.s1		f11 = f7, f10		C f11 = d * 2^64
+	fnma.s1		f6 = f7, f10, f8	C f6 = 2^128 - d * 2^64
+	;;	C 11
+	frcpa.s1	f8, p6 = f6, f7
+	;;	C 15
+   (p6)	fnma.s1		f9 = f7, f8, f1
+   (p6)	fmpy.s1		f10 = f6, f8
+	;;	C 19
+   (p6)	fmpy.s1		f11 = f9, f9
+   (p6)	fma.s1		f10 = f9, f10, f10
+	;;	C 23
+   (p6)	fma.s1		f8 = f9, f8, f8
+   (p6)	fma.s1		f9 = f11, f10, f10
+	;;	C 27
+   (p6)	fma.s1		f8 = f11, f8, f8
+   (p6)	fnma.s1		f10 = f7, f9, f6
+	;;	C 31
+   (p6)	fma.s1		f8 = f10, f8, f9
+	;;	C 35
+	fcvt.fxu.trunc.s1 f8 = f8
+	;;	C 39
+	getf.sig	r8 = f8
+	xmpy.hu		f10 = f8, f7		C di * d
+	;;	C 43
+	getf.sig	r14 = f10
+	andcm		r9 = -1, r32		C one's complement
+	;;	C 48
+	cmp.ltu		p6, p0 = r9, r14	C got overflow?
+	;;	C 49
+   (p6)	add		r8 = -1, r8		C adjust di down
+	br.ret.sptk.many b0
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/ia64/logops_n.asm b/third_party/gmp/mpn/ia64/logops_n.asm
new file mode 100644
index 0000000..e4a2f61
--- /dev/null
+++ b/third_party/gmp/mpn/ia64/logops_n.asm

@@ -0,0 +1,292 @@
+dnl  IA-64 mpn_and_n, mpn_andn_n, mpn_nand_n, mpn_ior_n, mpn_iorn_n,
+dnl  mpn_nior_n, mpn_xor_n, mpn_xnor_n -- mpn bitwise logical operations.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  Copyright 2003-2005 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C           cycles/limb
+C Itanium:      2
+C Itanium 2:    1
+
+C TODO
+C  * Use rp,rpx scheme of aors_n.asm to allow parallel stores (useful in
+C    wind-down code).
+
+C INPUT PARAMETERS
+define(`rp', `r32')
+define(`up', `r33')
+define(`vp', `r34')
+define(`n', `r35')
+
+ifdef(`OPERATION_and_n',
+`	define(`func',`mpn_and_n')
+	define(`logop',		`and	$1 = $2, $3')
+	define(`notormov',	`mov	$1 = $2')')
+ifdef(`OPERATION_andn_n',
+`	define(`func',`mpn_andn_n')
+	define(`logop',		`andcm	$1 = $2, $3')
+	define(`notormov',	`mov	$1 = $2')')
+ifdef(`OPERATION_nand_n',
+`	define(`func',`mpn_nand_n')
+	define(`logop',		`and	$1 = $2, $3')
+	define(`notormov',	`sub	$1 = -1, $2')')
+ifdef(`OPERATION_ior_n',
+`	define(`func',`mpn_ior_n')
+	define(`logop',		`or	$1 = $2, $3')
+	define(`notormov',	`mov	$1 = $2')')
+ifdef(`OPERATION_iorn_n',
+`	define(`func',`mpn_iorn_n')
+	define(`logop',		`andcm	$1 = $3, $2')
+	define(`notormov',	`sub	$1 = -1, $2')')
+ifdef(`OPERATION_nior_n',
+`	define(`func',`mpn_nior_n')
+	define(`logop',		`or	$1 = $2, $3')
+	define(`notormov',	`sub	$1 = -1, $2')')
+ifdef(`OPERATION_xor_n',
+`	define(`func',`mpn_xor_n')
+	define(`logop',		`xor	$1 = $2, $3')
+	define(`notormov',	`mov	$1 = $2')')
+ifdef(`OPERATION_xnor_n',
+`	define(`func',`mpn_xnor_n')
+	define(`logop',		`xor	$1 = $2, $3')
+	define(`notormov',	`sub	$1 = -1, $2')')
+
+MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n)
+
+ASM_START()
+PROLOGUE(func)
+	.prologue
+	.save	ar.lc, r2
+	.body
+ifdef(`HAVE_ABI_32',
+`	addp4	rp = 0, rp			C			M I
+	addp4	up = 0, up			C			M I
+	addp4	vp = 0, vp			C			M I
+	nop.m		0
+	nop.m		0
+	zxt4	n = n				C			I
+	;;
+')
+{.mmi
+	ld8		r10 = [up], 8		C			M
+	ld8		r11 = [vp], 8		C			M
+	mov.i		r2 = ar.lc		C			I0
+}
+{.mmi
+	and		r14 = 3, n		C			M I
+	cmp.lt		p15, p14 = 4, n		C			M I
+	shr.u		n = n, 2		C			I0
+	;;
+}
+{.mmi
+	cmp.eq		p6, p0 = 1, r14		C			M I
+	cmp.eq		p7, p0 = 2, r14		C			M I
+	cmp.eq		p8, p0 = 3, r14		C			M I
+}
+{.bbb
+   (p6)	br.dptk		.Lb01			C			B
+   (p7)	br.dptk		.Lb10			C			B
+   (p8)	br.dptk		.Lb11			C			B
+}
+
+.Lb00:	ld8		r17 = [up], 8		C			M
+	ld8		r21 = [vp], 8		C			M
+	add		n = -2, n		C			M I
+	;;
+	ld8		r18 = [up], 8		C			M
+	ld8		r22 = [vp], 8		C			M
+	;;
+	ld8		r19 = [up], 8		C			M
+	ld8		r23 = [vp], 8		C			M
+  (p15)	br.cond.dpnt	.grt4			C			B
+
+	logop(		r14, r10, r11)		C			M I
+	;;
+	logop(		r15, r17, r21)		C			M I
+	notormov(	r8, r14)		C			M I
+	br		.Lcj4			C			B
+
+.grt4:	logop(		r14, r10, r11)		C			M I
+	ld8		r16 = [up], 8		C			M
+	ld8		r20 = [vp], 8		C			M
+	;;
+	logop(		r15, r17, r21)		C			M I
+	ld8		r17 = [up], 8		C			M
+	mov.i		ar.lc = n		C			I0
+	notormov(	r8, r14)		C			M I
+	ld8		r21 = [vp], 8		C			M
+	br		.LL00			C			B
+
+.Lb01:	add		n = -1, n		C			M I
+	logop(		r15, r10, r11)		C			M I
+  (p15)	br.cond.dpnt	.grt1			C			B
+	;;
+
+	notormov(	r9, r15)		C			M I
+	br		.Lcj1			C			B
+
+.grt1:	ld8		r16 = [up], 8		C			M
+	ld8		r20 = [vp], 8		C			M
+	;;
+	ld8		r17 = [up], 8		C			M
+	ld8		r21 = [vp], 8		C			M
+	mov.i		ar.lc = n		C			I0
+	;;
+	ld8		r18 = [up], 8		C			M
+	ld8		r22 = [vp], 8		C			M
+	;;
+	ld8		r19 = [up], 8		C			M
+	ld8		r23 = [vp], 8		C			M
+	br.cloop.dptk	.grt5			C			B
+	;;
+
+	logop(		r14, r16, r20)		C			M I
+	notormov(	r9, r15)		C			M I
+	br		.Lcj5			C			B
+
+.grt5:	logop(		r14, r16, r20)		C			M I
+	ld8		r16 = [up], 8		C			M
+	notormov(	r9, r15)		C			M I
+	ld8		r20 = [vp], 8		C			M
+	br		.LL01			C			B
+
+.Lb10:	ld8		r19 = [up], 8		C			M
+	ld8		r23 = [vp], 8		C			M
+  (p15)	br.cond.dpnt	.grt2			C			B
+
+	logop(		r14, r10, r11)		C			M I
+	;;
+	logop(		r15, r19, r23)		C			M I
+	notormov(	r8, r14)		C			M I
+	br		.Lcj2			C			B
+
+.grt2:	ld8		r16 = [up], 8		C			M
+	ld8		r20 = [vp], 8		C			M
+	add		n = -1, n		C			M I
+	;;
+	ld8		r17 = [up], 8		C			M
+	ld8		r21 = [vp], 8		C			M
+	logop(		r14, r10, r11)		C			M I
+	;;
+	ld8		r18 = [up], 8		C			M
+	ld8		r22 = [vp], 8		C			M
+	mov.i		ar.lc = n		C			I0
+	;;
+	logop(		r15, r19, r23)		C			M I
+	ld8		r19 = [up], 8		C			M
+	notormov(	r8, r14)		C			M I
+	ld8		r23 = [vp], 8		C			M
+	br.cloop.dptk	.Loop			C			B
+	br		.Lcj6			C			B
+
+.Lb11:	ld8		r18 = [up], 8		C			M
+	ld8		r22 = [vp], 8		C			M
+	add		n = -1, n		C			M I
+	;;
+	ld8		r19 = [up], 8		C			M
+	ld8		r23 = [vp], 8		C			M
+	logop(		r15, r10, r11)		C			M I
+  (p15)	br.cond.dpnt	.grt3			C			B
+	;;
+
+	logop(		r14, r18, r22)		C			M I
+	notormov(	r9, r15)		C			M I
+	br		.Lcj3			C			B
+
+.grt3:	ld8		r16 = [up], 8		C			M
+	ld8		r20 = [vp], 8		C			M
+	;;
+	ld8		r17 = [up], 8		C			M
+	ld8		r21 = [vp], 8		C			M
+	mov.i		ar.lc = n		C			I0
+	;;
+	logop(		r14, r18, r22)		C			M I
+	ld8		r18 = [up], 8		C			M
+	notormov(	r9, r15)		C			M I
+	ld8		r22 = [vp], 8		C			M
+	br		.LL11			C			B
+
+C *** MAIN LOOP START ***
+	ALIGN(32)
+.Loop:	st8		[rp] = r8, 8		C			M
+	logop(		r14, r16, r20)		C			M I
+	notormov(	r9, r15)		C			M I
+	ld8		r16 = [up], 8		C			M
+	ld8		r20 = [vp], 8		C			M
+	nop.b		0
+	;;
+.LL01:	st8		[rp] = r9, 8		C			M
+	logop(		r15, r17, r21)		C			M I
+	notormov(	r8, r14)		C			M I
+	ld8		r17 = [up], 8		C			M
+	ld8		r21 = [vp], 8		C			M
+	nop.b		0
+	;;
+.LL00:	st8		[rp] = r8, 8		C			M
+	logop(		r14, r18, r22)		C			M I
+	notormov(	r9, r15)		C			M I
+	ld8		r18 = [up], 8		C			M
+	ld8		r22 = [vp], 8		C			M
+	nop.b		0
+	;;
+.LL11:	st8		[rp] = r9, 8		C			M
+	logop(		r15, r19, r23)		C			M I
+	notormov(	r8, r14)		C			M I
+	ld8		r19 = [up], 8		C			M
+	ld8		r23 = [vp], 8		C			M
+	br.cloop.dptk	.Loop	;;		C			B
+C *** MAIN LOOP END ***
+
+.Lcj6:	st8		[rp] = r8, 8		C			M
+	logop(		r14, r16, r20)		C			M I
+	notormov(	r9, r15)		C			M I
+	;;
+.Lcj5:	st8		[rp] = r9, 8		C			M
+	logop(		r15, r17, r21)		C			M I
+	notormov(	r8, r14)		C			M I
+	;;
+.Lcj4:	st8		[rp] = r8, 8		C			M
+	logop(		r14, r18, r22)		C			M I
+	notormov(	r9, r15)		C			M I
+	;;
+.Lcj3:	st8		[rp] = r9, 8		C			M
+	logop(		r15, r19, r23)		C			M I
+	notormov(	r8, r14)		C			M I
+	;;
+.Lcj2:	st8		[rp] = r8, 8		C			M
+	notormov(	r9, r15)		C			M I
+	;;
+.Lcj1:	st8		[rp] = r9, 8		C			M
+	mov.i		ar.lc = r2		C			I0
+	br.ret.sptk.many b0			C			B
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/ia64/lorrshift.asm b/third_party/gmp/mpn/ia64/lorrshift.asm
new file mode 100644
index 0000000..694aaf0
--- /dev/null
+++ b/third_party/gmp/mpn/ia64/lorrshift.asm

@@ -0,0 +1,358 @@
+dnl  IA-64 mpn_lshift/mpn_rshift.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  Copyright 2000-2005 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C           cycles/limb
+C Itanium:      2
+C Itanium 2:    1
+
+C This code is scheduled deeply since the plain shift instructions shr and shl
+C have a latency of 4 (on Itanium) or 3 (on Itanium 2).  Poor scheduling of
+C these instructions cause a 10 cycle replay trap on Itanium.
+
+C The ld8 scheduling should probably be decreased to make the function smaller.
+C Good lfetch  will make sure we never stall anyway.
+
+C We should actually issue the first ld8 at cycle 0, and the first BSH/FSH pair
+C at cycle 2.  Judicious use of predicates could allow us to issue more ld8's
+C in the prologue.
+
+
+C INPUT PARAMETERS
+define(`rp', `r32')
+define(`up', `r33')
+define(`n',  `r34')
+define(`cnt',`r35')
+
+define(`tnc',`r9')
+
+ifdef(`OPERATION_lshift',`
+	define(`FSH',`shl')
+	define(`BSH',`shr.u')
+	define(`UPD',`-8')
+	define(`POFF',`-512')
+	define(`PUPD',`-32')
+	define(`func',`mpn_lshift')
+')
+ifdef(`OPERATION_rshift',`
+	define(`FSH',`shr.u')
+	define(`BSH',`shl')
+	define(`UPD',`8')
+	define(`POFF',`512')
+	define(`PUPD',`32')
+	define(`func',`mpn_rshift')
+')
+
+MULFUNC_PROLOGUE(mpn_lshift mpn_rshift)
+
+ASM_START()
+PROLOGUE(func)
+	.prologue
+	.save	ar.lc, r2
+	.body
+ifdef(`HAVE_ABI_32',
+`	addp4	rp = 0, rp		C			M I
+	addp4	up = 0, up		C		M I
+	sxt4	n = n			C		M I
+	nop.m		0
+	nop.m		0
+	zxt4	cnt = cnt		C		I
+	;;
+')
+
+ {.mmi;	cmp.lt	p14, p15 = 4, n		C		M I
+	and	r14 = 3, n		C		M I
+	mov.i	r2 = ar.lc		C		I0
+}{.mmi;	add	r15 = -1, n		C		M I
+	sub	tnc = 64, cnt		C		M I
+	add	r16 = -5, n
+	;;
+}{.mmi;	cmp.eq	p6, p0 = 1, r14		C		M I
+	cmp.eq	p7, p0 = 2, r14		C		M I
+	shr.u	n = r16, 2		C		I0
+}{.mmi;	cmp.eq	p8, p0 = 3, r14		C		M I
+ifdef(`OPERATION_lshift',
+`	shladd	up = r15, 3, up		C		M I
+	shladd	rp = r15, 3, rp')	C		M I
+	;;
+}{.mmi;	add	r11 = POFF, up		C		M I
+	ld8	r10 = [up], UPD		C		M01
+	mov.i	ar.lc = n		C		I0
+}{.bbb;
+   (p6)	br.dptk	.Lb01
+   (p7)	br.dptk	.Lb10
+   (p8)	br.dptk	.Lb11
+	;; }
+
+.Lb00:	ld8	r19 = [up], UPD
+	;;
+	ld8	r16 = [up], UPD
+	;;
+	ld8	r17 = [up], UPD
+	BSH	r8 = r10, tnc		C function return value
+	;;
+	FSH	r24 = r10, cnt
+	BSH	r25 = r19, tnc
+  (p14)	br.cond.dptk	.grt4
+	;;
+	FSH	r26 = r19, cnt
+	BSH	r27 = r16, tnc
+	;;
+	FSH	r20 = r16, cnt
+	BSH	r21 = r17, tnc
+	;;
+	or	r14 = r25, r24
+	FSH	r22 = r17, cnt
+	BSH	r23 = r10, tnc
+	br	.Lr4
+
+.grt4:	ld8	r18 = [up], UPD
+	FSH	r26 = r19, cnt
+	BSH	r27 = r16, tnc
+	;;
+	ld8	r19 = [up], UPD
+	FSH	r20 = r16, cnt
+	BSH	r21 = r17, tnc
+	;;
+	ld8	r16 = [up], UPD
+	FSH	r22 = r17, cnt
+	BSH	r23 = r18, tnc
+	;;
+	or	r14 = r25, r24
+	ld8	r17 = [up], UPD
+	br.cloop.dpnt	.Ltop
+	br	.Lbot
+
+.Lb01:
+  (p15)	BSH	r8 = r10, tnc		C function return value	I
+  (p15)	FSH	r22 = r10, cnt		C		I
+  (p15)	br.cond.dptk	.Lr1		C return	B
+
+.grt1:	ld8	r18 = [up], UPD
+	;;
+	ld8	r19 = [up], UPD
+	BSH	r8 = r10, tnc		C function return value
+	;;
+	ld8	r16 = [up], UPD
+	FSH	r22 = r10, cnt
+	BSH	r23 = r18, tnc
+	;;
+	ld8	r17 = [up], UPD
+	FSH	r24 = r18, cnt
+	BSH	r25 = r19, tnc
+	br.cloop.dpnt	.grt5
+	;;
+	or	r15 = r23, r22
+	FSH	r26 = r19, cnt
+	BSH	r27 = r16, tnc
+	;;
+	FSH	r20 = r16, cnt
+	BSH	r21 = r17, tnc
+	br	.Lr5
+
+.grt5:	ld8	r18 = [up], UPD
+	FSH	r26 = r19, cnt
+	BSH	r27 = r16, tnc
+	;;
+	ld8	r19 = [up], UPD
+	FSH	r20 = r16, cnt
+	BSH	r21 = r17, tnc
+	;;
+	or	r15 = r23, r22
+	ld8	r16 = [up], UPD
+	br	.LL01
+
+
+.Lb10:	ld8	r17 = [up], UPD
+  (p14)	br.cond.dptk	.grt2
+
+	BSH	r8 = r10, tnc		C function return value
+	;;
+	FSH	r20 = r10, cnt
+	BSH	r21 = r17, tnc
+	;;
+	or	r14 = r21, r20
+	FSH	r22 = r17, cnt
+	br	.Lr2			C return
+
+.grt2:	ld8	r18 = [up], UPD
+	BSH	r8 = r10, tnc		C function return value
+	;;
+	ld8	r19 = [up], UPD
+	FSH	r20 = r10, cnt
+	BSH	r21 = r17, tnc
+	;;
+	ld8	r16 = [up], UPD
+	FSH	r22 = r17, cnt
+	BSH	r23 = r18, tnc
+	;;
+ {.mmi;	ld8	r17 = [up], UPD
+	or	r14 = r21, r20
+	FSH	r24 = r18, cnt
+}{.mib;	nop	0
+	BSH	r25 = r19, tnc
+	br.cloop.dpnt	.grt6
+	;; }
+
+	FSH	r26 = r19, cnt
+	BSH	r27 = r16, tnc
+	br	.Lr6
+
+.grt6:	ld8	r18 = [up], UPD
+	FSH	r26 = r19, cnt
+	BSH	r27 = r16, tnc
+	;;
+	ld8	r19 = [up], UPD
+	br	.LL10
+
+
+.Lb11:	ld8	r16 = [up], UPD
+	;;
+	ld8	r17 = [up], UPD
+	BSH	r8 = r10, tnc		C function return value
+  (p14)	br.cond.dptk	.grt3
+	;;
+
+	FSH	r26 = r10, cnt
+	BSH	r27 = r16, tnc
+	;;
+	FSH	r20 = r16, cnt
+	BSH	r21 = r17, tnc
+	;;
+	or	r15 = r27, r26
+	FSH	r22 = r17, cnt
+	br	.Lr3			C return
+
+.grt3:	ld8	r18 = [up], UPD
+	FSH	r26 = r10, cnt
+	BSH	r27 = r16, tnc
+	;;
+	ld8	r19 = [up], UPD
+	FSH	r20 = r16, cnt
+	BSH	r21 = r17, tnc
+	;;
+	ld8	r16 = [up], UPD
+	FSH	r22 = r17, cnt
+	BSH	r23 = r18, tnc
+	;;
+	ld8	r17 = [up], UPD
+	br.cloop.dpnt	.grt7
+
+	or	r15 = r27, r26
+	FSH	r24 = r18, cnt
+	BSH	r25 = r19, tnc
+	br	.Lr7
+
+.grt7:	or	r15 = r27, r26
+	FSH	r24 = r18, cnt
+	BSH	r25 = r19, tnc
+	ld8	r18 = [up], UPD
+	br	.LL11
+
+C *** MAIN LOOP START ***
+	ALIGN(32)
+.Ltop:
+ {.mmi;	st8	[rp] = r14, UPD		C M2
+	or	r15 = r27, r26		C M3
+	FSH	r24 = r18, cnt		C I0
+}{.mmi;	ld8	r18 = [up], UPD		C M1
+	lfetch	[r11], PUPD
+	BSH	r25 = r19, tnc		C I1
+	;; }
+.LL11:
+ {.mmi;	st8	[rp] = r15, UPD
+	or	r14 = r21, r20
+	FSH	r26 = r19, cnt
+}{.mmi;	ld8	r19 = [up], UPD
+	nop.m	0
+	BSH	r27 = r16, tnc
+	;; }
+.LL10:
+ {.mmi;	st8	[rp] = r14, UPD
+	or	r15 = r23, r22
+	FSH	r20 = r16, cnt
+}{.mmi;	ld8	r16 = [up], UPD
+	nop.m	0
+	BSH	r21 = r17, tnc
+	;; }
+.LL01:
+ {.mmi;	st8	[rp] = r15, UPD
+	or	r14 = r25, r24
+	FSH	r22 = r17, cnt
+}{.mib;	ld8	r17 = [up], UPD
+	BSH	r23 = r18, tnc
+	br.cloop.dptk	.Ltop
+	;; }
+C *** MAIN LOOP END ***
+
+.Lbot:
+ {.mmi;	st8	[rp] = r14, UPD
+	or	r15 = r27, r26
+	FSH	r24 = r18, cnt
+}{.mib;	nop	0
+	BSH	r25 = r19, tnc
+	nop	0
+	;; }
+.Lr7:
+ {.mmi;	st8	[rp] = r15, UPD
+	or	r14 = r21, r20
+	FSH	r26 = r19, cnt
+}{.mib;	nop	0
+	BSH	r27 = r16, tnc
+	nop	0
+	;; }
+.Lr6:
+ {.mmi;	st8	[rp] = r14, UPD
+	or	r15 = r23, r22
+	FSH	r20 = r16, cnt
+}{.mib;	nop	0
+	BSH	r21 = r17, tnc
+	nop	0
+	;; }
+.Lr5:	st8	[rp] = r15, UPD
+	or	r14 = r25, r24
+	FSH	r22 = r17, cnt
+	;;
+.Lr4:	st8	[rp] = r14, UPD
+	or	r15 = r27, r26
+	;;
+.Lr3:	st8	[rp] = r15, UPD
+	or	r14 = r21, r20
+	;;
+.Lr2:	st8	[rp] = r14, UPD
+	;;
+.Lr1:	st8	[rp] = r22, UPD		C		M23
+	mov	ar.lc = r2		C		I0
+	br.ret.sptk.many b0		C		B
+EPILOGUE(func)
+ASM_END()

diff --git a/third_party/gmp/mpn/ia64/lshiftc.asm b/third_party/gmp/mpn/ia64/lshiftc.asm
new file mode 100644
index 0000000..e8cec87
--- /dev/null
+++ b/third_party/gmp/mpn/ia64/lshiftc.asm

@@ -0,0 +1,463 @@
+dnl  IA-64 mpn_lshiftc.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  Copyright 2000-2005, 2010 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C           cycles/limb
+C Itanium:      ?
+C Itanium 2:    1.25
+
+C This code is scheduled deeply since the plain shift instructions shr and shl
+C have a latency of 4 (on Itanium) or 3 (on Itanium 2).  Poor scheduling of
+C these instructions cause a 10 cycle replay trap on Itanium.
+
+C The ld8 scheduling should probably be decreased to make the function smaller.
+C Good lfetch  will make sure we never stall anyway.
+
+C We should actually issue the first ld8 at cycle 0, and the first BSH/FSH pair
+C at cycle 2.  Judicious use of predicates could allow us to issue more ld8's
+C in the prologue.
+
+
+C INPUT PARAMETERS
+define(`rp', `r32')
+define(`up', `r33')
+define(`n',  `r34')
+define(`cnt',`r35')
+
+define(`tnc',`r9')
+
+define(`FSH',`shl')
+define(`BSH',`shr.u')
+define(`UPD',`-8')
+define(`POFF',`-512')
+define(`PUPD',`-32')
+define(`func',`mpn_lshiftc')
+
+ASM_START()
+PROLOGUE(mpn_lshiftc)
+	.prologue
+	.save	ar.lc, r2
+	.body
+ifdef(`HAVE_ABI_32',
+`	addp4	rp = 0, rp		C				M I
+	addp4	up = 0, up		C				M I
+	sxt4	n = n			C				M I
+	nop.m		0
+	nop.m		0
+	zxt4	cnt = cnt		C				I
+	;;
+')
+
+ {.mmi;	nop	0			C				M I
+	and	r14 = 3, n		C				M I
+	mov.i	r2 = ar.lc		C				I0
+}{.mmi;	add	r15 = -1, n		C				M I
+	sub	tnc = 64, cnt		C				M I
+	nop	0
+	;;
+}{.mmi;	cmp.eq	p6, p0 = 1, r14		C				M I
+	cmp.eq	p7, p0 = 2, r14		C				M I
+	shr.u	n = r15, 2		C				I0
+}{.mmi;	cmp.eq	p8, p0 = 3, r14		C				M I
+	shladd	up = r15, 3, up		C				M I
+	shladd	rp = r15, 3, rp		C				M I
+	;;
+}{.mmi;	add	r11 = POFF, up		C				M I
+	ld8	r10 = [up], UPD		C				M01
+	mov.i	ar.lc = n		C				I0
+}{.bbb;
+   (p6)	br.dptk	.Lb01
+   (p7)	br.dptk	.Lb10
+   (p8)	br.dptk	.Lb11
+	;; }
+
+.Lb00:
+	ld8	r19 = [up], UPD
+	;;
+	ld8	r16 = [up], UPD
+	;;
+	ld8	r17 = [up], UPD
+	BSH	r8 = r10, tnc
+	br.cloop.dptk	L(gt4)
+	;;
+	FSH	r24 = r10, cnt
+	BSH	r25 = r19, tnc
+	;;
+	FSH	r26 = r19, cnt
+	BSH	r27 = r16, tnc
+	;;
+	FSH	r20 = r16, cnt
+	BSH	r21 = r17, tnc
+	;;
+	or	r14 = r25, r24
+	FSH	r22 = r17, cnt
+	;;
+	or	r15 = r27, r26
+	sub	r31 = -1, r14
+	br	.Lr4
+
+L(gt4):
+ {.mmi;	nop	0
+	nop	0
+	FSH	r24 = r10, cnt
+}{.mmi;	ld8	r18 = [up], UPD
+	nop	0
+	BSH	r25 = r19, tnc
+	;; }
+ {.mmi;	nop	0
+	nop	0
+	FSH	r26 = r19, cnt
+}{.mmi;	ld8	r19 = [up], UPD
+	nop	0
+	BSH	r27 = r16, tnc
+	;; }
+ {.mmi;	nop	0
+	nop	0
+	FSH	r20 = r16, cnt
+}{.mmi;	ld8	r16 = [up], UPD
+	nop	0
+	BSH	r21 = r17, tnc
+	;; }
+ {.mmi;	nop	0
+	or	r14 = r25, r24
+	FSH	r22 = r17, cnt
+}{.mib;	ld8	r17 = [up], UPD
+	BSH	r23 = r18, tnc
+	br.cloop.dptk	L(gt8)
+	;; }
+ {.mmi;	nop	0
+	or	r15 = r27, r26
+	FSH	r24 = r18, cnt
+}{.mib;	sub	r31 = -1, r14
+	BSH	r25 = r19, tnc
+	br	.Lr8 }
+
+L(gt8):
+	or	r15 = r27, r26
+	FSH	r24 = r18, cnt
+	ld8	r18 = [up], UPD
+	sub	r31 = -1, r14
+	BSH	r25 = r19, tnc
+	br	.LL00
+
+.Lb01:
+	br.cloop.dptk	L(gt1)
+	;;
+	BSH	r8 = r10, tnc
+	FSH	r22 = r10, cnt
+	;;
+	sub	r31 = -1, r22
+	br	.Lr1
+	;;
+L(gt1):
+	ld8	r18 = [up], UPD
+	BSH	r8 = r10, tnc
+	FSH	r22 = r10, cnt
+	;;
+	ld8	r19 = [up], UPD
+	;;
+	ld8	r16 = [up], UPD
+	;;
+	ld8	r17 = [up], UPD
+	BSH	r23 = r18, tnc
+	br.cloop.dptk	L(gt5)
+	;;
+	nop	0
+	FSH	r24 = r18, cnt
+	BSH	r25 = r19, tnc
+	;;
+	nop	0
+	FSH	r26 = r19, cnt
+	BSH	r27 = r16, tnc
+	;;
+	or	r15 = r23, r22
+	FSH	r20 = r16, cnt
+	BSH	r21 = r17, tnc
+	;;
+	or	r14 = r25, r24
+	FSH	r22 = r17, cnt
+	sub	r31 = -1, r15
+	br	.Lr5
+
+L(gt5):
+ {.mmi;	nop	0
+	nop	0
+	FSH	r24 = r18, cnt
+}{.mmi;	ld8	r18 = [up], UPD
+	nop	0
+	BSH	r25 = r19, tnc
+	;; }
+ {.mmi;	nop	0
+	nop	0
+	FSH	r26 = r19, cnt
+}{.mmi;	ld8	r19 = [up], UPD
+	nop	0
+	BSH	r27 = r16, tnc
+	;; }
+ {.mmi;	nop	0
+	or	r15 = r23, r22
+	FSH	r20 = r16, cnt
+}{.mmi;	ld8	r16 = [up], UPD
+	nop	0
+	BSH	r21 = r17, tnc
+	;; }
+ {.mmi;	or	r14 = r25, r24
+	sub	r31 = -1, r15
+	FSH	r22 = r17, cnt
+}{.mib;	ld8	r17 = [up], UPD
+	BSH	r23 = r18, tnc
+	br	L(end)
+	;; }
+
+.Lb10:
+	ld8	r17 = [up], UPD
+	br.cloop.dptk	L(gt2)
+	;;
+	BSH	r8 = r10, tnc
+	FSH	r20 = r10, cnt
+	;;
+	BSH	r21 = r17, tnc
+	FSH	r22 = r17, cnt
+	;;
+	or	r14 = r21, r20
+	;;
+	sub	r31 = -1, r14
+	br	.Lr2
+	;;
+L(gt2):
+	ld8	r18 = [up], UPD
+	BSH	r8 = r10, tnc
+	FSH	r20 = r10, cnt
+	;;
+	ld8	r19 = [up], UPD
+	;;
+	ld8	r16 = [up], UPD
+	BSH	r21 = r17, tnc
+	FSH	r22 = r17, cnt
+	;;
+	ld8	r17 = [up], UPD
+	BSH	r23 = r18, tnc
+	br.cloop.dptk	L(gt6)
+	;;
+	nop	0
+	FSH	r24 = r18, cnt
+	BSH	r25 = r19, tnc
+	;;
+	or	r14 = r21, r20
+	FSH	r26 = r19, cnt
+	BSH	r27 = r16, tnc
+	;;
+ {.mmi;	nop	0
+	or	r15 = r23, r22
+	FSH	r20 = r16, cnt
+}{.mib;	sub	r31 = -1, r14
+	BSH	r21 = r17, tnc
+	br	.Lr6
+	;; }
+L(gt6):
+ {.mmi;	nop	0
+	nop	0
+	FSH	r24 = r18, cnt
+}{.mmi;	ld8	r18 = [up], UPD
+	nop	0
+	BSH	r25 = r19, tnc
+	;; }
+ {.mmi; nop   0
+	or	r14 = r21, r20
+	FSH	r26 = r19, cnt
+}{.mmi;	ld8	r19 = [up], UPD
+	nop	0
+	BSH	r27 = r16, tnc
+	;; }
+ {.mmi;	or	r15 = r23, r22
+	sub	r31 = -1, r14
+	FSH	r20 = r16, cnt
+}{.mib;	ld8	r16 = [up], UPD
+	BSH	r21 = r17, tnc
+	br	.LL10
+}
+
+.Lb11:
+	ld8	r16 = [up], UPD
+	;;
+	ld8	r17 = [up], UPD
+	BSH	r8 = r10, tnc
+	FSH	r26 = r10, cnt
+	br.cloop.dptk	L(gt3)
+	;;
+	BSH	r27 = r16, tnc
+	;;
+	FSH	r20 = r16, cnt
+	BSH	r21 = r17, tnc
+	;;
+	FSH	r22 = r17, cnt
+	;;
+	or	r15 = r27, r26
+	;;
+	or	r14 = r21, r20
+	sub	r31 = -1, r15
+	br	.Lr3
+	;;
+L(gt3):
+	ld8	r18 = [up], UPD
+	;;
+	ld8	r19 = [up], UPD
+	BSH	r27 = r16, tnc
+	;;
+ {.mmi;	nop	0
+	nop	0
+	FSH	r20 = r16, cnt
+}{.mmi;	ld8	r16 = [up], UPD
+	nop	0
+	BSH	r21 = r17, tnc
+	;;
+}{.mmi;	nop	0
+	nop	0
+	FSH	r22 = r17, cnt
+}{.mib;	ld8	r17 = [up], UPD
+	BSH	r23 = r18, tnc
+	br.cloop.dptk	L(gt7)
+	;; }
+	or	r15 = r27, r26
+	FSH	r24 = r18, cnt
+	BSH	r25 = r19, tnc
+	;;
+ {.mmi;	nop	0
+	or	r14 = r21, r20
+	FSH	r26 = r19, cnt
+}{.mib;	sub	r31 = -1, r15
+	BSH	r27 = r16, tnc
+	br	.Lr7
+}
+L(gt7):
+ {.mmi;	nop	0
+	or	r15 = r27, r26
+	FSH	r24 = r18, cnt
+}{.mmi;	ld8	r18 = [up], UPD
+	nop	0
+	BSH	r25 = r19, tnc
+	;; }
+ {.mmi;	or	r14 = r21, r20
+	sub	r31 = -1, r15
+	FSH	r26 = r19, cnt
+}{.mib;	ld8	r19 = [up], UPD
+	BSH	r27 = r16, tnc
+	br	.LL11
+}
+
+C *** MAIN LOOP START ***
+	ALIGN(32)
+L(top):
+.LL01:
+ {.mmi;	st8	[rp] = r31, UPD		C M2
+	or	r15 = r27, r26		C M3
+	FSH	r24 = r18, cnt		C I0
+}{.mmi;	ld8	r18 = [up], UPD		C M0
+	sub	r31 = -1, r14		C M1
+	BSH	r25 = r19, tnc		C I1
+	;; }
+.LL00:
+ {.mmi;	st8	[rp] = r31, UPD
+	or	r14 = r21, r20
+	FSH	r26 = r19, cnt
+}{.mmi;	ld8	r19 = [up], UPD
+	sub	r31 = -1, r15
+	BSH	r27 = r16, tnc
+	;; }
+.LL11:
+ {.mmi;	st8	[rp] = r31, UPD
+	or	r15 = r23, r22
+	FSH	r20 = r16, cnt
+}{.mmi;	ld8	r16 = [up], UPD
+	sub	r31 = -1, r14
+	BSH	r21 = r17, tnc
+	;; }
+.LL10:
+ {.mmi;	st8	[rp] = r31, UPD
+	or	r14 = r25, r24
+	FSH	r22 = r17, cnt
+}{.mmi;	ld8	r17 = [up], UPD
+	sub	r31 = -1, r15
+	BSH	r23 = r18, tnc
+	;; }
+L(end):	lfetch		[r11], PUPD
+	br.cloop.dptk	L(top)
+C *** MAIN LOOP END ***
+
+ {.mmi;	st8	[rp] = r31, UPD
+	or	r15 = r27, r26
+	FSH	r24 = r18, cnt
+}{.mib;	sub	r31 = -1, r14
+	BSH	r25 = r19, tnc
+	nop	0
+	;; }
+.Lr8:
+ {.mmi;	st8	[rp] = r31, UPD
+	or	r14 = r21, r20
+	FSH	r26 = r19, cnt
+}{.mib;	sub	r31 = -1, r15
+	BSH	r27 = r16, tnc
+	nop	0
+	;; }
+.Lr7:
+ {.mmi;	st8	[rp] = r31, UPD
+	or	r15 = r23, r22
+	FSH	r20 = r16, cnt
+}{.mib;	sub	r31 = -1, r14
+	BSH	r21 = r17, tnc
+	nop	0
+	;; }
+.Lr6:	st8	[rp] = r31, UPD
+	or	r14 = r25, r24
+	FSH	r22 = r17, cnt
+	sub	r31 = -1, r15
+	;;
+.Lr5:	st8	[rp] = r31, UPD
+	or	r15 = r27, r26
+	sub	r31 = -1, r14
+	;;
+.Lr4:	st8	[rp] = r31, UPD
+	or	r14 = r21, r20
+	sub	r31 = -1, r15
+	;;
+.Lr3:	st8	[rp] = r31, UPD
+	sub	r31 = -1, r14
+	;;
+.Lr2:	st8	[rp] = r31, UPD
+	sub	r31 = -1, r22
+	;;
+.Lr1:	st8	[rp] = r31, UPD		C				M23
+	mov	ar.lc = r2		C				I0
+	br.ret.sptk.many b0		C				B
+EPILOGUE(func)
+ASM_END()

diff --git a/third_party/gmp/mpn/ia64/mod_34lsub1.asm b/third_party/gmp/mpn/ia64/mod_34lsub1.asm
new file mode 100644
index 0000000..7789117
--- /dev/null
+++ b/third_party/gmp/mpn/ia64/mod_34lsub1.asm

@@ -0,0 +1,237 @@
+dnl  IA-64 mpn_mod_34lsub1
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  Copyright 2003-2005, 2010 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C           cycles/limb
+C Itanium:      ?
+C Itanium 2:    1
+
+
+C INPUT PARAMETERS
+define(`up', `r32')
+define(`n',  `r33')
+
+C Some useful aliases for registers we use
+define(`u0',`r14') define(`u1',`r15') define(`u2',`r16')
+define(`a0',`r17') define(`a1',`r18') define(`a2',`r19')
+define(`c0',`r20') define(`c1',`r21') define(`c2',`r22')
+
+C This is a fairly simple-minded implementation.  One could approach 0.67 c/l
+C with a more sophisticated implementation.  If we're really crazy, we could
+C super-unroll, storing carries just in predicate registers, then copy them to
+C a general register, and population count them from there.  That'd bring us
+C close to 3 insn/limb, for nearly 0.5 c/l.
+
+C Computing n/3 needs 16 cycles, which is a lot of startup overhead.
+C We therefore use a plain while-style loop:
+C	add		n = -3, n
+C	cmp.le		p9, p0 = 3, n
+C  (p9)	br.cond		.Loop
+C Alternatively, we could table n/3 for, say, n < 256, and predicate the
+C 16-cycle code.
+
+C The summing-up code at the end was written quickly, and could surely be
+C vastly improved.
+
+ASM_START()
+PROLOGUE(mpn_mod_34lsub1)
+	.prologue
+	.save	ar.lc, r2
+	.body
+ifdef(`HAVE_ABI_32',`
+	addp4		up = 0, up		C			M I
+	nop.m		0
+	zxt4		n = n			C			I
+	;;
+')
+
+ifelse(0,1,`
+	movl		r14 = 0xAAAAAAAAAAAAAAAB
+	;;
+	setf.sig	f6 = r14
+	setf.sig	f7 = r33
+	;;
+	xmpy.hu		f6 = f6, f7
+	;;
+	getf.sig	r8 = f6
+	;;
+	shr.u		r8 = r8, 1		C Loop count
+	;;
+	mov.i		ar.lc = r8
+')
+
+	ld8	u0 = [up], 8
+	cmp.ne	p9, p0 = 1, n
+  (p9)	br	L(gt1)
+	;;
+	shr.u	r8 = u0, 48
+	dep.z	r27 = u0, 0, 48
+	;;
+	add	r8 = r8, r27
+	br.ret.sptk.many b0
+
+
+L(gt1):
+ {.mmi;	nop.m	0
+	mov	a0 = 0
+	add	n = -2, n
+}{.mmi;	mov	c0 = 0
+	mov	c1 = 0
+	mov	c2 = 0
+	;;
+}{.mmi;	ld8	u1 = [up], 8
+	mov	a1 = 0
+	cmp.ltu	p6, p0 = r0, r0		C clear p6
+}{.mmb;	cmp.gt	p9, p0 = 3, n
+	mov	a2 = 0
+  (p9)	br.cond.dptk	L(end)
+	;;
+}
+	ALIGN(32)
+L(top):
+ {.mmi;	ld8	u2 = [up], 8
+  (p6)	add	c0 = 1, c0
+	cmp.ltu	p7, p0 = a0, u0
+}{.mmb;	sub	a0 = a0, u0
+	add	n = -3, n
+	nop.b	0
+	;;
+}{.mmi;	ld8	u0 = [up], 8
+  (p7)	add	c1 = 1, c1
+	cmp.ltu	p8, p0 = a1, u1
+}{.mmb;	sub	a1 = a1, u1
+	cmp.le	p9, p0 = 3, n
+	nop.b	0
+	;;
+}{.mmi;	ld8	u1 = [up], 8
+  (p8)	add	c2 = 1, c2
+	cmp.ltu	p6, p0 = a2, u2
+}{.mmb;	sub	a2 = a2, u2
+	nop.m	0
+dnl	br.cloop.dptk	L(top)
+  (p9)	br.cond.dptk	L(top)
+	;;
+}
+L(end):
+	cmp.eq	p10, p0 = 0, n
+	cmp.eq	p11, p0 = 1, n
+  (p10)	br	L(0)
+
+L(2):
+ {.mmi;	ld8	u2 = [up], 8
+  (p6)	add	c0 = 1, c0
+	cmp.ltu	p7, p0 = a0, u0
+}{.mmb;	sub	a0 = a0, u0
+	nop.m	0
+  (p11)	br	L(1)
+	;;
+}	ld8	u0 = [up], 8
+  (p7)	add	c1 = 1, c1
+	cmp.ltu	p8, p0 = a1, u1
+	sub	a1 = a1, u1
+	;;
+  (p8)	add	c2 = 1, c2
+	cmp.ltu	p6, p0 = a2, u2
+	sub	a2 = a2, u2
+	;;
+  (p6)	add	c0 = 1, c0
+	cmp.ltu	p7, p0 = a0, u0
+	sub	a0 = a0, u0
+	;;
+  (p7)	add	c1 = 1, c1
+	br	L(com)
+
+
+L(1):
+  (p7)	add	c1 = 1, c1
+	cmp.ltu	p8, p0 = a1, u1
+	sub	a1 = a1, u1
+	;;
+  (p8)	add	c2 = 1, c2
+	cmp.ltu	p6, p0 = a2, u2
+	sub	a2 = a2, u2
+	;;
+  (p6)	add	c0 = 1, c0
+	br	L(com)
+
+
+L(0):
+  (p6)	add	c0 = 1, c0
+	cmp.ltu	p7, p0 = a0, u0
+	sub	a0 = a0, u0
+	;;
+  (p7)	add	c1 = 1, c1
+	cmp.ltu	p8, p0 = a1, u1
+	sub	a1 = a1, u1
+	;;
+  (p8)	add	c2 = 1, c2
+
+L(com):
+C |     a2    |     a1    |     a0    |
+C |        |        |        |        |
+	shr.u	r24 = a0, 48		C 16 bits
+	shr.u	r25 = a1, 32		C 32 bits
+	shr.u	r26 = a2, 16		C 48 bits
+	;;
+	shr.u	r10 = c0, 48		C 16 bits, always zero
+	shr.u	r11 = c1, 32		C 32 bits
+	shr.u	r30 = c2, 16		C 48 bits
+	;;
+	dep.z	r27 = a0,  0, 48	C 48 bits
+	dep.z	r28 = a1, 16, 32	C 48 bits
+	dep.z	r29 = a2, 32, 16	C 48 bits
+	dep.z	r31 = c0,  0, 48	C 48 bits
+	dep.z	r14 = c1, 16, 32	C 48 bits
+	dep.z	r15 = c2, 32, 16	C 48 bits
+	;;
+ {.mmi;	add	r24 = r24, r25
+	add	r26 = r26, r27
+	add	r28 = r28, r29
+}{.mmi;	add	r10 = r10, r11
+	add	r30 = r30, r31
+	add	r14 = r14, r15
+	;;
+}
+	movl	r8 = 0xffffffffffff0
+	add	r24 = r24, r26
+	add	r10 = r10, r30
+	;;
+	add	r24 = r24, r28
+	add	r10 = r10, r14
+	;;
+	sub	r8 = r8, r24
+	;;
+	add	r8 = r8, r10
+	br.ret.sptk.many b0
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/ia64/mode1o.asm b/third_party/gmp/mpn/ia64/mode1o.asm
new file mode 100644
index 0000000..14d5e81
--- /dev/null
+++ b/third_party/gmp/mpn/ia64/mode1o.asm

@@ -0,0 +1,342 @@
+dnl  Itanium-2 mpn_modexact_1c_odd -- mpn by 1 exact remainder.
+
+dnl  Contributed to the GNU project by Kevin Ryde.
+
+dnl  Copyright 2003-2005 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C            cycles/limb
+C Itanium:      15
+C Itanium 2:     8
+
+
+dnl  Usage: ABI32(`code')
+dnl
+dnl  Emit the given code only under HAVE_ABI_32.
+dnl
+define(ABI32,
+m4_assert_onearg()
+`ifdef(`HAVE_ABI_32',`$1')')
+
+
+C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size,
+C                                mp_limb_t divisor, mp_limb_t carry);
+C
+C The modexact algorithm is usually conceived as a dependent chain
+C
+C	l = src[i] - c
+C	q = low(l * inverse)
+C	c = high(q*divisor) + (src[i]<c)
+C
+C but we can work the src[i]-c into an xma by calculating si=src[i]*inverse
+C separately (off the dependent chain) and using
+C
+C	q = low(c * inverse + si)
+C	c = high(q*divisor + c)
+C
+C This means the dependent chain is simply xma.l followed by xma.hu, for a
+C total 8 cycles/limb on itanium-2.
+C
+C The reason xma.hu works for the new c is that the low of q*divisor is
+C src[i]-c (being the whole purpose of the q generated, and it can be
+C verified algebraically).  If there was an underflow from src[i]-c, then
+C there will be an overflow from (src-c)+c, thereby adding 1 to the new c
+C the same as the borrow bit (src[i]<c) gives in the first style shown.
+C
+C Incidentally, fcmp is not an option for treating src[i]-c, since it
+C apparently traps to the kernel for unnormalized operands like those used
+C and generated by ldf8 and xma.  On one GNU/Linux system it took about 1200
+C cycles.
+C
+C
+C First Limb:
+C
+C The first limb uses q = (src[0]-c) * inverse shown in the first style.
+C This lets us get the first q as soon as the inverse is ready, without
+C going through si=s*inverse.  Basically at the start we have c and can use
+C it while waiting for the inverse, whereas for the second and subsequent
+C limbs it's the other way around, ie. we have the inverse and are waiting
+C for c.
+C
+C At .Lentry the first two instructions in the loop have been done already.
+C The load of f11=src[1] at the start (predicated on size>=2), and the
+C calculation of q by the initial different scheme.
+C
+C
+C Entry Sequence:
+C
+C In the entry sequence, the critical path is the calculation of the
+C inverse, so this is begun first and optimized.  Apart from that, ar.lc is
+C established nice and early so the br.cloop's should predict perfectly.
+C And the load for the low limbs src[0] and src[1] can be initiated long
+C ahead of where they're needed.
+C
+C
+C Inverse Calculation:
+C
+C The initial 8-bit inverse is calculated using a table lookup.  If it hits
+C L1 (which is likely if we're called several times) then it should take a
+C total 4 cycles, otherwise hopefully L2 for 9 cycles.  This is considered
+C the best approach, on balance.  It could be done bitwise, but that would
+C probably be about 14 cycles (2 per bit beyond the first couple).  Or it
+C could be taken from 4 bits to 8 with xmpy doubling as used beyond 8 bits,
+C but that would be about 11 cycles.
+C
+C The table is not the same as binvert_limb_table, instead it's 256 bytes,
+C designed to be indexed by the low byte of the divisor.  The divisor is
+C always odd, so the relevant data is every second byte in the table.  The
+C padding lets us use zxt1 instead of extr.u, the latter would cost an extra
+C cycle because it must go down I0, and we're using the first I0 slot to get
+C ip.  The extra 128 bytes of padding should be insignificant compared to
+C typical ia64 code bloat.
+C
+C Having the table in .text allows us to use IP-relative addressing,
+C avoiding a fetch from ltoff.  .rodata is apparently not suitable for use
+C IP-relative, it gets a linker relocation overflow on GNU/Linux.
+C
+C
+C Load Scheduling:
+C
+C In the main loop, the data loads are scheduled for an L2 hit, which means
+C 6 cycles for the data ready to use.  In fact we end up 7 cycles ahead.  In
+C any case that scheduling is achieved simply by doing the load (and xmpy.l
+C for "si") in the immediately preceding iteration.
+C
+C The main loop requires size >= 2, and we handle size==1 by an initial
+C br.cloop to enter the loop only if size>1.  Since ar.lc is established
+C early, this should predict perfectly.
+C
+C
+C Not done:
+C
+C Consideration was given to using a plain "(src[0]-c) % divisor" for
+C size==1, but cycle counting suggests about 50 for the sort of approach
+C taken by gcc __umodsi3, versus about 47 for the modexact.  (Both assuming
+C L1 hits for their respective fetching.)
+C
+C Consideration was given to a test for high<divisor and replacing the last
+C loop iteration with instead c-=src[size-1] followed by c+=d if underflow.
+C Branching on high<divisor wouldn't be good since a mispredict would cost
+C more than the loop iteration saved, and the condition is of course data
+C dependent.  So the theory would be to shorten the loop count if
+C high<divisor, and predicate extra operations at the end.  That would mean
+C a gain of 6 when high<divisor, or a cost of 2 if not.
+C
+C Whether such a tradeoff is a win on average depends on assumptions about
+C how many bits in the high and the divisor.  If both are uniformly
+C distributed then high<divisor about 50% of the time.  But smallish
+C divisors (less chance of high<divisor) might be more likely from
+C applications (mpz_divisible_ui, mpz_gcd_ui, etc).  Though biggish divisors
+C would be normal internally from say mpn/generic/perfsqr.c.  On balance,
+C for the moment, it's felt the gain is not really enough to be worth the
+C trouble.
+C
+C
+C Enhancement:
+C
+C Process two source limbs per iteration using a two-limb inverse and a
+C sequence like
+C
+C	ql  = low (c * il + sil)	quotient low limb
+C	qlc = high(c * il + sil)
+C	qh1 = low (c * ih + sih)	quotient high, partial
+C
+C	cl = high (ql * d + c)		carry out of low
+C	qh = low (qlc * 1 + qh1)	quotient high limb
+C
+C	new c = high (qh * d + cl)	carry out of high
+C
+C This would be 13 cycles/iteration, giving 6.5 cycles/limb.  The two limb
+C s*inverse as sih:sil = sh:sl * ih:il would be calculated off the dependent
+C chain with 4 multiplies.  The bigger inverse would take extra time to
+C calculate, but a one limb iteration to handle an odd size could be done as
+C soon as 64-bits of inverse were ready.
+C
+C Perhaps this could even extend to a 3 limb inverse, which might promise 17
+C or 18 cycles for 3 limbs, giving 5.66 or 6.0 cycles/limb.
+C
+
+ASM_START()
+	.explicit
+
+	.text
+	.align	32
+.Ltable:
+data1	0,0x01, 0,0xAB, 0,0xCD, 0,0xB7, 0,0x39, 0,0xA3, 0,0xC5, 0,0xEF
+data1	0,0xF1, 0,0x1B, 0,0x3D, 0,0xA7, 0,0x29, 0,0x13, 0,0x35, 0,0xDF
+data1	0,0xE1, 0,0x8B, 0,0xAD, 0,0x97, 0,0x19, 0,0x83, 0,0xA5, 0,0xCF
+data1	0,0xD1, 0,0xFB, 0,0x1D, 0,0x87, 0,0x09, 0,0xF3, 0,0x15, 0,0xBF
+data1	0,0xC1, 0,0x6B, 0,0x8D, 0,0x77, 0,0xF9, 0,0x63, 0,0x85, 0,0xAF
+data1	0,0xB1, 0,0xDB, 0,0xFD, 0,0x67, 0,0xE9, 0,0xD3, 0,0xF5, 0,0x9F
+data1	0,0xA1, 0,0x4B, 0,0x6D, 0,0x57, 0,0xD9, 0,0x43, 0,0x65, 0,0x8F
+data1	0,0x91, 0,0xBB, 0,0xDD, 0,0x47, 0,0xC9, 0,0xB3, 0,0xD5, 0,0x7F
+data1	0,0x81, 0,0x2B, 0,0x4D, 0,0x37, 0,0xB9, 0,0x23, 0,0x45, 0,0x6F
+data1	0,0x71, 0,0x9B, 0,0xBD, 0,0x27, 0,0xA9, 0,0x93, 0,0xB5, 0,0x5F
+data1	0,0x61, 0,0x0B, 0,0x2D, 0,0x17, 0,0x99, 0,0x03, 0,0x25, 0,0x4F
+data1	0,0x51, 0,0x7B, 0,0x9D, 0,0x07, 0,0x89, 0,0x73, 0,0x95, 0,0x3F
+data1	0,0x41, 0,0xEB, 0,0x0D, 0,0xF7, 0,0x79, 0,0xE3, 0,0x05, 0,0x2F
+data1	0,0x31, 0,0x5B, 0,0x7D, 0,0xE7, 0,0x69, 0,0x53, 0,0x75, 0,0x1F
+data1	0,0x21, 0,0xCB, 0,0xED, 0,0xD7, 0,0x59, 0,0xC3, 0,0xE5, 0,0x0F
+data1	0,0x11, 0,0x3B, 0,0x5D, 0,0xC7, 0,0x49, 0,0x33, 0,0x55, 0,0xFF
+
+
+PROLOGUE(mpn_modexact_1c_odd)
+
+	C r32	src
+	C r33	size
+	C r34	divisor
+	C r35	carry
+
+	.prologue
+.Lhere:
+{ .mmi;	add	r33 = -1, r33		C M0  size-1
+	mov	r14 = 2			C M1  2
+	mov	r15 = ip		C I0  .Lhere
+}{.mmi;	setf.sig f6 = r34		C M2  divisor
+	setf.sig f9 = r35		C M3  carry
+	zxt1	r3 = r34		C I1  divisor low byte
+}	;;
+
+{ .mmi;	add	r3 = .Ltable-.Lhere, r3	C M0  table offset ip and index
+	sub	r16 = 0, r34		C M1  -divisor
+	.save	ar.lc, r2
+	mov	r2 = ar.lc		C I0
+}{.mmi;	.body
+	setf.sig f13 = r14		C M2  2 in significand
+	mov	r17 = -1		C M3  -1
+ABI32(`	zxt4	r33 = r33')		C I1  size extend
+}	;;
+
+{ .mmi;	add	r3 = r3, r15		C M0  table entry address
+ABI32(` addp4	r32 = 0, r32')		C M1  src extend
+	mov	ar.lc = r33		C I0  size-1 loop count
+}{.mmi;	setf.sig f12 = r16		C M2  -divisor
+	setf.sig f8 = r17		C M3  -1
+}	;;
+
+{ .mmi;	ld1	r3 = [r3]		C M0  inverse, 8 bits
+	ldf8	f10 = [r32], 8		C M1  src[0]
+	cmp.ne	p6,p0 = 0, r33		C I0  test size!=1
+}	;;
+
+	C Wait for table load.
+	C Hope for an L1 hit of 1 cycles to ALU, but could be more.
+	setf.sig f7 = r3		C M2  inverse, 8 bits
+(p6)	ldf8	f11 = [r32], 8		C M1  src[1], if size!=1
+	;;
+
+	C 5 cycles
+
+	C f6	divisor
+	C f7	inverse, being calculated
+	C f8	-1, will be -inverse
+	C f9	carry
+	C f10	src[0]
+	C f11	src[1]
+	C f12	-divisor
+	C f13	2
+	C f14	scratch
+
+	xmpy.l	f14 = f13, f7		C 2*i
+	xmpy.l	f7 = f7, f7		C i*i
+	;;
+	xma.l	f7 = f7, f12, f14	C i*i*-d + 2*i, inverse 16 bits
+	;;
+
+	xmpy.l	f14 = f13, f7		C 2*i
+	xmpy.l	f7 = f7, f7		C i*i
+	;;
+	xma.l	f7 = f7, f12, f14	C i*i*-d + 2*i, inverse 32 bits
+	;;
+
+	xmpy.l	f14 = f13, f7		C 2*i
+	xmpy.l	f7 = f7, f7		C i*i
+	;;
+
+	xma.l	f7 = f7, f12, f14	C i*i*-d + 2*i, inverse 64 bits
+	xma.l	f10 = f9, f8, f10	C sc = c * -1 + src[0]
+	;;
+ASSERT(p6, `
+	xmpy.l	f15 = f6, f7 ;;	C divisor*inverse
+	getf.sig r31 = f15 ;;
+	cmp.eq	p6,p0 = 1, r31	C should == 1
+')
+
+	xmpy.l	f10 = f10, f7		C q = sc * inverse
+	xmpy.l	f8 = f7, f8		C -inverse = inverse * -1
+	br.cloop.sptk.few.clr .Lentry	C main loop, if size > 1
+	;;
+
+	C size==1, finish up now
+	xma.hu	f9 = f10, f6, f9	C c = high(q * divisor + c)
+	mov	ar.lc = r2		C I0
+	;;
+	getf.sig r8 = f9		C M2  return c
+	br.ret.sptk.many b0
+
+
+
+.Ltop:
+	C r2	saved ar.lc
+	C f6	divisor
+	C f7	inverse
+	C f8	-inverse
+	C f9	carry
+	C f10	src[i] * inverse
+	C f11	scratch src[i+1]
+
+	add	r16 = 160, r32
+	ldf8	f11 = [r32], 8		C src[i+1]
+	;;
+	C 2 cycles
+
+	lfetch	[r16]
+	xma.l	f10 = f9, f8, f10	C q = c * -inverse + si
+	;;
+	C 3 cycles
+
+.Lentry:
+	xma.hu	f9 = f10, f6, f9	C c = high(q * divisor + c)
+	xmpy.l	f10 = f11, f7		C si = src[i] * inverse
+	br.cloop.sptk.few.clr .Ltop
+	;;
+
+
+
+	xma.l	f10 = f9, f8, f10	C q = c * -inverse + si
+	mov	ar.lc = r2		C I0
+	;;
+	xma.hu	f9 = f10, f6, f9	C c = high(q * divisor + c)
+	;;
+	getf.sig r8 = f9		C M2  return c
+	br.ret.sptk.many b0
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/ia64/mul_1.asm b/third_party/gmp/mpn/ia64/mul_1.asm
new file mode 100644
index 0000000..21bf6d0
--- /dev/null
+++ b/third_party/gmp/mpn/ia64/mul_1.asm

@@ -0,0 +1,584 @@
+dnl  IA-64 mpn_mul_1, mpn_mul_1c -- Multiply a limb vector with a limb and
+dnl  store the result in a second limb vector.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  Copyright 2000-2004, 2006, 2007 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C         cycles/limb
+C Itanium:    4.0
+C Itanium 2:  2.0
+
+C TODO
+C  * Further optimize feed-in and wind-down code, both for speed and code size.
+C  * Handle low limb input and results specially, using a common stf8 in the
+C    epilogue.
+C  * Use 1 c/l carry propagation scheme in wind-down code.
+C  * Use extra pointer register for `up' to speed up feed-in loads.
+C  * Work out final differences with addmul_1.asm.
+
+C INPUT PARAMETERS
+define(`rp', `r32')
+define(`up', `r33')
+define(`n', `r34')
+define(`vl', `r35')
+define(`cy', `r36')	C for mpn_mul_1c
+
+ASM_START()
+PROLOGUE(mpn_mul_1)
+	.prologue
+	.save	ar.lc, r2
+	.body
+
+ifdef(`HAVE_ABI_32',
+`	addp4		rp = 0, rp		C M I
+	addp4		up = 0, up		C M I
+	zxt4		n = n			C I
+	;;
+')
+{.mfi
+	adds		r15 = -1, n		C M I
+	mov		f9 = f0			C F
+	mov.i		r2 = ar.lc		C I0
+}
+{.mmi
+	ldf8		f7 = [up], 8		C M
+	nop.m		0			C M
+	and		r14 = 3, n		C M I
+	;;
+}
+.Lcommon:
+{.mii
+	setf.sig	f6 = vl			C M2 M3
+	shr.u		r31 = r15, 2		C I0
+	cmp.eq		p10, p0 = 0, r14	C M I
+}
+{.mii
+	cmp.eq		p11, p0 = 2, r14	C M I
+	cmp.eq		p12, p0 = 3, r14	C M I
+	nop.i		0			C I
+	;;
+}
+{.mii
+	cmp.ne		p6, p7 = r0, r0		C M I
+	mov.i		ar.lc = r31		C I0
+	cmp.ne		p8, p9 = r0, r0		C M I
+}
+{.bbb
+  (p10)	br.dptk		.Lb00			C B
+  (p11)	br.dptk		.Lb10			C B
+  (p12)	br.dptk		.Lb11			C B
+	;;
+}
+
+.Lb01:	mov		r20 = 0
+	br.cloop.dptk	.grt1			C B
+
+	xma.l		f39 = f7, f6, f9	C F
+	xma.hu		f43 = f7, f6, f9	C F
+	;;
+	getf.sig	r8 = f43		C M2
+	stf8		[rp] = f39		C M2 M3
+	mov.i		ar.lc = r2		C I0
+	br.ret.sptk.many b0			C B
+
+.grt1:
+	ldf8		f32 = [up], 8
+	;;
+	ldf8		f33 = [up], 8
+	;;
+	ldf8		f34 = [up], 8
+	xma.l		f39 = f7, f6, f9
+	xma.hu		f43 = f7, f6, f9
+	;;
+	ldf8		f35 = [up], 8
+	br.cloop.dptk	.grt5
+
+	xma.l		f36 = f32, f6, f0
+	xma.hu		f40 = f32, f6, f0
+	;;
+	stf8		[rp] = f39, 8
+	xma.l		f37 = f33, f6, f0
+	xma.hu		f41 = f33, f6, f0
+	;;
+	getf.sig	r21 = f43
+	getf.sig	r18 = f36
+	xma.l		f38 = f34, f6, f0
+	xma.hu		f42 = f34, f6, f0
+	;;
+	getf.sig	r22 = f40
+	getf.sig	r19 = f37
+	xma.l		f39 = f35, f6, f0
+	xma.hu		f43 = f35, f6, f0
+	;;
+	getf.sig	r23 = f41
+	getf.sig	r16 = f38
+	br		.Lcj5
+
+.grt5:
+	xma.l		f36 = f32, f6, f0
+	xma.hu		f40 = f32, f6, f0
+	;;
+	getf.sig	r17 = f39
+	ldf8		f32 = [up], 8
+	xma.l		f37 = f33, f6, f0
+	xma.hu		f41 = f33, f6, f0
+	;;
+	getf.sig	r21 = f43
+	ldf8		f33 = [up], 8
+	xma.l		f38 = f34, f6, f0
+	;;
+	getf.sig	r18 = f36
+	xma.hu		f42 = f34, f6, f0
+	;;
+	getf.sig	r22 = f40
+	ldf8		f34 = [up], 8
+	xma.l		f39 = f35, f6, f0
+	;;
+	getf.sig	r19 = f37
+	xma.hu		f43 = f35, f6, f0
+	br		.LL01
+
+
+.Lb10:	ldf8		f35 = [up], 8
+	mov		r23 = 0
+	br.cloop.dptk	.grt2
+
+	xma.l		f38 = f7, f6, f9
+	xma.hu		f42 = f7, f6, f9
+	;;
+	stf8		[rp] = f38, 8
+	xma.l		f39 = f35, f6, f42
+	xma.hu		f43 = f35, f6, f42
+	;;
+	getf.sig	r8 = f43
+	stf8		[rp] = f39
+	mov.i		ar.lc = r2
+	br.ret.sptk.many b0
+
+
+.grt2:
+	ldf8		f32 = [up], 8
+	;;
+	ldf8		f33 = [up], 8
+	xma.l		f38 = f7, f6, f9
+	xma.hu		f42 = f7, f6, f9
+	;;
+	ldf8		f34 = [up], 8
+	xma.l		f39 = f35, f6, f0
+	xma.hu		f43 = f35, f6, f0
+	;;
+	ldf8		f35 = [up], 8
+	br.cloop.dptk	.grt6
+
+	stf8		[rp] = f38, 8
+	xma.l		f36 = f32, f6, f0
+	xma.hu		f40 = f32, f6, f0
+	;;
+	getf.sig	r20 = f42
+	getf.sig	r17 = f39
+	xma.l		f37 = f33, f6, f0
+	xma.hu		f41 = f33, f6, f0
+	;;
+	getf.sig	r21 = f43
+	getf.sig	r18 = f36
+	xma.l		f38 = f34, f6, f0
+	xma.hu		f42 = f34, f6, f0
+	;;
+	getf.sig	r22 = f40
+	getf.sig	r19 = f37
+	xma.l		f39 = f35, f6, f0
+	xma.hu		f43 = f35, f6, f0
+	br		.Lcj6
+
+.grt6:
+	getf.sig	r16 = f38
+	xma.l		f36 = f32, f6, f0
+	xma.hu		f40 = f32, f6, f0
+	;;
+	getf.sig	r20 = f42
+	ldf8		f32 = [up], 8
+	xma.l		f37 = f33, f6, f0
+	;;
+	getf.sig	r17 = f39
+	xma.hu		f41 = f33, f6, f0
+	;;
+	getf.sig	r21 = f43
+	ldf8		f33 = [up], 8
+	xma.l		f38 = f34, f6, f0
+	;;
+	getf.sig	r18 = f36
+	xma.hu		f42 = f34, f6, f0
+	br		.LL10
+
+
+.Lb11:	ldf8		f34 = [up], 8
+	mov		r22 = 0
+	;;
+	ldf8		f35 = [up], 8
+	br.cloop.dptk	.grt3
+	;;
+
+	xma.l		f37 = f7, f6, f9
+	xma.hu		f41 = f7, f6, f9
+	xma.l		f38 = f34, f6, f0
+	xma.hu		f42 = f34, f6, f0
+	xma.l		f39 = f35, f6, f0
+	xma.hu		f43 = f35, f6, f0
+	;;
+	getf.sig	r23 = f41
+	stf8		[rp] = f37, 8
+	getf.sig	r16 = f38
+	getf.sig	r20 = f42
+	getf.sig	r17 = f39
+	getf.sig	r8 = f43
+	br		.Lcj3
+
+.grt3:
+	ldf8		f32 = [up], 8
+	xma.l		f37 = f7, f6, f9
+	xma.hu		f41 = f7, f6, f9
+	;;
+	ldf8		f33 = [up], 8
+	xma.l		f38 = f34, f6, f0
+	xma.hu		f42 = f34, f6, f0
+	;;
+	getf.sig	r19 = f37
+	ldf8		f34 = [up], 8
+	xma.l		f39 = f35, f6, f0
+	xma.hu		f43 = f35, f6, f0
+	;;
+	getf.sig	r23 = f41
+	ldf8		f35 = [up], 8
+	br.cloop.dptk	.grt7
+
+	getf.sig	r16 = f38
+	xma.l		f36 = f32, f6, f0
+	getf.sig	r20 = f42
+	xma.hu		f40 = f32, f6, f0
+	;;
+	getf.sig	r17 = f39
+	xma.l		f37 = f33, f6, f0
+	getf.sig	r21 = f43
+	xma.hu		f41 = f33, f6, f0
+	;;
+	getf.sig	r18 = f36
+	st8		[rp] = r19, 8
+	xma.l		f38 = f34, f6, f0
+	xma.hu		f42 = f34, f6, f0
+	br		.Lcj7
+
+.grt7:
+	getf.sig	r16 = f38
+	xma.l		f36 = f32, f6, f0
+	xma.hu		f40 = f32, f6, f0
+	;;
+	getf.sig	r20 = f42
+	ldf8		f32 = [up], 8
+	xma.l		f37 = f33, f6, f0
+	;;
+	getf.sig	r17 = f39
+	xma.hu		f41 = f33, f6, f0
+	br		.LL11
+
+
+.Lb00:	ldf8		f33 = [up], 8
+	mov		r21 = 0
+	;;
+	ldf8		f34 = [up], 8
+	;;
+	ldf8		f35 = [up], 8
+	xma.l		f36 = f7, f6, f9
+	xma.hu		f40 = f7, f6, f9
+	br.cloop.dptk	.grt4
+
+	xma.l		f37 = f33, f6, f0
+	xma.hu		f41 = f33, f6, f0
+	xma.l		f38 = f34, f6, f0
+	xma.hu		f42 = f34, f6, f0
+	;;
+	getf.sig	r22 = f40
+	stf8		[rp] = f36, 8
+	xma.l		f39 = f35, f6, f0
+	getf.sig	r19 = f37
+	xma.hu		f43 = f35, f6, f0
+	;;
+	getf.sig	r23 = f41
+	getf.sig	r16 = f38
+	getf.sig	r20 = f42
+	getf.sig	r17 = f39
+	br		.Lcj4
+
+.grt4:
+	ldf8		f32 = [up], 8
+	xma.l		f37 = f33, f6, f0
+	xma.hu		f41 = f33, f6, f0
+	;;
+	getf.sig	r18 = f36
+	ldf8		f33 = [up], 8
+	xma.l		f38 = f34, f6, f0
+	xma.hu		f42 = f34, f6, f0
+	;;
+	getf.sig	r22 = f40
+	ldf8		f34 = [up], 8
+	xma.l		f39 = f35, f6, f0
+	;;
+	getf.sig	r19 = f37
+	getf.sig	r23 = f41
+	xma.hu		f43 = f35, f6, f0
+	ldf8		f35 = [up], 8
+	br.cloop.dptk	.grt8
+
+	getf.sig	r16 = f38
+	xma.l		f36 = f32, f6, f0
+	getf.sig	r20 = f42
+	xma.hu		f40 = f32, f6, f0
+	;;
+	getf.sig	r17 = f39
+	st8		[rp] = r18, 8
+	xma.l		f37 = f33, f6, f0
+	xma.hu		f41 = f33, f6, f0
+	br		.Lcj8
+
+.grt8:
+	getf.sig	r16 = f38
+	xma.l		f36 = f32, f6, f0
+	xma.hu		f40 = f32, f6, f0
+	br		.LL00
+
+
+C *** MAIN LOOP START ***
+	ALIGN(32)
+.Loop:
+	.pred.rel "mutex",p6,p7
+	getf.sig	r16 = f38
+	xma.l		f36 = f32, f6, f0
+   (p6)	cmp.leu		p8, p9 = r24, r17
+	st8		[rp] = r24, 8
+	xma.hu		f40 = f32, f6, f0
+   (p7)	cmp.ltu		p8, p9 = r24, r17
+	;;
+.LL00:
+	.pred.rel "mutex",p8,p9
+	getf.sig	r20 = f42
+   (p8)	add		r24 = r18, r21, 1
+	nop.b		0
+	ldf8		f32 = [up], 8
+   (p9)	add		r24 = r18, r21
+	nop.b		0
+	;;
+	.pred.rel "mutex",p8,p9
+	getf.sig	r17 = f39
+	xma.l		f37 = f33, f6, f0
+   (p8)	cmp.leu		p6, p7 = r24, r18
+	st8		[rp] = r24, 8
+	xma.hu		f41 = f33, f6, f0
+   (p9)	cmp.ltu		p6, p7 = r24, r18
+	;;
+.LL11:
+	.pred.rel "mutex",p6,p7
+	getf.sig	r21 = f43
+   (p6)	add		r24 = r19, r22, 1
+	nop.b		0
+	ldf8		f33 = [up], 8
+   (p7)	add		r24 = r19, r22
+	nop.b		0
+	;;
+	.pred.rel "mutex",p6,p7
+	getf.sig	r18 = f36
+	xma.l		f38 = f34, f6, f0
+   (p6)	cmp.leu		p8, p9 = r24, r19
+	st8		[rp] = r24, 8
+	xma.hu		f42 = f34, f6, f0
+   (p7)	cmp.ltu		p8, p9 = r24, r19
+	;;
+.LL10:
+	.pred.rel "mutex",p8,p9
+	getf.sig	r22 = f40
+   (p8)	add		r24 = r16, r23, 1
+	nop.b		0
+	ldf8		f34 = [up], 8
+   (p9)	add		r24 = r16, r23
+	nop.b		0
+	;;
+	.pred.rel "mutex",p8,p9
+	getf.sig	r19 = f37
+	xma.l		f39 = f35, f6, f0
+   (p8)	cmp.leu		p6, p7 = r24, r16
+	st8		[rp] = r24, 8
+	xma.hu		f43 = f35, f6, f0
+   (p9)	cmp.ltu		p6, p7 = r24, r16
+	;;
+.LL01:
+	.pred.rel "mutex",p6,p7
+	getf.sig	r23 = f41
+   (p6)	add		r24 = r17, r20, 1
+	nop.b		0
+	ldf8		f35 = [up], 8
+   (p7)	add		r24 = r17, r20
+	br.cloop.dptk	.Loop
+C *** MAIN LOOP END ***
+	;;
+
+.Lcj9:
+	.pred.rel "mutex",p6,p7
+	getf.sig	r16 = f38
+	xma.l		f36 = f32, f6, f0
+   (p6)	cmp.leu		p8, p9 = r24, r17
+	st8		[rp] = r24, 8
+	xma.hu		f40 = f32, f6, f0
+   (p7)	cmp.ltu		p8, p9 = r24, r17
+	;;
+	.pred.rel "mutex",p8,p9
+	getf.sig	r20 = f42
+   (p8)	add		r24 = r18, r21, 1
+   (p9)	add		r24 = r18, r21
+	;;
+	.pred.rel "mutex",p8,p9
+	getf.sig	r17 = f39
+	xma.l		f37 = f33, f6, f0
+   (p8)	cmp.leu		p6, p7 = r24, r18
+	st8		[rp] = r24, 8
+	xma.hu		f41 = f33, f6, f0
+   (p9)	cmp.ltu		p6, p7 = r24, r18
+	;;
+.Lcj8:
+	.pred.rel "mutex",p6,p7
+	getf.sig	r21 = f43
+   (p6)	add		r24 = r19, r22, 1
+   (p7)	add		r24 = r19, r22
+	;;
+	.pred.rel "mutex",p6,p7
+	getf.sig	r18 = f36
+	xma.l		f38 = f34, f6, f0
+   (p6)	cmp.leu		p8, p9 = r24, r19
+	st8		[rp] = r24, 8
+	xma.hu		f42 = f34, f6, f0
+   (p7)	cmp.ltu		p8, p9 = r24, r19
+	;;
+.Lcj7:
+	.pred.rel "mutex",p8,p9
+	getf.sig	r22 = f40
+   (p8)	add		r24 = r16, r23, 1
+   (p9)	add		r24 = r16, r23
+	;;
+	.pred.rel "mutex",p8,p9
+	getf.sig	r19 = f37
+	xma.l		f39 = f35, f6, f0
+   (p8)	cmp.leu		p6, p7 = r24, r16
+	st8		[rp] = r24, 8
+	xma.hu		f43 = f35, f6, f0
+   (p9)	cmp.ltu		p6, p7 = r24, r16
+	;;
+.Lcj6:
+	.pred.rel "mutex",p6,p7
+	getf.sig	r23 = f41
+   (p6)	add		r24 = r17, r20, 1
+   (p7)	add		r24 = r17, r20
+	;;
+	.pred.rel "mutex",p6,p7
+   (p6)	cmp.leu		p8, p9 = r24, r17
+   (p7)	cmp.ltu		p8, p9 = r24, r17
+	getf.sig	r16 = f38
+	st8		[rp] = r24, 8
+	;;
+.Lcj5:
+	.pred.rel "mutex",p8,p9
+	getf.sig	r20 = f42
+   (p8)	add		r24 = r18, r21, 1
+   (p9)	add		r24 = r18, r21
+	;;
+	.pred.rel "mutex",p8,p9
+   (p8)	cmp.leu		p6, p7 = r24, r18
+   (p9)	cmp.ltu		p6, p7 = r24, r18
+	getf.sig	r17 = f39
+	st8		[rp] = r24, 8
+	;;
+.Lcj4:
+	.pred.rel "mutex",p6,p7
+	getf.sig	r8 = f43
+   (p6)	add		r24 = r19, r22, 1
+   (p7)	add		r24 = r19, r22
+	;;
+	.pred.rel "mutex",p6,p7
+	st8		[rp] = r24, 8
+   (p6)	cmp.leu		p8, p9 = r24, r19
+   (p7)	cmp.ltu		p8, p9 = r24, r19
+	;;
+.Lcj3:
+	.pred.rel "mutex",p8,p9
+   (p8)	add		r24 = r16, r23, 1
+   (p9)	add		r24 = r16, r23
+	;;
+	.pred.rel "mutex",p8,p9
+	st8		[rp] = r24, 8
+   (p8)	cmp.leu		p6, p7 = r24, r16
+   (p9)	cmp.ltu		p6, p7 = r24, r16
+	;;
+.Lcj2:
+	.pred.rel "mutex",p6,p7
+   (p6)	add		r24 = r17, r20, 1
+   (p7)	add		r24 = r17, r20
+	;;
+	.pred.rel "mutex",p6,p7
+	st8		[rp] = r24, 8
+   (p6)	cmp.leu		p8, p9 = r24, r17
+   (p7)	cmp.ltu		p8, p9 = r24, r17
+	;;
+   (p8)	add		r8 = 1, r8
+	mov.i		ar.lc = r2
+	br.ret.sptk.many b0
+EPILOGUE()
+
+PROLOGUE(mpn_mul_1c)
+	.prologue
+	.save	ar.lc, r2
+	.body
+
+ifdef(`HAVE_ABI_32',
+`	addp4		rp = 0, rp		C M I
+	addp4		up = 0, up		C M I
+	zxt4		n = n			C I
+	;;
+')
+{.mmi
+	adds		r15 = -1, n		C M I
+	setf.sig	f9 = cy			C M2 M3
+	mov.i		r2 = ar.lc		C I0
+}
+{.mmb
+	ldf8		f7 = [up], 8		C M
+	and		r14 = 3, n		C M I
+	br.sptk		.Lcommon
+	;;
+}
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/ia64/mul_2.asm b/third_party/gmp/mpn/ia64/mul_2.asm
new file mode 100644
index 0000000..5343f64
--- /dev/null
+++ b/third_party/gmp/mpn/ia64/mul_2.asm

@@ -0,0 +1,625 @@
+dnl  IA-64 mpn_mul_2 -- Multiply a n-limb number with a 2-limb number and store
+dnl  store the result to a (n+1)-limb number.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  Copyright 2004, 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C         cycles/limb
+C Itanium:    ?
+C Itanium 2:  1.5
+
+C TODO
+C  * Clean up variable names, and try to decrease the number of distinct
+C    registers used.
+C  * Clean up feed-in code to not require zeroing several registers.
+C  * Make sure we don't depend on uninitialized predicate registers.
+C  * Could perhaps save a few cycles by using 1 c/l carry propagation in
+C    wind-down code.
+C  * Ultimately rewrite.  The problem with this code is that it first uses a
+C    loaded u value in one xma pair, then leaves it live over several unrelated
+C    xma pairs, before it uses it again.  It should actually be quite possible
+C    to just swap some aligned xma pairs around.  But we should then schedule
+C    u loads further from the first use.
+
+C INPUT PARAMETERS
+define(`rp',`r32')
+define(`up',`r33')
+define(`n',`r34')
+define(`vp',`r35')
+
+define(`srp',`r3')
+
+define(`v0',`f6')
+define(`v1',`f7')
+
+define(`s0',`r14')
+define(`acc0',`r15')
+
+define(`pr0_0',`r16') define(`pr0_1',`r17')
+define(`pr0_2',`r18') define(`pr0_3',`r19')
+
+define(`pr1_0',`r20') define(`pr1_1',`r21')
+define(`pr1_2',`r22') define(`pr1_3',`r23')
+
+define(`acc1_0',`r24') define(`acc1_1',`r25')
+define(`acc1_2',`r26') define(`acc1_3',`r27')
+
+dnl define(`',`r28')
+dnl define(`',`r29')
+dnl define(`',`r30')
+dnl define(`',`r31')
+
+define(`fp0b_0',`f8') define(`fp0b_1',`f9')
+define(`fp0b_2',`f10') define(`fp0b_3',`f11')
+
+define(`fp1a_0',`f12') define(`fp1a_1',`f13')
+define(`fp1a_2',`f14') define(`fp1a_3',`f15')
+
+define(`fp1b_0',`f32') define(`fp1b_1',`f33')
+define(`fp1b_2',`f34') define(`fp1b_3',`f35')
+
+define(`fp2a_0',`f36') define(`fp2a_1',`f37')
+define(`fp2a_2',`f38') define(`fp2a_3',`f39')
+
+define(`u_0',`f44') define(`u_1',`f45')
+define(`u_2',`f46') define(`u_3',`f47')
+
+define(`ux',`f49')
+define(`uy',`f51')
+
+ASM_START()
+PROLOGUE(mpn_mul_2)
+	.prologue
+	.save	ar.lc, r2
+	.body
+
+ifdef(`HAVE_ABI_32',`
+ {.mmi;		addp4	rp = 0, rp		C			M I
+		addp4	up = 0, up		C			M I
+		addp4	vp = 0, vp		C			M I
+}{.mmi;		nop	1
+		nop	1
+		zxt4	n = n			C			I
+	;;
+}')
+
+ {.mmi;		ldf8	ux = [up], 8		C			M
+		ldf8	v0 = [vp], 8		C			M
+		mov	r2 = ar.lc		C			I0
+}{.mmi;		nop	1			C			M
+		and	r14 = 3, n		C			M I
+		add	n = -2, n		C			M I
+	;;
+}{.mmi;		ldf8	uy = [up], 8		C			M
+		ldf8	v1 = [vp]		C			M
+		shr.u	n = n, 2		C			I0
+}{.mmi;		nop	1			C			M
+		cmp.eq	p10, p0 = 1, r14	C			M I
+		cmp.eq	p11, p0 = 2, r14	C			M I
+	;;
+}{.mmi;		nop	1			C			M
+		cmp.eq	p12, p0 = 3, r14	C			M I
+		mov	ar.lc = n		C			I0
+}{.bbb;	(p10)	br.dptk	L(b01)			C			B
+	(p11)	br.dptk	L(b10)			C			B
+	(p12)	br.dptk	L(b11)			C			B
+	;;
+}
+	ALIGN(32)
+L(b00):		ldf8	u_1 = [up], 8
+		mov	acc1_2 = 0
+		mov	pr1_2 = 0
+		mov	pr0_3 = 0
+		cmp.ne	p8, p9 = r0, r0
+	;;
+		xma.l	fp0b_3 = ux, v0, f0
+		cmp.ne	p12, p13 = r0, r0
+		ldf8	u_2 = [up], 8
+		xma.hu	fp1a_3 = ux, v0, f0
+		br.cloop.dptk	L(gt4)
+
+		xma.l	fp0b_0 = uy, v0, f0
+		xma.hu	fp1a_0 = uy, v0, f0
+	;;
+		getfsig	acc0 = fp0b_3
+		xma.l	fp1b_3 = ux, v1, fp1a_3
+		xma.hu	fp2a_3 = ux, v1, fp1a_3
+	;;
+		xma.l	fp0b_1 = u_1, v0, f0
+		xma.hu	fp1a_1 = u_1, v0, f0
+	;;
+		getfsig	pr0_0 = fp0b_0
+		xma.l	fp1b_0 = uy, v1, fp1a_0
+		xma.hu	fp2a_0 = uy, v1, fp1a_0
+	;;
+		getfsig	pr1_3 = fp1b_3
+		getfsig	acc1_3 = fp2a_3
+		xma.l	fp0b_2 = u_2, v0, f0
+		xma.hu	fp1a_2 = u_2, v0, f0
+		br	L(cj4)
+
+L(gt4):		xma.l	fp0b_0 = uy, v0, f0
+		xma.hu	fp1a_0 = uy, v0, f0
+	;;
+		getfsig	acc0 = fp0b_3
+		xma.l	fp1b_3 = ux, v1, fp1a_3
+		ldf8	u_3 = [up], 8
+		xma.hu	fp2a_3 = ux, v1, fp1a_3
+	;;
+		xma.l	fp0b_1 = u_1, v0, f0
+		xma.hu	fp1a_1 = u_1, v0, f0
+	;;
+		getfsig	pr0_0 = fp0b_0
+		xma.l	fp1b_0 = uy, v1, fp1a_0
+		xma.hu	fp2a_0 = uy, v1, fp1a_0
+	;;
+		ldf8	u_0 = [up], 8
+		getfsig	pr1_3 = fp1b_3
+		xma.l	fp0b_2 = u_2, v0, f0
+	;;
+		getfsig	acc1_3 = fp2a_3
+		xma.hu	fp1a_2 = u_2, v0, f0
+		br	L(00)
+
+
+	ALIGN(32)
+L(b01):		ldf8	u_0 = [up], 8		C M
+		mov	acc1_1 = 0		C M I
+		mov	pr1_1 = 0		C M I
+		mov	pr0_2 = 0		C M I
+		cmp.ne	p6, p7 = r0, r0		C M I
+	;;
+		xma.l	fp0b_2 = ux, v0, f0	C F
+		cmp.ne	p10, p11 = r0, r0	C M I
+		ldf8	u_1 = [up], 8		C M
+		xma.hu	fp1a_2 = ux, v0, f0	C F
+	;;
+		xma.l	fp0b_3 = uy, v0, f0	C F
+		xma.hu	fp1a_3 = uy, v0, f0	C F
+	;;
+		getfsig	acc0 = fp0b_2		C M
+		xma.l	fp1b_2 = ux, v1,fp1a_2	C F
+		ldf8	u_2 = [up], 8		C M
+		xma.hu	fp2a_2 = ux, v1,fp1a_2	C F
+		br.cloop.dptk	L(gt5)
+
+		xma.l	fp0b_0 = u_0, v0, f0	C F
+		xma.hu	fp1a_0 = u_0, v0, f0	C F
+	;;
+		getfsig	pr0_3 = fp0b_3		C M
+		xma.l	fp1b_3 = uy, v1,fp1a_3	C F
+		xma.hu	fp2a_3 = uy, v1,fp1a_3	C F
+	;;
+		getfsig	pr1_2 = fp1b_2		C M
+		getfsig	acc1_2 = fp2a_2		C M
+		xma.l	fp0b_1 = u_1, v0, f0	C F
+		xma.hu	fp1a_1 = u_1, v0, f0	C F
+		br	L(cj5)
+
+L(gt5):		xma.l	fp0b_0 = u_0, v0, f0
+		xma.hu	fp1a_0 = u_0, v0, f0
+	;;
+		getfsig	pr0_3 = fp0b_3
+		xma.l	fp1b_3 = uy, v1, fp1a_3
+		xma.hu	fp2a_3 = uy, v1, fp1a_3
+	;;
+		ldf8	u_3 = [up], 8
+		getfsig	pr1_2 = fp1b_2
+		xma.l	fp0b_1 = u_1, v0, f0
+	;;
+		getfsig	acc1_2 = fp2a_2
+		xma.hu	fp1a_1 = u_1, v0, f0
+		br	L(01)
+
+
+	ALIGN(32)
+L(b10):		br.cloop.dptk	L(gt2)
+		xma.l	fp0b_1 = ux, v0, f0
+		xma.hu	fp1a_1 = ux, v0, f0
+	;;
+		xma.l	fp0b_2 = uy, v0, f0
+		xma.hu	fp1a_2 = uy, v0, f0
+	;;
+		stf8	[rp] = fp0b_1, 8
+		xma.l	fp1b_1 = ux, v1, fp1a_1
+		xma.hu	fp2a_1 = ux, v1, fp1a_1
+	;;
+		getfsig	acc0 = fp0b_2
+		xma.l	fp1b_2 = uy, v1, fp1a_2
+		xma.hu	fp2a_2 = uy, v1, fp1a_2
+	;;
+		getfsig	pr1_1 = fp1b_1
+		getfsig	acc1_1 = fp2a_1
+		mov	ar.lc = r2
+		getfsig	pr1_2 = fp1b_2
+		getfsig	r8 = fp2a_2
+	;;
+		add	s0 = pr1_1, acc0
+	;;
+		st8	[rp] = s0, 8
+		cmp.ltu	p8, p9 = s0, pr1_1
+		sub	r31 = -1, acc1_1
+	;;
+	.pred.rel "mutex", p8, p9
+	(p8)	add	acc0 = pr1_2, acc1_1, 1
+	(p9)	add	acc0 = pr1_2, acc1_1
+	(p8)	cmp.leu	p10, p0 = r31, pr1_2
+	(p9)	cmp.ltu	p10, p0 = r31, pr1_2
+	;;
+		st8	[rp] = acc0, 8
+	(p10)	add	r8 = 1, r8
+		br.ret.sptk.many b0
+
+L(gt2):		ldf8	u_3 = [up], 8
+		mov	acc1_0 = 0
+		mov	pr1_0 = 0
+	;;
+		mov	pr0_1 = 0
+		xma.l	fp0b_1 = ux, v0, f0
+		ldf8	u_0 = [up], 8
+		xma.hu	fp1a_1 = ux, v0, f0
+	;;
+		xma.l	fp0b_2 = uy, v0, f0
+		xma.hu	fp1a_2 = uy, v0, f0
+	;;
+		getfsig	acc0 = fp0b_1
+		xma.l	fp1b_1 = ux, v1, fp1a_1
+		xma.hu	fp2a_1 = ux, v1, fp1a_1
+	;;
+		ldf8	u_1 = [up], 8
+		xma.l	fp0b_3 = u_3, v0, f0
+		xma.hu	fp1a_3 = u_3, v0, f0
+	;;
+		getfsig	pr0_2 = fp0b_2
+		xma.l	fp1b_2 = uy, v1, fp1a_2
+		xma.hu	fp2a_2 = uy, v1, fp1a_2
+	;;
+		ldf8	u_2 = [up], 8
+		getfsig	pr1_1 = fp1b_1
+	;;
+ {.mfi;		getfsig	acc1_1 = fp2a_1
+		xma.l	fp0b_0 = u_0, v0, f0
+		cmp.ne	p8, p9 = r0, r0
+}{.mfb;		cmp.ne	p12, p13 = r0, r0
+		xma.hu	fp1a_0 = u_0, v0, f0
+		br	L(10)
+}
+
+	ALIGN(32)
+L(b11):		mov	acc1_3 = 0
+		mov	pr1_3 = 0
+		mov	pr0_0 = 0
+		ldf8	u_2 = [up], 8
+		cmp.ne	p6, p7 = r0, r0
+		br.cloop.dptk	L(gt3)
+	;;
+		xma.l	fp0b_0 = ux, v0, f0
+		xma.hu	fp1a_0 = ux, v0, f0
+	;;
+		cmp.ne	p10, p11 = r0, r0
+		xma.l	fp0b_1 = uy, v0, f0
+		xma.hu	fp1a_1 = uy, v0, f0
+	;;
+		getfsig	acc0 = fp0b_0
+		xma.l	fp1b_0 = ux, v1, fp1a_0
+		xma.hu	fp2a_0 = ux, v1, fp1a_0
+	;;
+		xma.l	fp0b_2 = u_2, v0, f0
+		xma.hu	fp1a_2 = u_2, v0, f0
+	;;
+		getfsig	pr0_1 = fp0b_1
+		xma.l	fp1b_1 = uy, v1, fp1a_1
+		xma.hu	fp2a_1 = uy, v1, fp1a_1
+	;;
+		getfsig	pr1_0 = fp1b_0
+		getfsig	acc1_0 = fp2a_0
+		br	L(cj3)
+
+L(gt3):		xma.l	fp0b_0 = ux, v0, f0
+		cmp.ne	p10, p11 = r0, r0
+		ldf8	u_3 = [up], 8
+		xma.hu	fp1a_0 = ux, v0, f0
+	;;
+		xma.l	fp0b_1 = uy, v0, f0
+		xma.hu	fp1a_1 = uy, v0, f0
+	;;
+		getfsig	acc0 = fp0b_0
+		xma.l	fp1b_0 = ux, v1, fp1a_0
+		ldf8	u_0 = [up], 8
+		xma.hu	fp2a_0 = ux, v1, fp1a_0
+	;;
+		xma.l	fp0b_2 = u_2, v0, f0
+		xma.hu	fp1a_2 = u_2, v0, f0
+	;;
+		getfsig	pr0_1 = fp0b_1
+		xma.l	fp1b_1 = uy, v1, fp1a_1
+		xma.hu	fp2a_1 = uy, v1, fp1a_1
+	;;
+		ldf8	u_1 = [up], 8
+		getfsig	pr1_0 = fp1b_0
+	;;
+		getfsig	acc1_0 = fp2a_0
+		xma.l	fp0b_3 = u_3, v0, f0
+		xma.hu	fp1a_3 = u_3, v0, f0
+		br	L(11)
+
+
+C *** MAIN LOOP START ***
+	ALIGN(32)
+L(top):						C 00
+	.pred.rel "mutex", p8, p9
+	.pred.rel "mutex", p12, p13
+		ldf8	u_3 = [up], 8
+		getfsig	pr1_2 = fp1b_2
+	(p8)	cmp.leu	p6, p7 = acc0, pr0_1
+	(p9)	cmp.ltu	p6, p7 = acc0, pr0_1
+	(p12)	cmp.leu	p10, p11 = s0, pr1_0
+	(p13)	cmp.ltu	p10, p11 = s0, pr1_0
+	;;					C 01
+	.pred.rel "mutex", p6, p7
+		getfsig	acc1_2 = fp2a_2
+		st8	[rp] = s0, 8
+		xma.l	fp0b_1 = u_1, v0, f0
+	(p6)	add	acc0 = pr0_2, acc1_0, 1
+	(p7)	add	acc0 = pr0_2, acc1_0
+		xma.hu	fp1a_1 = u_1, v0, f0
+	;;					C 02
+L(01):
+	.pred.rel "mutex", p10, p11
+		getfsig	pr0_0 = fp0b_0
+		xma.l	fp1b_0 = u_0, v1, fp1a_0
+	(p10)	add	s0 = pr1_1, acc0, 1
+	(p11)	add	s0 = pr1_1, acc0
+		xma.hu	fp2a_0 = u_0, v1, fp1a_0
+		nop	1
+	;;					C 03
+	.pred.rel "mutex", p6, p7
+	.pred.rel "mutex", p10, p11
+		ldf8	u_0 = [up], 8
+		getfsig	pr1_3 = fp1b_3
+	(p6)	cmp.leu	p8, p9 = acc0, pr0_2
+	(p7)	cmp.ltu	p8, p9 = acc0, pr0_2
+	(p10)	cmp.leu	p12, p13 = s0, pr1_1
+	(p11)	cmp.ltu	p12, p13 = s0, pr1_1
+	;;					C 04
+	.pred.rel "mutex", p8, p9
+		getfsig	acc1_3 = fp2a_3
+		st8	[rp] = s0, 8
+		xma.l	fp0b_2 = u_2, v0, f0
+	(p8)	add	acc0 = pr0_3, acc1_1, 1
+	(p9)	add	acc0 = pr0_3, acc1_1
+		xma.hu	fp1a_2 = u_2, v0, f0
+	;;					C 05
+L(00):
+	.pred.rel "mutex", p12, p13
+		getfsig	pr0_1 = fp0b_1
+		xma.l	fp1b_1 = u_1, v1, fp1a_1
+	(p12)	add	s0 = pr1_2, acc0, 1
+	(p13)	add	s0 = pr1_2, acc0
+		xma.hu	fp2a_1 = u_1, v1, fp1a_1
+		nop	1
+	;;					C 06
+	.pred.rel "mutex", p8, p9
+	.pred.rel "mutex", p12, p13
+		ldf8	u_1 = [up], 8
+		getfsig	pr1_0 = fp1b_0
+	(p8)	cmp.leu	p6, p7 = acc0, pr0_3
+	(p9)	cmp.ltu	p6, p7 = acc0, pr0_3
+	(p12)	cmp.leu	p10, p11 = s0, pr1_2
+	(p13)	cmp.ltu	p10, p11 = s0, pr1_2
+	;;					C 07
+	.pred.rel "mutex", p6, p7
+		getfsig	acc1_0 = fp2a_0
+		st8	[rp] = s0, 8
+		xma.l	fp0b_3 = u_3, v0, f0
+	(p6)	add	acc0 = pr0_0, acc1_2, 1
+	(p7)	add	acc0 = pr0_0, acc1_2
+		xma.hu	fp1a_3 = u_3, v0, f0
+	;;					C 08
+L(11):
+	.pred.rel "mutex", p10, p11
+		getfsig	pr0_2 = fp0b_2
+		xma.l	fp1b_2 = u_2, v1, fp1a_2
+	(p10)	add	s0 = pr1_3, acc0, 1
+	(p11)	add	s0 = pr1_3, acc0
+		xma.hu	fp2a_2 = u_2, v1, fp1a_2
+		nop	1
+	;;					C 09
+	.pred.rel "mutex", p6, p7
+	.pred.rel "mutex", p10, p11
+		ldf8	u_2 = [up], 8
+		getfsig	pr1_1 = fp1b_1
+	(p6)	cmp.leu	p8, p9 = acc0, pr0_0
+	(p7)	cmp.ltu	p8, p9 = acc0, pr0_0
+	(p10)	cmp.leu	p12, p13 = s0, pr1_3
+	(p11)	cmp.ltu	p12, p13 = s0, pr1_3
+	;;					C 10
+	.pred.rel "mutex", p8, p9
+		getfsig	acc1_1 = fp2a_1
+		st8	[rp] = s0, 8
+		xma.l	fp0b_0 = u_0, v0, f0
+	(p8)	add	acc0 = pr0_1, acc1_3, 1
+	(p9)	add	acc0 = pr0_1, acc1_3
+		xma.hu	fp1a_0 = u_0, v0, f0
+	;;					C 11
+L(10):
+	.pred.rel "mutex", p12, p13
+		getfsig	pr0_3 = fp0b_3
+		xma.l	fp1b_3 = u_3, v1, fp1a_3
+	(p12)	add	s0 = pr1_0, acc0, 1
+	(p13)	add	s0 = pr1_0, acc0
+		xma.hu	fp2a_3 = u_3, v1, fp1a_3
+		br.cloop.dptk	L(top)
+	;;
+C *** MAIN LOOP END ***
+
+	.pred.rel "mutex", p8, p9
+	.pred.rel "mutex", p12, p13
+ {.mmi;		getfsig	pr1_2 = fp1b_2
+		st8	[rp] = s0, 8
+	(p8)	cmp.leu	p6, p7 = acc0, pr0_1
+}{.mmi;	(p9)	cmp.ltu	p6, p7 = acc0, pr0_1
+	(p12)	cmp.leu	p10, p11 = s0, pr1_0
+	(p13)	cmp.ltu	p10, p11 = s0, pr1_0
+	;;
+}	.pred.rel "mutex", p6, p7
+ {.mfi;		getfsig	acc1_2 = fp2a_2
+		xma.l	fp0b_1 = u_1, v0, f0
+		nop	1
+}{.mmf;	(p6)	add	acc0 = pr0_2, acc1_0, 1
+	(p7)	add	acc0 = pr0_2, acc1_0
+		xma.hu	fp1a_1 = u_1, v0, f0
+	;;
+}
+L(cj5):
+	.pred.rel "mutex", p10, p11
+ {.mfi;		getfsig	pr0_0 = fp0b_0
+		xma.l	fp1b_0 = u_0, v1, fp1a_0
+	(p10)	add	s0 = pr1_1, acc0, 1
+}{.mfi;	(p11)	add	s0 = pr1_1, acc0
+		xma.hu	fp2a_0 = u_0, v1, fp1a_0
+		nop	1
+	;;
+}	.pred.rel "mutex", p6, p7
+	.pred.rel "mutex", p10, p11
+ {.mmi;		getfsig	pr1_3 = fp1b_3
+		st8	[rp] = s0, 8
+	(p6)	cmp.leu	p8, p9 = acc0, pr0_2
+}{.mmi;	(p7)	cmp.ltu	p8, p9 = acc0, pr0_2
+	(p10)	cmp.leu	p12, p13 = s0, pr1_1
+	(p11)	cmp.ltu	p12, p13 = s0, pr1_1
+	;;
+}	.pred.rel "mutex", p8, p9
+ {.mfi;		getfsig	acc1_3 = fp2a_3
+		xma.l	fp0b_2 = u_2, v0, f0
+		nop	1
+}{.mmf;	(p8)	add	acc0 = pr0_3, acc1_1, 1
+	(p9)	add	acc0 = pr0_3, acc1_1
+		xma.hu	fp1a_2 = u_2, v0, f0
+	;;
+}
+L(cj4):
+	.pred.rel "mutex", p12, p13
+ {.mfi;		getfsig	pr0_1 = fp0b_1
+		xma.l	fp1b_1 = u_1, v1, fp1a_1
+	(p12)	add	s0 = pr1_2, acc0, 1
+}{.mfi;	(p13)	add	s0 = pr1_2, acc0
+		xma.hu	fp2a_1 = u_1, v1, fp1a_1
+		nop	1
+	;;
+}	.pred.rel "mutex", p8, p9
+	.pred.rel "mutex", p12, p13
+ {.mmi;		getfsig	pr1_0 = fp1b_0
+		st8	[rp] = s0, 8
+	(p8)	cmp.leu	p6, p7 = acc0, pr0_3
+}{.mmi;	(p9)	cmp.ltu	p6, p7 = acc0, pr0_3
+	(p12)	cmp.leu	p10, p11 = s0, pr1_2
+	(p13)	cmp.ltu	p10, p11 = s0, pr1_2
+	;;
+}	.pred.rel "mutex", p6, p7
+ {.mmi;		getfsig	acc1_0 = fp2a_0
+	(p6)	add	acc0 = pr0_0, acc1_2, 1
+	(p7)	add	acc0 = pr0_0, acc1_2
+	;;
+}
+L(cj3):
+	.pred.rel "mutex", p10, p11
+ {.mfi;		getfsig	pr0_2 = fp0b_2
+		xma.l	fp1b_2 = u_2, v1, fp1a_2
+	(p10)	add	s0 = pr1_3, acc0, 1
+}{.mfi;	(p11)	add	s0 = pr1_3, acc0
+		xma.hu	fp2a_2 = u_2, v1, fp1a_2
+		nop	1
+	;;
+}	.pred.rel "mutex", p6, p7
+	.pred.rel "mutex", p10, p11
+ {.mmi;		getfsig	pr1_1 = fp1b_1
+		st8	[rp] = s0, 8
+	(p6)	cmp.leu	p8, p9 = acc0, pr0_0
+}{.mmi;	(p7)	cmp.ltu	p8, p9 = acc0, pr0_0
+	(p10)	cmp.leu	p12, p13 = s0, pr1_3
+	(p11)	cmp.ltu	p12, p13 = s0, pr1_3
+	;;
+}	.pred.rel "mutex", p8, p9
+ {.mmi;		getfsig	acc1_1 = fp2a_1
+	(p8)	add	acc0 = pr0_1, acc1_3, 1
+	(p9)	add	acc0 = pr0_1, acc1_3
+	;;
+}	.pred.rel "mutex", p12, p13
+ {.mmi;	(p12)	add	s0 = pr1_0, acc0, 1
+	(p13)	add	s0 = pr1_0, acc0
+		nop	1
+	;;
+}	.pred.rel "mutex", p8, p9
+	.pred.rel "mutex", p12, p13
+ {.mmi;		getfsig	pr1_2 = fp1b_2
+		st8	[rp] = s0, 8
+	(p8)	cmp.leu	p6, p7 = acc0, pr0_1
+}{.mmi;	(p9)	cmp.ltu	p6, p7 = acc0, pr0_1
+	(p12)	cmp.leu	p10, p11 = s0, pr1_0
+	(p13)	cmp.ltu	p10, p11 = s0, pr1_0
+	;;
+}	.pred.rel "mutex", p6, p7
+ {.mmi;		getfsig	r8 = fp2a_2
+	(p6)	add	acc0 = pr0_2, acc1_0, 1
+	(p7)	add	acc0 = pr0_2, acc1_0
+	;;
+}	.pred.rel "mutex", p10, p11
+ {.mmi;	(p10)	add	s0 = pr1_1, acc0, 1
+	(p11)	add	s0 = pr1_1, acc0
+	(p6)	cmp.leu	p8, p9 = acc0, pr0_2
+	;;
+}	.pred.rel "mutex", p10, p11
+ {.mmi;	(p7)	cmp.ltu	p8, p9 = acc0, pr0_2
+	(p10)	cmp.leu	p12, p13 = s0, pr1_1
+	(p11)	cmp.ltu	p12, p13 = s0, pr1_1
+	;;
+}	.pred.rel "mutex", p8, p9
+ {.mmi;		st8	[rp] = s0, 8
+	(p8)	add	acc0 = pr1_2, acc1_1, 1
+	(p9)	add	acc0 = pr1_2, acc1_1
+	;;
+}	.pred.rel "mutex", p8, p9
+ {.mmi;	(p8)	cmp.leu	p10, p11 = acc0, pr1_2
+	(p9)	cmp.ltu	p10, p11 = acc0, pr1_2
+	(p12)	add	acc0 = 1, acc0
+	;;
+}{.mmi;		st8	[rp] = acc0, 8
+	(p12)	cmpeqor	p10, p0 = 0, acc0
+		nop	1
+	;;
+}{.mib;	(p10)	add	r8 = 1, r8
+		mov	ar.lc = r2
+		br.ret.sptk.many b0
+}
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/ia64/popcount.asm b/third_party/gmp/mpn/ia64/popcount.asm
new file mode 100644
index 0000000..c0b5c5c
--- /dev/null
+++ b/third_party/gmp/mpn/ia64/popcount.asm

@@ -0,0 +1,200 @@
+dnl  IA-64 mpn_popcount -- mpn population count.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  Copyright 2000-2005 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C           cycles/limb
+C Itanium:       1.5
+C Itanium 2:     1
+
+C INPUT PARAMETERS
+define(`up', `r32')
+define(`n', `r33')
+
+define(`u0',`r16') define(`u1',`r17') define(`u2',`r18') define(`u3',`r19')
+define(`c0',`r28') define(`c1',`r29') define(`c2',`r30') define(`c3',`r31')
+define(`s',`r8')
+
+
+ASM_START()
+PROLOGUE(mpn_popcount)
+	.prologue
+ifdef(`HAVE_ABI_32',
+`	addp4		up = 0, up		C			M I
+	nop.m		0
+	zxt4		n = n			C			I
+	;;
+')
+
+ {.mmi;	add		r9 = 512, up		C prefetch pointer	M I
+	ld8		r10 = [up], 8		C load first limb	M01
+	mov.i		r2 = ar.lc		C save ar.lc		I0
+}{.mmi;	and		r14 = 3, n		C			M I
+	cmp.lt		p15, p14 = 4, n		C small count?		M I
+	add		n = -5, n		C			M I
+	;;
+}{.mmi;	cmp.eq		p6, p0 = 1, r14		C			M I
+	cmp.eq		p7, p0 = 2, r14		C			M I
+	cmp.eq		p8, p0 = 3, r14		C			M I
+}{.bbb
+  (p6)	br.dptk		.Lb01			C			B
+  (p7)	br.dptk		.Lb10			C			B
+  (p8)	br.dptk		.Lb11			C			B
+}
+
+
+.Lb00:	ld8		u1 = [up], 8		C			M01
+	shr.u		n = n, 2		C			I0
+	mov		s = 0			C			M I
+	;;
+	ld8		u2 = [up], 8		C			M01
+	popcnt		c0 = r10		C			I0
+	mov.i		ar.lc = n		C			I0
+	;;
+	ld8		u3 = [up], 8		C			M01
+	popcnt		c1 = u1			C			I0
+  (p15)	br.cond.dptk	.grt4			C			B
+	;;
+	nop.m	0				C			-
+	nop.m	0				C			-
+	popcnt		c2 = u2			C			I0
+	;;
+	mov		s = c0			C			M I
+	popcnt		c3 = u3			C			I0
+	br		.Lcj4			C			B
+
+.grt4:	ld8		u0 = [up], 8		C			M01
+	popcnt		c2 = u2			C			I0
+	br		.LL00			C			B
+
+
+.Lb01:
+	popcnt		s = r10			C			I0
+  (p14)	br.ret.sptk.many b0			C			B
+
+.grt1:	ld8		u0 = [up], 8		C			M01
+	shr.u		n = n, 2		C			I0
+	;;
+	ld8		u1 = [up], 8		C			M01
+	mov.i		ar.lc = n		C			I0
+	;;
+	ld8		u2 = [up], 8		C			M01
+	popcnt		c0 = u0			C			I0
+	mov		c3 = 0			C			I0
+
+	;;
+	ld8		u3 = [up], 8		C			M01
+	popcnt		c1 = u1			C			I0
+	br.cloop.dptk	.Loop			C			B
+	br		.Lend			C			B
+
+
+.Lb10:	ld8		u3 = [up], 8		C			M01
+	shr.u		n = n, 2		C			I0
+  (p15)	br.cond.dptk	.grt2			C			B
+
+	popcnt		s = r10			C			I0
+	;;
+	popcnt		c3 = u3			C			I0
+	br		.Lcj2			C			B
+
+.grt2:	ld8		u0 = [up], 8		C			M01
+	mov.i		ar.lc = n		C			I0
+	popcnt		c2 = r10		C			I0
+	;;
+	ld8		u1 = [up], 8		C			M01
+	popcnt		c3 = u3			C			I0
+	mov		s = 0			C			M I
+	;;
+	ld8		u2 = [up], 8		C			M01
+	popcnt		c0 = u0			C			I0
+	br		.LL10			C			B
+
+
+.Lb11:	ld8		u2 = [up], 8		C			M01
+	shr.u		n = n, 2		C			I0
+	mov		s = 0			C			M I
+	;;
+	ld8		u3 = [up], 8		C			M01
+	popcnt		s = r10			C			I0
+  (p15)	br.cond.dptk	.grt3			C			B
+
+	popcnt		c2 = u2			C			I0
+	;;
+	popcnt		c3 = u3			C			I0
+	br		.Lcj3			C			B
+
+.grt3:	ld8		u0 = [up], 8		C			M01
+	popcnt		c2 = u2			C			I0
+	mov.i		ar.lc = n		C			I0
+	mov		c1 = 0
+	;;
+	ld8		u1 = [up], 8		C			M01
+	popcnt		c3 = u3			C			I0
+	br		.LL11			C			B
+
+
+.Loop:	ld8		u0 = [up], 8		C			M01
+	popcnt		c2 = u2			C			I0
+	add		s = s, c3		C			M I
+	;;
+.LL00:	ld8		u1 = [up], 8		C			M01
+	popcnt		c3 = u3			C			I0
+	add		s = s, c0		C			M I
+	;;
+.LL11:	ld8		u2 = [up], 8		C			M01
+	popcnt		c0 = u0			C			I0
+	add		s = s, c1		C			M I
+	;;
+.LL10:	ld8		u3 = [up], 8		C			M01
+	popcnt		c1 = u1			C			I0
+	add		s = s, c2		C			M I
+	lfetch		[r9], 32		C			M01
+	nop.m		0			C			-
+	br.cloop.dptk	.Loop			C			B
+	;;
+
+.Lend:	popcnt		c2 = u2			C			I0
+	add		s = s, c3		C			M I
+	;;
+	popcnt		c3 = u3			C			I0
+	add		s = s, c0		C			M I
+	;;
+.Lcj4:	add		s = s, c1		C			M I
+	;;
+.Lcj3:	add		s = s, c2		C			M I
+	;;
+.Lcj2:	add		s = s, c3		C			M I
+	mov.i		ar.lc = r2		C			I0
+	br.ret.sptk.many b0			C			B
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/ia64/rsh1aors_n.asm b/third_party/gmp/mpn/ia64/rsh1aors_n.asm
new file mode 100644
index 0000000..3c7defb
--- /dev/null
+++ b/third_party/gmp/mpn/ia64/rsh1aors_n.asm

@@ -0,0 +1,447 @@
+dnl  IA-64 mpn_rsh1add_n/mpn_rsh1sub_n -- rp[] = (up[] +- vp[]) >> 1.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  Copyright 2003-2005 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C         cycles/limb
+C Itanium:    2.5
+C Itanium 2:  1.5
+
+C TODO
+C  * Rewrite function entry code using aorslsh1_n.asm style.
+C  * Micro-optimize feed-in and wind-down code.
+
+C INPUT PARAMETERS
+define(`rp',`r32')
+define(`up',`r33')
+define(`vp',`r34')
+define(`n',`r35')
+
+ifdef(`OPERATION_rsh1add_n',`
+  define(ADDSUB,       add)
+  define(PRED,	       ltu)
+  define(INCR,	       1)
+  define(LIM,	       -1)
+  define(func, mpn_rsh1add_n)
+')
+ifdef(`OPERATION_rsh1sub_n',`
+  define(ADDSUB,       sub)
+  define(PRED,	       gtu)
+  define(INCR,	       -1)
+  define(LIM,	       0)
+  define(func, mpn_rsh1sub_n)
+')
+
+C Some useful aliases for registers we use
+define(`u0',`r14') define(`u1',`r15') define(`u2',`r16') define(`u3',`r17')
+define(`v0',`r18') define(`v1',`r19') define(`v2',`r20') define(`v3',`r21')
+define(`w0',`r22') define(`w1',`r23') define(`w2',`r24') define(`w3',`r25')
+define(`x0',`r26') define(`x1',`r9') define(`x2',`r30') define(`x3',`r31')
+
+MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1sub_n)
+
+ASM_START()
+PROLOGUE(func)
+	.prologue
+	.save	ar.lc, r2
+	.body
+ifdef(`HAVE_ABI_32',`
+	addp4		rp = 0, rp		C			M I
+	addp4		up = 0, up		C			M I
+	addp4		vp = 0, vp		C			M I
+	nop.m		0
+	nop.m		0
+	zxt4		n = n			C			I
+	;;
+')
+ {.mmi;	ld8		r11 = [vp], 8		C			M01
+	ld8		r10 = [up], 8		C			M01
+	mov.i		r2 = ar.lc		C			I0
+}{.mmi;	and		r14 = 3, n		C			M I
+	cmp.lt		p15, p0 = 4, n		C			M I
+	add		n = -4, n		C			M I
+	;;
+}{.mmi;	cmp.eq		p6, p0 = 1, r14		C			M I
+	cmp.eq		p7, p0 = 2, r14		C			M I
+	cmp.eq		p8, p0 = 3, r14		C			M I
+}{.bbb
+  (p6)	br.dptk		.Lb01			C			B
+  (p7)	br.dptk		.Lb10			C			B
+  (p8)	br.dptk		.Lb11			C			B
+}
+
+.Lb00:	ld8		v0 = [vp], 8		C			M01
+	ld8		u0 = [up], 8		C			M01
+	shr.u		n = n, 2		C			I0
+	;;
+	ld8		v1 = [vp], 8		C			M01
+	ld8		u1 = [up], 8		C			M01
+	ADDSUB		w3 = r10, r11		C			M I
+	;;
+	ld8		v2 = [vp], 8		C			M01
+	ld8		u2 = [up], 8		C			M01
+  (p15)	br.dpnt		.grt4			C			B
+	;;
+
+	cmp.PRED	p7, p0 = w3, r10	C			M I
+	and		r8 = 1, w3		C			M I
+	ADDSUB		w0 = u0, v0		C			M I
+	;;
+	cmp.PRED	p8, p0 = w0, u0		C			M I
+	ADDSUB		w1 = u1, v1		C			M I
+	;;
+	cmp.PRED	p9, p0 = w1, u1		C			M I
+   (p7)	cmp.eq.or	p8, p0 = LIM, w0	C			M I
+   (p7)	add		w0 = INCR, w0		C			M I
+	;;
+	shrp		x3 = w0, w3, 1		C			I0
+	ADDSUB		w2 = u2, v2		C			M I
+   (p8)	cmp.eq.or	p9, p0 = LIM, w1	C			M I
+   (p8)	add		w1 = INCR, w1		C			M I
+	br		.Lcj4			C			B
+
+.grt4:	ld8		v3 = [vp], 8		C			M01
+	cmp.PRED	p7, p0 = w3, r10	C			M I
+	ld8		u3 = [up], 8		C			M01
+	and		r8 = 1, w3		C			M I
+	;;
+	ADDSUB		w0 = u0, v0		C			M I
+	ld8		v0 = [vp], 8		C			M01
+	add		n = -1, n
+	;;
+	cmp.PRED	p8, p0 = w0, u0		C			M I
+	ld8		u0 = [up], 8		C			M01
+	ADDSUB		w1 = u1, v1		C			M I
+	;;
+	ld8		v1 = [vp], 8		C			M01
+	mov.i		ar.lc = n		C			I0
+	cmp.PRED	p9, p0 = w1, u1		C			M I
+	ld8		u1 = [up], 8		C			M01
+   (p7)	cmp.eq.or	p8, p0 = LIM, w0	C			M I
+   (p7)	add		w0 = INCR, w0		C			M I
+	;;
+	ADDSUB		w2 = u2, v2		C			M I
+	ld8		v2 = [vp], 8		C			M01
+	shrp		x3 = w0, w3, 1		C			I0
+   (p8)	cmp.eq.or	p9, p0 = LIM, w1	C			M I
+   (p8)	add		w1 = INCR, w1		C			M I
+	br		.LL00			C			B
+
+
+.Lb01:	ADDSUB		w2 = r10, r11		C			M I
+	shr.u		n = n, 2		C			I0
+  (p15)	br.dpnt		.grt1			C			B
+	;;
+
+	cmp.PRED	p6, p7 = w2, r10	C			M I
+	shr.u		x2 = w2, 1		C			I0
+	and		r8 = 1, w2		C			M I
+	;;
+   (p6)	dep		x2 = -1, x2, 63, 1	C			I0
+	br		.Lcj1			C			B
+
+.grt1:	ld8		v3 = [vp], 8		C			M01
+	ld8		u3 = [up], 8		C			M01
+	;;
+	ld8		v0 = [vp], 8		C			M01
+	ld8		u0 = [up], 8		C			M01
+	mov.i		ar.lc = n		C FIXME swap with next	I0
+	;;
+	ld8		v1 = [vp], 8		C			M01
+	ld8		u1 = [up], 8		C			M01
+	;;
+	ld8		v2 = [vp], 8		C			M01
+	ld8		u2 = [up], 8		C			M01
+	cmp.PRED	p6, p0 = w2, r10	C			M I
+	and		r8 = 1, w2		C			M I
+	ADDSUB		w3 = u3, v3		C			M I
+	br.cloop.dptk	.grt5			C			B
+	;;
+
+	cmp.PRED	p7, p0 = w3, u3		C			M I
+	;;
+	ADDSUB		w0 = u0, v0		C			M I
+   (p6)	cmp.eq.or	p7, p0 = LIM, w3	C			M I
+   (p6)	add		w3 = INCR, w3		C			M I
+	;;
+	cmp.PRED	p8, p0 = w0, u0		C			M I
+	shrp		x2 = w3, w2, 1		C			I0
+	ADDSUB		w1 = u1, v1		C			M I
+	;;
+	cmp.PRED	p9, p0 = w1, u1		C			M I
+   (p7)	cmp.eq.or	p8, p0 = LIM, w0	C			M I
+   (p7)	add		w0 = INCR, w0		C			M I
+	br		.Lcj5			C			B
+
+.grt5:	ld8		v3 = [vp], 8		C			M01
+	cmp.PRED	p7, p0 = w3, u3		C			M I
+	ld8		u3 = [up], 8		C			M01
+	;;
+	ADDSUB		w0 = u0, v0		C			M I
+	ld8		v0 = [vp], 8		C			M01
+   (p6)	cmp.eq.or	p7, p0 = LIM, w3	C			M I
+   (p6)	add		w3 = INCR, w3		C			M I
+	;;
+	cmp.PRED	p8, p0 = w0, u0		C			M I
+	shrp		x2 = w3, w2, 1		C			I0
+	ld8		u0 = [up], 8		C			M01
+	ADDSUB		w1 = u1, v1		C			M I
+	;;
+	ld8		v1 = [vp], 8		C			M01
+	cmp.PRED	p9, p0 = w1, u1		C			M I
+	ld8		u1 = [up], 8		C			M01
+   (p7)	cmp.eq.or	p8, p0 = LIM, w0	C			M I
+   (p7)	add		w0 = INCR, w0		C			M I
+	br		.LL01			C			B
+
+
+.Lb10:	ld8		v2 = [vp], 8		C			M01
+	ld8		u2 = [up], 8		C			M01
+	shr.u		n = n, 2		C			I0
+	ADDSUB		w1 = r10, r11		C			M I
+  (p15)	br.dpnt		.grt2			C			B
+	;;
+
+	cmp.PRED	p9, p0 = w1, r10	C			M I
+	and		r8 = 1, w1		C			M I
+	ADDSUB		w2 = u2, v2		C			M I
+	;;
+	cmp.PRED	p6, p0 = w2, u2		C			M I
+	;;
+   (p9)	cmp.eq.or	p6, p0 = LIM, w2	C			M I
+   (p9)	add		w2 = INCR, w2		C			M I
+	;;
+	shrp		x1 = w2, w1, 1		C			I0
+	shr.u		x2 = w2, 1		C			I0
+	br		.Lcj2			C			B
+
+.grt2:	ld8		v3 = [vp], 8		C			M01
+	ld8		u3 = [up], 8		C			M01
+	;;
+	ld8		v0 = [vp], 8		C			M01
+	ld8		u0 = [up], 8		C			M01
+	mov.i		ar.lc = n		C			I0
+	;;
+	ld8		v1 = [vp], 8		C			M01
+	cmp.PRED	p9, p0 = w1, r10	C			M I
+	ld8		u1 = [up], 8		C			M01
+	and		r8 = 1, w1		C			M I
+	;;
+	ADDSUB		w2 = u2, v2		C			M I
+	ld8		v2 = [vp], 8		C			M01
+	;;
+	cmp.PRED	p6, p0 = w2, u2		C			M I
+	ld8		u2 = [up], 8		C			M01
+	ADDSUB		w3 = u3, v3		C			M I
+	br.cloop.dptk	.grt6			C			B
+	;;
+
+	cmp.PRED	p7, p0 = w3, u3		C			M I
+   (p9)	cmp.eq.or	p6, p0 = LIM, w2	C			M I
+   (p9)	add		w2 = INCR, w2		C			M I
+	;;
+	shrp		x1 = w2, w1, 1		C			I0
+	ADDSUB		w0 = u0, v0		C			M I
+   (p6)	cmp.eq.or	p7, p0 = LIM, w3	C			M I
+   (p6)	add		w3 = INCR, w3		C			M I
+	br		.Lcj6			C			B
+
+.grt6:	ld8		v3 = [vp], 8		C			M01
+	cmp.PRED	p7, p0 = w3, u3		C			M I
+	ld8		u3 = [up], 8		C			M01
+   (p9)	cmp.eq.or	p6, p0 = LIM, w2	C			M I
+   (p9)	add		w2 = INCR, w2		C			M I
+	;;
+	shrp		x1 = w2, w1, 1		C			I0
+	ADDSUB		w0 = u0, v0		C			M I
+	ld8		v0 = [vp], 8		C			M01
+   (p6)	cmp.eq.or	p7, p0 = LIM, w3	C			M I
+   (p6)	add		w3 = INCR, w3		C			M I
+	br		.LL10			C			B
+
+
+.Lb11:	ld8		v1 = [vp], 8		C			M01
+	ld8		u1 = [up], 8		C			M01
+	shr.u		n = n, 2		C			I0
+	;;
+	ld8		v2 = [vp], 8		C			M01
+	ld8		u2 = [up], 8		C			M01
+	ADDSUB		w0 = r10, r11		C			M I
+  (p15)	br.dpnt		.grt3			C			B
+	;;
+
+	cmp.PRED	p8, p0 = w0, r10	C			M I
+	ADDSUB		w1 = u1, v1		C			M I
+	and		r8 = 1, w0		C			M I
+	;;
+	cmp.PRED	p9, p0 = w1, u1		C			M I
+	;;
+	ADDSUB		w2 = u2, v2		C			M I
+   (p8)	cmp.eq.or	p9, p0 = LIM, w1	C			M I
+   (p8)	add		w1 = INCR, w1		C			M I
+	;;
+	cmp.PRED	p6, p0 = w2, u2		C			M I
+	shrp		x0 = w1, w0, 1		C			I0
+	;;
+   (p9)	cmp.eq.or	p6, p0 = LIM, w2	C			M I
+   (p9)	add		w2 = INCR, w2		C			M I
+	br		.Lcj3			C			B
+
+.grt3:	ld8		v3 = [vp], 8		C			M01
+	ld8		u3 = [up], 8		C			M01
+	;;
+	ld8		v0 = [vp], 8		C			M01
+	mov.i		ar.lc = n		C			I0
+	cmp.PRED	p8, p0 = w0, r10	C			M I
+	ld8		u0 = [up], 8		C			M01
+	ADDSUB		w1 = u1, v1		C			M I
+	and		r8 = 1, w0		C			M I
+	;;
+	ld8		v1 = [vp], 8		C			M01
+	cmp.PRED	p9, p0 = w1, u1		C			M I
+	ld8		u1 = [up], 8		C			M01
+	;;
+	ADDSUB		w2 = u2, v2		C			M I
+	ld8		v2 = [vp], 8		C			M01
+   (p8)	cmp.eq.or	p9, p0 = LIM, w1	C			M I
+   (p8)	add		w1 = INCR, w1		C			M I
+	;;
+	cmp.PRED	p6, p0 = w2, u2		C			M I
+	shrp		x0 = w1, w0, 1		C			I0
+	ld8		u2 = [up], 8		C			M01
+	ADDSUB		w3 = u3, v3		C			M I
+	br.cloop.dptk	.grt7			C			B
+	;;
+
+	cmp.PRED	p7, p0 = w3, u3		C			M I
+   (p9)	cmp.eq.or	p6, p0 = LIM, w2	C			M I
+   (p9)	add		w2 = INCR, w2		C			M I
+	br		.Lcj7			C			B
+
+.grt7:	ld8		v3 = [vp], 8		C			M01
+	cmp.PRED	p7, p0 = w3, u3		C			M I
+	ld8		u3 = [up], 8		C			M01
+   (p9)	cmp.eq.or	p6, p0 = LIM, w2	C			M I
+   (p9)	add		w2 = INCR, w2		C			M I
+	br		.LL11			C			B
+
+
+C *** MAIN LOOP START ***
+	ALIGN(32)
+.Loop:	st8		[rp] = x3, 8		C			M23
+	ld8		v3 = [vp], 8		C			M01
+	cmp.PRED	p7, p0 = w3, u3		C			M I
+	ld8		u3 = [up], 8		C			M01
+   (p9)	cmp.eq.or	p6, p0 = LIM, w2	C			M I
+   (p9)	add		w2 = INCR, w2		C			M I
+	;;
+.LL11:	st8		[rp] = x0, 8		C			M23
+	shrp		x1 = w2, w1, 1		C			I0
+	ADDSUB		w0 = u0, v0		C			M I
+	ld8		v0 = [vp], 8		C			M01
+   (p6)	cmp.eq.or	p7, p0 = LIM, w3	C			M I
+   (p6)	add		w3 = INCR, w3		C			M I
+	;;
+.LL10:	cmp.PRED	p8, p0 = w0, u0		C			M I
+	shrp		x2 = w3, w2, 1		C			I0
+	nop.b		0
+	ld8		u0 = [up], 8		C			M01
+	ADDSUB		w1 = u1, v1		C			M I
+	nop.b		0
+	;;
+	st8		[rp] = x1, 8		C			M23
+	ld8		v1 = [vp], 8		C			M01
+	cmp.PRED	p9, p0 = w1, u1		C			M I
+	ld8		u1 = [up], 8		C			M01
+   (p7)	cmp.eq.or	p8, p0 = LIM, w0	C			M I
+   (p7)	add		w0 = INCR, w0		C			M I
+	;;
+.LL01:	st8		[rp] = x2, 8		C			M23
+	shrp		x3 = w0, w3, 1		C			I0
+	ADDSUB		w2 = u2, v2		C			M I
+	ld8		v2 = [vp], 8		C			M01
+   (p8)	cmp.eq.or	p9, p0 = LIM, w1	C			M I
+   (p8)	add		w1 = INCR, w1		C			M I
+	;;
+.LL00:	cmp.PRED	p6, p0 = w2, u2		C			M I
+	shrp		x0 = w1, w0, 1		C			I0
+	nop.b		0
+	ld8		u2 = [up], 8		C			M01
+	ADDSUB		w3 = u3, v3		C			M I
+	br.cloop.dptk	.Loop			C			B
+	;;
+C *** MAIN LOOP END ***
+
+.Lskip:	st8		[rp] = x3, 8		C			M23
+	cmp.PRED	p7, p0 = w3, u3		C			M I
+   (p9)	cmp.eq.or	p6, p0 = LIM, w2	C			M I
+   (p9)	add		w2 = INCR, w2		C			M I
+	;;
+.Lcj7:	st8		[rp] = x0, 8		C			M23
+	shrp		x1 = w2, w1, 1		C			I0
+	ADDSUB		w0 = u0, v0		C			M I
+   (p6)	cmp.eq.or	p7, p0 = LIM, w3	C			M I
+   (p6)	add		w3 = INCR, w3		C			M I
+	;;
+.Lcj6:	cmp.PRED	p8, p0 = w0, u0		C			M I
+	shrp		x2 = w3, w2, 1		C			I0
+	ADDSUB		w1 = u1, v1		C			M I
+	;;
+	st8		[rp] = x1, 8		C			M23
+	cmp.PRED	p9, p0 = w1, u1		C			M I
+   (p7)	cmp.eq.or	p8, p0 = LIM, w0	C			M I
+   (p7)	add		w0 = INCR, w0		C			M I
+	;;
+.Lcj5:	st8		[rp] = x2, 8		C			M23
+	shrp		x3 = w0, w3, 1		C			I0
+	ADDSUB		w2 = u2, v2		C			M I
+   (p8)	cmp.eq.or	p9, p0 = LIM, w1	C			M I
+   (p8)	add		w1 = INCR, w1		C			M I
+	;;
+.Lcj4:	cmp.PRED	p6, p0 = w2, u2		C			M I
+	shrp		x0 = w1, w0, 1		C			I0
+	;;
+	st8		[rp] = x3, 8		C			M23
+   (p9)	cmp.eq.or	p6, p0 = LIM, w2	C			M I
+   (p9)	add		w2 = INCR, w2		C			M I
+	;;
+.Lcj3:	st8		[rp] = x0, 8		C			M23
+	shrp		x1 = w2, w1, 1		C			I0
+	shr.u		x2 = w2, 1		C			I0
+	;;
+.Lcj2:	st8		[rp] = x1, 8		C			M23
+   (p6)	dep		x2 = -1, x2, 63, 1	C			I0
+	;;
+.Lcj1:	st8		[rp] = x2		C			M23
+	mov.i		ar.lc = r2		C			I0
+	br.ret.sptk.many b0			C			B
+EPILOGUE()

diff --git a/third_party/gmp/mpn/ia64/sec_tabselect.asm b/third_party/gmp/mpn/ia64/sec_tabselect.asm
new file mode 100644
index 0000000..9b11cde
--- /dev/null
+++ b/third_party/gmp/mpn/ia64/sec_tabselect.asm

@@ -0,0 +1,148 @@
+dnl  IA-64 mpn_sec_tabselect.
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C           cycles/limb
+C Itanium:       ?
+C Itanium 2:     2.5
+
+C NOTES
+C  * Using software pipelining could trivially yield 2 c/l without unrolling,
+C    or 1+epsilon with unrolling.  (This code was modelled after the powerpc64
+C    code, for simplicity.)
+
+C mpn_sec_tabselect (mp_limb_t *rp, mp_limb_t *tp, mp_size_t n, mp_size_t nents, mp_size_t which)
+define(`rp',     `r32')
+define(`tp',     `r33')
+define(`n',      `r34')
+define(`nents',  `r35')
+define(`which',  `r36')
+
+define(`mask',   `r8')
+
+define(`rp1',     `r32')
+define(`tp1',     `r33')
+define(`rp2',     `r14')
+define(`tp2',     `r15')
+
+ASM_START()
+PROLOGUE(mpn_sec_tabselect)
+	.prologue
+	.save	ar.lc, r2
+	.body
+ifdef(`HAVE_ABI_32',`
+ {.mmi;	addp4	rp = 0, rp		C			M I
+	addp4	tp = 0, tp		C			M I
+	zxt4	n = n			C			I
+}{.mii;	nop	0
+	zxt4	nents = nents		C			I
+	zxt4	which = which		C			I
+	;;
+}')
+ {.mmi;	add	rp2 = 8, rp1
+	add	tp2 = 8, tp1
+	add	r6 = -2, n
+	;;
+}{.mmi;	cmp.eq	p10, p0 = 1, n
+	and	r9 = 1, n		C set cr0 for use in inner loop
+	shr.u	r6 = r6, 1		C inner loop count
+	;;
+}{.mmi;	cmp.eq	p8, p0 = 0, r9
+	sub	which = nents, which
+	shl	n = n, 3
+	;;
+}
+L(outer):
+ {.mmi;	cmp.eq	p6, p7 = which, nents	C are we at the selected table entry?
+	nop	0
+	mov	ar.lc = r6		C			I0
+	;;
+}{.mmb;
+  (p6)	mov	mask = -1
+  (p7)	mov	mask = 0
+  (p8)	br.dptk	L(top)			C branch to loop entry if n even
+	;;
+}{.mmi;	ld8	r16 = [tp1], 8
+	add	tp2 = 8, tp2
+	nop	0
+	;;
+}{.mmi;	ld8	r18 = [rp1]
+	and	r16 = r16, mask
+	nop	0
+	;;
+}{.mmi;	andcm	r18 = r18, mask
+	;;
+	or	r16 = r16, r18
+	nop	0
+	;;
+}{.mmb;	st8	[rp1] = r16, 8
+	add	rp2 = 8, rp2
+  (p10)	br.dpnt	L(end)
+}
+	ALIGN(32)
+L(top):
+ {.mmi;	ld8	r16 = [tp1], 16
+	ld8	r17 = [tp2], 16
+	nop	0
+	;;
+}{.mmi;	ld8	r18 = [rp1]
+	and	r16 = r16, mask
+	nop	0
+}{.mmi;	ld8	r19 = [rp2]
+	and	r17 = r17, mask
+	nop	0
+	;;
+}{.mmi;	andcm	r18 = r18, mask
+	andcm	r19 = r19, mask
+	nop	0
+	;;
+}{.mmi;	or	r16 = r16, r18
+	or	r17 = r17, r19
+	nop	0
+	;;
+}{.mmb;	st8	[rp1] = r16, 16
+	st8	[rp2] = r17, 16
+	br.cloop.dptk	L(top)
+	;;
+}
+L(end):
+ {.mmi;	sub	rp1 = rp1, n		C move rp back to beginning
+	sub	rp2 = rp2, n		C move rp back to beginning
+	cmp.ne	p9, p0 = 1, nents
+}{.mmb;	add	nents = -1, nents
+	nop	0
+  (p9)	br.dptk	L(outer)
+	;;
+}{.mib;	nop	0
+	nop	0
+	br.ret.sptk.many b0
+}
+EPILOGUE()

diff --git a/third_party/gmp/mpn/ia64/sqr_diag_addlsh1.asm b/third_party/gmp/mpn/ia64/sqr_diag_addlsh1.asm
new file mode 100644
index 0000000..727f489
--- /dev/null
+++ b/third_party/gmp/mpn/ia64/sqr_diag_addlsh1.asm

@@ -0,0 +1,156 @@
+dnl  IA-64 mpn_sqr_diag_addlsh1
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  Copyright 2010, 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C           cycles/limb
+C Itanium:      ?
+C Itanium 2:    2	Unrolling could bring it to 1.5 + epsilon
+
+C Exact performance table.  The 2nd line is this code, the 3rd line is ctop-
+C less code.  In an assembly sqr_basecase, the ctop-full numbers will become a
+C few cycles better since we can mitigate the many I0 instructions.
+C
+C 1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20
+C -  20  22  24  26  28  30  32  34  36  38  40  42  44  46  48  50  52  54  56 Needs updating
+C -  13  16  17  18  20  21  23  25  26  30  31  31  33  34  36  38  39  42  43
+
+C We should keep in mind that this code takes linear time in a O(n^2) context
+C and that it will only be used under SQR_TOOM2_THRESHOLD, which might become
+C around 60.  Keeping overhead down for smallish operands (< 10) is more
+C important than optimal cycle counts.
+
+C TODO
+C  * Make sure we don't depend on uninitialised r-registers, f-registers, or
+C  * p-registers.
+C  * Optimise by doing first two loop iterations in function header.
+
+C INPUT PARAMETERS
+define(`rp_param', `r32')  define(`rp', `r14')		C size: 2n
+define(`tp_param', `r33')  define(`tp', `r15')		C size: 2n - 2
+define(`up_param', `r34')  define(`up', `r31')		C size: n
+define(`n',  `r35')
+
+ifdef(`HAVE_ABI_32',`
+	define(`ABI64', `')
+	define(`ABI32', `$1')
+',`
+	define(`ABI64', `$1')
+	define(`ABI32', `')
+')
+
+ASM_START()
+PROLOGUE(mpn_sqr_diag_addlsh1)
+
+	.prologue
+	.save	ar.pfs, r2
+	.save	ar.lc, r3
+	.body
+
+ {.mii;		alloc	r2 = ar.pfs, 4,24,0,24	C			M
+		mov	r3 = ar.lc		C			I0
+	ABI64(`	nop	4711		')
+	ABI32(`	zxt4	n = n		')
+}{.mmi;	ABI64(`	mov	tp = tp_param	')	C			M I
+	ABI32(`	addp4	tp = 0, tp_param')	C			M I
+	ABI64(`	mov	up = up_param	')	C			M I
+	ABI32(`	addp4	up = 0, up_param')	C			M I
+	ABI64(`	mov	rp = rp_param	')	C			M I
+	ABI32(`	addp4	rp = 0, rp_param')	C			M I
+	;;
+}{.mmi;		ld8	r36 = [tp], 8		C			M
+		add	r20 = -2, n		C			M I
+		mov	r9 = ar.ec		C			I0
+	;;
+}{.mmi;		ld8	r32 = [tp], 8		C			M
+		mov	r16 = 0			C			M I
+		mov	ar.ec = 7		C			I0
+	;;
+}{.mmi;		nop	4711
+		mov	r44 = 0			C			M I
+		mov	ar.lc = r20		C			I0
+	;;
+}{.mii;		mov	r33 = 0
+		mov	r10 = pr		C			I0
+		mov	pr.rot = 0x30000	C			I0
+	;;
+}		br.cexit.spnt.few.clr	L(end)
+
+dnl *** MAIN LOOP START ***
+	ALIGN(32)
+L(top):
+ {.mfi;	(p18)	ldf8	f33 = [up], 8		C			M
+	(p20)	xma.l	f36 = f35, f35, f42	C			F
+	(p41)	cmpequc	p50, p0 = -1, r44	C			M I
+}{.mfi;		setfsig	f40 = r16		C			M23
+	(p20)	xma.hu	f38 = f35, f35, f42	C			F
+	(p23)	add	r50 = r41, r49		C			M I
+	;;
+}{.mmi;	(p16)	ld8	r36 = [tp], 8		C			M
+	(p23)	cmpltu	p40, p0 = r50, r41	C cyout hi		M I
+	(p19)	shrp	r45 = r38, r35, 63	C non-critical		I0
+}{.mmi;	(p21)	getfsig	r39 = f39		C hi			M2
+	(p24)	st8	[rp] = r51, 8		C hi			M23
+	(p41)	add	r44 = 1, r44		C			M I
+	;;
+}{.mmi;	(p16)	ld8	r32 = [tp], 8		C			M
+	(p50)	cmpeqor	p40, p0 = -1, r50	C cyout hi		M I
+	(p17)	shrp	r16 = r33, r37, 63	C critical		I0
+}{.mmi;	(p21)	getfsig	r42 = f37		C lo			M2
+	(p23)	st8	[rp] = r44, 8		C lo			M23
+	(p50)	add	r50 = 1, r50		C			M I
+	;;
+}		br.ctop.sptk.few.clr L(top)	C			B
+dnl *** MAIN LOOP END ***
+	;;
+L(end):
+ {.mmi;		nop	4711
+	(p41)	add	r44 = 1, r44		C			M I
+		shr.u	r48 = r39, 63		C			I0
+	;;
+}{.mmi;		st8	[rp] = r51, 8		C			M23
+	(p41)	cmpequc	p6, p0 = 0, r44		C			M I
+		add	r50 = r41, r48		C			M I
+	;;
+}{.mmi;		st8	[rp] = r44, 8		C			M23
+	(p6)	add	r50 = 1, r50		C			M I
+		mov	ar.lc = r3		C			I0
+	;;
+}{.mii;		st8	[rp] = r50		C			M23
+		mov	ar.ec = r9		C			I0
+		mov	pr = r10		C			I0
+	;;
+}{.mib;		nop	4711
+		mov	ar.pfs = r2		C			I0
+		br.ret.sptk.many b0		C			B
+}
+EPILOGUE()

diff --git a/third_party/gmp/mpn/ia64/submul_1.asm b/third_party/gmp/mpn/ia64/submul_1.asm
new file mode 100644
index 0000000..cb2a552
--- /dev/null
+++ b/third_party/gmp/mpn/ia64/submul_1.asm

@@ -0,0 +1,647 @@
+dnl  IA-64 mpn_submul_1 -- Multiply a limb vector with a limb and subtract the
+dnl  result from a second limb vector.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  Copyright 2000-2004 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C         cycles/limb
+C Itanium:    4.0
+C Itanium 2:  2.25 (alignment dependent, sometimes it seems to need 3 c/l)
+
+C TODO
+C  * Optimize feed-in and wind-down code, both for speed and code size.
+C  * Handle low limb input and results specially, using a common stf8 in the
+C    epilogue.
+C  * Delay r8, r10 initialization, put cmp-p6 in 1st bundle and br .Ldone in
+C    2nd bundle.  This will allow the bbb bundle to be one cycle earlier and
+C    save a cycle.
+
+C INPUT PARAMETERS
+define(`rp', `r32')
+define(`up', `r33')
+define(`n',  `r34')
+define(`vl', `r35')
+
+ASM_START()
+PROLOGUE(mpn_submul_1)
+	.prologue
+	.save	ar.lc, r2
+	.body
+
+ifdef(`HAVE_ABI_32',
+`	addp4		rp = 0, rp		C M I
+	addp4		up = 0, up		C M I
+	zxt4		n = n			C I
+	;;
+')
+{.mmi
+	mov		r10 = rp		C M I
+	mov		r9 = up			C M I
+	sub		vl = r0, vl		C M I	negate vl
+}
+{.mmi
+	ldf8		f8 = [rp], 8		C M
+	ldf8		f7 = [up], 8		C M
+	add		r19 = -1, n		C M I	n - 1
+	;;
+}
+{.mmi
+	cmp.eq		p6, p0 = 0, vl		C M I
+	mov		r8 = 0			C M I	zero cylimb
+	mov		r2 = ar.lc		C I0
+}
+{.mmi
+	setf.sig	f6 = vl			C M2 M3
+	and		r14 = 3, n		C M I
+	shr.u		r19 = r19, 2		C I0
+	;;
+}
+{.mmb
+	nop		0
+	cmp.eq		p10, p0 = 0, r14	C M I
+   (p6)	br.spnt		.Ldone			C B	vl == 0
+}
+{.mmi
+	cmp.eq		p11, p0 = 2, r14	C M I
+	cmp.eq		p12, p0 = 3, r14	C M I
+	mov		ar.lc = r19		C I0
+}
+{.bbb
+  (p10)	br.dptk		.Lb00			C B
+  (p11)	br.dptk		.Lb10			C B
+  (p12)	br.dptk		.Lb11			C B
+	;;
+}
+
+.Lb01:	br.cloop.dptk	.grt1
+
+	xma.l		f39 = f7, f6, f8
+	xma.hu		f43 = f7, f6, f8
+	;;
+	getf.sig	r27 = f39			C lo
+	getf.sig	r31 = f43			C hi
+	ld8		r20 = [r9], 8
+	br		.Lcj1
+
+.grt1:	ldf8		f44 = [rp], 8
+	ldf8		f32 = [up], 8
+	;;
+	ldf8		f45 = [rp], 8
+	ldf8		f33 = [up], 8
+	;;
+	ldf8		f46 = [rp], 8
+	xma.l		f39 = f7, f6, f8
+	ldf8		f34 = [up], 8
+	xma.hu		f43 = f7, f6, f8
+	;;
+	ldf8		f47 = [rp], 8
+	xma.l		f36 = f32, f6, f44
+	ldf8		f35 = [up], 8
+	xma.hu		f40 = f32, f6, f44
+	br.cloop.dptk	.grt5
+	;;
+
+	getf.sig	r27 = f39			C lo
+	xma.l		f37 = f33, f6, f45
+	ld8		r20 = [r9], 8
+	xma.hu		f41 = f33, f6, f45
+	;;
+	getf.sig	r31 = f43			C hi
+	getf.sig	r24 = f36			C lo
+	xma.l		f38 = f34, f6, f46
+	ld8		r21 = [r9], 8
+	xma.hu		f42 = f34, f6, f46
+	;;
+	getf.sig	r28 = f40			C hi
+	getf.sig	r25 = f37			C lo
+	xma.l		f39 = f35, f6, f47
+	ld8		r22 = [r9], 8
+	xma.hu		f43 = f35, f6, f47
+	;;
+	getf.sig	r29 = f41			C hi
+	getf.sig	r26 = f38			C lo
+	ld8		r23 = [r9], 8
+	br		.Lcj5
+
+.grt5:	ldf8		f44 = [rp], 8
+	ldf8		f32 = [up], 8
+	;;
+	getf.sig	r27 = f39			C lo
+	xma.l		f37 = f33, f6, f45
+	ld8		r20 = [r9], 8
+	xma.hu		f41 = f33, f6, f45
+	;;
+	ldf8		f45 = [rp], 8
+	getf.sig	r31 = f43			C hi
+	ldf8		f33 = [up], 8
+	;;
+	getf.sig	r24 = f36			C lo
+	xma.l		f38 = f34, f6, f46
+	ld8		r21 = [r9], 8
+	xma.hu		f42 = f34, f6, f46
+	;;
+	ldf8		f46 = [rp], 8
+	getf.sig	r28 = f40			C hi
+	ldf8		f34 = [up], 8
+	;;
+	getf.sig	r25 = f37			C lo
+	xma.l		f39 = f35, f6, f47
+	ld8		r22 = [r9], 8
+	xma.hu		f43 = f35, f6, f47
+	;;
+	ldf8		f47 = [rp], 8
+	getf.sig	r29 = f41			C hi
+	ldf8		f35 = [up], 8
+	;;
+	getf.sig	r26 = f38			C lo
+	xma.l		f36 = f32, f6, f44
+	ld8		r23 = [r9], 8
+	xma.hu		f40 = f32, f6, f44
+	br.cloop.dptk	.Loop
+	br		.Lend
+
+
+.Lb10:	ldf8		f47 = [rp], 8
+	ldf8		f35 = [up], 8
+	br.cloop.dptk	.grt2
+
+	xma.l		f38 = f7, f6, f8
+	xma.hu		f42 = f7, f6, f8
+	;;
+	xma.l		f39 = f35, f6, f47
+	xma.hu		f43 = f35, f6, f47
+	;;
+	getf.sig	r26 = f38			C lo
+	getf.sig	r30 = f42			C hi
+	ld8		r23 = [r9], 8
+	;;
+	getf.sig	r27 = f39			C lo
+	getf.sig	r31 = f43			C hi
+	ld8		r20 = [r9], 8
+	br		.Lcj2
+
+.grt2:	ldf8		f44 = [rp], 8
+	ldf8		f32 = [up], 8
+	;;
+	ldf8		f45 = [rp], 8
+	ldf8		f33 = [up], 8
+	xma.l		f38 = f7, f6, f8
+	xma.hu		f42 = f7, f6, f8
+	;;
+	ldf8		f46 = [rp], 8
+	ldf8		f34 = [up], 8
+	xma.l		f39 = f35, f6, f47
+	xma.hu		f43 = f35, f6, f47
+	;;
+	ldf8		f47 = [rp], 8
+	ldf8		f35 = [up], 8
+	;;
+	getf.sig	r26 = f38			C lo
+	xma.l		f36 = f32, f6, f44
+	ld8		r23 = [r9], 8
+	xma.hu		f40 = f32, f6, f44
+	br.cloop.dptk	.grt6
+
+	getf.sig	r30 = f42			C hi
+	;;
+	getf.sig	r27 = f39			C lo
+	xma.l		f37 = f33, f6, f45
+	ld8		r20 = [r9], 8
+	xma.hu		f41 = f33, f6, f45
+	;;
+	getf.sig	r31 = f43			C hi
+	getf.sig	r24 = f36			C lo
+	xma.l		f38 = f34, f6, f46
+	ld8		r21 = [r9], 8
+	xma.hu		f42 = f34, f6, f46
+	;;
+	getf.sig	r28 = f40			C hi
+	getf.sig	r25 = f37			C lo
+	xma.l		f39 = f35, f6, f47
+	ld8		r22 = [r9], 8
+	xma.hu		f43 = f35, f6, f47
+	br		.Lcj6
+
+.grt6:	ldf8		f44 = [rp], 8
+	getf.sig	r30 = f42			C hi
+	ldf8		f32 = [up], 8
+	;;
+	getf.sig	r27 = f39			C lo
+	xma.l		f37 = f33, f6, f45
+	ld8		r20 = [r9], 8
+	xma.hu		f41 = f33, f6, f45
+	;;
+	ldf8		f45 = [rp], 8
+	getf.sig	r31 = f43			C hi
+	ldf8		f33 = [up], 8
+	;;
+	getf.sig	r24 = f36			C lo
+	xma.l		f38 = f34, f6, f46
+	ld8		r21 = [r9], 8
+	xma.hu		f42 = f34, f6, f46
+	;;
+	ldf8		f46 = [rp], 8
+	getf.sig	r28 = f40			C hi
+	ldf8		f34 = [up], 8
+	;;
+	getf.sig	r25 = f37			C lo
+	xma.l		f39 = f35, f6, f47
+	ld8		r22 = [r9], 8
+	xma.hu		f43 = f35, f6, f47
+	br		.LL10
+
+
+.Lb11:	ldf8		f46 = [rp], 8
+	ldf8		f34 = [up], 8
+	;;
+	ldf8		f47 = [rp], 8
+	ldf8		f35 = [up], 8
+	br.cloop.dptk	.grt3
+
+	xma.l		f37 = f7, f6, f8
+	xma.hu		f41 = f7, f6, f8
+	;;
+	xma.l		f38 = f34, f6, f46
+	xma.hu		f42 = f34, f6, f46
+	;;
+	getf.sig	r25 = f37			C lo
+	xma.l		f39 = f35, f6, f47
+	xma.hu		f43 = f35, f6, f47
+	;;
+	getf.sig	r29 = f41			C hi
+	ld8		r22 = [r9], 8
+	;;
+	getf.sig	r26 = f38			C lo
+	getf.sig	r30 = f42			C hi
+	ld8		r23 = [r9], 8
+	;;
+	getf.sig	r27 = f39			C lo
+	getf.sig	r31 = f43			C hi
+	ld8		r20 = [r9], 8
+	br		.Lcj3
+
+.grt3:	ldf8		f44 = [rp], 8
+	xma.l		f37 = f7, f6, f8
+	ldf8		f32 = [up], 8
+	xma.hu		f41 = f7, f6, f8
+	;;
+	ldf8		f45 = [rp], 8
+	xma.l		f38 = f34, f6, f46
+	ldf8		f33 = [up], 8
+	xma.hu		f42 = f34, f6, f46
+	;;
+	ldf8		f46 = [rp], 8
+	ldf8		f34 = [up], 8
+	;;
+	getf.sig	r25 = f37			C lo
+	xma.l		f39 = f35, f6, f47
+	ld8		r22 = [r9], 8
+	xma.hu		f43 = f35, f6, f47
+	;;
+	ldf8		f47 = [rp], 8
+	getf.sig	r29 = f41			C hi
+	ldf8		f35 = [up], 8
+	;;
+	getf.sig	r26 = f38			C lo
+	xma.l		f36 = f32, f6, f44
+	ld8		r23 = [r9], 8
+	xma.hu		f40 = f32, f6, f44
+	br.cloop.dptk	.grt7
+	;;
+
+	getf.sig	r30 = f42			C hi
+	getf.sig	r27 = f39			C lo
+	xma.l		f37 = f33, f6, f45
+	ld8		r20 = [r9], 8
+	xma.hu		f41 = f33, f6, f45
+	;;
+	getf.sig	r31 = f43			C hi
+	getf.sig	r24 = f36			C lo
+	xma.l		f38 = f34, f6, f46
+	ld8		r21 = [r9], 8
+	xma.hu		f42 = f34, f6, f46
+	br		.Lcj7
+
+.grt7:	ldf8		f44 = [rp], 8
+	getf.sig	r30 = f42			C hi
+	ldf8		f32 = [up], 8
+	;;
+	getf.sig	r27 = f39			C lo
+	xma.l		f37 = f33, f6, f45
+	ld8		r20 = [r9], 8
+	xma.hu		f41 = f33, f6, f45
+	;;
+	ldf8		f45 = [rp], 8
+	getf.sig	r31 = f43			C hi
+	ldf8		f33 = [up], 8
+	;;
+	getf.sig	r24 = f36			C lo
+	xma.l		f38 = f34, f6, f46
+	ld8		r21 = [r9], 8
+	xma.hu		f42 = f34, f6, f46
+	br		.LL11
+
+
+.Lb00:	ldf8		f45 = [rp], 8
+	ldf8		f33 = [up], 8
+	;;
+	ldf8		f46 = [rp], 8
+	ldf8		f34 = [up], 8
+	;;
+	ldf8		f47 = [rp], 8
+	xma.l		f36 = f7, f6, f8
+	ldf8		f35 = [up], 8
+	xma.hu		f40 = f7, f6, f8
+	br.cloop.dptk	.grt4
+
+	xma.l		f37 = f33, f6, f45
+	xma.hu		f41 = f33, f6, f45
+	;;
+	getf.sig	r24 = f36			C lo
+	xma.l		f38 = f34, f6, f46
+	ld8		r21 = [r9], 8
+	xma.hu		f42 = f34, f6, f46
+	;;
+	getf.sig	r28 = f40			C hi
+	xma.l		f39 = f35, f6, f47
+	getf.sig	r25 = f37			C lo
+	ld8		r22 = [r9], 8
+	xma.hu		f43 = f35, f6, f47
+	;;
+	getf.sig	r29 = f41			C hi
+	getf.sig	r26 = f38			C lo
+	ld8		r23 = [r9], 8
+	;;
+	getf.sig	r30 = f42			C hi
+	getf.sig	r27 = f39			C lo
+	ld8		r20 = [r9], 8
+	br		.Lcj4
+
+.grt4:	ldf8		f44 = [rp], 8
+	xma.l		f37 = f33, f6, f45
+	ldf8		f32 = [up], 8
+	xma.hu		f41 = f33, f6, f45
+	;;
+	ldf8		f45 = [rp], 8
+	ldf8		f33 = [up], 8
+	xma.l		f38 = f34, f6, f46
+	getf.sig	r24 = f36			C lo
+	ld8		r21 = [r9], 8
+	xma.hu		f42 = f34, f6, f46
+	;;
+	ldf8		f46 = [rp], 8
+	getf.sig	r28 = f40			C hi
+	ldf8		f34 = [up], 8
+	xma.l		f39 = f35, f6, f47
+	getf.sig	r25 = f37			C lo
+	ld8		r22 = [r9], 8
+	xma.hu		f43 = f35, f6, f47
+	;;
+	ldf8		f47 = [rp], 8
+	getf.sig	r29 = f41			C hi
+	ldf8		f35 = [up], 8
+	;;
+	getf.sig	r26 = f38			C lo
+	xma.l		f36 = f32, f6, f44
+	ld8		r23 = [r9], 8
+	xma.hu		f40 = f32, f6, f44
+	br.cloop.dptk	.grt8
+	;;
+
+	getf.sig	r30 = f42			C hi
+	getf.sig	r27 = f39			C lo
+	xma.l		f37 = f33, f6, f45
+	ld8		r20 = [r9], 8
+	xma.hu		f41 = f33, f6, f45
+	br		.Lcj8
+
+.grt8:	ldf8		f44 = [rp], 8
+	getf.sig	r30 = f42			C hi
+	ldf8		f32 = [up], 8
+	;;
+	getf.sig	r27 = f39			C lo
+	xma.l		f37 = f33, f6, f45
+	ld8		r20 = [r9], 8
+	xma.hu		f41 = f33, f6, f45
+	br		.LL00
+
+	ALIGN(32)
+.Loop:
+{.mmi
+	ldf8		f44 = [rp], 8
+	cmp.ltu		p6, p0 = r27, r8	C lo cmp
+	sub		r14 = r27, r8		C lo sub
+}
+{.mmi
+	getf.sig	r30 = f42			C hi
+	ldf8		f32 = [up], 8
+	sub		r8 = r20, r31		C hi sub
+	;;				C 01
+}
+{.mmf
+	getf.sig	r27 = f39			C lo
+	st8		[r10] = r14, 8
+	xma.l		f37 = f33, f6, f45
+}
+{.mfi
+	ld8		r20 = [r9], 8
+	xma.hu		f41 = f33, f6, f45
+   (p6)	add		r8 = 1, r8
+	;;				C 02
+}
+{.mmi
+.LL00:	ldf8		f45 = [rp], 8
+	cmp.ltu		p6, p0 = r24, r8
+	sub		r14 = r24, r8
+}
+{.mmi
+	getf.sig	r31 = f43			C hi
+	ldf8		f33 = [up], 8
+	sub		r8 = r21, r28
+	;;				C 03
+}
+{.mmf
+	getf.sig	r24 = f36			C lo
+	st8		[r10] = r14, 8
+	xma.l		f38 = f34, f6, f46
+}
+{.mfi
+	ld8		r21 = [r9], 8
+	xma.hu		f42 = f34, f6, f46
+   (p6)	add		r8 = 1, r8
+	;;				C 04
+}
+{.mmi
+.LL11:	ldf8		f46 = [rp], 8
+	cmp.ltu		p6, p0 = r25, r8
+	sub		r14 = r25, r8
+}
+{.mmi
+	getf.sig	r28 = f40			C hi
+	ldf8		f34 = [up], 8
+	sub		r8 = r22, r29
+	;;				C 05
+}
+{.mmf
+	getf.sig	r25 = f37			C lo
+	st8		[r10] = r14, 8
+	xma.l		f39 = f35, f6, f47
+}
+{.mfi
+	ld8		r22 = [r9], 8
+	xma.hu		f43 = f35, f6, f47
+   (p6)	add		r8 = 1, r8
+	;;				C 06
+}
+{.mmi
+.LL10:	ldf8		f47 = [rp], 8
+	cmp.ltu		p6, p0 = r26, r8
+	sub		r14 = r26, r8
+}
+{.mmi
+	getf.sig	r29 = f41			C hi
+	ldf8		f35 = [up], 8
+	sub		r8 = r23, r30
+	;;				C 07
+}
+{.mmf
+	getf.sig	r26 = f38			C lo
+	st8		[r10] = r14, 8
+	xma.l		f36 = f32, f6, f44
+}
+{.mfi
+	ld8		r23 = [r9], 8
+	xma.hu		f40 = f32, f6, f44
+   (p6)	add		r8 = 1, r8
+}
+	br.cloop.dptk	.Loop
+	;;
+
+.Lend:
+	cmp.ltu		p6, p0 = r27, r8
+	sub		r14 = r27, r8
+	getf.sig	r30 = f42
+	sub		r8 = r20, r31
+	;;
+	getf.sig	r27 = f39
+	st8		[r10] = r14, 8
+	xma.l		f37 = f33, f6, f45
+	ld8		r20 = [r9], 8
+	xma.hu		f41 = f33, f6, f45
+   (p6)	add		r8 = 1, r8
+	;;
+.Lcj8:
+	cmp.ltu		p6, p0 = r24, r8
+	sub		r14 = r24, r8
+	getf.sig	r31 = f43
+	sub		r8 = r21, r28
+	;;
+	getf.sig	r24 = f36
+	st8		[r10] = r14, 8
+	xma.l		f38 = f34, f6, f46
+	ld8		r21 = [r9], 8
+	xma.hu		f42 = f34, f6, f46
+   (p6)	add		r8 = 1, r8
+	;;
+.Lcj7:
+	cmp.ltu		p6, p0 = r25, r8
+	sub		r14 = r25, r8
+	getf.sig	r28 = f40
+	sub		r8 = r22, r29
+	;;
+	getf.sig	r25 = f37
+	st8		[r10] = r14, 8
+	xma.l		f39 = f35, f6, f47
+	ld8		r22 = [r9], 8
+	xma.hu		f43 = f35, f6, f47
+   (p6)	add		r8 = 1, r8
+	;;
+.Lcj6:
+	cmp.ltu		p6, p0 = r26, r8
+	sub		r14 = r26, r8
+	getf.sig	r29 = f41
+	sub		r8 = r23, r30
+	;;
+	getf.sig	r26 = f38
+	st8		[r10] = r14, 8
+	ld8		r23 = [r9], 8
+   (p6)	add		r8 = 1, r8
+	;;
+.Lcj5:
+	cmp.ltu		p6, p0 = r27, r8
+	sub		r14 = r27, r8
+	getf.sig	r30 = f42
+	sub		r8 = r20, r31
+	;;
+	getf.sig	r27 = f39
+	st8		[r10] = r14, 8
+	ld8		r20 = [r9], 8
+   (p6)	add		r8 = 1, r8
+	;;
+.Lcj4:
+	cmp.ltu		p6, p0 = r24, r8
+	sub		r14 = r24, r8
+	getf.sig	r31 = f43
+	sub		r8 = r21, r28
+	;;
+	st8		[r10] = r14, 8
+   (p6)	add		r8 = 1, r8
+	;;
+.Lcj3:
+	cmp.ltu		p6, p0 = r25, r8
+	sub		r14 = r25, r8
+	sub		r8 = r22, r29
+	;;
+	st8		[r10] = r14, 8
+   (p6)	add		r8 = 1, r8
+	;;
+.Lcj2:
+	cmp.ltu		p6, p0 = r26, r8
+	sub		r14 = r26, r8
+	sub		r8 = r23, r30
+	;;
+	st8		[r10] = r14, 8
+   (p6)	add		r8 = 1, r8
+	;;
+.Lcj1:
+	cmp.ltu		p6, p0 = r27, r8
+	sub		r14 = r27, r8
+	sub		r8 = r20, r31
+	;;
+	st8		[r10] = r14, 8
+	mov		ar.lc = r2
+   (p6)	add		r8 = 1, r8
+	br.ret.sptk.many b0
+.Ldone:	mov		ar.lc = r2
+	br.ret.sptk.many b0
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/lisp/gmpasm-mode.el b/third_party/gmp/mpn/lisp/gmpasm-mode.el
new file mode 100644
index 0000000..6f2fea0
--- /dev/null
+++ b/third_party/gmp/mpn/lisp/gmpasm-mode.el

@@ -0,0 +1,385 @@
+;;; gmpasm-mode.el -- GNU MP asm and m4 editing mode.
+
+
+;; Copyright 1999-2002 Free Software Foundation, Inc.
+
+;;   This file is part of the GNU MP Library.
+;;
+;;   The GNU MP Library is free software; you can redistribute it and/or modify
+;;   it under the terms of either:
+;;
+;;     * the GNU Lesser General Public License as published by the Free
+;;       Software Foundation; either version 3 of the License, or (at your
+;;       option) any later version.
+;;
+;;   or
+;;
+;;     * the GNU General Public License as published by the Free Software
+;;       Foundation; either version 2 of the License, or (at your option) any
+;;       later version.
+;;
+;;   or both in parallel, as here.
+;;
+;;   The GNU MP Library is distributed in the hope that it will be useful, but
+;;   WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+;;   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+;;   for more details.
+;;
+;;   You should have received copies of the GNU General Public License and the
+;;   GNU Lesser General Public License along with the GNU MP Library.  If not,
+;;   see https://www.gnu.org/licenses/.
+
+
+;;; Commentary:
+;;
+;; gmpasm-mode is a major mode for editing m4 processed assembler code and
+;; m4 macro files in GMP.  It's similar to m4-mode, but has a number of
+;; settings better suited to GMP.
+;;
+;;
+;; Install
+;; -------
+;;
+;; To make M-x gmpasm-mode available, put gmpasm-mode.el somewhere in your
+;; load-path and the following in your .emacs
+;;
+;;	(autoload 'gmpasm-mode "gmpasm-mode" nil t)
+;;
+;; To use gmpasm-mode automatically on all .asm and .m4 files, put the
+;; following in your .emacs
+;;
+;;	(add-to-list 'auto-mode-alist '("\\.asm\\'" . gmpasm-mode))
+;;	(add-to-list 'auto-mode-alist '("\\.m4\\'" . gmpasm-mode))
+;;
+;; To have gmpasm-mode only on gmp files, try instead something like the
+;; following, which uses it only in a directory starting with "gmp", or a
+;; sub-directory of such.
+;;
+;;	(add-to-list 'auto-mode-alist
+;;	             '("/gmp.*/.*\\.\\(asm\\|m4\\)\\'" . gmpasm-mode))
+;;
+;; Byte compiling will slightly speed up loading.  If you want a docstring
+;; in the autoload you can use M-x update-file-autoloads if you set it up
+;; right.
+;;
+;;
+;; Emacsen
+;; -------
+;;
+;; GNU Emacs 20.x, 21.x and XEmacs 20.x all work well.  GNU Emacs 19.x
+;; should work if replacements for the various 20.x-isms are available,
+;; though comment-region with "C" doesn't do the right thing.
+
+
+;;; Code:
+
+(defgroup gmpasm nil
+  "GNU MP m4 and asm editing."
+  :prefix "gmpasm-"
+  :group 'languages)
+
+(defcustom gmpasm-mode-hook nil
+  "*Hook called by `gmpasm-mode'."
+  :type 'hook
+  :group 'gmpasm)
+
+(defcustom gmpasm-comment-start-regexp "\\([#;!@*|C]\\|//\\)"
+  "*Regexp matching possible comment styles.
+See `gmpasm-mode' docstring for how this is used.
+
+Commenting styles within GMP include
+  #   - alpha, i386, i960, vax, traditional unix
+  ;   - a29k, clipper, hppa, m88k, ppc
+  !   - sh, sparc, z8000
+  |   - m68k
+  @   - arm
+  *   - cray
+  C   - GMP m4, see mpn/asm-defs.m4
+  //  - ia64"
+  :type 'regexp
+  :group 'gmpasm)
+
+
+(defun gmpasm-add-to-list-second (list-var element)
+  "(gmpasm-add-to-list-second LIST-VAR ELEMENT)
+
+Add ELEMENT to LIST-VAR as the second element in the list, if it isn't
+already in the list.  If LIST-VAR is nil, then ELEMENT is just added as the
+sole element in the list.
+
+This is like `add-to-list', but it puts the new value second in the list.
+
+The first cons cell is copied rather than changed in-place, so references to
+the list elsewhere won't be affected."
+
+  (if (member element (symbol-value list-var))
+      (symbol-value list-var)
+    (set list-var
+	 (if (symbol-value list-var)
+	     (cons (car (symbol-value list-var))
+		   (cons element
+			 (cdr (symbol-value list-var))))
+	   (list element)))))
+
+
+(defun gmpasm-remove-from-list (list-var element)
+  "(gmpasm-remove-from-list LIST-VAR ELEMENT)
+
+Remove ELEMENT from LIST-VAR, using `copy-sequence' and `delete'.
+This is vaguely like `add-to-list', but the element is removed from the list.
+The list is copied rather than changed in-place, so references to it elsewhere
+aren't affected."
+
+;; Only the portion of the list up to the removed element needs to be
+;; copied, but there's no need to bother arranging that, since this function
+;; is only used for a couple of initializations.
+
+  (set list-var (delete element (copy-sequence (symbol-value list-var)))))
+
+
+(defvar gmpasm-mode-map
+  (let ((map (make-sparse-keymap)))
+
+    ;; assembler and dnl commenting
+    (define-key map "\C-c\C-c" 'comment-region)
+    (define-key map "\C-c\C-d" 'gmpasm-comment-region-dnl)
+
+    ;; kill an M-x compile, since it's not hard to put m4 into an infinite
+    ;; loop
+    (define-key map "\C-c\C-k" 'kill-compilation)
+
+    map)
+  "Keymap for `gmpasm-mode'.")
+
+
+(defvar gmpasm-mode-syntax-table
+  (let ((table (make-syntax-table)))
+    ;; underscore left as a symbol char, like C mode
+
+    ;; m4 quotes
+    (modify-syntax-entry ?`  "('"  table)
+    (modify-syntax-entry ?'  ")`"  table)
+
+    table)
+  "Syntax table used in `gmpasm-mode'.
+
+'#' and '\n' aren't set as comment syntax.  In m4 these are a comment
+outside quotes, but not inside.  Omitting a syntax entry ensures that when
+inside quotes emacs treats parentheses and apostrophes the same way that m4
+does.  When outside quotes this is not quite right, but having it right when
+nesting expressions is more important.
+
+'*', '!' or '|' aren't setup as comment syntax either, on CPUs which use
+these for comments.  The GMP macro setups don't set them in m4 changecom(),
+since that prevents them being used in eval() expressions, and on that basis
+they don't change the way quotes and parentheses are treated by m4 and
+should be treated by emacs.")
+
+
+(defvar gmpasm-font-lock-keywords
+  (eval-when-compile
+    (list
+     (cons
+      (concat
+       "\\b"
+       (regexp-opt
+	'("deflit" "defreg" "defframe" "defframe_pushl"
+	  "define_not_for_expansion"
+	  "m4_error" "m4_warning"
+	  "ASM_START" "ASM_END"
+	  "PROLOGUE" "PROLOGUE_GP" "MULFUNC_PROLOGUE" "EPILOGUE"
+	  "DATASTART" "DATAEND"
+	  "forloop"
+	  "TEXT" "DATA" "ALIGN" "W32" "FLOAT64"
+	  "builtin" "changecom" "changequote" "changeword" "debugfile"
+	  "debugmode" "decr" "define" "defn" "divert" "divnum" "dumpdef"
+	  "errprint" "esyscmd" "eval" "__file__" "format" "gnu" "ifdef"
+	  "ifelse" "include" "incr" "index" "indir" "len" "__line__"
+	  "m4exit" "m4wrap" "maketemp" "patsubst" "popdef" "pushdef"
+	  "regexp" "shift" "sinclude" "substr" "syscmd" "sysval"
+	  "traceoff" "traceon" "translit" "undefine" "undivert" "unix")
+	t)
+       "\\b") 'font-lock-keyword-face)))
+
+  "`font-lock-keywords' for `gmpasm-mode'.
+
+The keywords are m4 builtins and some of the GMP macros used in asm files.
+L doesn't look good fontified, so it's omitted.
+
+The right assembler comment regexp is added dynamically buffer-local (with
+dnl too).")
+
+
+;; Initialized if gmpasm-mode finds filladapt loaded.
+(defvar gmpasm-filladapt-token-table nil
+  "Filladapt token table used in `gmpasm-mode'.")
+(defvar gmpasm-filladapt-token-match-table nil
+  "Filladapt token match table used in `gmpasm-mode'.")
+(defvar gmpasm-filladapt-token-conversion-table nil
+  "Filladapt token conversion table used in `gmpasm-mode'.")
+
+
+;;;###autoload
+(defun gmpasm-mode ()
+  "A major mode for editing GNU MP asm and m4 files.
+
+\\{gmpasm-mode-map}
+`comment-start' and `comment-end' are set buffer-local to assembler
+commenting appropriate for the CPU by looking for something matching
+`gmpasm-comment-start-regexp' at the start of a line, or \"#\" is used if
+there's no match (if \"#\" isn't what you want, type in a desired comment
+and do \\[gmpasm-mode] to reinitialize).
+
+`adaptive-fill-regexp' is set buffer-local to the standard regexp with
+`comment-start' and dnl added.  If filladapt.el has been loaded it similarly
+gets `comment-start' and dnl added as buffer-local fill prefixes.
+
+Font locking has the m4 builtins, some of the GMP macros, m4 dnl commenting,
+and assembler commenting (based on the `comment-start' determined).
+
+Note that `gmpasm-comment-start-regexp' is only matched as a whole word, so
+the `C' in it is only matched as a whole word, not on something that happens
+to start with `C'.  Also it's only the particular `comment-start' determined
+that's added for filling etc, not the whole `gmpasm-comment-start-regexp'.
+
+`gmpasm-mode-hook' is run after initializations are complete."
+
+  (interactive)
+  (kill-all-local-variables)
+  (setq major-mode 'gmpasm-mode
+        mode-name  "gmpasm")
+  (use-local-map gmpasm-mode-map)
+  (set-syntax-table gmpasm-mode-syntax-table)
+  (setq fill-column 76)
+
+  ;; Short instructions might fit with 32, but anything with labels or
+  ;; expressions soon needs the comments pushed out to column 40.
+  (setq comment-column 40)
+
+  ;; Don't want to find out the hard way which dumb assemblers don't like a
+  ;; missing final newline.
+  (set (make-local-variable 'require-final-newline) t)
+
+  ;; The first match of gmpasm-comment-start-regexp at the start of a line
+  ;; determines comment-start, or "#" if no match.
+  (set (make-local-variable 'comment-start)
+       (save-excursion
+	 (goto-char (point-min))
+	 (if (re-search-forward
+	      (concat "^\\(" gmpasm-comment-start-regexp "\\)\\(\\s-\\|$\\)")
+	      nil t)
+	     (match-string 1)
+	   "#")))
+  (set (make-local-variable 'comment-end) "")
+
+  ;; If comment-start ends in an alphanumeric then \b is used to match it
+  ;; only as a separate word.  The test is for an alphanumeric rather than
+  ;; \w since we might try # or ! as \w characters but without wanting \b on
+  ;; them.
+  (let ((comment-regexp
+	 (concat (regexp-quote comment-start)
+		 (if (string-match "[a-zA-Z0-9]\\'" comment-start) "\\b"))))
+
+    ;; Whitespace is required before a comment-start so m4 $# doesn't match
+    ;; when comment-start is "#".
+    (set (make-local-variable 'comment-start-skip)
+	 (concat "\\(^\\|\\s-\\)\\(\\<dnl\\>\\|" comment-regexp "\\)[ \t]*"))
+
+    ;; Comment fontification based on comment-start, and always with dnl.
+    ;; Same treatment of a space before "#" as in comment-start-skip, but
+    ;; don't fontify that space.
+    (add-to-list (make-local-variable 'gmpasm-font-lock-keywords)
+		 (list (concat "\\(^\\|\\s-\\)\\(\\(\\<dnl\\>\\|"
+			       comment-regexp
+			       "\\).*$\\)")
+		       2 'font-lock-comment-face))
+
+    (set (make-local-variable 'font-lock-defaults)
+	 '(gmpasm-font-lock-keywords
+	   t	         ; no syntactic fontification (of strings etc)
+	   nil           ; no case-fold
+	   ((?_ . "w"))  ; _ part of a word while fontifying
+	   ))
+
+    ;; Paragraphs are separated by blank lines, or lines with only dnl or
+    ;; comment-start.
+    (set (make-local-variable 'paragraph-separate)
+	 (concat "[ \t\f]*\\(\\(" comment-regexp "\\|dnl\\)[ \t]*\\)*$"))
+    (set (make-local-variable 'paragraph-start)
+	 (concat "\f\\|" paragraph-separate))
+
+    ;; Some sort of "def...(" m4 define, possibly with ` for quoting.
+    ;; Could do something with PROLOGUE here, but in GMP the filename is
+    ;; enough, it's not normally necessary to say the function name.
+    (set (make-local-variable 'add-log-current-defun-header-regexp)
+	 "^def[a-z0-9_]+(`?\\([a-zA-Z0-9_]+\\)")
+
+    ;; Adaptive fill gets dnl and comment-start as comment style prefixes on
+    ;; top of the standard regexp (which has # and ; already actually).
+    (set (make-local-variable 'adaptive-fill-regexp)
+	 (concat "[ \t]*\\(\\("
+		 comment-regexp
+		 "\\|dnl\\|[-|#;>*]+\\|(?[0-9]+[.)]\\)[ \t]*\\)*"))
+    (set (make-local-variable 'adaptive-fill-first-line-regexp)
+	 "\\`\\([ \t]*dnl\\)?[ \t]*\\'")
+
+    (when (fboundp 'filladapt-mode)
+      (unless gmpasm-filladapt-token-table
+	(setq gmpasm-filladapt-token-table
+	      filladapt-token-table)
+	(setq gmpasm-filladapt-token-match-table
+	      filladapt-token-match-table)
+	(setq gmpasm-filladapt-token-conversion-table
+	      filladapt-token-conversion-table)
+
+	;; Numbered bullet points like "2.1" get matched at the start of a
+	;; line when it's really something like "2.1 cycles/limb", so remove
+	;; this from the list.  The regexp for "1.", "2." etc is left
+	;; though.
+	(gmpasm-remove-from-list 'gmpasm-filladapt-token-table
+				 '("[0-9]+\\(\\.[0-9]+\\)+[ \t]"
+				   bullet))
+
+	;; "%" as a comment prefix interferes with register names on some
+	;; CPUs, like %eax on x86, so remove this.
+	(gmpasm-remove-from-list 'gmpasm-filladapt-token-table
+				 '("%+" postscript-comment))
+
+	(add-to-list 'gmpasm-filladapt-token-match-table
+		     '(gmpasm-comment gmpasm-comment))
+	(add-to-list 'gmpasm-filladapt-token-conversion-table
+		     '(gmpasm-comment . exact)))
+
+      (set (make-local-variable 'filladapt-token-table)
+	   gmpasm-filladapt-token-table)
+      (set (make-local-variable 'filladapt-token-match-table)
+	   gmpasm-filladapt-token-match-table)
+      (set (make-local-variable 'filladapt-token-conversion-table)
+	   gmpasm-filladapt-token-conversion-table)
+
+      ;; Add dnl and comment-start as fill prefixes.
+      ;; Comments in filladapt.el say filladapt-token-table must begin
+      ;; with ("^" beginning-of-line), so put our addition second.
+      (gmpasm-add-to-list-second 'filladapt-token-table
+				 (list (concat "dnl[ \t]\\|" comment-regexp)
+				       'gmpasm-comment))))
+
+  (run-hooks 'gmpasm-mode-hook))
+
+
+(defun gmpasm-comment-region-dnl (beg end &optional arg)
+  "(gmpasm-comment-region-dnl BEG END &optional ARG)
+
+Comment or uncomment each line in the region using `dnl'.
+With \\[universal-argument] prefix arg, uncomment each line in region.
+This is `comment-region', but using \"dnl\"."
+
+  (interactive "r\nP")
+  (let ((comment-start "dnl")
+	(comment-end ""))
+    (comment-region beg end arg)))
+
+
+(provide 'gmpasm-mode)
+
+;;; gmpasm-mode.el ends here

diff --git a/third_party/gmp/mpn/m4-ccas b/third_party/gmp/mpn/m4-ccas
new file mode 100755
index 0000000..16d80c6
--- /dev/null
+++ b/third_party/gmp/mpn/m4-ccas

@@ -0,0 +1,107 @@
+#!/bin/sh
+#
+# A helper script for Makeasm.am .asm.lo rule.
+
+# Copyright 2001 Free Software Foundation, Inc.
+#
+#  This file is part of the GNU MP Library.
+#
+#  The GNU MP Library is free software; you can redistribute it and/or modify
+#  it under the terms of either:
+#
+#    * the GNU Lesser General Public License as published by the Free
+#      Software Foundation; either version 3 of the License, or (at your
+#      option) any later version.
+#
+#  or
+#
+#    * the GNU General Public License as published by the Free Software
+#      Foundation; either version 2 of the License, or (at your option) any
+#      later version.
+#
+#  or both in parallel, as here.
+#
+#  The GNU MP Library is distributed in the hope that it will be useful, but
+#  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+#  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+#  for more details.
+#
+#  You should have received copies of the GNU General Public License and the
+#  GNU Lesser General Public License along with the GNU MP Library.  If not,
+#  see https://www.gnu.org/licenses/.
+
+
+# Usage: m4-ccas --m4=M4 CC ... file.asm ...
+#
+# Process file.asm with the given M4 plus any -D arguments, then
+# assemble with the given CC plus all arguments.
+#
+# The M4 command must be in a single --m4= argument, and will be split
+# on whitespace.  When CC is invoked file.asm is replaced with a
+# temporary .s file which is the M4 output.
+#
+# To allow parallel builds, the temp file name is based on the .asm
+# file name, which will be the output object filename for all uses we
+# put this script to.
+
+M4=
+CC=
+DEFS=
+ASM=
+SEEN_O=no
+
+for i in "$@"; do
+  case $i in
+    --m4=*)
+      M4=`echo "$i" | sed 's/^--m4=//'`
+      ;;
+    -D*)
+      DEFS="$DEFS $i"
+      CC="$CC $i"
+      ;;
+    *.asm)
+      if test -n "$ASM"; then
+        echo "Only one .asm file permitted"
+        exit 1
+      fi
+      BASENAME=`echo "$i" | sed -e 's/\.asm$//' -e 's/^.*[\\/:]//'`
+      TMP=tmp-$BASENAME.s
+      ASM=$i
+      CC="$CC $TMP"
+      ;;
+    -o)
+      SEEN_O=yes
+      CC="$CC $i"
+      ;;
+    *)
+      CC="$CC $i"
+      ;;
+  esac
+done
+
+if test -z "$M4"; then
+  echo "No --m4 specified"
+  exit 1
+fi
+
+if test -z "$ASM"; then
+  echo "No .asm specified"
+  exit 1
+fi
+
+# Libtool adds it's own -o when sending output to .libs/foo.o, but not
+# when just wanting foo.o in the current directory.  We need an
+# explicit -o in both cases since we're assembling tmp-foo.s.
+#
+if test $SEEN_O = no; then
+  CC="$CC -o $BASENAME.o"
+fi
+
+echo "$M4 $DEFS $ASM >$TMP"
+$M4 $DEFS $ASM >$TMP || exit
+
+echo "$CC"
+$CC || exit
+
+# Comment this out to preserve .s intermediates
+rm -f $TMP

diff --git a/third_party/gmp/mpn/m68k/README b/third_party/gmp/mpn/m68k/README
new file mode 100644
index 0000000..5261564
--- /dev/null
+++ b/third_party/gmp/mpn/m68k/README

@@ -0,0 +1,138 @@
+Copyright 2001, 2003, 2004 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.
+
+
+
+
+
+                      M68K MPN SUBROUTINES
+
+
+This directory contains mpn functions for various m68k family chips.
+
+
+CODE ORGANIZATION
+
+	m68k             m68000, m68010, m68060
+	m68k/mc68020     m68020, m68030, m68040, and CPU32
+
+
+The m5200 "coldfire", which is m68000 less a few instructions, currently has
+no assembler code support.
+
+
+STATUS
+
+The code herein is old and poorly maintained.  If somebody really cared, it
+could be optimized substantially.  For example,
+
+* mpn_add_n and mpn_sub_n could, with more unrolling be improved from 6 to
+  close to 4 c/l (on m68040).
+
+* The multiplication loops could be sped up by using the FPU.
+
+* mpn_lshift by 31 should use the special-case mpn_rshift by 1 code, and
+  vice versa mpn_rshift by 31 use the special lshift by 1, when operand
+  overlap permits.
+
+* On 68000, mpn_mul_1, mpn_addmul_1 and mpn_submul_1 could check for a
+  16-bit multiplier and use two multiplies per limb, not four.
+
+  Similarly various other _1 operations like mpn_mod_1, mpn_divrem_1,
+  mpn_divexact_1, mpn_modexact_1c_odd.
+
+* On 68000, mpn_lshift and mpn_rshift could use a roll and mask instead of
+  lsrl and lsll.  This promises to be a speedup, effectively trading a 6+2*n
+  shift for one or two 4 cycle masks.  Suggested by Jean-Charles Meyrignac.
+
+* config.guess detects 68000, 68010, CPU32 and 68020 by running some code,
+  but relies on system information for 030, 040 and 060.  Can they be
+  identified by running some code?  Currently this only makes a difference
+  to the compiler options selected, since we have no specific asm code for
+  those chips.
+
+One novel idea for 68000 would be to use a 16-bit limb instead of 32-bits.
+This would suit the native 16x16 multiply, but might make it difficult to
+get full value from the native 32x32 add/sub/etc.  This would be an ABI
+option, and would select "__GMP_SHORT_LIMB" in gmp.h.
+
+Naturally an entirely new set of asm subroutines would be needed for a
+16-bit limb.  Also there's various places in the C code assuming limb>=long,
+which would need to be updated, eg. mpz_set_ui.  Some of the nails changes
+may have helped cover some of this.
+
+
+ASM FILES
+
+The .asm files are put through m4 for macro processing, and with the help of
+configure give either MIT or Motorola syntax.  The generic mpn/asm-defs.m4
+is used, together with mpn/m68k/m68k-defs.m4.  See comments in those files.
+
+Not all possible syntax variations are covered.  GCC config/m68k for
+instance has things like $ for immediates on CRDS or reversed cmp order for
+AT&T SGS.  These could probably be handled if anyone really needs it.
+
+
+CALLING CONVENTIONS
+
+The SVR4 standard has an int of 32 bits, and all parameters 32-bit aligned
+on the stack.
+
+PalmOS and perhaps various embedded systems intended for 68000 however use
+an int of 16 bits and parameters only 16-bit aligned on the stack.  This is
+generated by "gcc -mshort" (and is the default for the PalmOS gcc port, we
+believe).
+
+The asm files adapt to these two ABIs by checking sizeof(unsigned), coming
+through config.m4 as SIZEOF_UNSIGNED.  Only mpn_lshift and mpn_rshift are
+affected, all other routines take longs and pointers, which are 32-bits in
+both cases.
+
+Strictly speaking the size of an int doesn't determine the stack padding
+convention.  But if int is 16 bits then we can definitely say the host
+system is not SVR4, and therefore may as well assume we're in 16-bit stack
+alignment.
+
+
+REFERENCES
+
+"Motorola M68000 Family Programmer's Reference Manual", available online,
+
+	http://e-www.motorola.com/brdata/PDFDB/docs/M68000PM.pdf
+
+"System V Application Binary Interface: Motorola 68000 Processor Family
+Supplement", AT&T, 1990, ISBN 0-13-877553-6.  Has details of calling
+conventions and ELF style PIC coding.
+
+
+
+----------------
+Local variables:
+mode: text
+fill-column: 76
+End:

diff --git a/third_party/gmp/mpn/m68k/aors_n.asm b/third_party/gmp/mpn/m68k/aors_n.asm
new file mode 100644
index 0000000..f7d379e
--- /dev/null
+++ b/third_party/gmp/mpn/m68k/aors_n.asm

@@ -0,0 +1,99 @@
+dnl  mc68020 mpn_add_n, mpn_sub_n -- add or subtract limb vectors
+
+dnl  Copyright 1992, 1994, 1996, 1999-2003, 2005 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C         cycles/limb
+C 68040:      6
+
+ifdef(`OPERATION_add_n',`
+  define(M4_inst,       addxl)
+  define(M4_function_n, mpn_add_n)
+',`ifdef(`OPERATION_sub_n',`
+  define(M4_inst,       subxl)
+  define(M4_function_n, mpn_sub_n)
+',
+`m4_error(`Need OPERATION_add_n or OPERATION_sub_n
+')')')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_sub_n)
+
+
+C INPUT PARAMETERS
+C res_ptr	(sp + 4)
+C s1_ptr	(sp + 8)
+C s2_ptr	(sp + 12)
+C size		(sp + 16)
+
+
+PROLOGUE(M4_function_n)
+
+C Save used registers on the stack.
+	movel	d2, M(-,sp)
+	movel	a2, M(-,sp)
+
+C Copy the arguments to registers.  Better use movem?
+	movel	M(sp,12), a2
+	movel	M(sp,16), a0
+	movel	M(sp,20), a1
+	movel	M(sp,24), d2
+
+	eorw	#1, d2
+	lsrl	#1, d2
+	bcc	L(L1)
+	subql	#1, d2	C clears cy as side effect
+
+L(Loop):
+	movel	M(a0,+), d0
+	movel	M(a1,+), d1
+	M4_inst	d1, d0
+	movel	d0, M(a2,+)
+L(L1):	movel	M(a0,+), d0
+	movel	M(a1,+), d1
+	M4_inst	d1, d0
+	movel	d0, M(a2,+)
+
+	dbf	d2, L(Loop)		C loop until 16 lsb of %4 == -1
+	subxl	d0, d0			C d0 <= -cy; save cy as 0 or -1 in d0
+	subl	#0x10000, d2
+	bcs	L(L2)
+	addl	d0, d0			C restore cy
+	bra	L(Loop)
+
+L(L2):
+	negl	d0
+
+C Restore used registers from stack frame.
+	movel	M(sp,+), a2
+	movel	M(sp,+), d2
+
+	rts
+
+EPILOGUE(M4_function_n)

diff --git a/third_party/gmp/mpn/m68k/gmp-mparam.h b/third_party/gmp/mpn/m68k/gmp-mparam.h
new file mode 100644
index 0000000..9ac7b41
--- /dev/null
+++ b/third_party/gmp/mpn/m68k/gmp-mparam.h

@@ -0,0 +1,76 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2000-2004 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+
+/* 25MHz 68040 */
+
+/* Generated by tuneup.c, 2004-02-05, gcc 3.2 */
+
+#define MUL_TOOM22_THRESHOLD             14
+#define MUL_TOOM33_THRESHOLD             90
+
+#define SQR_BASECASE_THRESHOLD            5
+#define SQR_TOOM2_THRESHOLD              28
+#define SQR_TOOM3_THRESHOLD              98
+
+#define DIV_SB_PREINV_THRESHOLD       MP_SIZE_T_MAX  /* never */
+#define DIV_DC_THRESHOLD                 55
+#define POWM_THRESHOLD                   65
+
+#define HGCD_THRESHOLD                  116
+#define GCD_ACCEL_THRESHOLD               3
+#define GCD_DC_THRESHOLD                590
+#define JACOBI_BASE_METHOD                2
+
+#define DIVREM_1_NORM_THRESHOLD       MP_SIZE_T_MAX  /* never */
+#define DIVREM_1_UNNORM_THRESHOLD     MP_SIZE_T_MAX  /* never */
+#define MOD_1_NORM_THRESHOLD          MP_SIZE_T_MAX  /* never */
+#define MOD_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */
+#define USE_PREINV_DIVREM_1               0
+#define USE_PREINV_MOD_1                  0
+#define DIVREM_2_THRESHOLD            MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD          MP_SIZE_T_MAX  /* never */
+#define MODEXACT_1_ODD_THRESHOLD      MP_SIZE_T_MAX  /* never */
+
+#define GET_STR_DC_THRESHOLD             18
+#define GET_STR_PRECOMPUTE_THRESHOLD     43
+#define SET_STR_THRESHOLD               937
+
+#define MUL_FFT_TABLE  { 336, 672, 1408, 3584, 10240, 24576, 0 }
+#define MUL_FFT_MODF_THRESHOLD          296
+#define MUL_FFT_THRESHOLD              1728
+
+#define SQR_FFT_TABLE  { 336, 736, 1408, 3584, 10240, 24576, 0 }
+#define SQR_FFT_MODF_THRESHOLD          296
+#define SQR_FFT_THRESHOLD              2304

diff --git a/third_party/gmp/mpn/m68k/lshift.asm b/third_party/gmp/mpn/m68k/lshift.asm
new file mode 100644
index 0000000..f202abf
--- /dev/null
+++ b/third_party/gmp/mpn/m68k/lshift.asm

@@ -0,0 +1,175 @@
+dnl  mc68020 mpn_lshift -- mpn left shift.
+
+dnl  Copyright 1996, 1999-2003 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C           cycles/limb
+C        shift==1  shift>1
+C 68040:    5         12
+
+
+C mp_limb_t mpn_lshift (mp_ptr res_ptr, mp_srcptr s_ptr, mp_size_t s_size,
+C                       unsigned cnt);
+C
+C The "cnt" parameter is either 16 bits or 32 bits depending on
+C SIZEOF_UNSIGNED (see ABI notes in mpn/m68k/README).  The value is of
+C course only 1 to 31.  When loaded as 16 bits there's garbage in the upper
+C half, hence the use of cmpw.  The shift instructions take the their count
+C modulo 64, so the upper part doesn't matter to them either.
+C
+
+C INPUT PARAMETERS
+C res_ptr	(sp + 4)
+C s_ptr		(sp + 8)
+C s_size	(sp + 12)
+C cnt		(sp + 16)
+
+define(res_ptr, `a1')
+define(s_ptr,   `a0')
+define(s_size,  `d6')
+define(cnt,     `d4')
+
+ifdef(`SIZEOF_UNSIGNED',,
+`m4_error(`SIZEOF_UNSIGNED not defined, should be in config.m4
+')')
+
+PROLOGUE(mpn_lshift)
+C Save used registers on the stack.
+	moveml	d2-d6/a2, M(-,sp)
+
+C Copy the arguments to registers.
+	movel	M(sp,28), res_ptr
+	movel	M(sp,32), s_ptr
+	movel	M(sp,36), s_size
+ifelse(SIZEOF_UNSIGNED,2,
+`	movew	M(sp,40), cnt',
+`	movel	M(sp,40), cnt')
+
+	moveql	#1, d5
+	cmpw	d5, cnt
+	bne	L(Lnormal)
+	cmpl	s_ptr, res_ptr
+	bls	L(Lspecial)		C jump if s_ptr >= res_ptr
+
+ifelse(scale_available_p,1,`
+	lea	M(s_ptr,s_size,l,4), a2
+',`
+	movel	s_size, d0
+	asll	#2, d0
+	lea	M(s_ptr,d0,l), a2
+')
+	cmpl	res_ptr, a2
+	bls	L(Lspecial)		C jump if res_ptr >= s_ptr + s_size
+
+L(Lnormal):
+	moveql	#32, d5
+	subl	cnt, d5
+
+ifelse(scale_available_p,1,`
+	lea	M(s_ptr,s_size,l,4), s_ptr
+	lea	M(res_ptr,s_size,l,4), res_ptr
+',`
+	movel	s_size, d0
+	asll	#2, d0
+	addl	d0, s_ptr
+	addl	d0, res_ptr
+')
+	movel	M(-,s_ptr), d2
+	movel	d2, d0
+	lsrl	d5, d0		C compute carry limb
+
+	lsll	cnt, d2
+	movel	d2, d1
+	subql	#1, s_size
+	beq	L(Lend)
+	lsrl	#1, s_size
+	bcs	L(L1)
+	subql	#1, s_size
+
+L(Loop):
+	movel	M(-,s_ptr), d2
+	movel	d2, d3
+	lsrl	d5, d3
+	orl	d3, d1
+	movel	d1, M(-,res_ptr)
+	lsll	cnt, d2
+L(L1):
+	movel	M(-,s_ptr), d1
+	movel	d1, d3
+	lsrl	d5, d3
+	orl	d3, d2
+	movel	d2, M(-,res_ptr)
+	lsll	cnt, d1
+
+	dbf	s_size, L(Loop)
+	subl	#0x10000, s_size
+	bcc	L(Loop)
+
+L(Lend):
+	movel	d1, M(-,res_ptr)	C store least significant limb
+
+C Restore used registers from stack frame.
+	moveml	M(sp,+), d2-d6/a2
+	rts
+
+C We loop from least significant end of the arrays, which is only
+C permissable if the source and destination don't overlap, since the
+C function is documented to work for overlapping source and destination.
+
+L(Lspecial):
+	clrl	d0			C initialize carry
+	eorw	#1, s_size
+	lsrl	#1, s_size
+	bcc	L(LL1)
+	subql	#1, s_size
+
+L(LLoop):
+	movel	M(s_ptr,+), d2
+	addxl	d2, d2
+	movel	d2, M(res_ptr,+)
+L(LL1):
+	movel	M(s_ptr,+), d2
+	addxl	d2, d2
+	movel	d2, M(res_ptr,+)
+
+	dbf	s_size, L(LLoop)
+	addxl	d0, d0		C save cy in lsb
+	subl	#0x10000, s_size
+	bcs	L(LLend)
+	lsrl	#1, d0		C restore cy
+	bra	L(LLoop)
+
+L(LLend):
+C Restore used registers from stack frame.
+	moveml	M(sp,+), d2-d6/a2
+	rts
+
+EPILOGUE(mpn_lshift)

diff --git a/third_party/gmp/mpn/m68k/m68k-defs.m4 b/third_party/gmp/mpn/m68k/m68k-defs.m4
new file mode 100644
index 0000000..15289f6
--- /dev/null
+++ b/third_party/gmp/mpn/m68k/m68k-defs.m4

@@ -0,0 +1,230 @@
+divert(-1)
+
+dnl  m4 macros for 68k assembler.
+
+dnl  Copyright 2001-2003 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+dnl  The default m4 `#' commenting interferes with the assembler syntax for
+dnl  immediates.  `|' would be correct, but it interferes with "||" in
+dnl  eval().  Would like to disable commenting, but that's not possible (see
+dnl  mpn/asm-defs.m4), so use `;' which should be harmless.
+
+changecom(;)
+
+
+dnl  Called: PROLOGUE_cpu(GSYM_PREFIX`'foo)
+dnl
+dnl  Same as the standard PROLOGUE, but align to 2 bytes not 4.
+
+define(`PROLOGUE_cpu',
+m4_assert_numargs(1)
+`	TEXT
+	ALIGN(2)
+	GLOBL	`$1' GLOBL_ATTR
+	TYPE(`$1',`function')
+`$1'LABEL_SUFFIX')
+
+
+dnl  Usage: d0, etc
+dnl
+dnl  Expand to d0 or %d0 according to the assembler's requirements.
+dnl
+dnl  Actually d0 expands to `d0' or %`d0', the quotes protecting against
+dnl  further expansion.  Definitions are made even if d0 is to be just `d0',
+dnl  so that any m4 quoting problems will show up everywhere, not just on a
+dnl  %d0 system.
+dnl
+dnl  Care must be taken with quoting when using these in a definition.  For
+dnl  instance the quotes in the following are essential or two %'s will be
+dnl  produced when `counter' is used.
+dnl
+dnl         define(counter, `d7')
+dnl
+
+dnl  Called: m68k_reg(r)
+define(m68k_reg,
+m4_assert_numargs(1)
+m4_assert_defined(`WANT_REGISTER_PERCENT')
+`ifelse(WANT_REGISTER_PERCENT,yes,%)`$1'')
+
+dnl  Usage: m68k_defreg(r)
+define(m68k_defreg,
+m4_assert_numargs(1)
+`deflit($1,`m68k_reg(`$1')')')
+
+m68k_defreg(d0)
+m68k_defreg(d1)
+m68k_defreg(d2)
+m68k_defreg(d3)
+m68k_defreg(d4)
+m68k_defreg(d5)
+m68k_defreg(d6)
+m68k_defreg(d7)
+
+m68k_defreg(a0)
+m68k_defreg(a1)
+m68k_defreg(a2)
+m68k_defreg(a3)
+m68k_defreg(a4)
+m68k_defreg(a5)
+m68k_defreg(a6)
+m68k_defreg(a7)
+
+m68k_defreg(sp)
+m68k_defreg(pc)
+
+
+dnl  Usage: M(base)
+dnl         M(base,displacement)
+dnl         M(base,index,size)
+dnl         M(base,index,size,scale)
+dnl         M(base,+)
+dnl         M(-,base)
+dnl
+dnl  `base' is an address register, `index' is a data register, `size' is w
+dnl  or l, and scale is 1, 2, 4 or 8.
+dnl
+dnl  M(-,base) has it's arguments that way around to emphasise it's a
+dnl  pre-decrement, as opposed to M(base,+) a post-increment.
+dnl
+dnl  Enhancement: Add the memory indirect modes, if/when they're needed.
+
+define(M,
+m4_assert_numargs_range(1,4)
+m4_assert_defined(`WANT_ADDRESSING')
+`ifelse(WANT_ADDRESSING,mit,
+`ifelse($#,1, ``$1'@')dnl
+ifelse($#,2,
+`ifelse($2,+, ``$1'@+',
+`ifelse($1,-, ``$2'@-',
+              ``$1'@($2)')')')dnl
+ifelse($#,3,  ``$1'@(`$2':`$3')')dnl
+ifelse($#,4,  ``$1'@(`$2':`$3':$4)')',
+
+dnl  WANT_ADDRESSING `motorola'
+`ifelse($#,1, `(`$1')')dnl
+ifelse($#,2,
+`ifelse($2,+, `(`$1')+',
+`ifelse($1,-, `-(`$2')',
+              `$2(`$1')')')')dnl
+ifelse($#,3,  `(`$1',`$2'.$3)')dnl
+ifelse($#,4,  `(`$1',`$2'.$3*$4)')')')
+
+
+dnl  Usage: addl etc
+dnl
+dnl  m68k instructions with special handling for the suffix, with for
+dnl  instance addl expanding to addl or add.l as necessary.
+dnl
+dnl  See also t-m68k-defs.pl which verifies all mnemonics used in the asm
+dnl  files have entries here.
+
+dnl  Called: m68k_insn(mnemonic,suffix)
+define(m68k_insn,
+m4_assert_numargs(2)
+m4_assert_defined(`WANT_DOT_SIZE')
+`ifelse(WANT_DOT_SIZE,yes, ``$1'.``$2''',
+                           ``$1$2'')')
+
+dnl  Usage: m68k_definsn(mnemonic,suffix)
+define(m68k_definsn,
+m4_assert_numargs(2)
+`deflit($1`'$2,`m68k_insn(`$1',`$2')')')
+
+m68k_definsn(add,  l)
+m68k_definsn(addx, l)
+m68k_definsn(addq, l)
+m68k_definsn(asl,  l)
+m68k_definsn(cmp,  l)
+m68k_definsn(cmp,  w)
+m68k_definsn(clr,  l)
+m68k_definsn(divu, l)
+m68k_definsn(eor,  w)
+m68k_definsn(lsl,  l)
+m68k_definsn(lsr,  l)
+m68k_definsn(move, l)
+m68k_definsn(move, w)
+m68k_definsn(movem,l)
+m68k_definsn(moveq,l)
+m68k_definsn(mulu, l)
+m68k_definsn(neg,  l)
+m68k_definsn(or,   l)
+m68k_definsn(roxl, l)
+m68k_definsn(roxr, l)
+m68k_definsn(sub,  l)
+m68k_definsn(subx, l)
+m68k_definsn(subq, l)
+
+
+dnl  Usage: bra etc
+dnl
+dnl  Expand to `bra', `jra' or `jbra' according to what the assembler will
+dnl  accept.  The latter two give variable-sized branches in gas.
+dnl
+dnl  See also t-m68k-defs.pl which verifies all the bXX branches used in the
+dnl  asm files have entries here.
+
+dnl  Called: m68k_branch(cond)
+define(m68k_branch,
+m4_assert_numargs(1)
+m4_assert_defined(`WANT_BRANCHES')
+`ifelse(WANT_BRANCHES,jra, `j$1',
+`ifelse(WANT_BRANCHES,jbra,`jb$1',
+                           ``b$1'')')')
+
+dnl  Called: m68k_defbranch(cond)
+define(m68k_defbranch,
+m4_assert_numargs(1)
+`deflit(b$1,`m68k_branch(`$1')')')
+
+m68k_defbranch(ra)
+m68k_defbranch(cc)
+m68k_defbranch(cs)
+m68k_defbranch(ls)
+m68k_defbranch(eq)
+m68k_defbranch(ne)
+
+
+dnl  Usage: scale_available_p
+dnl
+dnl  Expand to 1 if a scale factor can be used in addressing modes, or 0 if
+dnl  not.  M(a0,d0,l,4), meaning a0+d0*4, is not available in 68000 or
+dnl  68010, but is in CPU32 and in 68020 and up.
+
+define(scale_available_p,
+`m4_ifdef_anyof_p(
+`HAVE_HOST_CPU_m68360'
+`HAVE_HOST_CPU_m68020'
+`HAVE_HOST_CPU_m68030'
+`HAVE_HOST_CPU_m68040'
+`HAVE_HOST_CPU_m68060')')
+
+
+divert

diff --git a/third_party/gmp/mpn/m68k/mc68020/aorsmul_1.asm b/third_party/gmp/mpn/m68k/mc68020/aorsmul_1.asm
new file mode 100644
index 0000000..4ee30ad
--- /dev/null
+++ b/third_party/gmp/mpn/m68k/mc68020/aorsmul_1.asm

@@ -0,0 +1,101 @@
+dnl  mc68020 mpn_addmul_1, mpn_submul_1 -- add or subtract mpn multiple.
+
+dnl  Copyright 1992, 1994, 1996, 1999-2002, 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C         cycles/limb
+C 68040:     25
+
+ifdef(`OPERATION_addmul_1',`
+  define(M4_inst,       addl)
+  define(M4_function_1, mpn_addmul_1)
+',`ifdef(`OPERATION_submul_1',`
+  define(M4_inst,       subl)
+  define(M4_function_1, mpn_submul_1)
+',
+`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1
+')')')
+
+
+C INPUT PARAMETERS
+C res_ptr	(sp + 4)
+C s1_ptr	(sp + 8)
+C s1_size	(sp + 12)
+C s2_limb	(sp + 16)
+
+define(res_ptr, `a0')
+define(s1_ptr,  `a1')
+define(s1_size, `d2')
+define(s2_limb, `d4')
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
+
+PROLOGUE(M4_function_1)
+
+C Save used registers on the stack.
+	moveml	d2-d5, M(-,sp)
+
+C Copy the arguments to registers.  Better use movem?
+	movel	M(sp,20), res_ptr
+	movel	M(sp,24), s1_ptr
+	movel	M(sp,28), s1_size
+	movel	M(sp,32), s2_limb
+
+	eorw	#1, s1_size
+	clrl	d1
+	clrl	d5
+	lsrl	#1, s1_size
+	bcc	L(L1)
+	subql	#1, s1_size
+	subl	d0, d0		C (d0,cy) <= (0,0)
+
+L(Loop):
+	movel	M(s1_ptr,+), d3
+	mulul	s2_limb, d1:d3
+	addxl	d0, d3
+	addxl	d5, d1
+	M4_inst	d3, M(res_ptr,+)
+L(L1):	movel	M(s1_ptr,+), d3
+	mulul	s2_limb, d0:d3
+	addxl	d1, d3
+	addxl	d5, d0
+	M4_inst	d3, M(res_ptr,+)
+
+	dbf	s1_size, L(Loop)
+	addxl	d5, d0
+	subl	#0x10000, s1_size
+	bcc	L(Loop)
+
+C Restore used registers from stack frame.
+	moveml	M(sp,+), d2-d5
+
+	rts
+
+EPILOGUE(M4_function_1)

diff --git a/third_party/gmp/mpn/m68k/mc68020/mul_1.asm b/third_party/gmp/mpn/m68k/mc68020/mul_1.asm
new file mode 100644
index 0000000..f5fbb30
--- /dev/null
+++ b/third_party/gmp/mpn/m68k/mc68020/mul_1.asm

@@ -0,0 +1,96 @@
+dnl  mc68020 mpn_mul_1 -- mpn by limb multiply
+
+dnl  Copyright 1992, 1994, 1996, 1999-2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C         cycles/limb
+C 68040:     24
+
+C INPUT PARAMETERS
+C res_ptr	(sp + 4)
+C s1_ptr	(sp + 8)
+C s1_size	(sp + 12)
+C s2_limb	(sp + 16)
+
+
+define(res_ptr, `a0')
+define(s1_ptr,  `a1')
+define(s1_size, `d2')
+define(s2_limb, `d4')
+
+
+PROLOGUE(mpn_mul_1)
+
+C Save used registers on the stack.
+	moveml	d2-d4, M(-,sp)
+
+C	movel	d2, M(-,sp)
+C	movel	d3, M(-,sp)
+C	movel	d4, M(-,sp)
+
+C Copy the arguments to registers.  Better use movem?
+	movel	M(sp,16), res_ptr
+	movel	M(sp,20), s1_ptr
+	movel	M(sp,24), s1_size
+	movel	M(sp,28), s2_limb
+
+	eorw	#1, s1_size
+	clrl	d1
+	lsrl	#1, s1_size
+	bcc	L(L1)
+	subql	#1, s1_size
+	subl	d0, d0		C (d0,cy) <= (0,0)
+
+L(Loop):
+	movel	M(s1_ptr,+), d3
+	mulul	s2_limb, d1:d3
+	addxl	d0, d3
+	movel	d3, M(res_ptr,+)
+L(L1):	movel	M(s1_ptr,+), d3
+	mulul	s2_limb, d0:d3
+	addxl	d1, d3
+	movel	d3, M(res_ptr,+)
+
+	dbf	s1_size, L(Loop)
+	clrl	d3
+	addxl	d3, d0
+	subl	#0x10000, s1_size
+	bcc	L(Loop)
+
+C Restore used registers from stack frame.
+	moveml	M(sp,+), d2-d4
+
+C	movel	M(sp,+),d4
+C	movel	M(sp,+),d3
+C	movel	M(sp,+),d2
+
+	rts
+
+EPILOGUE(mpn_mul_1)

diff --git a/third_party/gmp/mpn/m68k/mc68020/udiv.asm b/third_party/gmp/mpn/m68k/mc68020/udiv.asm
new file mode 100644
index 0000000..aadeab9
--- /dev/null
+++ b/third_party/gmp/mpn/m68k/mc68020/udiv.asm

@@ -0,0 +1,45 @@
+dnl  mc68020 mpn_udiv_qrnnd -- 2x1 limb division
+
+dnl  Copyright 1999-2001 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_udiv_qrnnd (mp_limb_t *rp,
+C                           mp_limb_t nh, mp_limb_t nl, mp_limb_t d);
+C
+
+PROLOGUE(mpn_udiv_qrnnd)
+	movel	M(sp,4), a0	C rp
+	movel	M(sp,8), d1	C nh
+	movel	M(sp,12), d0	C nl
+	divul	M(sp,16), d1:d0
+	movel	d1, M(a0)	C r
+	rts
+EPILOGUE(mpn_udiv_qrnnd)

diff --git a/third_party/gmp/mpn/m68k/mc68020/umul.asm b/third_party/gmp/mpn/m68k/mc68020/umul.asm
new file mode 100644
index 0000000..f19314e
--- /dev/null
+++ b/third_party/gmp/mpn/m68k/mc68020/umul.asm

@@ -0,0 +1,44 @@
+dnl  mc68020 mpn_umul_ppmm -- limb by limb multiplication
+
+dnl  Copyright 1999-2001 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_umul_ppmm (mp_limb_t *lp, mp_limb_t x, mp_limb_t y);
+C
+
+PROLOGUE(mpn_umul_ppmm)
+	movel	M(sp,4), a0	C lp
+	movel	M(sp,8), d1	C x
+	movel	M(sp,12), d0	C y
+	mulul	d0, d0:d1
+	movel	d1, M(a0)	C low
+	rts
+EPILOGUE(mpn_umul_ppmm)

diff --git a/third_party/gmp/mpn/m68k/rshift.asm b/third_party/gmp/mpn/m68k/rshift.asm
new file mode 100644
index 0000000..21b5f89
--- /dev/null
+++ b/third_party/gmp/mpn/m68k/rshift.asm

@@ -0,0 +1,175 @@
+dnl  mc68020 mpn_rshift -- mpn right shift.
+
+dnl  Copyright 1996, 1999-2003 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C           cycles/limb
+C        shift==1  shift>1
+C 68040:    9         12
+
+
+C mp_limb_t mpn_rshift (mp_ptr res_ptr, mp_srcptr s_ptr, mp_size_t s_size,
+C                       unsigned cnt);
+C
+C The "cnt" parameter is either 16 bits or 32 bits depending on
+C SIZEOF_UNSIGNED (see ABI notes in mpn/m68k/README).  The value is of
+C course only 1 to 31.  When loaded as 16 bits there's garbage in the upper
+C half, hence the use of cmpw.  The shift instructions take the their count
+C modulo 64, so the upper part doesn't matter to them either.
+C
+
+C INPUT PARAMETERS
+C res_ptr	(sp + 4)
+C s_ptr		(sp + 8)
+C s_size	(sp + 12)
+C cnt		(sp + 16)
+
+define(res_ptr, `a1')
+define(s_ptr,   `a0')
+define(s_size,  `d6')
+define(cnt,     `d4')
+
+ifdef(`SIZEOF_UNSIGNED',,
+`m4_error(`SIZEOF_UNSIGNED not defined, should be in config.m4
+')')
+
+PROLOGUE(mpn_rshift)
+C Save used registers on the stack.
+	moveml	d2-d6/a2, M(-,sp)
+
+C Copy the arguments to registers.
+	movel	M(sp,28), res_ptr
+	movel	M(sp,32), s_ptr
+	movel	M(sp,36), s_size
+ifelse(SIZEOF_UNSIGNED,2,
+`	movew	M(sp,40), cnt',
+`	movel	M(sp,40), cnt')
+
+	moveql	#1, d5
+	cmpw	d5, cnt
+	bne	L(Lnormal)
+	cmpl	res_ptr, s_ptr
+	bls	L(Lspecial)		C jump if res_ptr >= s_ptr
+
+ifelse(scale_available_p,1,`
+	lea	M(res_ptr,s_size,l,4), a2
+',`
+	movel	s_size, d0
+	asll	#2, d0
+	lea	M(res_ptr,d0,l), a2
+')
+	cmpl	s_ptr, a2
+	bls	L(Lspecial)		C jump if s_ptr >= res_ptr + s_size
+
+L(Lnormal):
+	moveql	#32, d5
+	subl	cnt, d5
+	movel	M(s_ptr,+), d2
+	movel	d2, d0
+	lsll	d5, d0		C compute carry limb
+
+	lsrl	cnt, d2
+	movel	d2, d1
+	subql	#1, s_size
+	beq	L(Lend)
+	lsrl	#1, s_size
+	bcs	L(L1)
+	subql	#1, s_size
+
+L(Loop):
+	movel	M(s_ptr,+), d2
+	movel	d2, d3
+	lsll	d5, d3
+	orl	d3, d1
+	movel	d1, M(res_ptr,+)
+	lsrl	cnt, d2
+L(L1):
+	movel	M(s_ptr,+), d1
+	movel	d1, d3
+	lsll	d5, d3
+	orl	d3, d2
+	movel	d2, M(res_ptr,+)
+	lsrl	cnt, d1
+
+	dbf	s_size, L(Loop)
+	subl	#0x10000, s_size
+	bcc	L(Loop)
+
+L(Lend):
+	movel	d1, M(res_ptr)	C store most significant limb
+
+C Restore used registers from stack frame.
+	moveml	M(sp,+), d2-d6/a2
+	rts
+
+C We loop from most significant end of the arrays, which is only permissable
+C if the source and destination don't overlap, since the function is
+C documented to work for overlapping source and destination.
+
+L(Lspecial):
+ifelse(scale_available_p,1,`
+	lea	M(s_ptr,s_size,l,4), s_ptr
+	lea	M(res_ptr,s_size,l,4), res_ptr
+',`
+	movel	s_size, d0
+	asll	#2, d0
+	addl	d0, s_ptr
+	addl	d0, res_ptr
+')
+
+	clrl	d0			C initialize carry
+	eorw	#1, s_size
+	lsrl	#1, s_size
+	bcc	L(LL1)
+	subql	#1, s_size
+
+L(LLoop):
+	movel	M(-,s_ptr), d2
+	roxrl	#1, d2
+	movel	d2, M(-,res_ptr)
+L(LL1):
+	movel	M(-,s_ptr), d2
+	roxrl	#1, d2
+	movel	d2, M(-,res_ptr)
+
+	dbf	s_size, L(LLoop)
+	roxrl	#1, d0		C save cy in msb
+	subl	#0x10000, s_size
+	bcs	L(LLend)
+	addl	d0, d0		C restore cy
+	bra	L(LLoop)
+
+L(LLend):
+C Restore used registers from stack frame.
+	moveml	M(sp,+), d2-d6/a2
+	rts
+
+EPILOGUE(mpn_rshift)

diff --git a/third_party/gmp/mpn/m68k/t-m68k-defs.pl b/third_party/gmp/mpn/m68k/t-m68k-defs.pl
new file mode 100644
index 0000000..91c21fa
--- /dev/null
+++ b/third_party/gmp/mpn/m68k/t-m68k-defs.pl

@@ -0,0 +1,91 @@
+#! /usr/bin/perl -w
+
+# Copyright 2001, 2003 Free Software Foundation, Inc.
+#
+#  This file is part of the GNU MP Library.
+#
+#  The GNU MP Library is free software; you can redistribute it and/or modify
+#  it under the terms of either:
+#
+#    * the GNU Lesser General Public License as published by the Free
+#      Software Foundation; either version 3 of the License, or (at your
+#      option) any later version.
+#
+#  or
+#
+#    * the GNU General Public License as published by the Free Software
+#      Foundation; either version 2 of the License, or (at your option) any
+#      later version.
+#
+#  or both in parallel, as here.
+#
+#  The GNU MP Library is distributed in the hope that it will be useful, but
+#  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+#  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+#  for more details.
+#
+#  You should have received copies of the GNU General Public License and the
+#  GNU Lesser General Public License along with the GNU MP Library.  If not,
+#  see https://www.gnu.org/licenses/.
+
+
+# Usage:  perl t-m68k-defs.pl [-t]
+#
+# Run this in the mpn/m68k source directory to check that m68k-defs.m4 has
+# m68k_defbranch()s or m68k_definsn()s for each instruction used in *.asm
+# and */*.asm.  Print nothing if everything is ok.  The -t option prints
+# some diagnostic traces.
+
+use strict;
+use Getopt::Std;
+
+my %opt;
+getopts('t', \%opt);
+
+my %branch;
+my %insn;
+
+open(FD, "<m68k-defs.m4")
+    or die "Cannot open m68k-defs.m4: $!\nIs this the mpn/m68k source directory?\n";
+my ($srcdir, $top_srcdir);
+while (<FD>) {
+    if (/^m68k_defbranch\(\s*(.*)\)/) { $branch{"b".$1} = 1; }
+    if (/^m68k_definsn\(\s*(.*),\s*(.*)\)/) { $insn{$1.$2} = 1; }
+}
+close(FD);
+
+print "branches: ", join(" ",keys(%branch)), "\n" if $opt{'t'};
+print "insns: ", join(" ",keys(%insn)), "\n" if $opt{'t'};
+
+
+foreach my $file (glob("*.asm"), glob("*/*.asm")) {
+    print "file $file\n" if $opt{'t'};
+
+    open(FD, "<$file") or die "Cannot open $file: $!";
+    while (<FD>) {
+	if (/^[ \t]*C/) { next; };
+	if (/^\t([a-z0-9]+)/) {
+	    my $opcode = $1;
+	    print "opcode $1\n" if $opt{'t'};
+
+	    # instructions with an l, w or b suffix should have a definsn
+	    # (unless they're already a defbranch)
+	    if ($opcode =~ /[lwb]$/
+		&& ! defined $insn{$opcode}
+		&& ! defined $branch{$opcode})
+	    {
+		print "$file: $.: missing m68k_definsn: $opcode\n";
+	    }
+
+	    # instructions bXX should have a defbranch (unless they're
+	    # already a definsn)
+	    if ($opcode =~ /^b/
+		&& ! defined $insn{$opcode}
+		&& ! defined $branch{$opcode})
+	    {
+		print "$file: $.: missing m68k_defbranch: $opcode\n";
+	    }
+	}
+    }
+    close(FD);
+}

diff --git a/third_party/gmp/mpn/m88k/README b/third_party/gmp/mpn/m88k/README
new file mode 100644
index 0000000..1b51e83
--- /dev/null
+++ b/third_party/gmp/mpn/m88k/README

@@ -0,0 +1,61 @@
+Copyright 2003 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.
+
+
+
+
+
+                      M88K MPN SUBROUTINES
+
+This directory contains mpn functions for various m88k family chips.
+
+CODE ORGANIZATION
+
+	m88k             m88000, m88100
+	m88k/mc88110     m88110
+
+STATUS
+
+The code herein is old and poorly maintained.
+
+* The .s files assume the system uses a "_" underscore prefix, which
+  should be controlled by configure.
+
+* The mc88110/*.S files are using the defunct "sysdep.h" configuration
+  scheme and won't compile.
+
+Conversion to the current m4 .asm style wouldn't be difficult.
+
+
+
+
+----------------
+Local variables:
+mode: text
+fill-column: 76
+End:

diff --git a/third_party/gmp/mpn/m88k/add_n.s b/third_party/gmp/mpn/m88k/add_n.s
new file mode 100644
index 0000000..dbdb22f
--- /dev/null
+++ b/third_party/gmp/mpn/m88k/add_n.s

@@ -0,0 +1,113 @@
+; mc88100 mpn_add_n -- Add two limb vectors of the same length > 0 and store
+; sum in a third limb vector.
+
+; Copyright 1992, 1994, 1995, 2000 Free Software Foundation, Inc.
+
+;  This file is part of the GNU MP Library.
+;
+;  The GNU MP Library is free software; you can redistribute it and/or modify
+;  it under the terms of either:
+;
+;    * the GNU Lesser General Public License as published by the Free
+;      Software Foundation; either version 3 of the License, or (at your
+;      option) any later version.
+;
+;  or
+;
+;    * the GNU General Public License as published by the Free Software
+;      Foundation; either version 2 of the License, or (at your option) any
+;      later version.
+;
+;  or both in parallel, as here.
+;
+;  The GNU MP Library is distributed in the hope that it will be useful, but
+;  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+;  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+;  for more details.
+;
+;  You should have received copies of the GNU General Public License and the
+;  GNU Lesser General Public License along with the GNU MP Library.  If not,
+;  see https://www.gnu.org/licenses/.
+
+
+; INPUT PARAMETERS
+; res_ptr	r2
+; s1_ptr	r3
+; s2_ptr	r4
+; size		r5
+
+; This code has been optimized to run one instruction per clock, avoiding
+; load stalls and writeback contention.  As a result, the instruction
+; order is not always natural.
+
+; The speed is about 4.6 clocks/limb + 18 clocks/limb-vector on an 88100,
+; but on the 88110, it seems to run much slower, 6.6 clocks/limb.
+
+	text
+	align	 16
+	global	 ___gmpn_add_n
+___gmpn_add_n:
+	ld	r6,r3,0			; read first limb from s1_ptr
+	extu	r10,r5,3
+	ld	r7,r4,0			; read first limb from s2_ptr
+
+	subu.co	r5,r0,r5		; (clear carry as side effect)
+	mak	r5,r5,3<4>
+	bcnd	eq0,r5,Lzero
+
+	or	r12,r0,lo16(Lbase)
+	or.u	r12,r12,hi16(Lbase)
+	addu	r12,r12,r5		; r12 is address for entering in loop
+
+	extu	r5,r5,2			; divide by 4
+	subu	r2,r2,r5		; adjust res_ptr
+	subu	r3,r3,r5		; adjust s1_ptr
+	subu	r4,r4,r5		; adjust s2_ptr
+
+	or	r8,r6,r0
+
+	jmp.n	r12
+	 or	r9,r7,r0
+
+Loop:	addu	r3,r3,32
+	st	r8,r2,28
+	addu	r4,r4,32
+	ld	r6,r3,0
+	addu	r2,r2,32
+	ld	r7,r4,0
+Lzero:	subu	r10,r10,1		; add 0 + 8r limbs (adj loop cnt)
+Lbase:	ld	r8,r3,4
+	addu.cio r6,r6,r7
+	ld	r9,r4,4
+	st	r6,r2,0
+	ld	r6,r3,8			; add 7 + 8r limbs
+	addu.cio r8,r8,r9
+	ld	r7,r4,8
+	st	r8,r2,4
+	ld	r8,r3,12		; add 6 + 8r limbs
+	addu.cio r6,r6,r7
+	ld	r9,r4,12
+	st	r6,r2,8
+	ld	r6,r3,16		; add 5 + 8r limbs
+	addu.cio r8,r8,r9
+	ld	r7,r4,16
+	st	r8,r2,12
+	ld	r8,r3,20		; add 4 + 8r limbs
+	addu.cio r6,r6,r7
+	ld	r9,r4,20
+	st	r6,r2,16
+	ld	r6,r3,24		; add 3 + 8r limbs
+	addu.cio r8,r8,r9
+	ld	r7,r4,24
+	st	r8,r2,20
+	ld	r8,r3,28		; add 2 + 8r limbs
+	addu.cio r6,r6,r7
+	ld	r9,r4,28
+	st	r6,r2,24
+	bcnd.n	ne0,r10,Loop		; add 1 + 8r limbs
+	 addu.cio r8,r8,r9
+
+	st	r8,r2,28		; store most significant limb
+
+	jmp.n	 r1
+	 addu.ci r2,r0,r0		; return carry-out from most sign. limb

diff --git a/third_party/gmp/mpn/m88k/mc88110/add_n.S b/third_party/gmp/mpn/m88k/mc88110/add_n.S
new file mode 100644
index 0000000..c3b12b3
--- /dev/null
+++ b/third_party/gmp/mpn/m88k/mc88110/add_n.S

@@ -0,0 +1,209 @@
+; mc88110 __gmpn_add_n -- Add two limb vectors of the same length > 0 and store
+; sum in a third limb vector.
+
+; Copyright 1995, 1996, 2000 Free Software Foundation, Inc.
+
+;  This file is part of the GNU MP Library.
+;
+;  The GNU MP Library is free software; you can redistribute it and/or modify
+;  it under the terms of either:
+;
+;    * the GNU Lesser General Public License as published by the Free
+;      Software Foundation; either version 3 of the License, or (at your
+;      option) any later version.
+;
+;  or
+;
+;    * the GNU General Public License as published by the Free Software
+;      Foundation; either version 2 of the License, or (at your option) any
+;      later version.
+;
+;  or both in parallel, as here.
+;
+;  The GNU MP Library is distributed in the hope that it will be useful, but
+;  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+;  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+;  for more details.
+;
+;  You should have received copies of the GNU General Public License and the
+;  GNU Lesser General Public License along with the GNU MP Library.  If not,
+;  see https://www.gnu.org/licenses/.
+
+
+; INPUT PARAMETERS
+#define res_ptr	r2
+#define s1_ptr	r3
+#define s2_ptr	r4
+#define size	r5
+
+#include "sysdep.h"
+
+	text
+	align	16
+	global	C_SYMBOL_NAME(__gmpn_add_n)
+C_SYMBOL_NAME(__gmpn_add_n):
+	addu.co	 r0,r0,r0		; clear cy flag
+	xor	 r12,s2_ptr,res_ptr
+	bb1	 2,r12,L1
+; **  V1a  **
+L0:	bb0	 2,res_ptr,L_v1		; branch if res_ptr is aligned?
+/* Add least significant limb separately to align res_ptr and s2_ptr */
+	ld	 r10,s1_ptr,0
+	addu	 s1_ptr,s1_ptr,4
+	ld	 r8,s2_ptr,0
+	addu	 s2_ptr,s2_ptr,4
+	subu	 size,size,1
+	addu.co	 r6,r10,r8
+	st	 r6,res_ptr,0
+	addu	 res_ptr,res_ptr,4
+L_v1:	cmp	 r12,size,2
+	bb1	 lt,r12,Lend2
+
+	ld	 r10,s1_ptr,0
+	ld	 r12,s1_ptr,4
+	ld.d	 r8,s2_ptr,0
+	subu	 size,size,10
+	bcnd	 lt0,size,Lfin1
+/* Add blocks of 8 limbs until less than 8 limbs remain */
+	align	 8
+Loop1:	subu	 size,size,8
+	addu.cio r6,r10,r8
+	ld	 r10,s1_ptr,8
+	addu.cio r7,r12,r9
+	ld	 r12,s1_ptr,12
+	ld.d	 r8,s2_ptr,8
+	st.d	 r6,res_ptr,0
+	addu.cio r6,r10,r8
+	ld	 r10,s1_ptr,16
+	addu.cio r7,r12,r9
+	ld	 r12,s1_ptr,20
+	ld.d	 r8,s2_ptr,16
+	st.d	 r6,res_ptr,8
+	addu.cio r6,r10,r8
+	ld	 r10,s1_ptr,24
+	addu.cio r7,r12,r9
+	ld	 r12,s1_ptr,28
+	ld.d	 r8,s2_ptr,24
+	st.d	 r6,res_ptr,16
+	addu.cio r6,r10,r8
+	ld	 r10,s1_ptr,32
+	addu.cio r7,r12,r9
+	ld	 r12,s1_ptr,36
+	addu	 s1_ptr,s1_ptr,32
+	ld.d	 r8,s2_ptr,32
+	addu	 s2_ptr,s2_ptr,32
+	st.d	 r6,res_ptr,24
+	addu	 res_ptr,res_ptr,32
+	bcnd	 ge0,size,Loop1
+
+Lfin1:	addu	 size,size,8-2
+	bcnd	 lt0,size,Lend1
+/* Add blocks of 2 limbs until less than 2 limbs remain */
+Loope1:	addu.cio r6,r10,r8
+	ld	 r10,s1_ptr,8
+	addu.cio r7,r12,r9
+	ld	 r12,s1_ptr,12
+	ld.d	 r8,s2_ptr,8
+	st.d	 r6,res_ptr,0
+	subu	 size,size,2
+	addu	 s1_ptr,s1_ptr,8
+	addu	 s2_ptr,s2_ptr,8
+	addu	 res_ptr,res_ptr,8
+	bcnd	 ge0,size,Loope1
+Lend1:	addu.cio r6,r10,r8
+	addu.cio r7,r12,r9
+	st.d	 r6,res_ptr,0
+
+	bb0	 0,size,Lret1
+/* Add last limb */
+	ld	 r10,s1_ptr,8
+	ld	 r8,s2_ptr,8
+	addu.cio r6,r10,r8
+	st	 r6,res_ptr,8
+
+Lret1:	jmp.n	 r1
+	addu.ci	 r2,r0,r0		; return carry-out from most sign. limb
+
+L1:	xor	 r12,s1_ptr,res_ptr
+	bb1	 2,r12,L2
+; **  V1b  **
+	or	 r12,r0,s2_ptr
+	or	 s2_ptr,r0,s1_ptr
+	or	 s1_ptr,r0,r12
+	br	 L0
+
+; **  V2  **
+/* If we come here, the alignment of s1_ptr and res_ptr as well as the
+   alignment of s2_ptr and res_ptr differ.  Since there are only two ways
+   things can be aligned (that we care about) we now know that the alignment
+   of s1_ptr and s2_ptr are the same.  */
+
+L2:	cmp	 r12,size,1
+	bb1	 eq,r12,Ljone
+	bb0	 2,s1_ptr,L_v2		; branch if s1_ptr is aligned
+/* Add least significant limb separately to align res_ptr and s2_ptr */
+	ld	 r10,s1_ptr,0
+	addu	 s1_ptr,s1_ptr,4
+	ld	 r8,s2_ptr,0
+	addu	 s2_ptr,s2_ptr,4
+	subu	 size,size,1
+	addu.co	 r6,r10,r8
+	st	 r6,res_ptr,0
+	addu	 res_ptr,res_ptr,4
+
+L_v2:	subu	 size,size,8
+	bcnd	 lt0,size,Lfin2
+/* Add blocks of 8 limbs until less than 8 limbs remain */
+	align	 8
+Loop2:	subu	 size,size,8
+	ld.d	 r8,s1_ptr,0
+	ld.d	 r6,s2_ptr,0
+	addu.cio r8,r8,r6
+	st	 r8,res_ptr,0
+	addu.cio r9,r9,r7
+	st	 r9,res_ptr,4
+	ld.d	 r8,s1_ptr,8
+	ld.d	 r6,s2_ptr,8
+	addu.cio r8,r8,r6
+	st	 r8,res_ptr,8
+	addu.cio r9,r9,r7
+	st	 r9,res_ptr,12
+	ld.d	 r8,s1_ptr,16
+	ld.d	 r6,s2_ptr,16
+	addu.cio r8,r8,r6
+	st	 r8,res_ptr,16
+	addu.cio r9,r9,r7
+	st	 r9,res_ptr,20
+	ld.d	 r8,s1_ptr,24
+	ld.d	 r6,s2_ptr,24
+	addu.cio r8,r8,r6
+	st	 r8,res_ptr,24
+	addu.cio r9,r9,r7
+	st	 r9,res_ptr,28
+	addu	 s1_ptr,s1_ptr,32
+	addu	 s2_ptr,s2_ptr,32
+	addu	 res_ptr,res_ptr,32
+	bcnd	 ge0,size,Loop2
+
+Lfin2:	addu	 size,size,8-2
+	bcnd	 lt0,size,Lend2
+Loope2:	ld.d	 r8,s1_ptr,0
+	ld.d	 r6,s2_ptr,0
+	addu.cio r8,r8,r6
+	st	 r8,res_ptr,0
+	addu.cio r9,r9,r7
+	st	 r9,res_ptr,4
+	subu	 size,size,2
+	addu	 s1_ptr,s1_ptr,8
+	addu	 s2_ptr,s2_ptr,8
+	addu	 res_ptr,res_ptr,8
+	bcnd	 ge0,size,Loope2
+Lend2:	bb0	 0,size,Lret2
+/* Add last limb */
+Ljone:	ld	 r10,s1_ptr,0
+	ld	 r8,s2_ptr,0
+	addu.cio r6,r10,r8
+	st	 r6,res_ptr,0
+
+Lret2:	jmp.n	 r1
+	addu.ci	 r2,r0,r0		; return carry-out from most sign. limb

diff --git a/third_party/gmp/mpn/m88k/mc88110/addmul_1.s b/third_party/gmp/mpn/m88k/mc88110/addmul_1.s
new file mode 100644
index 0000000..321221f
--- /dev/null
+++ b/third_party/gmp/mpn/m88k/mc88110/addmul_1.s

@@ -0,0 +1,70 @@
+; mc88110 __gmpn_addmul_1 -- Multiply a limb vector with a single limb and
+; store the product in a second limb vector.
+
+; Copyright 1996, 2000 Free Software Foundation, Inc.
+
+;  This file is part of the GNU MP Library.
+;
+;  The GNU MP Library is free software; you can redistribute it and/or modify
+;  it under the terms of either:
+;
+;    * the GNU Lesser General Public License as published by the Free
+;      Software Foundation; either version 3 of the License, or (at your
+;      option) any later version.
+;
+;  or
+;
+;    * the GNU General Public License as published by the Free Software
+;      Foundation; either version 2 of the License, or (at your option) any
+;      later version.
+;
+;  or both in parallel, as here.
+;
+;  The GNU MP Library is distributed in the hope that it will be useful, but
+;  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+;  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+;  for more details.
+;
+;  You should have received copies of the GNU General Public License and the
+;  GNU Lesser General Public License along with the GNU MP Library.  If not,
+;  see https://www.gnu.org/licenses/.
+
+
+; INPUT PARAMETERS
+; res_ptr	r2
+; s1_ptr	r3
+; size		r4
+; s2_limb	r5
+
+	text
+	align	16
+	global	___gmpn_addmul_1
+___gmpn_addmul_1:
+	lda	 r3,r3[r4]
+	lda	 r8,r2[r4]		; RES_PTR in r8 since r2 is retval
+	subu	 r4,r0,r4
+	addu.co	 r2,r0,r0		; r2 = cy = 0
+
+	ld	 r6,r3[r4]
+	addu	 r4,r4,1
+	subu	 r8,r8,4
+	bcnd.n	 eq0,r4,Lend
+	 mulu.d	 r10,r6,r5
+
+Loop:	ld	 r7,r8[r4]
+	ld	 r6,r3[r4]
+	addu.cio r9,r11,r2
+	addu.ci	 r2,r10,r0
+	addu.co	 r9,r9,r7
+	st	 r9,r8[r4]
+	addu	 r4,r4,1
+	mulu.d	 r10,r6,r5
+	bcnd	 ne0,r4,Loop
+
+Lend:	ld	 r7,r8,0
+	addu.cio r9,r11,r2
+	addu.ci	 r2,r10,r0
+	addu.co	 r9,r9,r7
+	st	 r9,r8,0
+	jmp.n	 r1
+	 addu.ci r2,r2,r0

diff --git a/third_party/gmp/mpn/m88k/mc88110/mul_1.s b/third_party/gmp/mpn/m88k/mc88110/mul_1.s
new file mode 100644
index 0000000..28fd14b
--- /dev/null
+++ b/third_party/gmp/mpn/m88k/mc88110/mul_1.s

@@ -0,0 +1,68 @@
+; mc88110 __gmpn_mul_1 -- Multiply a limb vector with a single limb and
+; store the product in a second limb vector.
+
+; Copyright 1992, 1994, 1995, 2000 Free Software Foundation, Inc.
+
+;  This file is part of the GNU MP Library.
+;
+;  The GNU MP Library is free software; you can redistribute it and/or modify
+;  it under the terms of either:
+;
+;    * the GNU Lesser General Public License as published by the Free
+;      Software Foundation; either version 3 of the License, or (at your
+;      option) any later version.
+;
+;  or
+;
+;    * the GNU General Public License as published by the Free Software
+;      Foundation; either version 2 of the License, or (at your option) any
+;      later version.
+;
+;  or both in parallel, as here.
+;
+;  The GNU MP Library is distributed in the hope that it will be useful, but
+;  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+;  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+;  for more details.
+;
+;  You should have received copies of the GNU General Public License and the
+;  GNU Lesser General Public License along with the GNU MP Library.  If not,
+;  see https://www.gnu.org/licenses/.
+
+
+; INPUT PARAMETERS
+; res_ptr	r2
+; s1_ptr	r3
+; size		r4
+; s2_limb	r5
+
+	text
+	align	16
+	global	___gmpn_mul_1
+___gmpn_mul_1:
+	; Make S1_PTR and RES_PTR point at the end of their blocks
+	; and negate SIZE.
+	lda	 r3,r3[r4]
+	lda	 r8,r2[r4]		; RES_PTR in r8 since r2 is retval
+	subu	 r4,r0,r4
+
+	addu.co	 r2,r0,r0		; r2 = cy = 0
+
+	ld	 r6,r3[r4]
+	addu	 r4,r4,1
+	mulu.d	 r10,r6,r5
+	bcnd.n	 eq0,r4,Lend
+	 subu	 r8,r8,8
+
+Loop:	ld	 r6,r3[r4]
+	addu.cio r9,r11,r2
+	or	 r2,r10,r0		; could be avoided if unrolled
+	addu	 r4,r4,1
+	mulu.d	 r10,r6,r5
+	bcnd.n	 ne0,r4,Loop
+	 st	 r9,r8[r4]
+
+Lend:	addu.cio r9,r11,r2
+	st	 r9,r8,4
+	jmp.n	 r1
+	 addu.ci r2,r10,r0

diff --git a/third_party/gmp/mpn/m88k/mc88110/sub_n.S b/third_party/gmp/mpn/m88k/mc88110/sub_n.S
new file mode 100644
index 0000000..f0a8ecb
--- /dev/null
+++ b/third_party/gmp/mpn/m88k/mc88110/sub_n.S

@@ -0,0 +1,285 @@
+; mc88110 __gmpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+; store difference in a third limb vector.
+
+; Copyright 1995, 1996, 2000 Free Software Foundation, Inc.
+
+;  This file is part of the GNU MP Library.
+;
+;  The GNU MP Library is free software; you can redistribute it and/or modify
+;  it under the terms of either:
+;
+;    * the GNU Lesser General Public License as published by the Free
+;      Software Foundation; either version 3 of the License, or (at your
+;      option) any later version.
+;
+;  or
+;
+;    * the GNU General Public License as published by the Free Software
+;      Foundation; either version 2 of the License, or (at your option) any
+;      later version.
+;
+;  or both in parallel, as here.
+;
+;  The GNU MP Library is distributed in the hope that it will be useful, but
+;  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+;  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+;  for more details.
+;
+;  You should have received copies of the GNU General Public License and the
+;  GNU Lesser General Public License along with the GNU MP Library.  If not,
+;  see https://www.gnu.org/licenses/.
+
+
+; INPUT PARAMETERS
+#define res_ptr	r2
+#define s1_ptr	r3
+#define s2_ptr	r4
+#define size	r5
+
+#include "sysdep.h"
+
+	text
+	align	16
+	global	C_SYMBOL_NAME(__gmpn_sub_n)
+C_SYMBOL_NAME(__gmpn_sub_n):
+	subu.co	 r0,r0,r0		; set cy flag
+	xor	 r12,s2_ptr,res_ptr
+	bb1	 2,r12,L1
+; **  V1a  **
+L0:	bb0	 2,res_ptr,L_v1		; branch if res_ptr is aligned
+/* Add least significant limb separately to align res_ptr and s2_ptr */
+	ld	 r10,s1_ptr,0
+	addu	 s1_ptr,s1_ptr,4
+	ld	 r8,s2_ptr,0
+	addu	 s2_ptr,s2_ptr,4
+	subu	 size,size,1
+	subu.co	 r6,r10,r8
+	st	 r6,res_ptr,0
+	addu	 res_ptr,res_ptr,4
+L_v1:	cmp	 r12,size,2
+	bb1	 lt,r12,Lend2
+
+	ld	 r10,s1_ptr,0
+	ld	 r12,s1_ptr,4
+	ld.d	 r8,s2_ptr,0
+	subu	 size,size,10
+	bcnd	 lt0,size,Lfin1
+/* Add blocks of 8 limbs until less than 8 limbs remain */
+	align	 8
+Loop1:	subu	 size,size,8
+	subu.cio r6,r10,r8
+	ld	 r10,s1_ptr,8
+	subu.cio r7,r12,r9
+	ld	 r12,s1_ptr,12
+	ld.d	 r8,s2_ptr,8
+	st.d	 r6,res_ptr,0
+	subu.cio r6,r10,r8
+	ld	 r10,s1_ptr,16
+	subu.cio r7,r12,r9
+	ld	 r12,s1_ptr,20
+	ld.d	 r8,s2_ptr,16
+	st.d	 r6,res_ptr,8
+	subu.cio r6,r10,r8
+	ld	 r10,s1_ptr,24
+	subu.cio r7,r12,r9
+	ld	 r12,s1_ptr,28
+	ld.d	 r8,s2_ptr,24
+	st.d	 r6,res_ptr,16
+	subu.cio r6,r10,r8
+	ld	 r10,s1_ptr,32
+	subu.cio r7,r12,r9
+	ld	 r12,s1_ptr,36
+	addu	 s1_ptr,s1_ptr,32
+	ld.d	 r8,s2_ptr,32
+	addu	 s2_ptr,s2_ptr,32
+	st.d	 r6,res_ptr,24
+	addu	 res_ptr,res_ptr,32
+	bcnd	 ge0,size,Loop1
+
+Lfin1:	addu	 size,size,8-2
+	bcnd	 lt0,size,Lend1
+/* Add blocks of 2 limbs until less than 2 limbs remain */
+Loope1:	subu.cio r6,r10,r8
+	ld	 r10,s1_ptr,8
+	subu.cio r7,r12,r9
+	ld	 r12,s1_ptr,12
+	ld.d	 r8,s2_ptr,8
+	st.d	 r6,res_ptr,0
+	subu	 size,size,2
+	addu	 s1_ptr,s1_ptr,8
+	addu	 s2_ptr,s2_ptr,8
+	addu	 res_ptr,res_ptr,8
+	bcnd	 ge0,size,Loope1
+Lend1:	subu.cio r6,r10,r8
+	subu.cio r7,r12,r9
+	st.d	 r6,res_ptr,0
+
+	bb0	 0,size,Lret1
+/* Add last limb */
+	ld	 r10,s1_ptr,8
+	ld	 r8,s2_ptr,8
+	subu.cio r6,r10,r8
+	st	 r6,res_ptr,8
+
+Lret1:	addu.ci r2,r0,r0		; return carry-out from most sign. limb
+	jmp.n	 r1
+	 xor	r2,r2,1
+
+L1:	xor	 r12,s1_ptr,res_ptr
+	bb1	 2,r12,L2
+; **  V1b  **
+	bb0	 2,res_ptr,L_v1b	; branch if res_ptr is aligned
+/* Add least significant limb separately to align res_ptr and s1_ptr */
+	ld	 r10,s2_ptr,0
+	addu	 s2_ptr,s2_ptr,4
+	ld	 r8,s1_ptr,0
+	addu	 s1_ptr,s1_ptr,4
+	subu	 size,size,1
+	subu.co	 r6,r8,r10
+	st	 r6,res_ptr,0
+	addu	 res_ptr,res_ptr,4
+L_v1b:	cmp	 r12,size,2
+	bb1	 lt,r12,Lend2
+
+	ld	 r10,s2_ptr,0
+	ld	 r12,s2_ptr,4
+	ld.d	 r8,s1_ptr,0
+	subu	 size,size,10
+	bcnd	 lt0,size,Lfin1b
+/* Add blocks of 8 limbs until less than 8 limbs remain */
+	align	 8
+Loop1b:	subu	 size,size,8
+	subu.cio r6,r8,r10
+	ld	 r10,s2_ptr,8
+	subu.cio r7,r9,r12
+	ld	 r12,s2_ptr,12
+	ld.d	 r8,s1_ptr,8
+	st.d	 r6,res_ptr,0
+	subu.cio r6,r8,r10
+	ld	 r10,s2_ptr,16
+	subu.cio r7,r9,r12
+	ld	 r12,s2_ptr,20
+	ld.d	 r8,s1_ptr,16
+	st.d	 r6,res_ptr,8
+	subu.cio r6,r8,r10
+	ld	 r10,s2_ptr,24
+	subu.cio r7,r9,r12
+	ld	 r12,s2_ptr,28
+	ld.d	 r8,s1_ptr,24
+	st.d	 r6,res_ptr,16
+	subu.cio r6,r8,r10
+	ld	 r10,s2_ptr,32
+	subu.cio r7,r9,r12
+	ld	 r12,s2_ptr,36
+	addu	 s2_ptr,s2_ptr,32
+	ld.d	 r8,s1_ptr,32
+	addu	 s1_ptr,s1_ptr,32
+	st.d	 r6,res_ptr,24
+	addu	 res_ptr,res_ptr,32
+	bcnd	 ge0,size,Loop1b
+
+Lfin1b:	addu	 size,size,8-2
+	bcnd	 lt0,size,Lend1b
+/* Add blocks of 2 limbs until less than 2 limbs remain */
+Loope1b:subu.cio r6,r8,r10
+	ld	 r10,s2_ptr,8
+	subu.cio r7,r9,r12
+	ld	 r12,s2_ptr,12
+	ld.d	 r8,s1_ptr,8
+	st.d	 r6,res_ptr,0
+	subu	 size,size,2
+	addu	 s1_ptr,s1_ptr,8
+	addu	 s2_ptr,s2_ptr,8
+	addu	 res_ptr,res_ptr,8
+	bcnd	 ge0,size,Loope1b
+Lend1b:	subu.cio r6,r8,r10
+	subu.cio r7,r9,r12
+	st.d	 r6,res_ptr,0
+
+	bb0	 0,size,Lret1b
+/* Add last limb */
+	ld	 r10,s2_ptr,8
+	ld	 r8,s1_ptr,8
+	subu.cio r6,r8,r10
+	st	 r6,res_ptr,8
+
+Lret1b:	addu.ci r2,r0,r0		; return carry-out from most sign. limb
+	jmp.n	 r1
+	 xor	r2,r2,1
+
+; **  V2  **
+/* If we come here, the alignment of s1_ptr and res_ptr as well as the
+   alignment of s2_ptr and res_ptr differ.  Since there are only two ways
+   things can be aligned (that we care about) we now know that the alignment
+   of s1_ptr and s2_ptr are the same.  */
+
+L2:	cmp	 r12,size,1
+	bb1	 eq,r12,Ljone
+	bb0	 2,s1_ptr,L_v2		; branch if s1_ptr is aligned
+/* Add least significant limb separately to align res_ptr and s2_ptr */
+	ld	 r10,s1_ptr,0
+	addu	 s1_ptr,s1_ptr,4
+	ld	 r8,s2_ptr,0
+	addu	 s2_ptr,s2_ptr,4
+	subu	 size,size,1
+	subu.co	 r6,r10,r8
+	st	 r6,res_ptr,0
+	addu	 res_ptr,res_ptr,4
+
+L_v2:	subu	 size,size,8
+	bcnd	 lt0,size,Lfin2
+/* Add blocks of 8 limbs until less than 8 limbs remain */
+	align	 8
+Loop2:	subu	 size,size,8
+	ld.d	 r8,s1_ptr,0
+	ld.d	 r6,s2_ptr,0
+	subu.cio r8,r8,r6
+	st	 r8,res_ptr,0
+	subu.cio r9,r9,r7
+	st	 r9,res_ptr,4
+	ld.d	 r8,s1_ptr,8
+	ld.d	 r6,s2_ptr,8
+	subu.cio r8,r8,r6
+	st	 r8,res_ptr,8
+	subu.cio r9,r9,r7
+	st	 r9,res_ptr,12
+	ld.d	 r8,s1_ptr,16
+	ld.d	 r6,s2_ptr,16
+	subu.cio r8,r8,r6
+	st	 r8,res_ptr,16
+	subu.cio r9,r9,r7
+	st	 r9,res_ptr,20
+	ld.d	 r8,s1_ptr,24
+	ld.d	 r6,s2_ptr,24
+	subu.cio r8,r8,r6
+	st	 r8,res_ptr,24
+	subu.cio r9,r9,r7
+	st	 r9,res_ptr,28
+	addu	 s1_ptr,s1_ptr,32
+	addu	 s2_ptr,s2_ptr,32
+	addu	 res_ptr,res_ptr,32
+	bcnd	 ge0,size,Loop2
+
+Lfin2:	addu	 size,size,8-2
+	bcnd	 lt0,size,Lend2
+Loope2:	ld.d	 r8,s1_ptr,0
+	ld.d	 r6,s2_ptr,0
+	subu.cio r8,r8,r6
+	st	 r8,res_ptr,0
+	subu.cio r9,r9,r7
+	st	 r9,res_ptr,4
+	subu	 size,size,2
+	addu	 s1_ptr,s1_ptr,8
+	addu	 s2_ptr,s2_ptr,8
+	addu	 res_ptr,res_ptr,8
+	bcnd	 ge0,size,Loope2
+Lend2:	bb0	 0,size,Lret2
+/* Add last limb */
+Ljone:	ld	 r10,s1_ptr,0
+	ld	 r8,s2_ptr,0
+	subu.cio r6,r10,r8
+	st	 r6,res_ptr,0
+
+Lret2:	addu.ci r2,r0,r0		; return carry-out from most sign. limb
+	jmp.n	 r1
+	 xor	r2,r2,1

diff --git a/third_party/gmp/mpn/m88k/mul_1.s b/third_party/gmp/mpn/m88k/mul_1.s
new file mode 100644
index 0000000..c8abdc0
--- /dev/null
+++ b/third_party/gmp/mpn/m88k/mul_1.s

@@ -0,0 +1,136 @@
+; mc88100 __gmpn_mul_1 -- Multiply a limb vector with a single limb and
+; store the product in a second limb vector.
+
+; Copyright 1992, 1994, 1995, 2000 Free Software Foundation, Inc.
+
+;  This file is part of the GNU MP Library.
+;
+;  The GNU MP Library is free software; you can redistribute it and/or modify
+;  it under the terms of either:
+;
+;    * the GNU Lesser General Public License as published by the Free
+;      Software Foundation; either version 3 of the License, or (at your
+;      option) any later version.
+;
+;  or
+;
+;    * the GNU General Public License as published by the Free Software
+;      Foundation; either version 2 of the License, or (at your option) any
+;      later version.
+;
+;  or both in parallel, as here.
+;
+;  The GNU MP Library is distributed in the hope that it will be useful, but
+;  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+;  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+;  for more details.
+;
+;  You should have received copies of the GNU General Public License and the
+;  GNU Lesser General Public License along with the GNU MP Library.  If not,
+;  see https://www.gnu.org/licenses/.
+
+
+; INPUT PARAMETERS
+; res_ptr	r2
+; s1_ptr	r3
+; size		r4
+; s2_limb	r5
+
+; Common overhead is about 11 cycles/invocation.
+
+; The speed for S2_LIMB >= 0x10000 is approximately 21 cycles/limb.  (The
+; pipeline stalls 2 cycles due to WB contention.)
+
+; The speed for S2_LIMB < 0x10000 is approximately 16 cycles/limb.  (The
+; pipeline stalls 2 cycles due to WB contention and 1 cycle due to latency.)
+
+; To enhance speed:
+; 1. Unroll main loop 4-8 times.
+; 2. Schedule code to avoid WB contention.  It might be tempting to move the
+;    ld instruction in the loops down to save 2 cycles (less WB contention),
+;    but that looses because the ultimate value will be read from outside
+;    the allocated space.  But if we handle the ultimate multiplication in
+;    the tail, we can do this.
+; 3. Make the multiplication with less instructions.  I think the code for
+;    (S2_LIMB >= 0x10000) is not minimal.
+; With these techniques the (S2_LIMB >= 0x10000) case would run in 17 or
+; less cycles/limb; the (S2_LIMB < 0x10000) case would run in 11
+; cycles/limb.  (Assuming infinite unrolling.)
+
+	text
+	align	 16
+	global	 ___gmpn_mul_1
+___gmpn_mul_1:
+
+	; Make S1_PTR and RES_PTR point at the end of their blocks
+	; and negate SIZE.
+	lda	 r3,r3[r4]
+	lda	 r6,r2[r4]	; RES_PTR in r6 since r2 is retval
+	subu	 r4,r0,r4
+
+	addu.co	 r2,r0,r0	; r2 = cy = 0
+	ld	 r9,r3[r4]
+	mask	 r7,r5,0xffff	; r7 = lo(S2_LIMB)
+	extu	 r8,r5,16	; r8 = hi(S2_LIMB)
+	bcnd.n	 eq0,r8,Lsmall	; jump if (hi(S2_LIMB) == 0)
+	 subu	 r6,r6,4
+
+; General code for any value of S2_LIMB.
+
+	; Make a stack frame and save r25 and r26
+	subu	 r31,r31,16
+	st.d	 r25,r31,8
+
+	; Enter the loop in the middle
+	br.n	L1
+	addu	 r4,r4,1
+
+Loop:	ld	 r9,r3[r4]
+	st	 r26,r6[r4]
+; bcnd	ne0,r0,0		; bubble
+	addu	 r4,r4,1
+L1:	mul	 r26,r9,r5	; low word of product	mul_1	WB ld
+	mask	 r12,r9,0xffff	; r12 = lo(s1_limb)	mask_1
+	mul	 r11,r12,r7	; r11 =  prod_0		mul_2	WB mask_1
+	mul	 r10,r12,r8	; r10 = prod_1a		mul_3
+	extu	 r13,r9,16	; r13 = hi(s1_limb)	extu_1	WB mul_1
+	mul	 r12,r13,r7	; r12 = prod_1b		mul_4	WB extu_1
+	mul	 r25,r13,r8	; r25  = prod_2		mul_5	WB mul_2
+	extu	 r11,r11,16	; r11 = hi(prod_0)	extu_2	WB mul_3
+	addu	 r10,r10,r11	;			addu_1	WB extu_2
+; bcnd	ne0,r0,0		; bubble			WB addu_1
+	addu.co	 r10,r10,r12	;				WB mul_4
+	mask.u	 r10,r10,0xffff	; move the 16 most significant bits...
+	addu.ci	 r10,r10,r0	; ...to the low half of the word...
+	rot	 r10,r10,16	; ...and put carry in pos 16.
+	addu.co	 r26,r26,r2	; add old carry limb
+	bcnd.n	 ne0,r4,Loop
+	 addu.ci r2,r25,r10	; compute new carry limb
+
+	st	 r26,r6[r4]
+	ld.d	 r25,r31,8
+	jmp.n	 r1
+	 addu	 r31,r31,16
+
+; Fast code for S2_LIMB < 0x10000
+Lsmall:
+	; Enter the loop in the middle
+	br.n	SL1
+	addu	 r4,r4,1
+
+SLoop:	ld	 r9,r3[r4]	;
+	st	 r8,r6[r4]	;
+	addu	 r4,r4,1	;
+SL1:	mul	 r8,r9,r5	; low word of product
+	mask	 r12,r9,0xffff	; r12 = lo(s1_limb)
+	extu	 r13,r9,16	; r13 = hi(s1_limb)
+	mul	 r11,r12,r7	; r11 =  prod_0
+	mul	 r12,r13,r7	; r12 = prod_1b
+	addu.cio r8,r8,r2	; add old carry limb
+	extu	 r10,r11,16	; r11 = hi(prod_0)
+	addu	 r10,r10,r12	;
+	bcnd.n	 ne0,r4,SLoop
+	extu	 r2,r10,16	; r2 = new carry limb
+
+	jmp.n	 r1
+	st	 r8,r6[r4]

diff --git a/third_party/gmp/mpn/m88k/sub_n.s b/third_party/gmp/mpn/m88k/sub_n.s
new file mode 100644
index 0000000..2bd8f09
--- /dev/null
+++ b/third_party/gmp/mpn/m88k/sub_n.s

@@ -0,0 +1,115 @@
+; mc88100 mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+; store difference in a third limb vector.
+
+; Copyright 1992, 1994, 1996, 2000 Free Software Foundation, Inc.
+
+;  This file is part of the GNU MP Library.
+;
+;  The GNU MP Library is free software; you can redistribute it and/or modify
+;  it under the terms of either:
+;
+;    * the GNU Lesser General Public License as published by the Free
+;      Software Foundation; either version 3 of the License, or (at your
+;      option) any later version.
+;
+;  or
+;
+;    * the GNU General Public License as published by the Free Software
+;      Foundation; either version 2 of the License, or (at your option) any
+;      later version.
+;
+;  or both in parallel, as here.
+;
+;  The GNU MP Library is distributed in the hope that it will be useful, but
+;  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+;  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+;  for more details.
+;
+;  You should have received copies of the GNU General Public License and the
+;  GNU Lesser General Public License along with the GNU MP Library.  If not,
+;  see https://www.gnu.org/licenses/.
+
+
+; INPUT PARAMETERS
+; res_ptr	r2
+; s1_ptr	r3
+; s2_ptr	r4
+; size		r5
+
+; This code has been optimized to run one instruction per clock, avoiding
+; load stalls and writeback contention.  As a result, the instruction
+; order is not always natural.
+
+; The speed is about 4.6 clocks/limb + 18 clocks/limb-vector on an 88100,
+; but on the 88110, it seems to run much slower, 6.6 clocks/limb.
+
+	text
+	align	 16
+	global	 ___gmpn_sub_n
+___gmpn_sub_n:
+	ld	r6,r3,0			; read first limb from s1_ptr
+	extu	r10,r5,3
+	ld	r7,r4,0			; read first limb from s2_ptr
+
+	subu	r5,r0,r5
+	mak	r5,r5,3<4>
+	bcnd.n	eq0,r5,Lzero
+	subu.co	r0,r0,r0		; initialize carry
+
+	or	r12,r0,lo16(Lbase)
+	or.u	r12,r12,hi16(Lbase)
+	addu	r12,r12,r5		; r12 is address for entering in loop
+
+	extu	r5,r5,2			; divide by 4
+	subu	r2,r2,r5		; adjust res_ptr
+	subu	r3,r3,r5		; adjust s1_ptr
+	subu	r4,r4,r5		; adjust s2_ptr
+
+	or	r8,r6,r0
+
+	jmp.n	r12
+	 or	r9,r7,r0
+
+Loop:	addu	r3,r3,32
+	st	r8,r2,28
+	addu	r4,r4,32
+	ld	r6,r3,0
+	addu	r2,r2,32
+	ld	r7,r4,0
+Lzero:	subu	r10,r10,1		; subtract 0 + 8r limbs (adj loop cnt)
+Lbase:	ld	r8,r3,4
+	subu.cio r6,r6,r7
+	ld	r9,r4,4
+	st	r6,r2,0
+	ld	r6,r3,8			; subtract 7 + 8r limbs
+	subu.cio r8,r8,r9
+	ld	r7,r4,8
+	st	r8,r2,4
+	ld	r8,r3,12		; subtract 6 + 8r limbs
+	subu.cio r6,r6,r7
+	ld	r9,r4,12
+	st	r6,r2,8
+	ld	r6,r3,16		; subtract 5 + 8r limbs
+	subu.cio r8,r8,r9
+	ld	r7,r4,16
+	st	r8,r2,12
+	ld	r8,r3,20		; subtract 4 + 8r limbs
+	subu.cio r6,r6,r7
+	ld	r9,r4,20
+	st	r6,r2,16
+	ld	r6,r3,24		; subtract 3 + 8r limbs
+	subu.cio r8,r8,r9
+	ld	r7,r4,24
+	st	r8,r2,20
+	ld	r8,r3,28		; subtract 2 + 8r limbs
+	subu.cio r6,r6,r7
+	ld	r9,r4,28
+	st	r6,r2,24
+	bcnd.n	ne0,r10,Loop		; subtract 1 + 8r limbs
+	 subu.cio r8,r8,r9
+
+	st	r8,r2,28		; store most significant limb
+
+	addu.ci r2,r0,r0		; return carry-out from most sign. limb
+	jmp.n	 r1
+	 xor	r2,r2,1

diff --git a/third_party/gmp/mpn/minithres/gmp-mparam.h b/third_party/gmp/mpn/minithres/gmp-mparam.h
new file mode 100644
index 0000000..35fcb77
--- /dev/null
+++ b/third_party/gmp/mpn/minithres/gmp-mparam.h

@@ -0,0 +1,113 @@
+/* Minimal values gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 2000, 2006, 2008-2010, 2012 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+/* The values in this file are not currently minimal.
+   Trimming them further would be good.  */
+
+#define MOD_1_NORM_THRESHOLD                 0  /* always */
+#define MOD_1_UNNORM_THRESHOLD               0  /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          2
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          2
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD         3
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD         4
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD      1
+#define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD            3
+
+#define MUL_TOOM22_THRESHOLD                 8
+#define MUL_TOOM33_THRESHOLD                20
+#define MUL_TOOM44_THRESHOLD                24
+#define MUL_TOOM6H_THRESHOLD                70 /* FIXME */
+#define MUL_TOOM8H_THRESHOLD                86
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      50 /* FIXME */
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD      50 /* FIXME */
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD      50 /* FIXME */
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD      50 /* FIXME */
+
+#define SQR_BASECASE_THRESHOLD               0
+#define SQR_TOOM2_THRESHOLD                  8
+#define SQR_TOOM3_THRESHOLD                 20
+#define SQR_TOOM4_THRESHOLD                 24
+#define SQR_TOOM6H_THRESHOLD                70 /* FIXME */
+#define SQR_TOOM8H_THRESHOLD                86
+
+#define MULMOD_BNM1_THRESHOLD            10
+#define SQRMOD_BNM1_THRESHOLD            10
+
+#define MUL_FFT_TABLE  {64, 256, 1024, 4096, 8192, 65536, 0}
+#define MUL_FFT_MODF_THRESHOLD  65
+#define MUL_FFT_THRESHOLD      200
+
+#define SQR_FFT_TABLE  {64, 256, 1024, 4096, 8192, 65536, 0}
+#define SQR_FFT_MODF_THRESHOLD  65
+#define SQR_FFT_THRESHOLD      200
+
+#define MULLO_BASECASE_THRESHOLD             0
+#define MULLO_DC_THRESHOLD                   2
+#define MULLO_MUL_N_THRESHOLD                4
+#define SQRLO_BASECASE_THRESHOLD             0
+#define SQRLO_DC_THRESHOLD                   2
+#define SQRLO_SQR_THRESHOLD                  4
+
+
+#define DC_DIV_QR_THRESHOLD                  6
+#define DC_DIVAPPR_Q_THRESHOLD               6
+#define DC_BDIV_QR_THRESHOLD                 4
+#define DC_BDIV_Q_THRESHOLD                  4
+
+#define INV_MULMOD_BNM1_THRESHOLD            2
+#define INV_NEWTON_THRESHOLD                 6
+#define INV_APPR_THRESHOLD                   4
+
+#define BINV_NEWTON_THRESHOLD                6
+#define REDC_1_TO_REDC_N_THRESHOLD           9
+
+#define MU_DIV_QR_THRESHOLD                  8
+#define MU_DIVAPPR_Q_THRESHOLD               8
+#define MUPI_DIV_QR_THRESHOLD                8
+#define MU_BDIV_QR_THRESHOLD                 8
+#define MU_BDIV_Q_THRESHOLD                  8
+
+#define MATRIX22_STRASSEN_THRESHOLD          2
+#define HGCD_THRESHOLD                      10
+#define GCD_DC_THRESHOLD                    20
+#define GCDEXT_SCHOENHAGE_THRESHOLD         20
+#define JACOBI_BASE_METHOD                   1
+
+#define GET_STR_DC_THRESHOLD                 4
+#define GET_STR_PRECOMPUTE_THRESHOLD        10
+#define SET_STR_THRESHOLD                   64
+#define SET_STR_PRECOMPUTE_THRESHOLD       100
+
+#define FAC_ODD_THRESHOLD                    0  /* always */
+#define FAC_DSC_THRESHOLD                   70

diff --git a/third_party/gmp/mpn/mips32/add_n.asm b/third_party/gmp/mpn/mips32/add_n.asm
new file mode 100644
index 0000000..e7d4c48
--- /dev/null
+++ b/third_party/gmp/mpn/mips32/add_n.asm

@@ -0,0 +1,124 @@
+dnl  MIPS32 mpn_add_n -- Add two limb vectors of the same length > 0 and store
+dnl  sum in a third limb vector.
+
+dnl  Copyright 1995, 2000, 2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr	$4
+C s1_ptr	$5
+C s2_ptr	$6
+C size		$7
+
+ASM_START()
+PROLOGUE(mpn_add_n)
+
+	lw	$10,0($5)
+	lw	$11,0($6)
+
+	addiu	$7,$7,-1
+	and	$9,$7,4-1	C number of limbs in first loop
+	beq	$9,$0,.L0	C if multiple of 4 limbs, skip first loop
+	 move	$2,$0
+
+	subu	$7,$7,$9
+
+.Loop0:	addiu	$9,$9,-1
+	lw	$12,4($5)
+	addu	$11,$11,$2
+	lw	$13,4($6)
+	sltu	$8,$11,$2
+	addu	$11,$10,$11
+	sltu	$2,$11,$10
+	sw	$11,0($4)
+	or	$2,$2,$8
+
+	addiu	$5,$5,4
+	addiu	$6,$6,4
+	move	$10,$12
+	move	$11,$13
+	bne	$9,$0,.Loop0
+	 addiu	$4,$4,4
+
+.L0:	beq	$7,$0,.Lend
+	 nop
+
+.Loop:	addiu	$7,$7,-4
+
+	lw	$12,4($5)
+	addu	$11,$11,$2
+	lw	$13,4($6)
+	sltu	$8,$11,$2
+	addu	$11,$10,$11
+	sltu	$2,$11,$10
+	sw	$11,0($4)
+	or	$2,$2,$8
+
+	lw	$10,8($5)
+	addu	$13,$13,$2
+	lw	$11,8($6)
+	sltu	$8,$13,$2
+	addu	$13,$12,$13
+	sltu	$2,$13,$12
+	sw	$13,4($4)
+	or	$2,$2,$8
+
+	lw	$12,12($5)
+	addu	$11,$11,$2
+	lw	$13,12($6)
+	sltu	$8,$11,$2
+	addu	$11,$10,$11
+	sltu	$2,$11,$10
+	sw	$11,8($4)
+	or	$2,$2,$8
+
+	lw	$10,16($5)
+	addu	$13,$13,$2
+	lw	$11,16($6)
+	sltu	$8,$13,$2
+	addu	$13,$12,$13
+	sltu	$2,$13,$12
+	sw	$13,12($4)
+	or	$2,$2,$8
+
+	addiu	$5,$5,16
+	addiu	$6,$6,16
+
+	bne	$7,$0,.Loop
+	 addiu	$4,$4,16
+
+.Lend:	addu	$11,$11,$2
+	sltu	$8,$11,$2
+	addu	$11,$10,$11
+	sltu	$2,$11,$10
+	sw	$11,0($4)
+	j	$31
+	or	$2,$2,$8
+EPILOGUE(mpn_add_n)

diff --git a/third_party/gmp/mpn/mips32/addmul_1.asm b/third_party/gmp/mpn/mips32/addmul_1.asm
new file mode 100644
index 0000000..9aa9e16
--- /dev/null
+++ b/third_party/gmp/mpn/mips32/addmul_1.asm

@@ -0,0 +1,101 @@
+dnl  MIPS32 mpn_addmul_1 -- Multiply a limb vector with a single limb and add
+dnl  the product to a second limb vector.
+
+dnl  Copyright 1992, 1994, 1996, 2000, 2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr	$4
+C s1_ptr	$5
+C size		$6
+C s2_limb	$7
+
+ASM_START()
+PROLOGUE(mpn_addmul_1)
+
+C feed-in phase 0
+	lw	$8,0($5)
+
+C feed-in phase 1
+	addiu	$5,$5,4
+	multu	$8,$7
+
+	addiu	$6,$6,-1
+	beq	$6,$0,$LC0
+	 move	$2,$0		C zero cy2
+
+	addiu	$6,$6,-1
+	beq	$6,$0,$LC1
+	lw	$8,0($5)	C load new s1 limb as early as possible
+
+Loop:	lw	$10,0($4)
+	mflo	$3
+	mfhi	$9
+	addiu	$5,$5,4
+	addu	$3,$3,$2	C add old carry limb to low product limb
+	multu	$8,$7
+	lw	$8,0($5)	C load new s1 limb as early as possible
+	addiu	$6,$6,-1	C decrement loop counter
+	sltu	$2,$3,$2	C carry from previous addition -> $2
+	addu	$3,$10,$3
+	sltu	$10,$3,$10
+	addu	$2,$2,$10
+	sw	$3,0($4)
+	addiu	$4,$4,4
+	bne	$6,$0,Loop
+	 addu	$2,$9,$2	C add high product limb and carry from addition
+
+C wind-down phase 1
+$LC1:	lw	$10,0($4)
+	mflo	$3
+	mfhi	$9
+	addu	$3,$3,$2
+	sltu	$2,$3,$2
+	multu	$8,$7
+	addu	$3,$10,$3
+	sltu	$10,$3,$10
+	addu	$2,$2,$10
+	sw	$3,0($4)
+	addiu	$4,$4,4
+	addu	$2,$9,$2	C add high product limb and carry from addition
+
+C wind-down phase 0
+$LC0:	lw	$10,0($4)
+	mflo	$3
+	mfhi	$9
+	addu	$3,$3,$2
+	sltu	$2,$3,$2
+	addu	$3,$10,$3
+	sltu	$10,$3,$10
+	addu	$2,$2,$10
+	sw	$3,0($4)
+	j	$31
+	addu	$2,$9,$2	C add high product limb and carry from addition
+EPILOGUE(mpn_addmul_1)

diff --git a/third_party/gmp/mpn/mips32/gmp-mparam.h b/third_party/gmp/mpn/mips32/gmp-mparam.h
new file mode 100644
index 0000000..986135d
--- /dev/null
+++ b/third_party/gmp/mpn/mips32/gmp-mparam.h

@@ -0,0 +1,72 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 1999-2002 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+
+/* Generated by tuneup.c, 2002-02-20, gcc 2.95 (R3000) */
+
+#define MUL_TOOM22_THRESHOLD             20
+#define MUL_TOOM33_THRESHOLD             50
+
+#define SQR_BASECASE_THRESHOLD            7
+#define SQR_TOOM2_THRESHOLD              57
+#define SQR_TOOM3_THRESHOLD              78
+
+#define DIV_SB_PREINV_THRESHOLD           0  /* always */
+#define DIV_DC_THRESHOLD                 57
+#define POWM_THRESHOLD                   78
+
+#define GCD_ACCEL_THRESHOLD               3
+#define JACOBI_BASE_METHOD                2
+
+#define DIVREM_1_NORM_THRESHOLD           0  /* always */
+#define DIVREM_1_UNNORM_THRESHOLD         0  /* always */
+#define MOD_1_NORM_THRESHOLD              0  /* always */
+#define MOD_1_UNNORM_THRESHOLD            0  /* always */
+#define USE_PREINV_DIVREM_1               1
+#define USE_PREINV_MOD_1                  1
+#define DIVREM_2_THRESHOLD                0  /* always */
+#define DIVEXACT_1_THRESHOLD              0  /* always */
+#define MODEXACT_1_ODD_THRESHOLD          0  /* always */
+
+#define GET_STR_DC_THRESHOLD             19
+#define GET_STR_PRECOMPUTE_THRESHOLD     25
+#define SET_STR_THRESHOLD               309
+
+#define MUL_FFT_TABLE  { 496, 1056, 2176, 5632, 14336, 40960, 0 }
+#define MUL_FFT_MODF_THRESHOLD          624
+#define MUL_FFT_THRESHOLD              5888
+
+#define SQR_FFT_TABLE  { 496, 1184, 2176, 5632, 14336, 40960, 0 }
+#define SQR_FFT_MODF_THRESHOLD          560
+#define SQR_FFT_THRESHOLD              5376

diff --git a/third_party/gmp/mpn/mips32/lshift.asm b/third_party/gmp/mpn/mips32/lshift.asm
new file mode 100644
index 0000000..6a58bb4
--- /dev/null
+++ b/third_party/gmp/mpn/mips32/lshift.asm

@@ -0,0 +1,99 @@
+dnl  MIPS32 mpn_lshift -- Left shift.
+
+dnl  Copyright 1995, 2000, 2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr	$4
+C src_ptr	$5
+C size		$6
+C cnt		$7
+
+ASM_START()
+PROLOGUE(mpn_lshift)
+	sll	$2,$6,2
+	addu	$5,$5,$2	C make r5 point at end of src
+	lw	$10,-4($5)	C load first limb
+	subu	$13,$0,$7
+	addu	$4,$4,$2	C make r4 point at end of res
+	addiu	$6,$6,-1
+	and	$9,$6,4-1	C number of limbs in first loop
+	beq	$9,$0,.L0	C if multiple of 4 limbs, skip first loop
+	 srl	$2,$10,$13	C compute function result
+
+	subu	$6,$6,$9
+
+.Loop0:	lw	$3,-8($5)
+	addiu	$4,$4,-4
+	addiu	$5,$5,-4
+	addiu	$9,$9,-1
+	sll	$11,$10,$7
+	srl	$12,$3,$13
+	move	$10,$3
+	or	$8,$11,$12
+	bne	$9,$0,.Loop0
+	 sw	$8,0($4)
+
+.L0:	beq	$6,$0,.Lend
+	 nop
+
+.Loop:	lw	$3,-8($5)
+	addiu	$4,$4,-16
+	addiu	$6,$6,-4
+	sll	$11,$10,$7
+	srl	$12,$3,$13
+
+	lw	$10,-12($5)
+	sll	$14,$3,$7
+	or	$8,$11,$12
+	sw	$8,12($4)
+	srl	$9,$10,$13
+
+	lw	$3,-16($5)
+	sll	$11,$10,$7
+	or	$8,$14,$9
+	sw	$8,8($4)
+	srl	$12,$3,$13
+
+	lw	$10,-20($5)
+	sll	$14,$3,$7
+	or	$8,$11,$12
+	sw	$8,4($4)
+	srl	$9,$10,$13
+
+	addiu	$5,$5,-16
+	or	$8,$14,$9
+	bgtz	$6,.Loop
+	 sw	$8,0($4)
+
+.Lend:	sll	$8,$10,$7
+	j	$31
+	sw	$8,-4($4)
+EPILOGUE(mpn_lshift)

diff --git a/third_party/gmp/mpn/mips32/mips-defs.m4 b/third_party/gmp/mpn/mips32/mips-defs.m4
new file mode 100644
index 0000000..5fa89ec
--- /dev/null
+++ b/third_party/gmp/mpn/mips32/mips-defs.m4

@@ -0,0 +1,80 @@
+divert(-1)
+
+dnl  m4 macros for MIPS assembly code (both 32-bit and 64-bit).
+
+
+dnl  Copyright 2000-2002 Free Software Foundation, Inc.
+dnl
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+dnl  Usage: ASM_START()
+define(`ASM_START',
+m4_assert_numargs(0)
+`	.set noreorder
+	.set nomacro')
+
+dnl  Usage: X(value)
+define(`X',
+m4_assert_numargs(1)
+`0x$1')
+
+dnl  Called: PROLOGUE_cpu(GSYM_PREFIX`'foo)
+dnl          EPILOGUE_cpu(GSYM_PREFIX`'foo)
+
+define(`PROLOGUE_cpu',
+m4_assert_numargs(1)
+`	.text
+	.align	4
+	.globl	$1
+	.ent	$1
+$1:')
+
+define(`EPILOGUE_cpu',
+m4_assert_numargs(1)
+`	.end	$1')
+
+
+dnl  Usage: r0 ... r31
+dnl         f0 ... f31
+dnl
+dnl  Map register names r0 to $0, and f0 to $f0, etc.
+dnl
+dnl  defreg() is used to protect the $ in $0 (otherwise it would represent a
+dnl  macro argument).  Double quoting is used to protect the f0 in $f0
+dnl  (otherwise it would be an infinite recursion).
+
+forloop(i,0,31,`defreg(`r'i,$i)')
+forloop(i,0,31,`deflit(`f'i,``$f''i)')
+
+
+dnl  Usage: ASM_END()
+define(`ASM_END',
+m4_assert_numargs(0)
+)
+
+divert

diff --git a/third_party/gmp/mpn/mips32/mips.m4 b/third_party/gmp/mpn/mips32/mips.m4
new file mode 100644
index 0000000..8b49e57
--- /dev/null
+++ b/third_party/gmp/mpn/mips32/mips.m4

@@ -0,0 +1,80 @@
+divert(-1)
+
+dnl  m4 macros for MIPS assembly code.
+
+
+dnl  Copyright 2000-2002 Free Software Foundation, Inc.
+dnl
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+dnl  Usage: ASM_START()
+define(`ASM_START',
+m4_assert_numargs(0)
+`	.set noreorder
+	.set nomacro')
+
+dnl  Usage: X(value)
+define(`X',
+m4_assert_numargs(1)
+`0x$1')
+
+dnl  Called: PROLOGUE_cpu(GSYM_PREFIX`'foo)
+dnl          EPILOGUE_cpu(GSYM_PREFIX`'foo)
+
+define(`PROLOGUE_cpu',
+m4_assert_numargs(1)
+`	.text
+	.align	4
+	.globl	$1
+	.ent	$1
+$1:')
+
+define(`EPILOGUE_cpu',
+m4_assert_numargs(1)
+`	.end	$1')
+
+
+dnl  Usage: r0 ... r31
+dnl         f0 ... f31
+dnl
+dnl  Map register names r0 to $0, and f0 to $f0, etc.
+dnl
+dnl  defreg() is used to protect the $ in $0 (otherwise it would represent a
+dnl  macro argument).  Double quoting is used to protect the f0 in $f0
+dnl  (otherwise it would be an infinite recursion).
+
+forloop(i,0,31,`defreg(`r'i,$i)')
+forloop(i,0,31,`deflit(`f'i,``$f''i)')
+
+
+dnl  Usage: ASM_END()
+define(`ASM_END',
+m4_assert_numargs(0)
+)
+
+divert

diff --git a/third_party/gmp/mpn/mips32/mul_1.asm b/third_party/gmp/mpn/mips32/mul_1.asm
new file mode 100644
index 0000000..4337bc2
--- /dev/null
+++ b/third_party/gmp/mpn/mips32/mul_1.asm

@@ -0,0 +1,89 @@
+dnl  MIPS32 mpn_mul_1 -- Multiply a limb vector with a single limb and store
+dnl  the product in a second limb vector.
+
+dnl  Copyright 1992, 1994, 1996, 2000, 2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr	$4
+C s1_ptr	$5
+C size		$6
+C s2_limb	$7
+
+ASM_START()
+PROLOGUE(mpn_mul_1)
+
+C feed-in phase 0
+	lw	$8,0($5)
+
+C feed-in phase 1
+	addiu	$5,$5,4
+	multu	$8,$7
+
+	addiu	$6,$6,-1
+	beq	$6,$0,$LC0
+	 move	$2,$0		C zero cy2
+
+	addiu	$6,$6,-1
+	beq	$6,$0,$LC1
+	lw	$8,0($5)	C load new s1 limb as early as possible
+
+Loop:	mflo	$10
+	mfhi	$9
+	addiu	$5,$5,4
+	addu	$10,$10,$2	C add old carry limb to low product limb
+	multu	$8,$7
+	lw	$8,0($5)	C load new s1 limb as early as possible
+	addiu	$6,$6,-1	C decrement loop counter
+	sltu	$2,$10,$2	C carry from previous addition -> $2
+	sw	$10,0($4)
+	addiu	$4,$4,4
+	bne	$6,$0,Loop
+	 addu	$2,$9,$2	C add high product limb and carry from addition
+
+C wind-down phase 1
+$LC1:	mflo	$10
+	mfhi	$9
+	addu	$10,$10,$2
+	sltu	$2,$10,$2
+	multu	$8,$7
+	sw	$10,0($4)
+	addiu	$4,$4,4
+	addu	$2,$9,$2	C add high product limb and carry from addition
+
+C wind-down phase 0
+$LC0:	mflo	$10
+	mfhi	$9
+	addu	$10,$10,$2
+	sltu	$2,$10,$2
+	sw	$10,0($4)
+	j	$31
+	addu	$2,$9,$2	C add high product limb and carry from addition
+EPILOGUE(mpn_mul_1)

diff --git a/third_party/gmp/mpn/mips32/rshift.asm b/third_party/gmp/mpn/mips32/rshift.asm
new file mode 100644
index 0000000..4b54510
--- /dev/null
+++ b/third_party/gmp/mpn/mips32/rshift.asm

@@ -0,0 +1,96 @@
+dnl  MIPS32 mpn_rshift -- Right shift.
+
+dnl  Copyright 1995, 2000, 2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr	$4
+C src_ptr	$5
+C size		$6
+C cnt		$7
+
+ASM_START()
+PROLOGUE(mpn_rshift)
+	lw	$10,0($5)	C load first limb
+	subu	$13,$0,$7
+	addiu	$6,$6,-1
+	and	$9,$6,4-1	C number of limbs in first loop
+	beq	$9,$0,.L0	C if multiple of 4 limbs, skip first loop
+	 sll	$2,$10,$13	C compute function result
+
+	subu	$6,$6,$9
+
+.Loop0:	lw	$3,4($5)
+	addiu	$4,$4,4
+	addiu	$5,$5,4
+	addiu	$9,$9,-1
+	srl	$11,$10,$7
+	sll	$12,$3,$13
+	move	$10,$3
+	or	$8,$11,$12
+	bne	$9,$0,.Loop0
+	 sw	$8,-4($4)
+
+.L0:	beq	$6,$0,.Lend
+	 nop
+
+.Loop:	lw	$3,4($5)
+	addiu	$4,$4,16
+	addiu	$6,$6,-4
+	srl	$11,$10,$7
+	sll	$12,$3,$13
+
+	lw	$10,8($5)
+	srl	$14,$3,$7
+	or	$8,$11,$12
+	sw	$8,-16($4)
+	sll	$9,$10,$13
+
+	lw	$3,12($5)
+	srl	$11,$10,$7
+	or	$8,$14,$9
+	sw	$8,-12($4)
+	sll	$12,$3,$13
+
+	lw	$10,16($5)
+	srl	$14,$3,$7
+	or	$8,$11,$12
+	sw	$8,-8($4)
+	sll	$9,$10,$13
+
+	addiu	$5,$5,16
+	or	$8,$14,$9
+	bgtz	$6,.Loop
+	 sw	$8,-4($4)
+
+.Lend:	srl	$8,$10,$7
+	j	$31
+	sw	$8,0($4)
+EPILOGUE(mpn_rshift)

diff --git a/third_party/gmp/mpn/mips32/sub_n.asm b/third_party/gmp/mpn/mips32/sub_n.asm
new file mode 100644
index 0000000..a962ce1
--- /dev/null
+++ b/third_party/gmp/mpn/mips32/sub_n.asm

@@ -0,0 +1,123 @@
+dnl  MIPS32 mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+dnl  store difference in a third limb vector.
+
+dnl  Copyright 1995, 2000, 2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr	$4
+C s1_ptr	$5
+C s2_ptr	$6
+C size		$7
+
+ASM_START()
+PROLOGUE(mpn_sub_n)
+	lw	$10,0($5)
+	lw	$11,0($6)
+
+	addiu	$7,$7,-1
+	and	$9,$7,4-1	C number of limbs in first loop
+	beq	$9,$0,.L0	C if multiple of 4 limbs, skip first loop
+	 move	$2,$0
+
+	subu	$7,$7,$9
+
+.Loop0:	addiu	$9,$9,-1
+	lw	$12,4($5)
+	addu	$11,$11,$2
+	lw	$13,4($6)
+	sltu	$8,$11,$2
+	subu	$11,$10,$11
+	sltu	$2,$10,$11
+	sw	$11,0($4)
+	or	$2,$2,$8
+
+	addiu	$5,$5,4
+	addiu	$6,$6,4
+	move	$10,$12
+	move	$11,$13
+	bne	$9,$0,.Loop0
+	 addiu	$4,$4,4
+
+.L0:	beq	$7,$0,.Lend
+	 nop
+
+.Loop:	addiu	$7,$7,-4
+
+	lw	$12,4($5)
+	addu	$11,$11,$2
+	lw	$13,4($6)
+	sltu	$8,$11,$2
+	subu	$11,$10,$11
+	sltu	$2,$10,$11
+	sw	$11,0($4)
+	or	$2,$2,$8
+
+	lw	$10,8($5)
+	addu	$13,$13,$2
+	lw	$11,8($6)
+	sltu	$8,$13,$2
+	subu	$13,$12,$13
+	sltu	$2,$12,$13
+	sw	$13,4($4)
+	or	$2,$2,$8
+
+	lw	$12,12($5)
+	addu	$11,$11,$2
+	lw	$13,12($6)
+	sltu	$8,$11,$2
+	subu	$11,$10,$11
+	sltu	$2,$10,$11
+	sw	$11,8($4)
+	or	$2,$2,$8
+
+	lw	$10,16($5)
+	addu	$13,$13,$2
+	lw	$11,16($6)
+	sltu	$8,$13,$2
+	subu	$13,$12,$13
+	sltu	$2,$12,$13
+	sw	$13,12($4)
+	or	$2,$2,$8
+
+	addiu	$5,$5,16
+	addiu	$6,$6,16
+
+	bne	$7,$0,.Loop
+	 addiu	$4,$4,16
+
+.Lend:	addu	$11,$11,$2
+	sltu	$8,$11,$2
+	subu	$11,$10,$11
+	sltu	$2,$10,$11
+	sw	$11,0($4)
+	j	$31
+	or	$2,$2,$8
+EPILOGUE(mpn_sub_n)

diff --git a/third_party/gmp/mpn/mips32/submul_1.asm b/third_party/gmp/mpn/mips32/submul_1.asm
new file mode 100644
index 0000000..335722b
--- /dev/null
+++ b/third_party/gmp/mpn/mips32/submul_1.asm

@@ -0,0 +1,101 @@
+dnl  MIPS32 mpn_submul_1 -- Multiply a limb vector with a single limb and
+dnl  subtract the product from a second limb vector.
+
+dnl  Copyright 1992, 1994, 1996, 2000, 2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr	$4
+C s1_ptr	$5
+C size		$6
+C s2_limb	$7
+
+ASM_START()
+PROLOGUE(mpn_submul_1)
+
+C feed-in phase 0
+	lw	$8,0($5)
+
+C feed-in phase 1
+	addiu	$5,$5,4
+	multu	$8,$7
+
+	addiu	$6,$6,-1
+	beq	$6,$0,$LC0
+	 move	$2,$0		C zero cy2
+
+	addiu	$6,$6,-1
+	beq	$6,$0,$LC1
+	lw	$8,0($5)	C load new s1 limb as early as possible
+
+Loop:	lw	$10,0($4)
+	mflo	$3
+	mfhi	$9
+	addiu	$5,$5,4
+	addu	$3,$3,$2	C add old carry limb to low product limb
+	multu	$8,$7
+	lw	$8,0($5)	C load new s1 limb as early as possible
+	addiu	$6,$6,-1	C decrement loop counter
+	sltu	$2,$3,$2	C carry from previous addition -> $2
+	subu	$3,$10,$3
+	sgtu	$10,$3,$10
+	addu	$2,$2,$10
+	sw	$3,0($4)
+	addiu	$4,$4,4
+	bne	$6,$0,Loop
+	 addu	$2,$9,$2	C add high product limb and carry from addition
+
+C wind-down phase 1
+$LC1:	lw	$10,0($4)
+	mflo	$3
+	mfhi	$9
+	addu	$3,$3,$2
+	sltu	$2,$3,$2
+	multu	$8,$7
+	subu	$3,$10,$3
+	sgtu	$10,$3,$10
+	addu	$2,$2,$10
+	sw	$3,0($4)
+	addiu	$4,$4,4
+	addu	$2,$9,$2	C add high product limb and carry from addition
+
+C wind-down phase 0
+$LC0:	lw	$10,0($4)
+	mflo	$3
+	mfhi	$9
+	addu	$3,$3,$2
+	sltu	$2,$3,$2
+	subu	$3,$10,$3
+	sgtu	$10,$3,$10
+	addu	$2,$2,$10
+	sw	$3,0($4)
+	j	$31
+	addu	$2,$9,$2	C add high product limb and carry from addition
+EPILOGUE(mpn_submul_1)

diff --git a/third_party/gmp/mpn/mips32/umul.asm b/third_party/gmp/mpn/mips32/umul.asm
new file mode 100644
index 0000000..1ced0eb
--- /dev/null
+++ b/third_party/gmp/mpn/mips32/umul.asm

@@ -0,0 +1,45 @@
+dnl  MIPS32 umul_ppmm -- longlong.h support.
+
+dnl  Copyright 1999, 2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C plp   $4
+C u     $5
+C v     $6
+
+ASM_START()
+PROLOGUE(mpn_umul_ppmm)
+	multu	$5,$6
+	mflo	$3
+	mfhi	$2
+	j	$31
+	sw	$3,0($4)
+EPILOGUE(mpn_umul_ppmm)

diff --git a/third_party/gmp/mpn/mips64/README b/third_party/gmp/mpn/mips64/README
new file mode 100644
index 0000000..7ddd0e5
--- /dev/null
+++ b/third_party/gmp/mpn/mips64/README

@@ -0,0 +1,60 @@
+Copyright 1996 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.
+
+
+
+
+
+This directory contains mpn functions optimized for MIPS3.  Example of
+processors that implement MIPS3 are R4000, R4400, R4600, R4700, and R8000.
+
+RELEVANT OPTIMIZATION ISSUES
+
+1. On the R4000 and R4400, branches, both the plain and the "likely" ones,
+   take 3 cycles to execute.  (The fastest possible loop will take 4 cycles,
+   because of the delay insn.)
+
+   On the R4600, branches takes a single cycle
+
+   On the R8000, branches often take no noticeable cycles, as they are
+   executed in a separate function unit..
+
+2. The R4000 and R4400 have a load latency of 4 cycles.
+
+3. On the R4000 and R4400, multiplies take a data-dependent number of
+   cycles, contrary to the SGI documentation.  There seem to be 3 or 4
+   possible latencies.
+
+4. The R1x000 processors can issue one floating-point operation, two integer
+   operations, and one memory operation per cycle.  The FPU has very short
+   latencies, while the integer multiply unit is non-pipelined.  We should
+   therefore write fp based mpn_Xmul_1.
+
+STATUS
+
+Good...

diff --git a/third_party/gmp/mpn/mips64/add_n.asm b/third_party/gmp/mpn/mips64/add_n.asm
new file mode 100644
index 0000000..6856407
--- /dev/null
+++ b/third_party/gmp/mpn/mips64/add_n.asm

@@ -0,0 +1,134 @@
+dnl  MIPS64 mpn_add_n -- Add two limb vectors of the same length > 0 and store
+dnl  sum in a third limb vector.
+
+dnl  Copyright 1995, 2000-2002, 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr	$4
+C s1_ptr	$5
+C s2_ptr	$6
+C size		$7
+
+ASM_START()
+PROLOGUE(mpn_add_nc)
+	ld	$10,0($5)
+	ld	$11,0($6)
+
+	daddiu	$7,$7,-1
+	and	$9,$7,4-1	C number of limbs in first loop
+	beq	$9,$0,.L0	C if multiple of 4 limbs, skip first loop
+	 move	$2,$8
+	b	.Loop0
+	 dsubu	$7,$7,$9
+EPILOGUE()
+PROLOGUE(mpn_add_n)
+	ld	$10,0($5)
+	ld	$11,0($6)
+
+	daddiu	$7,$7,-1
+	and	$9,$7,4-1	C number of limbs in first loop
+	beq	$9,$0,.L0	C if multiple of 4 limbs, skip first loop
+	 move	$2,$0
+
+	dsubu	$7,$7,$9
+
+.Loop0:	daddiu	$9,$9,-1
+	ld	$12,8($5)
+	daddu	$11,$11,$2
+	ld	$13,8($6)
+	sltu	$8,$11,$2
+	daddu	$11,$10,$11
+	sltu	$2,$11,$10
+	sd	$11,0($4)
+	or	$2,$2,$8
+
+	daddiu	$5,$5,8
+	daddiu	$6,$6,8
+	move	$10,$12
+	move	$11,$13
+	bne	$9,$0,.Loop0
+	 daddiu	$4,$4,8
+
+.L0:	beq	$7,$0,.Lend
+	 nop
+
+.Loop:	daddiu	$7,$7,-4
+
+	ld	$12,8($5)
+	daddu	$11,$11,$10
+	ld	$13,8($6)
+	sltu	$8,$11,$10
+	daddu	$11,$11,$2
+	sltu	$2,$11,$2
+	sd	$11,0($4)
+	or	$2,$2,$8
+
+	ld	$10,16($5)
+	daddu	$13,$13,$12
+	ld	$11,16($6)
+	sltu	$8,$13,$12
+	daddu	$13,$13,$2
+	sltu	$2,$13,$2
+	sd	$13,8($4)
+	or	$2,$2,$8
+
+	ld	$12,24($5)
+	daddu	$11,$11,$10
+	ld	$13,24($6)
+	sltu	$8,$11,$10
+	daddu	$11,$11,$2
+	sltu	$2,$11,$2
+	sd	$11,16($4)
+	or	$2,$2,$8
+
+	ld	$10,32($5)
+	daddu	$13,$13,$12
+	ld	$11,32($6)
+	sltu	$8,$13,$12
+	daddu	$13,$13,$2
+	sltu	$2,$13,$2
+	sd	$13,24($4)
+	or	$2,$2,$8
+
+	daddiu	$5,$5,32
+	daddiu	$6,$6,32
+
+	bne	$7,$0,.Loop
+	 daddiu	$4,$4,32
+
+.Lend:	daddu	$11,$11,$2
+	sltu	$8,$11,$2
+	daddu	$11,$10,$11
+	sltu	$2,$11,$10
+	sd	$11,0($4)
+	j	$31
+	or	$2,$2,$8
+EPILOGUE()

diff --git a/third_party/gmp/mpn/mips64/gmp-mparam.h b/third_party/gmp/mpn/mips64/gmp-mparam.h
new file mode 100644
index 0000000..b7fcf24
--- /dev/null
+++ b/third_party/gmp/mpn/mips64/gmp-mparam.h

@@ -0,0 +1,72 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 1999-2004 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+
+/* Generated by tuneup.c, 2004-02-10, gcc 3.2 & MIPSpro C 7.2.1 (R1x000) */
+
+#define MUL_TOOM22_THRESHOLD             16
+#define MUL_TOOM33_THRESHOLD             89
+
+#define SQR_BASECASE_THRESHOLD            6
+#define SQR_TOOM2_THRESHOLD              32
+#define SQR_TOOM3_THRESHOLD              98
+
+#define DIV_SB_PREINV_THRESHOLD           0  /* always */
+#define DIV_DC_THRESHOLD                 53
+#define POWM_THRESHOLD                   61
+
+#define HGCD_THRESHOLD                  116
+#define GCD_ACCEL_THRESHOLD               3
+#define GCD_DC_THRESHOLD                492
+#define JACOBI_BASE_METHOD                2
+
+#define MOD_1_NORM_THRESHOLD              0  /* always */
+#define MOD_1_UNNORM_THRESHOLD            0  /* always */
+#define USE_PREINV_DIVREM_1               1
+#define USE_PREINV_MOD_1                  1
+#define DIVREM_2_THRESHOLD                0  /* always */
+#define DIVEXACT_1_THRESHOLD              0  /* always */
+#define MODEXACT_1_ODD_THRESHOLD          0  /* always */
+
+#define GET_STR_DC_THRESHOLD             21
+#define GET_STR_PRECOMPUTE_THRESHOLD     26
+#define SET_STR_THRESHOLD              3962
+
+#define MUL_FFT_TABLE  { 368, 736, 1600, 3328, 7168, 20480, 49152, 0 }
+#define MUL_FFT_MODF_THRESHOLD          264
+#define MUL_FFT_THRESHOLD              1920
+
+#define SQR_FFT_TABLE  { 368, 736, 1856, 3328, 7168, 20480, 49152, 0 }
+#define SQR_FFT_MODF_THRESHOLD          280
+#define SQR_FFT_THRESHOLD              1920

diff --git a/third_party/gmp/mpn/mips64/hilo/addmul_1.asm b/third_party/gmp/mpn/mips64/hilo/addmul_1.asm
new file mode 100644
index 0000000..8ff0976
--- /dev/null
+++ b/third_party/gmp/mpn/mips64/hilo/addmul_1.asm

@@ -0,0 +1,101 @@
+dnl  MIPS64 mpn_addmul_1 -- Multiply a limb vector with a single limb and add
+dnl  the product to a second limb vector.
+
+dnl  Copyright 1992, 1994, 1995, 2000-2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr	$4
+C s1_ptr	$5
+C size		$6
+C s2_limb	$7
+
+ASM_START()
+PROLOGUE(mpn_addmul_1)
+
+C feed-in phase 0
+	ld	$8,0($5)
+
+C feed-in phase 1
+	daddiu	$5,$5,8
+	dmultu	$8,$7
+
+	daddiu	$6,$6,-1
+	beq	$6,$0,$LC0
+	 move	$2,$0		C zero cy2
+
+	daddiu	$6,$6,-1
+	beq	$6,$0,$LC1
+	ld	$8,0($5)	C load new s1 limb as early as possible
+
+Loop:	ld	$10,0($4)
+	mflo	$3
+	mfhi	$9
+	daddiu	$5,$5,8
+	daddu	$3,$3,$2	C add old carry limb to low product limb
+	dmultu	$8,$7
+	ld	$8,0($5)	C load new s1 limb as early as possible
+	daddiu	$6,$6,-1	C decrement loop counter
+	sltu	$2,$3,$2	C carry from previous addition -> $2
+	daddu	$3,$10,$3
+	sltu	$10,$3,$10
+	daddu	$2,$2,$10
+	sd	$3,0($4)
+	daddiu	$4,$4,8
+	bne	$6,$0,Loop
+	 daddu	$2,$9,$2	C add high product limb and carry from addition
+
+C wind-down phase 1
+$LC1:	ld	$10,0($4)
+	mflo	$3
+	mfhi	$9
+	daddu	$3,$3,$2
+	sltu	$2,$3,$2
+	dmultu	$8,$7
+	daddu	$3,$10,$3
+	sltu	$10,$3,$10
+	daddu	$2,$2,$10
+	sd	$3,0($4)
+	daddiu	$4,$4,8
+	daddu	$2,$9,$2	C add high product limb and carry from addition
+
+C wind-down phase 0
+$LC0:	ld	$10,0($4)
+	mflo	$3
+	mfhi	$9
+	daddu	$3,$3,$2
+	sltu	$2,$3,$2
+	daddu	$3,$10,$3
+	sltu	$10,$3,$10
+	daddu	$2,$2,$10
+	sd	$3,0($4)
+	j	$31
+	daddu	$2,$9,$2	C add high product limb and carry from addition
+EPILOGUE(mpn_addmul_1)

diff --git a/third_party/gmp/mpn/mips64/hilo/mul_1.asm b/third_party/gmp/mpn/mips64/hilo/mul_1.asm
new file mode 100644
index 0000000..77acf0a
--- /dev/null
+++ b/third_party/gmp/mpn/mips64/hilo/mul_1.asm

@@ -0,0 +1,92 @@
+dnl  MIPS64 mpn_mul_1 -- Multiply a limb vector with a single limb and store
+dnl  the product in a second limb vector.
+
+dnl  Copyright 1992, 1994, 1995, 2000-2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr	$4
+C s1_ptr	$5
+C size		$6
+C s2_limb	$7
+
+ASM_START()
+PROLOGUE(mpn_mul_1)
+
+C feed-in phase 0
+	ld	$8,0($5)
+
+C feed-in phase 1
+	daddiu	$5,$5,8
+	dmultu	$8,$7
+
+	daddiu	$6,$6,-1
+	beq	$6,$0,$LC0
+	 move	$2,$0		C zero cy2
+
+	daddiu	$6,$6,-1
+	beq	$6,$0,$LC1
+	ld	$8,0($5)	C load new s1 limb as early as possible
+
+Loop:	nop
+	mflo	$10
+	mfhi	$9
+	daddiu	$5,$5,8
+	daddu	$10,$10,$2	C add old carry limb to low product limb
+	dmultu	$8,$7
+	ld	$8,0($5)	C load new s1 limb as early as possible
+	daddiu	$6,$6,-1	C decrement loop counter
+	sltu	$2,$10,$2	C carry from previous addition -> $2
+	nop
+	nop
+	sd	$10,0($4)
+	daddiu	$4,$4,8
+	bne	$6,$0,Loop
+	 daddu	$2,$9,$2	C add high product limb and carry from addition
+
+C wind-down phase 1
+$LC1:	mflo	$10
+	mfhi	$9
+	daddu	$10,$10,$2
+	sltu	$2,$10,$2
+	dmultu	$8,$7
+	sd	$10,0($4)
+	daddiu	$4,$4,8
+	daddu	$2,$9,$2	C add high product limb and carry from addition
+
+C wind-down phase 0
+$LC0:	mflo	$10
+	mfhi	$9
+	daddu	$10,$10,$2
+	sltu	$2,$10,$2
+	sd	$10,0($4)
+	j	$31
+	daddu	$2,$9,$2	C add high product limb and carry from addition
+EPILOGUE(mpn_mul_1)

diff --git a/third_party/gmp/mpn/mips64/hilo/sqr_diagonal.asm b/third_party/gmp/mpn/mips64/hilo/sqr_diagonal.asm
new file mode 100644
index 0000000..dcb87dc
--- /dev/null
+++ b/third_party/gmp/mpn/mips64/hilo/sqr_diagonal.asm

@@ -0,0 +1,77 @@
+dnl  MIPS64 mpn_sqr_diagonal.
+
+dnl  Copyright 2001, 2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+dnl  INPUT PARAMETERS
+dnl  rp		$4
+dnl  up		$5
+dnl  n		$6
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_sqr_diagonal)
+	ld	r8,0(r5)
+	daddiu	r6,r6,-2
+	dmultu	r8,r8
+	bltz	r6,$Lend1
+	nop
+	ld	r8,8(r5)
+	beq	r6,r0,$Lend2
+	nop
+
+$Loop:	mflo	r10
+	mfhi	r9
+	daddiu	r6,r6,-1
+	sd	r10,0(r4)
+	sd	r9,8(r4)
+	dmultu	r8,r8
+	ld	r8,16(r5)
+	daddiu	r5,r5,8
+	bne	r6,r0,$Loop
+	daddiu	r4,r4,16
+
+$Lend2: mflo	r10
+	mfhi	r9
+	sd	r10,0(r4)
+	sd	r9,8(r4)
+	dmultu	r8,r8
+	mflo	r10
+	mfhi	r9
+	sd	r10,16(r4)
+	j	r31
+	sd	r9,24(r4)
+
+$Lend1: mflo	r10
+	mfhi	r9
+	sd	r10,0(r4)
+	j	r31
+	sd	r9,8(r4)
+EPILOGUE(mpn_sqr_diagonal)

diff --git a/third_party/gmp/mpn/mips64/hilo/submul_1.asm b/third_party/gmp/mpn/mips64/hilo/submul_1.asm
new file mode 100644
index 0000000..089589c
--- /dev/null
+++ b/third_party/gmp/mpn/mips64/hilo/submul_1.asm

@@ -0,0 +1,101 @@
+dnl  MIPS64 mpn_submul_1 -- Multiply a limb vector with a single limb and
+dnl  subtract the product from a second limb vector.
+
+dnl  Copyright 1992, 1994, 1995, 2000-2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr	$4
+C s1_ptr	$5
+C size		$6
+C s2_limb	$7
+
+ASM_START()
+PROLOGUE(mpn_submul_1)
+
+C feed-in phase 0
+	ld	$8,0($5)
+
+C feed-in phase 1
+	daddiu	$5,$5,8
+	dmultu	$8,$7
+
+	daddiu	$6,$6,-1
+	beq	$6,$0,$LC0
+	 move	$2,$0		C zero cy2
+
+	daddiu	$6,$6,-1
+	beq	$6,$0,$LC1
+	ld	$8,0($5)	C load new s1 limb as early as possible
+
+Loop:	ld	$10,0($4)
+	mflo	$3
+	mfhi	$9
+	daddiu	$5,$5,8
+	daddu	$3,$3,$2	C add old carry limb to low product limb
+	dmultu	$8,$7
+	ld	$8,0($5)	C load new s1 limb as early as possible
+	daddiu	$6,$6,-1	C decrement loop counter
+	sltu	$2,$3,$2	C carry from previous addition -> $2
+	dsubu	$3,$10,$3
+	sgtu	$10,$3,$10
+	daddu	$2,$2,$10
+	sd	$3,0($4)
+	daddiu	$4,$4,8
+	bne	$6,$0,Loop
+	 daddu	$2,$9,$2	C add high product limb and carry from addition
+
+C wind-down phase 1
+$LC1:	ld	$10,0($4)
+	mflo	$3
+	mfhi	$9
+	daddu	$3,$3,$2
+	sltu	$2,$3,$2
+	dmultu	$8,$7
+	dsubu	$3,$10,$3
+	sgtu	$10,$3,$10
+	daddu	$2,$2,$10
+	sd	$3,0($4)
+	daddiu	$4,$4,8
+	daddu	$2,$9,$2	C add high product limb and carry from addition
+
+C wind-down phase 0
+$LC0:	ld	$10,0($4)
+	mflo	$3
+	mfhi	$9
+	daddu	$3,$3,$2
+	sltu	$2,$3,$2
+	dsubu	$3,$10,$3
+	sgtu	$10,$3,$10
+	daddu	$2,$2,$10
+	sd	$3,0($4)
+	j	$31
+	daddu	$2,$9,$2	C add high product limb and carry from addition
+EPILOGUE(mpn_submul_1)

diff --git a/third_party/gmp/mpn/mips64/hilo/umul.asm b/third_party/gmp/mpn/mips64/hilo/umul.asm
new file mode 100644
index 0000000..b9aac57
--- /dev/null
+++ b/third_party/gmp/mpn/mips64/hilo/umul.asm

@@ -0,0 +1,45 @@
+dnl  MIPS64 umul_ppmm -- longlong.h support.
+
+dnl  Copyright 2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C plp   $4
+C u     $5
+C v     $6
+
+ASM_START()
+PROLOGUE(mpn_umul_ppmm)
+	dmultu	$5,$6
+	mflo	$3
+	mfhi	$2
+	j	$31
+	sd	$3,0($4)
+EPILOGUE(mpn_umul_ppmm)

diff --git a/third_party/gmp/mpn/mips64/lshift.asm b/third_party/gmp/mpn/mips64/lshift.asm
new file mode 100644
index 0000000..3440eaf
--- /dev/null
+++ b/third_party/gmp/mpn/mips64/lshift.asm

@@ -0,0 +1,99 @@
+dnl  MIPS64 mpn_lshift -- Left shift.
+
+dnl  Copyright 1995, 2000-2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr	$4
+C src_ptr	$5
+C size		$6
+C cnt		$7
+
+ASM_START()
+PROLOGUE(mpn_lshift)
+	dsll	$2,$6,3
+	daddu	$5,$5,$2	C make r5 point at end of src
+	ld	$10,-8($5)	C load first limb
+	dsubu	$13,$0,$7
+	daddu	$4,$4,$2	C make r4 point at end of res
+	daddiu	$6,$6,-1
+	and	$9,$6,4-1	C number of limbs in first loop
+	beq	$9,$0,.L0	C if multiple of 4 limbs, skip first loop
+	 dsrl	$2,$10,$13	C compute function result
+
+	dsubu	$6,$6,$9
+
+.Loop0:	ld	$3,-16($5)
+	daddiu	$4,$4,-8
+	daddiu	$5,$5,-8
+	daddiu	$9,$9,-1
+	dsll	$11,$10,$7
+	dsrl	$12,$3,$13
+	move	$10,$3
+	or	$8,$11,$12
+	bne	$9,$0,.Loop0
+	 sd	$8,0($4)
+
+.L0:	beq	$6,$0,.Lend
+	 nop
+
+.Loop:	ld	$3,-16($5)
+	daddiu	$4,$4,-32
+	daddiu	$6,$6,-4
+	dsll	$11,$10,$7
+	dsrl	$12,$3,$13
+
+	ld	$10,-24($5)
+	dsll	$14,$3,$7
+	or	$8,$11,$12
+	sd	$8,24($4)
+	dsrl	$9,$10,$13
+
+	ld	$3,-32($5)
+	dsll	$11,$10,$7
+	or	$8,$14,$9
+	sd	$8,16($4)
+	dsrl	$12,$3,$13
+
+	ld	$10,-40($5)
+	dsll	$14,$3,$7
+	or	$8,$11,$12
+	sd	$8,8($4)
+	dsrl	$9,$10,$13
+
+	daddiu	$5,$5,-32
+	or	$8,$14,$9
+	bgtz	$6,.Loop
+	 sd	$8,0($4)
+
+.Lend:	dsll	$8,$10,$7
+	j	$31
+	sd	$8,-8($4)
+EPILOGUE(mpn_lshift)

diff --git a/third_party/gmp/mpn/mips64/rshift.asm b/third_party/gmp/mpn/mips64/rshift.asm
new file mode 100644
index 0000000..9253cb5
--- /dev/null
+++ b/third_party/gmp/mpn/mips64/rshift.asm

@@ -0,0 +1,96 @@
+dnl  MIPS64 mpn_rshift -- Right shift.
+
+dnl  Copyright 1995, 2000-2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr	$4
+C src_ptr	$5
+C size		$6
+C cnt		$7
+
+ASM_START()
+PROLOGUE(mpn_rshift)
+	ld	$10,0($5)	C load first limb
+	dsubu	$13,$0,$7
+	daddiu	$6,$6,-1
+	and	$9,$6,4-1	C number of limbs in first loop
+	beq	$9,$0,.L0	C if multiple of 4 limbs, skip first loop
+	 dsll	$2,$10,$13	C compute function result
+
+	dsubu	$6,$6,$9
+
+.Loop0:	ld	$3,8($5)
+	daddiu	$4,$4,8
+	daddiu	$5,$5,8
+	daddiu	$9,$9,-1
+	dsrl	$11,$10,$7
+	dsll	$12,$3,$13
+	move	$10,$3
+	or	$8,$11,$12
+	bne	$9,$0,.Loop0
+	 sd	$8,-8($4)
+
+.L0:	beq	$6,$0,.Lend
+	 nop
+
+.Loop:	ld	$3,8($5)
+	daddiu	$4,$4,32
+	daddiu	$6,$6,-4
+	dsrl	$11,$10,$7
+	dsll	$12,$3,$13
+
+	ld	$10,16($5)
+	dsrl	$14,$3,$7
+	or	$8,$11,$12
+	sd	$8,-32($4)
+	dsll	$9,$10,$13
+
+	ld	$3,24($5)
+	dsrl	$11,$10,$7
+	or	$8,$14,$9
+	sd	$8,-24($4)
+	dsll	$12,$3,$13
+
+	ld	$10,32($5)
+	dsrl	$14,$3,$7
+	or	$8,$11,$12
+	sd	$8,-16($4)
+	dsll	$9,$10,$13
+
+	daddiu	$5,$5,32
+	or	$8,$14,$9
+	bgtz	$6,.Loop
+	 sd	$8,-8($4)
+
+.Lend:	dsrl	$8,$10,$7
+	j	$31
+	sd	$8,0($4)
+EPILOGUE(mpn_rshift)

diff --git a/third_party/gmp/mpn/mips64/sub_n.asm b/third_party/gmp/mpn/mips64/sub_n.asm
new file mode 100644
index 0000000..6a69897
--- /dev/null
+++ b/third_party/gmp/mpn/mips64/sub_n.asm

@@ -0,0 +1,134 @@
+dnl  MIPS64 mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+dnl  store difference in a third limb vector.
+
+dnl  Copyright 1995, 2000-2002, 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr	$4
+C s1_ptr	$5
+C s2_ptr	$6
+C size		$7
+
+ASM_START()
+PROLOGUE(mpn_sub_nc)
+	ld	$10,0($5)
+	ld	$11,0($6)
+
+	daddiu	$7,$7,-1
+	and	$9,$7,4-1	C number of limbs in first loop
+	beq	$9,$0,.L0	C if multiple of 4 limbs, skip first loop
+	 move	$2,$8
+	b	.Loop0
+	 dsubu	$7,$7,$9
+EPILOGUE()
+PROLOGUE(mpn_sub_n)
+	ld	$10,0($5)
+	ld	$11,0($6)
+
+	daddiu	$7,$7,-1
+	and	$9,$7,4-1	C number of limbs in first loop
+	beq	$9,$0,.L0	C if multiple of 4 limbs, skip first loop
+	 move	$2,$0
+
+	dsubu	$7,$7,$9
+
+.Loop0:	daddiu	$9,$9,-1
+	ld	$12,8($5)
+	daddu	$11,$11,$2
+	ld	$13,8($6)
+	sltu	$8,$11,$2
+	dsubu	$11,$10,$11
+	sltu	$2,$10,$11
+	sd	$11,0($4)
+	or	$2,$2,$8
+
+	daddiu	$5,$5,8
+	daddiu	$6,$6,8
+	move	$10,$12
+	move	$11,$13
+	bne	$9,$0,.Loop0
+	 daddiu	$4,$4,8
+
+.L0:	beq	$7,$0,.Lend
+	 nop
+
+.Loop:	daddiu	$7,$7,-4
+
+	ld	$12,8($5)
+	dsubu	$11,$10,$11
+	ld	$13,8($6)
+	sltu	$8,$10,$11
+	dsubu	$14,$11,$2
+	sltu	$2,$11,$14
+	sd	$14,0($4)
+	or	$2,$2,$8
+
+	ld	$10,16($5)
+	dsubu	$13,$12,$13
+	ld	$11,16($6)
+	sltu	$8,$12,$13
+	dsubu	$14,$13,$2
+	sltu	$2,$13,$14
+	sd	$14,8($4)
+	or	$2,$2,$8
+
+	ld	$12,24($5)
+	dsubu	$11,$10,$11
+	ld	$13,24($6)
+	sltu	$8,$10,$11
+	dsubu	$14,$11,$2
+	sltu	$2,$11,$14
+	sd	$14,16($4)
+	or	$2,$2,$8
+
+	ld	$10,32($5)
+	dsubu	$13,$12,$13
+	ld	$11,32($6)
+	sltu	$8,$12,$13
+	dsubu	$14,$13,$2
+	sltu	$2,$13,$14
+	sd	$14,24($4)
+	or	$2,$2,$8
+
+	daddiu	$5,$5,32
+	daddiu	$6,$6,32
+
+	bne	$7,$0,.Loop
+	 daddiu	$4,$4,32
+
+.Lend:	daddu	$11,$11,$2
+	sltu	$8,$11,$2
+	dsubu	$11,$10,$11
+	sltu	$2,$10,$11
+	sd	$11,0($4)
+	j	$31
+	or	$2,$2,$8
+EPILOGUE()

diff --git a/third_party/gmp/mpn/pa32/README b/third_party/gmp/mpn/pa32/README
new file mode 100644
index 0000000..4323390
--- /dev/null
+++ b/third_party/gmp/mpn/pa32/README

@@ -0,0 +1,162 @@
+Copyright 1996, 1999, 2001, 2002, 2004 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.
+
+
+
+
+
+
+This directory contains mpn functions for various HP PA-RISC chips.  Code
+that runs faster on the PA7100 and later implementations, is in the pa7100
+directory.
+
+RELEVANT OPTIMIZATION ISSUES
+
+  Load and Store timing
+
+On the PA7000 no memory instructions can issue the two cycles after a store.
+For the PA7100, this is reduced to one cycle.
+
+The PA7100 has a lookup-free cache, so it helps to schedule loads and the
+dependent instruction really far from each other.
+
+STATUS
+
+1. mpn_mul_1 could be improved to 6.5 cycles/limb on the PA7100, using the
+   instructions below (but some sw pipelining is needed to avoid the
+   xmpyu-fstds delay):
+
+	fldds	s1_ptr
+
+	xmpyu
+	fstds	N(%r30)
+	xmpyu
+	fstds	N(%r30)
+
+	ldws	N(%r30)
+	ldws	N(%r30)
+	ldws	N(%r30)
+	ldws	N(%r30)
+
+	addc
+	stws	res_ptr
+	addc
+	stws	res_ptr
+
+	addib	Loop
+
+2. mpn_addmul_1 could be improved from the current 10 to 7.5 cycles/limb
+   (asymptotically) on the PA7100, using the instructions below.  With proper
+   sw pipelining and the unrolling level below, the speed becomes 8
+   cycles/limb.
+
+	fldds	s1_ptr
+	fldds	s1_ptr
+
+	xmpyu
+	fstds	N(%r30)
+	xmpyu
+	fstds	N(%r30)
+	xmpyu
+	fstds	N(%r30)
+	xmpyu
+	fstds	N(%r30)
+
+	ldws	N(%r30)
+	ldws	N(%r30)
+	ldws	N(%r30)
+	ldws	N(%r30)
+	ldws	N(%r30)
+	ldws	N(%r30)
+	ldws	N(%r30)
+	ldws	N(%r30)
+	addc
+	addc
+	addc
+	addc
+	addc	%r0,%r0,cy-limb
+
+	ldws	res_ptr
+	ldws	res_ptr
+	ldws	res_ptr
+	ldws	res_ptr
+	add
+	stws	res_ptr
+	addc
+	stws	res_ptr
+	addc
+	stws	res_ptr
+	addc
+	stws	res_ptr
+
+	addib
+
+3. For the PA8000 we have to stick to using 32-bit limbs before compiler
+   support emerges.  But we want to use 64-bit operations whenever possible,
+   in particular for loads and stores.  It is possible to handle mpn_add_n
+   efficiently by rotating (when s1/s2 are aligned), masking+bit field
+   inserting when (they are not).  The speed should double compared to the
+   code used today.
+
+
+
+
+LABEL SYNTAX
+
+The HP-UX assembler takes labels starting in column 0 with no colon,
+
+	L$loop  ldws,mb -4(0,%r25),%r22
+
+Gas on hppa GNU/Linux however requires a colon,
+
+	L$loop: ldws,mb -4(0,%r25),%r22
+
+This is covered by using LDEF() from asm-defs.m4.  An alternative would be
+to use ".label" which is accepted by both,
+
+		.label  L$loop
+		ldws,mb -4(0,%r25),%r22
+
+but that's not as nice to look at, not if you're used to assembler code
+having labels in column 0.
+
+
+
+
+REFERENCES
+
+Hewlett Packard, "HP Assembler Reference Manual", 9th edition, June 1998,
+part number 92432-90012.
+
+
+
+----------------
+Local variables:
+mode: text
+fill-column: 76
+End:

diff --git a/third_party/gmp/mpn/pa32/add_n.asm b/third_party/gmp/mpn/pa32/add_n.asm
new file mode 100644
index 0000000..46f3937
--- /dev/null
+++ b/third_party/gmp/mpn/pa32/add_n.asm

@@ -0,0 +1,63 @@
+dnl  HP-PA mpn_add_n -- Add two limb vectors of the same length > 0 and store
+dnl  sum in a third limb vector.
+
+dnl  Copyright 1992, 1994, 2000-2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr	gr26
+C s1_ptr	gr25
+C s2_ptr	gr24
+C size		gr23
+
+C One might want to unroll this as for other processors, but it turns out that
+C the data cache contention after a store makes such unrolling useless.  We
+C can't come under 5 cycles/limb anyway.
+
+ASM_START()
+PROLOGUE(mpn_add_n)
+	ldws,ma		4(0,%r25),%r20
+	ldws,ma		4(0,%r24),%r19
+
+	addib,=		-1,%r23,L(end)	C check for (SIZE == 1)
+	 add		%r20,%r19,%r28	C add first limbs ignoring cy
+
+LDEF(loop)
+	ldws,ma		4(0,%r25),%r20
+	ldws,ma		4(0,%r24),%r19
+	stws,ma		%r28,4(0,%r26)
+	addib,<>	-1,%r23,L(loop)
+	 addc		%r20,%r19,%r28
+
+LDEF(end)
+	stws		%r28,0(0,%r26)
+	bv		0(%r2)
+	 addc		%r0,%r0,%r28
+EPILOGUE()

diff --git a/third_party/gmp/mpn/pa32/gmp-mparam.h b/third_party/gmp/mpn/pa32/gmp-mparam.h
new file mode 100644
index 0000000..377efcb
--- /dev/null
+++ b/third_party/gmp/mpn/pa32/gmp-mparam.h

@@ -0,0 +1,61 @@
+/* HP-PA 1.0 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 1999-2002, 2012 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* These values are for the PA7100 using GCC.  */
+/* Generated by tuneup.c, 2000-10-27. */
+
+#ifndef MUL_TOOM22_THRESHOLD
+#define MUL_TOOM22_THRESHOLD      30
+#endif
+#ifndef MUL_TOOM33_THRESHOLD
+#define MUL_TOOM33_THRESHOLD     141
+#endif
+
+#ifndef SQR_TOOM2_THRESHOLD
+#define SQR_TOOM2_THRESHOLD       59
+#endif
+#ifndef SQR_TOOM3_THRESHOLD
+#define SQR_TOOM3_THRESHOLD      177
+#endif
+
+#ifndef DIV_DC_THRESHOLD
+#define DIV_DC_THRESHOLD         108
+#endif
+
+#ifndef POWM_THRESHOLD
+#define POWM_THRESHOLD            18
+#endif
+
+#ifndef GCDEXT_THRESHOLD
+#define GCDEXT_THRESHOLD          33
+#endif

diff --git a/third_party/gmp/mpn/pa32/hppa1_1/addmul_1.asm b/third_party/gmp/mpn/pa32/hppa1_1/addmul_1.asm
new file mode 100644
index 0000000..ec2f219
--- /dev/null
+++ b/third_party/gmp/mpn/pa32/hppa1_1/addmul_1.asm

@@ -0,0 +1,106 @@
+dnl  HP-PA 1.1 mpn_addmul_1 -- Multiply a limb vector with a limb and add the
+dnl  result to a second limb vector.
+
+dnl  Copyright 1992-1994, 2000-2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr	r26
+C s1_ptr	r25
+C size		r24
+C s2_limb	r23
+
+C This runs at 11 cycles/limb on a PA7000.  With the used instructions, it can
+C not become faster due to data cache contention after a store.  On the PA7100
+C it runs at 10 cycles/limb.
+
+C There are some ideas described in mul_1.asm that applies to this code too.
+
+ASM_START()
+PROLOGUE(mpn_addmul_1)
+C	.callinfo	frame=64,no_calls
+
+	ldo		64(%r30),%r30
+	fldws,ma	4(%r25),%fr5
+	stw		%r23,-16(%r30)		C move s2_limb ...
+	addib,=		-1,%r24,L(just_one_limb)
+	 fldws		-16(%r30),%fr4		C ... into fr4
+	add		%r0,%r0,%r0		C clear carry
+	xmpyu		%fr4,%fr5,%fr6
+	fldws,ma	4(%r25),%fr7
+	fstds		%fr6,-16(%r30)
+	xmpyu		%fr4,%fr7,%fr8
+	ldw		-12(%r30),%r19		C least significant limb in product
+	ldw		-16(%r30),%r28
+
+	fstds		%fr8,-16(%r30)
+	addib,=		-1,%r24,L(end)
+	 ldw		-12(%r30),%r1
+
+C Main loop
+LDEF(loop)
+	ldws		0(%r26),%r29
+	fldws,ma	4(%r25),%fr5
+	add		%r29,%r19,%r19
+	stws,ma		%r19,4(%r26)
+	addc		%r28,%r1,%r19
+	xmpyu		%fr4,%fr5,%fr6
+	ldw		-16(%r30),%r28
+	fstds		%fr6,-16(%r30)
+	addc		%r0,%r28,%r28
+	addib,<>	-1,%r24,L(loop)
+	 ldw		-12(%r30),%r1
+
+LDEF(end)
+	ldw		0(%r26),%r29
+	add		%r29,%r19,%r19
+	stws,ma		%r19,4(%r26)
+	addc		%r28,%r1,%r19
+	ldw		-16(%r30),%r28
+	ldws		0(%r26),%r29
+	addc		%r0,%r28,%r28
+	add		%r29,%r19,%r19
+	stws,ma		%r19,4(%r26)
+	addc		%r0,%r28,%r28
+	bv		0(%r2)
+	 ldo		-64(%r30),%r30
+
+LDEF(just_one_limb)
+	xmpyu		%fr4,%fr5,%fr6
+	ldw		0(%r26),%r29
+	fstds		%fr6,-16(%r30)
+	ldw		-12(%r30),%r1
+	ldw		-16(%r30),%r28
+	add		%r29,%r1,%r19
+	stw		%r19,0(%r26)
+	addc		%r0,%r28,%r28
+	bv		0(%r2)
+	 ldo		-64(%r30),%r30
+EPILOGUE()

diff --git a/third_party/gmp/mpn/pa32/hppa1_1/gmp-mparam.h b/third_party/gmp/mpn/pa32/hppa1_1/gmp-mparam.h
new file mode 100644
index 0000000..1261b24
--- /dev/null
+++ b/third_party/gmp/mpn/pa32/hppa1_1/gmp-mparam.h

@@ -0,0 +1,72 @@
+/* HP-PA 1.1 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 1999-2002, 2004 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* Generated by tuneup.c, 2004-02-07, gcc 2.8 (pa7100/100MHz) */
+
+#define MUL_TOOM22_THRESHOLD             30
+#define MUL_TOOM33_THRESHOLD             89
+
+#define SQR_BASECASE_THRESHOLD            4
+#define SQR_TOOM2_THRESHOLD              55
+#define SQR_TOOM3_THRESHOLD             101
+
+#define DIV_SB_PREINV_THRESHOLD           0  /* always */
+#define DIV_DC_THRESHOLD                 84
+#define POWM_THRESHOLD                  166
+
+#define HGCD_THRESHOLD                  231
+#define GCD_ACCEL_THRESHOLD               3
+#define GCD_DC_THRESHOLD                823
+#define JACOBI_BASE_METHOD                2
+
+#define DIVREM_1_NORM_THRESHOLD           5
+#define DIVREM_1_UNNORM_THRESHOLD        11
+#define MOD_1_NORM_THRESHOLD              5
+#define MOD_1_UNNORM_THRESHOLD           10
+#define USE_PREINV_DIVREM_1               1
+#define USE_PREINV_MOD_1                  1
+#define DIVREM_2_THRESHOLD                0  /* always */
+#define DIVEXACT_1_THRESHOLD              0  /* always */
+#define MODEXACT_1_ODD_THRESHOLD          0  /* always */
+
+#define GET_STR_DC_THRESHOLD             13
+#define GET_STR_PRECOMPUTE_THRESHOLD     23
+#define SET_STR_THRESHOLD              6589
+
+#define MUL_FFT_TABLE  { 464, 928, 1920, 4608, 14336, 40960, 0 }
+#define MUL_FFT_MODF_THRESHOLD          480
+#define MUL_FFT_THRESHOLD              3328
+
+#define SQR_FFT_TABLE  { 528, 1184, 2176, 5632, 14336, 40960, 0 }
+#define SQR_FFT_MODF_THRESHOLD          520
+#define SQR_FFT_THRESHOLD              3328

diff --git a/third_party/gmp/mpn/pa32/hppa1_1/mul_1.asm b/third_party/gmp/mpn/pa32/hppa1_1/mul_1.asm
new file mode 100644
index 0000000..6e60c2f
--- /dev/null
+++ b/third_party/gmp/mpn/pa32/hppa1_1/mul_1.asm

@@ -0,0 +1,102 @@
+dnl  HP-PA 1.1 mpn_mul_1 -- Multiply a limb vector with a limb and store the
+dnl  result in a second limb vector.
+
+dnl  Copyright 1992-1994, 2000-2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr	r26
+C s1_ptr	r25
+C size		r24
+C s2_limb	r23
+
+C This runs at 9 cycles/limb on a PA7000.  With the used instructions, it can
+C not become faster due to data cache contention after a store.  On the PA7100
+C it runs at 7 cycles/limb.
+
+C We could use fldds to read two limbs at a time from the S1 array, and that
+C could bring down the times to 8.5 and 6.5 cycles/limb for the PA7000 and
+C PA7100, respectively.  We don't do that since it does not seem worth the
+C (alignment) troubles...
+
+C At least the PA7100 is rumored to be able to deal with cache-misses without
+C stalling instruction issue.  If this is true, and the cache is actually also
+C lockup-free, we should use a deeper software pipeline, and load from S1 very
+C early!  (The loads and stores to -12(sp) will surely be in the cache.)
+
+ASM_START()
+PROLOGUE(mpn_mul_1)
+C	.callinfo	frame=64,no_calls
+
+	ldo		64(%r30),%r30
+	fldws,ma	4(%r25),%fr5
+	stw		%r23,-16(%r30)		C move s2_limb ...
+	addib,=		-1,%r24,L(just_one_limb)
+	 fldws		-16(%r30),%fr4		C ... into fr4
+	add		%r0,%r0,%r0		C clear carry
+	xmpyu		%fr4,%fr5,%fr6
+	fldws,ma	4(%r25),%fr7
+	fstds		%fr6,-16(%r30)
+	xmpyu		%fr4,%fr7,%fr8
+	ldw		-12(%r30),%r19		C least significant limb in product
+	ldw		-16(%r30),%r28
+
+	fstds		%fr8,-16(%r30)
+	addib,=		-1,%r24,L(end)
+	 ldw		-12(%r30),%r1
+
+C Main loop
+LDEF(loop)
+	fldws,ma	4(%r25),%fr5
+	stws,ma		%r19,4(%r26)
+	addc		%r28,%r1,%r19
+	xmpyu		%fr4,%fr5,%fr6
+	ldw		-16(%r30),%r28
+	fstds		%fr6,-16(%r30)
+	addib,<>	-1,%r24,L(loop)
+	 ldw		-12(%r30),%r1
+
+LDEF(end)
+	stws,ma		%r19,4(%r26)
+	addc		%r28,%r1,%r19
+	ldw		-16(%r30),%r28
+	stws,ma		%r19,4(%r26)
+	addc		%r0,%r28,%r28
+	bv		0(%r2)
+	 ldo		-64(%r30),%r30
+
+LDEF(just_one_limb)
+	xmpyu		%fr4,%fr5,%fr6
+	fstds		%fr6,-16(%r30)
+	ldw		-16(%r30),%r28
+	ldo		-64(%r30),%r30
+	bv		0(%r2)
+	 fstws		%fr6R,0(%r26)
+EPILOGUE()

diff --git a/third_party/gmp/mpn/pa32/hppa1_1/pa7100/add_n.asm b/third_party/gmp/mpn/pa32/hppa1_1/pa7100/add_n.asm
new file mode 100644
index 0000000..b96d403
--- /dev/null
+++ b/third_party/gmp/mpn/pa32/hppa1_1/pa7100/add_n.asm

@@ -0,0 +1,83 @@
+dnl  HP-PA mpn_add_n -- Add two limb vectors of the same length > 0 and store
+dnl  sum in a third limb vector.  Optimized for the PA7100, where is runs at
+dnl  4.25 cycles/limb.
+
+dnl  Copyright 1992, 1994, 2000-2003 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr	r26
+C s1_ptr	r25
+C s2_ptr	r24
+C size		r23
+
+ASM_START()
+PROLOGUE(mpn_add_n)
+	ldws,ma		4(0,%r25),%r20
+	ldws,ma		4(0,%r24),%r19
+
+	addib,<=	-5,%r23,L(rest)
+	 add		%r20,%r19,%r28	C add first limbs ignoring cy
+
+LDEF(loop)
+	ldws,ma		4(0,%r25),%r20
+	ldws,ma		4(0,%r24),%r19
+	stws,ma		%r28,4(0,%r26)
+	addc		%r20,%r19,%r28
+	ldws,ma		4(0,%r25),%r20
+	ldws,ma		4(0,%r24),%r19
+	stws,ma		%r28,4(0,%r26)
+	addc		%r20,%r19,%r28
+	ldws,ma		4(0,%r25),%r20
+	ldws,ma		4(0,%r24),%r19
+	stws,ma		%r28,4(0,%r26)
+	addc		%r20,%r19,%r28
+	ldws,ma		4(0,%r25),%r20
+	ldws,ma		4(0,%r24),%r19
+	stws,ma		%r28,4(0,%r26)
+	addib,>		-4,%r23,L(loop)
+	addc		%r20,%r19,%r28
+
+LDEF(rest)
+	addib,=		4,%r23,L(end)
+	nop
+
+LDEF(eloop)
+	ldws,ma		4(0,%r25),%r20
+	ldws,ma		4(0,%r24),%r19
+	stws,ma		%r28,4(0,%r26)
+	addib,>		-1,%r23,L(eloop)
+	addc		%r20,%r19,%r28
+
+LDEF(end)
+	stws		%r28,0(0,%r26)
+	bv		0(%r2)
+	 addc		%r0,%r0,%r28
+EPILOGUE()

diff --git a/third_party/gmp/mpn/pa32/hppa1_1/pa7100/addmul_1.asm b/third_party/gmp/mpn/pa32/hppa1_1/pa7100/addmul_1.asm
new file mode 100644
index 0000000..fb16100
--- /dev/null
+++ b/third_party/gmp/mpn/pa32/hppa1_1/pa7100/addmul_1.asm

@@ -0,0 +1,201 @@
+dnl  HP-PA 7100/7200 mpn_addmul_1 -- Multiply a limb vector with a limb and
+dnl  add the result to a second limb vector.
+
+dnl  Copyright 1995, 2000-2003 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+define(`res_ptr',`%r26')
+define(`s1_ptr',`%r25')
+define(`size_param',`%r24')
+define(`s2_limb',`%r23')
+
+define(`cylimb',`%r28')
+define(`s0',`%r19')
+define(`s1',`%r20')
+define(`s2',`%r3')
+define(`s3',`%r4')
+define(`lo0',`%r21')
+define(`lo1',`%r5')
+define(`lo2',`%r6')
+define(`lo3',`%r7')
+define(`hi0',`%r22')
+define(`hi1',`%r23')				C safe to reuse
+define(`hi2',`%r29')
+define(`hi3',`%r1')
+
+ASM_START()
+PROLOGUE(mpn_addmul_1)
+C	.callinfo	frame=128,no_calls
+
+	ldo	128(%r30),%r30
+	stws	s2_limb,-16(%r30)
+	add	 %r0,%r0,cylimb			C clear cy and cylimb
+	addib,<	-4,size_param,L(few_limbs)
+	fldws	-16(%r30),%fr31R
+
+	ldo	-112(%r30),%r31
+	stw	%r3,-96(%r30)
+	stw	%r4,-92(%r30)
+	stw	%r5,-88(%r30)
+	stw	%r6,-84(%r30)
+	stw	%r7,-80(%r30)
+
+	bb,>=,n	 s1_ptr,29,L(0)
+
+	fldws,ma 4(s1_ptr),%fr4
+	ldws	 0(res_ptr),s0
+	xmpyu	 %fr4,%fr31R,%fr5
+	fstds	 %fr5,-16(%r31)
+	ldws	-16(%r31),cylimb
+	ldws	-12(%r31),lo0
+	add	 s0,lo0,s0
+	addib,< -1,size_param,L(few_limbs)
+	stws,ma	 s0,4(res_ptr)
+
+C start software pipeline ----------------------------------------------------
+LDEF(0)
+	fldds,ma 8(s1_ptr),%fr4
+	fldds,ma 8(s1_ptr),%fr8
+
+	xmpyu	 %fr4L,%fr31R,%fr5
+	xmpyu	 %fr4R,%fr31R,%fr6
+	xmpyu	 %fr8L,%fr31R,%fr9
+	xmpyu	 %fr8R,%fr31R,%fr10
+
+	fstds	 %fr5,-16(%r31)
+	fstds	 %fr6,-8(%r31)
+	fstds	 %fr9,0(%r31)
+	fstds	 %fr10,8(%r31)
+
+	ldws   -16(%r31),hi0
+	ldws   -12(%r31),lo0
+	ldws	-8(%r31),hi1
+	ldws	-4(%r31),lo1
+	ldws	 0(%r31),hi2
+	ldws	 4(%r31),lo2
+	ldws	 8(%r31),hi3
+	ldws	12(%r31),lo3
+
+	addc	 lo0,cylimb,lo0
+	addc	 lo1,hi0,lo1
+	addc	 lo2,hi1,lo2
+	addc	 lo3,hi2,lo3
+
+	addib,<	 -4,size_param,L(end)
+	addc	 %r0,hi3,cylimb			C propagate carry into cylimb
+C main loop ------------------------------------------------------------------
+LDEF(loop)
+	fldds,ma 8(s1_ptr),%fr4
+	fldds,ma 8(s1_ptr),%fr8
+
+	ldws	 0(res_ptr),s0
+	xmpyu	 %fr4L,%fr31R,%fr5
+	ldws	 4(res_ptr),s1
+	xmpyu	 %fr4R,%fr31R,%fr6
+	ldws	 8(res_ptr),s2
+	xmpyu	 %fr8L,%fr31R,%fr9
+	ldws	12(res_ptr),s3
+	xmpyu	 %fr8R,%fr31R,%fr10
+
+	fstds	 %fr5,-16(%r31)
+	add	 s0,lo0,s0
+	fstds	 %fr6,-8(%r31)
+	addc	 s1,lo1,s1
+	fstds	 %fr9,0(%r31)
+	addc	 s2,lo2,s2
+	fstds	 %fr10,8(%r31)
+	addc	 s3,lo3,s3
+
+	ldws   -16(%r31),hi0
+	ldws   -12(%r31),lo0
+	ldws	-8(%r31),hi1
+	ldws	-4(%r31),lo1
+	ldws	 0(%r31),hi2
+	ldws	 4(%r31),lo2
+	ldws	 8(%r31),hi3
+	ldws	12(%r31),lo3
+
+	addc	 lo0,cylimb,lo0
+	stws,ma	 s0,4(res_ptr)
+	addc	 lo1,hi0,lo1
+	stws,ma	 s1,4(res_ptr)
+	addc	 lo2,hi1,lo2
+	stws,ma	 s2,4(res_ptr)
+	addc	 lo3,hi2,lo3
+	stws,ma	 s3,4(res_ptr)
+
+	addib,>= -4,size_param,L(loop)
+	addc	 %r0,hi3,cylimb			C propagate carry into cylimb
+C finish software pipeline ---------------------------------------------------
+LDEF(end)
+	ldws	 0(res_ptr),s0
+	ldws	 4(res_ptr),s1
+	ldws	 8(res_ptr),s2
+	ldws	12(res_ptr),s3
+
+	add	 s0,lo0,s0
+	stws,ma	 s0,4(res_ptr)
+	addc	 s1,lo1,s1
+	stws,ma	 s1,4(res_ptr)
+	addc	 s2,lo2,s2
+	stws,ma	 s2,4(res_ptr)
+	addc	 s3,lo3,s3
+	stws,ma	 s3,4(res_ptr)
+
+C restore callee-saves registers ---------------------------------------------
+	ldw	-96(%r30),%r3
+	ldw	-92(%r30),%r4
+	ldw	-88(%r30),%r5
+	ldw	-84(%r30),%r6
+	ldw	-80(%r30),%r7
+
+LDEF(few_limbs)
+	addib,=,n 4,size_param,L(ret)
+
+LDEF(loop2)
+	fldws,ma 4(s1_ptr),%fr4
+	ldws	 0(res_ptr),s0
+	xmpyu	 %fr4,%fr31R,%fr5
+	fstds	 %fr5,-16(%r30)
+	ldws	-16(%r30),hi0
+	ldws	-12(%r30),lo0
+	addc	 lo0,cylimb,lo0
+	addc	 %r0,hi0,cylimb
+	add	 s0,lo0,s0
+	stws,ma	 s0,4(res_ptr)
+	addib,<> -1,size_param,L(loop2)
+	nop
+
+LDEF(ret)
+	addc	 %r0,cylimb,cylimb
+	bv	 0(%r2)
+	ldo	 -128(%r30),%r30
+EPILOGUE(mpn_addmul_1)

diff --git a/third_party/gmp/mpn/pa32/hppa1_1/pa7100/lshift.asm b/third_party/gmp/mpn/pa32/hppa1_1/pa7100/lshift.asm
new file mode 100644
index 0000000..d65db2a
--- /dev/null
+++ b/third_party/gmp/mpn/pa32/hppa1_1/pa7100/lshift.asm

@@ -0,0 +1,95 @@
+dnl  HP-PA  mpn_lshift -- Shift a number left.
+dnl  Optimized for the PA7100, where is runs at 3.25 cycles/limb.
+
+dnl  Copyright 1992, 1994, 2000-2003 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr	r26
+C s_ptr		r25
+C size		r24
+C cnt		r23
+
+ASM_START()
+PROLOGUE(mpn_lshift)
+	sh2add		%r24,%r25,%r25
+	sh2add		%r24,%r26,%r26
+	ldws,mb		-4(0,%r25),%r22
+	subi		32,%r23,%r1
+	mtsar		%r1
+	addib,=		-1,%r24,L(0004)
+	vshd		%r0,%r22,%r28		C compute carry out limb
+	ldws,mb		-4(0,%r25),%r29
+	addib,<=	-5,%r24,L(rest)
+	vshd		%r22,%r29,%r20
+
+LDEF(loop)
+	ldws,mb		-4(0,%r25),%r22
+	stws,mb		%r20,-4(0,%r26)
+	vshd		%r29,%r22,%r20
+	ldws,mb		-4(0,%r25),%r29
+	stws,mb		%r20,-4(0,%r26)
+	vshd		%r22,%r29,%r20
+	ldws,mb		-4(0,%r25),%r22
+	stws,mb		%r20,-4(0,%r26)
+	vshd		%r29,%r22,%r20
+	ldws,mb		-4(0,%r25),%r29
+	stws,mb		%r20,-4(0,%r26)
+	addib,>		-4,%r24,L(loop)
+	vshd		%r22,%r29,%r20
+
+LDEF(rest)
+	addib,=		4,%r24,L(end1)
+	nop
+
+LDEF(eloop)
+	ldws,mb		-4(0,%r25),%r22
+	stws,mb		%r20,-4(0,%r26)
+	addib,<=	-1,%r24,L(end2)
+	vshd		%r29,%r22,%r20
+	ldws,mb		-4(0,%r25),%r29
+	stws,mb		%r20,-4(0,%r26)
+	addib,>		-1,%r24,L(eloop)
+	vshd		%r22,%r29,%r20
+
+LDEF(end1)
+	stws,mb		%r20,-4(0,%r26)
+	vshd		%r29,%r0,%r20
+	bv		0(%r2)
+	stw		%r20,-4(0,%r26)
+
+LDEF(end2)
+	stws,mb		%r20,-4(0,%r26)
+
+LDEF(0004)
+	vshd		%r22,%r0,%r20
+	bv		0(%r2)
+	stw		%r20,-4(0,%r26)
+EPILOGUE()

diff --git a/third_party/gmp/mpn/pa32/hppa1_1/pa7100/rshift.asm b/third_party/gmp/mpn/pa32/hppa1_1/pa7100/rshift.asm
new file mode 100644
index 0000000..f7896fc
--- /dev/null
+++ b/third_party/gmp/mpn/pa32/hppa1_1/pa7100/rshift.asm

@@ -0,0 +1,92 @@
+dnl  HP-PA  mpn_rshift -- Shift a number right.
+dnl  Optimized for the PA7100, where is runs at 3.25 cycles/limb.
+
+dnl  Copyright 1992, 1994, 2000-2003 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr	r26
+C s_ptr		r25
+C size		r24
+C cnt		r23
+
+ASM_START()
+PROLOGUE(mpn_rshift)
+	ldws,ma		4(0,%r25),%r22
+	mtsar		%r23
+	addib,=		-1,%r24,L(0004)
+	vshd		%r22,%r0,%r28		C compute carry out limb
+	ldws,ma		4(0,%r25),%r29
+	addib,<=	-5,%r24,L(rest)
+	vshd		%r29,%r22,%r20
+
+LDEF(loop)
+	ldws,ma		4(0,%r25),%r22
+	stws,ma		%r20,4(0,%r26)
+	vshd		%r22,%r29,%r20
+	ldws,ma		4(0,%r25),%r29
+	stws,ma		%r20,4(0,%r26)
+	vshd		%r29,%r22,%r20
+	ldws,ma		4(0,%r25),%r22
+	stws,ma		%r20,4(0,%r26)
+	vshd		%r22,%r29,%r20
+	ldws,ma		4(0,%r25),%r29
+	stws,ma		%r20,4(0,%r26)
+	addib,>		-4,%r24,L(loop)
+	vshd		%r29,%r22,%r20
+
+LDEF(rest)
+	addib,=		4,%r24,L(end1)
+	nop
+
+LDEF(eloop)
+	ldws,ma		4(0,%r25),%r22
+	stws,ma		%r20,4(0,%r26)
+	addib,<=	-1,%r24,L(end2)
+	vshd		%r22,%r29,%r20
+	ldws,ma		4(0,%r25),%r29
+	stws,ma		%r20,4(0,%r26)
+	addib,>		-1,%r24,L(eloop)
+	vshd		%r29,%r22,%r20
+
+LDEF(end1)
+	stws,ma		%r20,4(0,%r26)
+	vshd		%r0,%r29,%r20
+	bv		0(%r2)
+	stw		%r20,0(0,%r26)
+
+LDEF(end2)
+	stws,ma		%r20,4(0,%r26)
+
+LDEF(0004)
+	vshd		%r0,%r22,%r20
+	bv		0(%r2)
+	stw		%r20,0(0,%r26)
+EPILOGUE()

diff --git a/third_party/gmp/mpn/pa32/hppa1_1/pa7100/sub_n.asm b/third_party/gmp/mpn/pa32/hppa1_1/pa7100/sub_n.asm
new file mode 100644
index 0000000..df3f6e8
--- /dev/null
+++ b/third_party/gmp/mpn/pa32/hppa1_1/pa7100/sub_n.asm

@@ -0,0 +1,84 @@
+dnl  HP-PA mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+dnl  store difference in a third limb vector.  Optimized for the PA7100, where
+dnl  is runs at 4.25 cycles/limb.
+
+dnl  Copyright 1992, 1994, 2000-2003 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr	r26
+C s1_ptr	r25
+C s2_ptr	r24
+C size		r23
+
+ASM_START()
+PROLOGUE(mpn_sub_n)
+	ldws,ma		4(0,%r25),%r20
+	ldws,ma		4(0,%r24),%r19
+
+	addib,<=	-5,%r23,L(rest)
+	 sub		%r20,%r19,%r28	C subtract first limbs ignoring cy
+
+LDEF(loop)
+	ldws,ma		4(0,%r25),%r20
+	ldws,ma		4(0,%r24),%r19
+	stws,ma		%r28,4(0,%r26)
+	subb		%r20,%r19,%r28
+	ldws,ma		4(0,%r25),%r20
+	ldws,ma		4(0,%r24),%r19
+	stws,ma		%r28,4(0,%r26)
+	subb		%r20,%r19,%r28
+	ldws,ma		4(0,%r25),%r20
+	ldws,ma		4(0,%r24),%r19
+	stws,ma		%r28,4(0,%r26)
+	subb		%r20,%r19,%r28
+	ldws,ma		4(0,%r25),%r20
+	ldws,ma		4(0,%r24),%r19
+	stws,ma		%r28,4(0,%r26)
+	addib,>		-4,%r23,L(loop)
+	subb		%r20,%r19,%r28
+
+LDEF(rest)
+	addib,=		4,%r23,L(end)
+	nop
+
+LDEF(eloop)
+	ldws,ma		4(0,%r25),%r20
+	ldws,ma		4(0,%r24),%r19
+	stws,ma		%r28,4(0,%r26)
+	addib,>		-1,%r23,L(eloop)
+	subb		%r20,%r19,%r28
+
+LDEF(end)
+	stws		%r28,0(0,%r26)
+	addc		%r0,%r0,%r28
+	bv		0(%r2)
+	 subi		1,%r28,%r28
+EPILOGUE()

diff --git a/third_party/gmp/mpn/pa32/hppa1_1/pa7100/submul_1.asm b/third_party/gmp/mpn/pa32/hppa1_1/pa7100/submul_1.asm
new file mode 100644
index 0000000..5ea08cb
--- /dev/null
+++ b/third_party/gmp/mpn/pa32/hppa1_1/pa7100/submul_1.asm

@@ -0,0 +1,207 @@
+dnl  HP-PA 7100/7200 mpn_submul_1 -- Multiply a limb vector with a limb and
+dnl  subtract the result from a second limb vector.
+
+dnl  Copyright 1995, 2000-2003 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+define(`res_ptr',`%r26')
+define(`s1_ptr',`%r25')
+define(`size_param',`%r24')
+define(`s2_limb',`%r23')
+
+define(`cylimb',`%r28')
+define(`s0',`%r19')
+define(`s1',`%r20')
+define(`s2',`%r3')
+define(`s3',`%r4')
+define(`lo0',`%r21')
+define(`lo1',`%r5')
+define(`lo2',`%r6')
+define(`lo3',`%r7')
+define(`hi0',`%r22')
+define(`hi1',`%r23')				C safe to reuse
+define(`hi2',`%r29')
+define(`hi3',`%r1')
+
+ASM_START()
+PROLOGUE(mpn_submul_1)
+C	.callinfo	frame=128,no_calls
+
+	ldo	128(%r30),%r30
+	stws	s2_limb,-16(%r30)
+	add	 %r0,%r0,cylimb			C clear cy and cylimb
+	addib,<	-4,size_param,L(few_limbs)
+	fldws	-16(%r30),%fr31R
+
+	ldo	-112(%r30),%r31
+	stw	%r3,-96(%r30)
+	stw	%r4,-92(%r30)
+	stw	%r5,-88(%r30)
+	stw	%r6,-84(%r30)
+	stw	%r7,-80(%r30)
+
+	bb,>=,n	 s1_ptr,29,L(0)
+
+	fldws,ma 4(s1_ptr),%fr4
+	ldws	 0(res_ptr),s0
+	xmpyu	 %fr4,%fr31R,%fr5
+	fstds	 %fr5,-16(%r31)
+	ldws	-16(%r31),cylimb
+	ldws	-12(%r31),lo0
+	sub	 s0,lo0,s0
+	add	 s0,lo0,%r0			C invert cy
+	addib,< -1,size_param,L(few_limbs)
+	stws,ma	 s0,4(res_ptr)
+
+C start software pipeline ----------------------------------------------------
+LDEF(0)
+	fldds,ma 8(s1_ptr),%fr4
+	fldds,ma 8(s1_ptr),%fr8
+
+	xmpyu	 %fr4L,%fr31R,%fr5
+	xmpyu	 %fr4R,%fr31R,%fr6
+	xmpyu	 %fr8L,%fr31R,%fr9
+	xmpyu	 %fr8R,%fr31R,%fr10
+
+	fstds	 %fr5,-16(%r31)
+	fstds	 %fr6,-8(%r31)
+	fstds	 %fr9,0(%r31)
+	fstds	 %fr10,8(%r31)
+
+	ldws   -16(%r31),hi0
+	ldws   -12(%r31),lo0
+	ldws	-8(%r31),hi1
+	ldws	-4(%r31),lo1
+	ldws	 0(%r31),hi2
+	ldws	 4(%r31),lo2
+	ldws	 8(%r31),hi3
+	ldws	12(%r31),lo3
+
+	addc	 lo0,cylimb,lo0
+	addc	 lo1,hi0,lo1
+	addc	 lo2,hi1,lo2
+	addc	 lo3,hi2,lo3
+
+	addib,<	 -4,size_param,L(end)
+	addc	 %r0,hi3,cylimb			C propagate carry into cylimb
+C main loop ------------------------------------------------------------------
+LDEF(loop)
+	fldds,ma 8(s1_ptr),%fr4
+	fldds,ma 8(s1_ptr),%fr8
+
+	ldws	 0(res_ptr),s0
+	xmpyu	 %fr4L,%fr31R,%fr5
+	ldws	 4(res_ptr),s1
+	xmpyu	 %fr4R,%fr31R,%fr6
+	ldws	 8(res_ptr),s2
+	xmpyu	 %fr8L,%fr31R,%fr9
+	ldws	12(res_ptr),s3
+	xmpyu	 %fr8R,%fr31R,%fr10
+
+	fstds	 %fr5,-16(%r31)
+	sub	 s0,lo0,s0
+	fstds	 %fr6,-8(%r31)
+	subb	 s1,lo1,s1
+	fstds	 %fr9,0(%r31)
+	subb	 s2,lo2,s2
+	fstds	 %fr10,8(%r31)
+	subb	 s3,lo3,s3
+	subb	 %r0,%r0,lo0			C these two insns ...
+	add	 lo0,lo0,%r0			C ... just invert cy
+
+	ldws   -16(%r31),hi0
+	ldws   -12(%r31),lo0
+	ldws	-8(%r31),hi1
+	ldws	-4(%r31),lo1
+	ldws	 0(%r31),hi2
+	ldws	 4(%r31),lo2
+	ldws	 8(%r31),hi3
+	ldws	12(%r31),lo3
+
+	addc	 lo0,cylimb,lo0
+	stws,ma	 s0,4(res_ptr)
+	addc	 lo1,hi0,lo1
+	stws,ma	 s1,4(res_ptr)
+	addc	 lo2,hi1,lo2
+	stws,ma	 s2,4(res_ptr)
+	addc	 lo3,hi2,lo3
+	stws,ma	 s3,4(res_ptr)
+
+	addib,>= -4,size_param,L(loop)
+	addc	 %r0,hi3,cylimb			C propagate carry into cylimb
+C finish software pipeline ---------------------------------------------------
+LDEF(end)
+	ldws	 0(res_ptr),s0
+	ldws	 4(res_ptr),s1
+	ldws	 8(res_ptr),s2
+	ldws	12(res_ptr),s3
+
+	sub	 s0,lo0,s0
+	stws,ma	 s0,4(res_ptr)
+	subb	 s1,lo1,s1
+	stws,ma	 s1,4(res_ptr)
+	subb	 s2,lo2,s2
+	stws,ma	 s2,4(res_ptr)
+	subb	 s3,lo3,s3
+	stws,ma	 s3,4(res_ptr)
+	subb	 %r0,%r0,lo0			C these two insns ...
+	add	 lo0,lo0,%r0			C ... invert cy
+
+C restore callee-saves registers ---------------------------------------------
+	ldw	-96(%r30),%r3
+	ldw	-92(%r30),%r4
+	ldw	-88(%r30),%r5
+	ldw	-84(%r30),%r6
+	ldw	-80(%r30),%r7
+
+LDEF(few_limbs)
+	addib,=,n 4,size_param,L(ret)
+
+LDEF(loop2)
+	fldws,ma 4(s1_ptr),%fr4
+	ldws	 0(res_ptr),s0
+	xmpyu	 %fr4,%fr31R,%fr5
+	fstds	 %fr5,-16(%r30)
+	ldws	-16(%r30),hi0
+	ldws	-12(%r30),lo0
+	addc	 lo0,cylimb,lo0
+	addc	 %r0,hi0,cylimb
+	sub	 s0,lo0,s0
+	add	 s0,lo0,%r0			C invert cy
+	stws,ma	 s0,4(res_ptr)
+	addib,<> -1,size_param,L(loop2)
+	nop
+
+LDEF(ret)
+	addc	 %r0,cylimb,cylimb
+	bv	 0(%r2)
+	ldo	 -128(%r30),%r30
+EPILOGUE(mpn_submul_1)

diff --git a/third_party/gmp/mpn/pa32/hppa1_1/sqr_diagonal.asm b/third_party/gmp/mpn/pa32/hppa1_1/sqr_diagonal.asm
new file mode 100644
index 0000000..1c7a18e
--- /dev/null
+++ b/third_party/gmp/mpn/pa32/hppa1_1/sqr_diagonal.asm

@@ -0,0 +1,60 @@
+dnl  HP-PA 1.1 32-bit mpn_sqr_diagonal.
+
+dnl  Copyright 2001, 2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C This code runs at 6 cycles/limb on the PA7100 and 2.5 cycles/limb on PA8x00.
+C 2-way unrolling wouldn't help the PA7100; it could however bring times down
+C to 2.0 cycles/limb for the PA8x00.
+
+C INPUT PARAMETERS
+define(`rp',`%r26')
+define(`up',`%r25')
+define(`n',`%r24')
+
+ASM_START()
+PROLOGUE(mpn_sqr_diagonal)
+	ldo		4(rp),rp
+	fldws,ma	4(up),%fr4r
+	addib,=		-1,n,L(exit)
+	xmpyu		%fr4r,%fr4r,%fr5
+
+LDEF(loop)
+	fldws,ma	4(up),%fr4r
+	fstws		%fr5r,-4(rp)
+	fstws,ma	%fr5l,8(rp)
+	addib,<>	-1,n,L(loop)
+	xmpyu		%fr4r,%fr4r,%fr5
+
+LDEF(exit)
+	fstws		%fr5r,-4(rp)
+	bv		0(%r2)
+	fstws		%fr5l,0(rp)
+EPILOGUE(mpn_sqr_diagonal)

diff --git a/third_party/gmp/mpn/pa32/hppa1_1/submul_1.asm b/third_party/gmp/mpn/pa32/hppa1_1/submul_1.asm
new file mode 100644
index 0000000..a9b11d2
--- /dev/null
+++ b/third_party/gmp/mpn/pa32/hppa1_1/submul_1.asm

@@ -0,0 +1,115 @@
+dnl  HP-PA 1.1 mpn_submul_1 -- Multiply a limb vector with a limb and subtract
+dnl  the result from a second limb vector.
+
+dnl  Copyright 1992-1994, 2000-2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr	r26
+C s1_ptr	r25
+C size		r24
+C s2_limb	r23
+
+C This runs at 12 cycles/limb on a PA7000.  With the used instructions, it can
+C not become faster due to data cache contention after a store.  On the PA7100
+C it runs at 11 cycles/limb.
+
+C There are some ideas described in mul_1.asm that applies to this code too.
+
+C It seems possible to make this run as fast as mpn_addmul_1, if we use
+C	sub,>>=	%r29,%r19,%r22
+C	addi	1,%r28,%r28
+C but that requires reworking the hairy software pipeline...
+
+ASM_START()
+PROLOGUE(mpn_submul_1)
+C	.callinfo	frame=64,no_calls
+
+	ldo		64(%r30),%r30
+	fldws,ma	4(%r25),%fr5
+	stw		%r23,-16(%r30)		C move s2_limb ...
+	addib,=		-1,%r24,L(just_one_limb)
+	 fldws		-16(%r30),%fr4		C ... into fr4
+	add		%r0,%r0,%r0		C clear carry
+	xmpyu		%fr4,%fr5,%fr6
+	fldws,ma	4(%r25),%fr7
+	fstds		%fr6,-16(%r30)
+	xmpyu		%fr4,%fr7,%fr8
+	ldw		-12(%r30),%r19		C least significant limb in product
+	ldw		-16(%r30),%r28
+
+	fstds		%fr8,-16(%r30)
+	addib,=		-1,%r24,L(end)
+	 ldw		-12(%r30),%r1
+
+C Main loop
+LDEF(loop)
+	ldws		0(%r26),%r29
+	fldws,ma	4(%r25),%fr5
+	sub		%r29,%r19,%r22
+	add		%r22,%r19,%r0
+	stws,ma		%r22,4(%r26)
+	addc		%r28,%r1,%r19
+	xmpyu		%fr4,%fr5,%fr6
+	ldw		-16(%r30),%r28
+	fstds		%fr6,-16(%r30)
+	addc		%r0,%r28,%r28
+	addib,<>	-1,%r24,L(loop)
+	 ldw		-12(%r30),%r1
+
+LDEF(end)
+	ldw		0(%r26),%r29
+	sub		%r29,%r19,%r22
+	add		%r22,%r19,%r0
+	stws,ma		%r22,4(%r26)
+	addc		%r28,%r1,%r19
+	ldw		-16(%r30),%r28
+	ldws		0(%r26),%r29
+	addc		%r0,%r28,%r28
+	sub		%r29,%r19,%r22
+	add		%r22,%r19,%r0
+	stws,ma		%r22,4(%r26)
+	addc		%r0,%r28,%r28
+	bv		0(%r2)
+	 ldo		-64(%r30),%r30
+
+LDEF(just_one_limb)
+	xmpyu		%fr4,%fr5,%fr6
+	ldw		0(%r26),%r29
+	fstds		%fr6,-16(%r30)
+	ldw		-12(%r30),%r1
+	ldw		-16(%r30),%r28
+	sub		%r29,%r1,%r22
+	add		%r22,%r1,%r0
+	stw		%r22,0(%r26)
+	addc		%r0,%r28,%r28
+	bv		0(%r2)
+	 ldo		-64(%r30),%r30
+EPILOGUE()

diff --git a/third_party/gmp/mpn/pa32/hppa1_1/udiv.asm b/third_party/gmp/mpn/pa32/hppa1_1/udiv.asm
new file mode 100644
index 0000000..626ecd2
--- /dev/null
+++ b/third_party/gmp/mpn/pa32/hppa1_1/udiv.asm

@@ -0,0 +1,102 @@
+dnl  HP-PA  __udiv_qrnnd division support, used from longlong.h.
+dnl  This version runs fast on PA 7000 and later.
+
+dnl  Copyright 1993, 1994, 2000, 2001, 2003 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C rem_ptr	gr26
+C n1		gr25
+C n0		gr24
+C d		gr23
+
+C This file has caused a lot of trouble, since it demands PIC reference to
+C static data, which triggers bugs in gas (at least version 2.7 through
+C 2.11.2).  When the bug is triggered, many bogus relocs are generated.  The
+C current solution is to stuff data right into the code, and refer it using
+C absolute offsets.  Fragile to be sure, but nothing else seems to work.
+
+ASM_START()
+ifdef(`PIC',`',
+`	RODATA
+	INT64(0000, 0x43f00000, 0x0)	C 2^64
+')
+
+PROLOGUE(mpn_udiv_qrnnd)
+C	.callinfo	frame=64,no_calls
+
+	ldo		64(%r30),%r30
+
+	stws		%r25,-16(0,%r30)	C n_hi
+	stws		%r24,-12(0,%r30)	C n_lo
+
+ifdef(`PIC',
+`	bl		.+20,%r31
+	dep		%r0,31,2,%r31
+	.word	0x0				C padding for alignment
+	.word	0x43f00000, 0x0			C 2^64
+	ldo		4(%r31),%r31',
+`	ldil		`L'%L(0000),%r31
+	ldo		R%L(0000)(%r31),%r31')
+
+	fldds		-16(0,%r30),%fr5
+	stws		%r23,-12(0,%r30)
+	comib,<=	0,%r25,L(1)
+	fcnvxf,dbl,dbl	%fr5,%fr5
+	fldds		0(0,%r31),%fr4
+	fadd,dbl	%fr4,%fr5,%fr5
+
+LDEF(1)
+	fcpy,sgl	%fr0,%fr6L
+	fldws		-12(0,%r30),%fr6R
+	fcnvxf,dbl,dbl	%fr6,%fr4
+
+	fdiv,dbl	%fr5,%fr4,%fr5
+
+	fcnvfx,dbl,dbl	%fr5,%fr4
+	fstws		%fr4R,-16(%r30)
+	xmpyu		%fr4R,%fr6R,%fr6
+	ldws		-16(%r30),%r28
+	fstds		%fr6,-16(0,%r30)
+	ldws		-12(0,%r30),%r21
+	ldws		-16(0,%r30),%r20
+	sub		%r24,%r21,%r22
+	subb		%r25,%r20,%r20
+	comib,=		0,%r20,L(2)
+	ldo		-64(%r30),%r30
+
+	add		%r22,%r23,%r22
+	ldo		-1(%r28),%r28
+
+LDEF(2)
+	bv		0(%r2)
+	stws		%r22,0(0,%r26)
+
+EPILOGUE(mpn_udiv_qrnnd)

diff --git a/third_party/gmp/mpn/pa32/hppa1_1/umul.asm b/third_party/gmp/mpn/pa32/hppa1_1/umul.asm
new file mode 100644
index 0000000..18b923c
--- /dev/null
+++ b/third_party/gmp/mpn/pa32/hppa1_1/umul.asm

@@ -0,0 +1,47 @@
+dnl  Copyright 1999, 2001 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_umul_ppmm)
+C	.callinfo frame=64,no_calls
+
+	ldo	64(%r30),%r30
+	stw	%r25,-16(0,%r30)
+	fldws	-16(0,%r30),%fr22R
+	stw	%r24,-16(0,%r30)
+	fldws	-16(0,%r30),%fr22L
+	xmpyu	%fr22R,%fr22L,%fr22
+	fstds	%fr22,-16(0,%r30)
+	ldw	-16(0,%r30),%r28
+	ldw	-12(0,%r30),%r29
+	stw	%r29,0(0,%r26)
+	bv	0(%r2)
+	ldo	-64(%r30),%r30
+EPILOGUE()

diff --git a/third_party/gmp/mpn/pa32/hppa2_0/add_n.asm b/third_party/gmp/mpn/pa32/hppa2_0/add_n.asm
new file mode 100644
index 0000000..8d881b8
--- /dev/null
+++ b/third_party/gmp/mpn/pa32/hppa2_0/add_n.asm

@@ -0,0 +1,107 @@
+dnl  HP-PA 2.0 32-bit mpn_add_n -- Add two limb vectors of the same length > 0
+dnl  and store sum in a third limb vector.
+
+dnl  Copyright 1997, 1998, 2000-2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr	gr26
+C s1_ptr	gr25
+C s2_ptr	gr24
+C size		gr23
+
+C This runs at 2 cycles/limb on PA8000.
+
+ASM_START()
+PROLOGUE(mpn_add_n)
+	sub		%r0,%r23,%r22
+	zdep		%r22,30,3,%r28		C r28 = 2 * (-n & 7)
+	zdep		%r22,29,3,%r22		C r22 = 4 * (-n & 7)
+	sub		%r25,%r22,%r25		C offset s1_ptr
+	sub		%r24,%r22,%r24		C offset s2_ptr
+	sub		%r26,%r22,%r26		C offset res_ptr
+	blr		%r28,%r0		C branch into loop
+	add		%r0,%r0,%r0		C reset carry
+
+LDEF(loop)
+	ldw		0(%r25),%r20
+	ldw		0(%r24),%r31
+	addc		%r20,%r31,%r20
+	stw		%r20,0(%r26)
+
+LDEF(7)
+	ldw		4(%r25),%r21
+	ldw		4(%r24),%r19
+	addc		%r21,%r19,%r21
+	stw		%r21,4(%r26)
+
+LDEF(6)
+	ldw		8(%r25),%r20
+	ldw		8(%r24),%r31
+	addc		%r20,%r31,%r20
+	stw		%r20,8(%r26)
+
+LDEF(5)
+	ldw		12(%r25),%r21
+	ldw		12(%r24),%r19
+	addc		%r21,%r19,%r21
+	stw		%r21,12(%r26)
+
+LDEF(4)
+	ldw		16(%r25),%r20
+	ldw		16(%r24),%r31
+	addc		%r20,%r31,%r20
+	stw		%r20,16(%r26)
+
+LDEF(3)
+	ldw		20(%r25),%r21
+	ldw		20(%r24),%r19
+	addc		%r21,%r19,%r21
+	stw		%r21,20(%r26)
+
+LDEF(2)
+	ldw		24(%r25),%r20
+	ldw		24(%r24),%r31
+	addc		%r20,%r31,%r20
+	stw		%r20,24(%r26)
+
+LDEF(1)
+	ldw		28(%r25),%r21
+	ldo		32(%r25),%r25
+	ldw		28(%r24),%r19
+	addc		%r21,%r19,%r21
+	stw		%r21,28(%r26)
+	ldo		32(%r24),%r24
+	addib,>		-8,%r23,L(loop)
+	ldo		32(%r26),%r26
+
+	bv		(%r2)
+	addc		%r0,%r0,%r28
+EPILOGUE()

diff --git a/third_party/gmp/mpn/pa32/hppa2_0/gmp-mparam.h b/third_party/gmp/mpn/pa32/hppa2_0/gmp-mparam.h
new file mode 100644
index 0000000..6016274
--- /dev/null
+++ b/third_party/gmp/mpn/pa32/hppa2_0/gmp-mparam.h

@@ -0,0 +1,167 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 1999-2002, 2009, 2010 Free Software Foundation,
+Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* 552 MHz PA8600 (gcc61.fsffrance.org) */
+
+#define DIVREM_1_NORM_THRESHOLD              3
+#define DIVREM_1_UNNORM_THRESHOLD            3
+#define MOD_1_NORM_THRESHOLD                 3
+#define MOD_1_UNNORM_THRESHOLD               4
+#define MOD_1N_TO_MOD_1_1_THRESHOLD         11
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          8
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD         0  /* never mpn_mod_1_1p */
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD        22
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     28
+#define USE_PREINV_DIVREM_1                  1
+#define DIVEXACT_1_THRESHOLD                 0  /* always */
+#define BMOD_1_TO_MOD_1_THRESHOLD           36
+
+#define MUL_TOOM22_THRESHOLD                18
+#define MUL_TOOM33_THRESHOLD                65
+#define MUL_TOOM44_THRESHOLD               166
+#define MUL_TOOM6H_THRESHOLD               202
+#define MUL_TOOM8H_THRESHOLD               333
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD     105
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     138
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD     105
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD     102
+
+#define SQR_BASECASE_THRESHOLD               7
+#define SQR_TOOM2_THRESHOLD                 55
+#define SQR_TOOM3_THRESHOLD                 93
+#define SQR_TOOM4_THRESHOLD                250
+#define SQR_TOOM6_THRESHOLD                306
+#define SQR_TOOM8_THRESHOLD                527
+
+#define MULMOD_BNM1_THRESHOLD               13
+#define SQRMOD_BNM1_THRESHOLD               15
+
+#define MUL_FFT_MODF_THRESHOLD             244  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    244, 5}, {      8, 4}, {     17, 5}, {     13, 6}, \
+    {      7, 5}, {     17, 6}, {      9, 5}, {     19, 6}, \
+    {     13, 7}, {      7, 6}, {     17, 7}, {      9, 6}, \
+    {     19, 7}, {     11, 6}, {     24, 7}, {     13, 8}, \
+    {      7, 7}, {     19, 8}, {     11, 7}, {     25, 8}, \
+    {     15, 7}, {     33, 8}, {     23, 9}, {     15, 8}, \
+    {     39, 9}, {     23,10}, {     15, 9}, {     31, 8}, \
+    {     67, 9}, {     39, 8}, {     79, 9}, {     47,10}, \
+    {     31, 9}, {     71, 8}, {    143, 9}, {     79,10}, \
+    {     47,11}, {     31,10}, {     63, 9}, {    127, 8}, \
+    {    255, 9}, {    135, 8}, {    271, 9}, {    143,10}, \
+    {     79, 9}, {    159, 8}, {    319, 9}, {    175, 8}, \
+    {    351,10}, {     95, 9}, {    191, 8}, {    383, 9}, \
+    {    207,11}, {     63,10}, {    127, 9}, {    255, 8}, \
+    {    511, 9}, {    271,10}, {    143, 9}, {    287, 8}, \
+    {    575,10}, {    159, 9}, {    319,10}, {    175, 9}, \
+    {    351,11}, {     95,10}, {    191, 9}, {    383,10}, \
+    {    207, 9}, {    415,12}, {     63,11}, {    127,10}, \
+    {    255, 9}, {    511,10}, {    271, 9}, {    543, 8}, \
+    {   1087,10}, {    287, 9}, {    575,10}, {    303,11}, \
+    {    159,10}, {    351, 9}, {    703, 8}, {   1407,11}, \
+    {    191,10}, {    415, 9}, {    831,11}, {    223, 9}, \
+    {    895,10}, {    479,12}, {    127,11}, {    255,10}, \
+    {    543, 9}, {   1087,11}, {    287,10}, {    607, 9}, \
+    {   1215,11}, {    351,10}, {    703, 9}, {   1407,12}, \
+    {    191,11}, {    415,10}, {    831,11}, {    479,13}, \
+    {   8192,14}, {  16384,15}, {  32768,16} }
+#define MUL_FFT_TABLE3_SIZE 107
+#define MUL_FFT_THRESHOLD                 2112
+
+#define SQR_FFT_MODF_THRESHOLD             240  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    240, 5}, {      8, 4}, {     17, 5}, {     19, 6}, \
+    {     17, 7}, {      9, 6}, {     20, 7}, {     11, 6}, \
+    {     23, 7}, {     13, 8}, {      7, 7}, {     19, 8}, \
+    {     11, 7}, {     25, 8}, {     15, 7}, {     33, 8}, \
+    {     19, 7}, {     39, 8}, {     23, 9}, {     15, 8}, \
+    {     39, 9}, {     23,10}, {     15, 9}, {     31, 8}, \
+    {     63, 9}, {     47,10}, {     31, 9}, {     63, 8}, \
+    {    127, 9}, {     71, 8}, {    143, 9}, {     79,10}, \
+    {     47,11}, {     31,10}, {     63, 9}, {    127, 8}, \
+    {    255, 7}, {    511, 9}, {    135, 8}, {    271, 9}, \
+    {    143,10}, {     79, 9}, {    159, 8}, {    319, 9}, \
+    {    175, 8}, {    351, 7}, {    703,10}, {     95, 9}, \
+    {    191, 8}, {    383, 9}, {    207,11}, {     63,10}, \
+    {    127, 9}, {    255, 8}, {    511, 9}, {    271,10}, \
+    {    143, 9}, {    287, 8}, {    575,10}, {    159, 9}, \
+    {    319,10}, {    175, 9}, {    351, 8}, {    703,11}, \
+    {     95,10}, {    191, 9}, {    383,10}, {    207, 9}, \
+    {    415,12}, {     63,11}, {    127,10}, {    255, 9}, \
+    {    511,10}, {    271, 9}, {    543, 8}, {   1087,10}, \
+    {    287, 9}, {    575,11}, {    159,10}, {    319, 9}, \
+    {    639,10}, {    351, 9}, {    703, 8}, {   1407,11}, \
+    {    191,10}, {    415, 9}, {    831,11}, {    223, 8}, \
+    {   1791,10}, {    479, 9}, {    959,12}, {    127,11}, \
+    {    255,10}, {    543,11}, {    287,10}, {    607,11}, \
+    {    319,10}, {    639,11}, {    351,10}, {    703, 9}, \
+    {   1407,12}, {    191,11}, {    415,10}, {    831,11}, \
+    {    479,10}, {    959,13}, {   8192,14}, {  16384,15}, \
+    {  32768,16} }
+#define SQR_FFT_TABLE3_SIZE 109
+#define SQR_FFT_THRESHOLD                 1600
+
+#define MULLO_BASECASE_THRESHOLD             0  /* always */
+#define MULLO_DC_THRESHOLD                 116
+#define MULLO_MUL_N_THRESHOLD             3574
+
+#define DC_DIV_QR_THRESHOLD                100
+#define DC_DIVAPPR_Q_THRESHOLD             348
+#define DC_BDIV_QR_THRESHOLD               109
+#define DC_BDIV_Q_THRESHOLD                254
+
+#define INV_MULMOD_BNM1_THRESHOLD           34
+#define INV_NEWTON_THRESHOLD               276
+#define INV_APPR_THRESHOLD                 276
+
+#define BINV_NEWTON_THRESHOLD              278
+#define REDC_1_TO_REDC_N_THRESHOLD          78
+
+#define MU_DIV_QR_THRESHOLD                979
+#define MU_DIVAPPR_Q_THRESHOLD             263
+#define MUPI_DIV_QR_THRESHOLD              102
+#define MU_BDIV_QR_THRESHOLD               807
+#define MU_BDIV_Q_THRESHOLD               1187
+
+#define MATRIX22_STRASSEN_THRESHOLD         11
+#define HGCD_THRESHOLD                     100
+#define GCD_DC_THRESHOLD                   379
+#define GCDEXT_DC_THRESHOLD                249
+#define JACOBI_BASE_METHOD                   2
+
+#define GET_STR_DC_THRESHOLD                 7
+#define GET_STR_PRECOMPUTE_THRESHOLD        16
+#define SET_STR_DC_THRESHOLD               270
+#define SET_STR_PRECOMPUTE_THRESHOLD       782

diff --git a/third_party/gmp/mpn/pa32/hppa2_0/sqr_diagonal.asm b/third_party/gmp/mpn/pa32/hppa2_0/sqr_diagonal.asm
new file mode 100644
index 0000000..c55112f
--- /dev/null
+++ b/third_party/gmp/mpn/pa32/hppa2_0/sqr_diagonal.asm

@@ -0,0 +1,112 @@
+dnl  HP-PA 32-bit mpn_sqr_diagonal optimized for the PA8x00.
+
+dnl  Copyright 2001, 2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C This code runs at 6 cycles/limb on the PA7100 and 2 cycles/limb on PA8x00.
+C The 2-way unrolling is actually not helping the PA7100.
+
+C INPUT PARAMETERS
+define(`rp',`%r26')
+define(`up',`%r25')
+define(`n',`%r24')
+
+ASM_START()
+PROLOGUE(mpn_sqr_diagonal)
+
+	fldws,ma	4(up),%fr4r
+	addib,=		-1,n,L(end1)
+	ldo		4(rp),rp
+
+	fldws,ma	4(up),%fr6r
+	addib,=		-1,n,L(end2)
+	xmpyu		%fr4r,%fr4r,%fr5
+
+	fldws,ma	4(up),%fr4r
+	addib,=		-1,n,L(end3)
+	xmpyu		%fr6r,%fr6r,%fr7
+
+
+LDEF(loop)
+	fldws,ma	4(up),%fr6r
+	fstws		%fr5r,-4(rp)
+	fstws,ma	%fr5l,8(rp)
+	addib,=		-1,n,L(exite)
+	xmpyu		%fr4r,%fr4r,%fr5
+	fldws,ma	4(up),%fr4r
+	fstws		%fr7r,-4(rp)
+	fstws,ma	%fr7l,8(rp)
+	addib,<>	-1,n,L(loop)
+	xmpyu		%fr6r,%fr6r,%fr7
+
+LDEF(exito)
+	fstws		%fr5r,-4(rp)
+	fstws		%fr5l,0(rp)
+	xmpyu		%fr4r,%fr4r,%fr5
+	fstws		%fr7r,4(rp)
+	fstws		%fr7l,8(rp)
+	fstws,mb	%fr5r,12(rp)
+	bv		0(%r2)
+	fstws		%fr5l,4(rp)
+
+LDEF(exite)
+	fstws		%fr7r,-4(rp)
+	fstws		%fr7l,0(rp)
+	xmpyu		%fr6r,%fr6r,%fr7
+	fstws		%fr5r,4(rp)
+	fstws		%fr5l,8(rp)
+	fstws,mb	%fr7r,12(rp)
+	bv		0(%r2)
+	fstws		%fr7l,4(rp)
+
+LDEF(end1)
+	xmpyu		%fr4r,%fr4r,%fr5
+	fstws		%fr5r,-4(rp)
+	bv		0(%r2)
+	fstws,ma	%fr5l,8(rp)
+
+LDEF(end2)
+	xmpyu		%fr6r,%fr6r,%fr7
+	fstws		%fr5r,-4(rp)
+	fstws		%fr5l,0(rp)
+	fstws		%fr7r,4(rp)
+	bv		0(%r2)
+	fstws		%fr7l,8(rp)
+
+LDEF(end3)
+	fstws		%fr5r,-4(rp)
+	fstws		%fr5l,0(rp)
+	xmpyu		%fr4r,%fr4r,%fr5
+	fstws		%fr7r,4(rp)
+	fstws		%fr7l,8(rp)
+	fstws,mb	%fr5r,12(rp)
+	bv		0(%r2)
+	fstws		%fr5l,4(rp)
+EPILOGUE(mpn_sqr_diagonal)

diff --git a/third_party/gmp/mpn/pa32/hppa2_0/sub_n.asm b/third_party/gmp/mpn/pa32/hppa2_0/sub_n.asm
new file mode 100644
index 0000000..47b3163
--- /dev/null
+++ b/third_party/gmp/mpn/pa32/hppa2_0/sub_n.asm

@@ -0,0 +1,107 @@
+dnl  HP-PA 2.0 32-bit mpn_sub_n -- Subtract two limb vectors of the same
+dnl  length > 0 and store difference in a third limb vector.
+
+dnl  Copyright 1997, 1998, 2000-2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr	gr26
+C s1_ptr	gr25
+C s2_ptr	gr24
+C size		gr23
+
+C This runs at 2 cycles/limb on PA8000.
+
+ASM_START()
+PROLOGUE(mpn_sub_n)
+	sub		%r0,%r23,%r22
+	zdep		%r22,30,3,%r28		C r28 = 2 * (-n & 7)
+	zdep		%r22,29,3,%r22		C r22 = 4 * (-n & 7)
+	sub		%r25,%r22,%r25		C offset s1_ptr
+	sub		%r24,%r22,%r24		C offset s2_ptr
+	blr		%r28,%r0		C branch into loop
+	sub		%r26,%r22,%r26		C offset res_ptr and set carry
+
+LDEF(loop)
+	ldw		0(%r25),%r20
+	ldw		0(%r24),%r31
+	subb		%r20,%r31,%r20
+	stw		%r20,0(%r26)
+
+LDEF(7)
+	ldw		4(%r25),%r21
+	ldw		4(%r24),%r19
+	subb		%r21,%r19,%r21
+	stw		%r21,4(%r26)
+
+LDEF(6)
+	ldw		8(%r25),%r20
+	ldw		8(%r24),%r31
+	subb		%r20,%r31,%r20
+	stw		%r20,8(%r26)
+
+LDEF(5)
+	ldw		12(%r25),%r21
+	ldw		12(%r24),%r19
+	subb		%r21,%r19,%r21
+	stw		%r21,12(%r26)
+
+LDEF(4)
+	ldw		16(%r25),%r20
+	ldw		16(%r24),%r31
+	subb		%r20,%r31,%r20
+	stw		%r20,16(%r26)
+
+LDEF(3)
+	ldw		20(%r25),%r21
+	ldw		20(%r24),%r19
+	subb		%r21,%r19,%r21
+	stw		%r21,20(%r26)
+
+LDEF(2)
+	ldw		24(%r25),%r20
+	ldw		24(%r24),%r31
+	subb		%r20,%r31,%r20
+	stw		%r20,24(%r26)
+
+LDEF(1)
+	ldw		28(%r25),%r21
+	ldo		32(%r25),%r25
+	ldw		28(%r24),%r19
+	subb		%r21,%r19,%r21
+	stw		%r21,28(%r26)
+	ldo		32(%r24),%r24
+	addib,>		-8,%r23,L(loop)
+	ldo		32(%r26),%r26
+
+	addc		%r0,%r0,%r28
+	bv		(%r2)
+	subi		1,%r28,%r28
+EPILOGUE()

diff --git a/third_party/gmp/mpn/pa32/lshift.asm b/third_party/gmp/mpn/pa32/lshift.asm
new file mode 100644
index 0000000..5ea497c
--- /dev/null
+++ b/third_party/gmp/mpn/pa32/lshift.asm

@@ -0,0 +1,75 @@
+dnl  HP-PA  mpn_lshift -- Shift a number left.
+
+dnl  Copyright 1992, 1994, 2000-2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr	gr26
+C s_ptr		gr25
+C size		gr24
+C cnt		gr23
+
+ASM_START()
+PROLOGUE(mpn_lshift)
+	sh2add		%r24,%r25,%r25
+	sh2add		%r24,%r26,%r26
+	ldws,mb		-4(0,%r25),%r22
+	subi		32,%r23,%r1
+	mtsar		%r1
+	addib,=		-1,%r24,L(0004)
+	vshd		%r0,%r22,%r28		C compute carry out limb
+	ldws,mb		-4(0,%r25),%r29
+	addib,=		-1,%r24,L(0002)
+	vshd		%r22,%r29,%r20
+
+LDEF(loop)
+	ldws,mb		-4(0,%r25),%r22
+	stws,mb		%r20,-4(0,%r26)
+	addib,=		-1,%r24,L(0003)
+	vshd		%r29,%r22,%r20
+	ldws,mb		-4(0,%r25),%r29
+	stws,mb		%r20,-4(0,%r26)
+	addib,<>	-1,%r24,L(loop)
+	vshd		%r22,%r29,%r20
+
+LDEF(0002)
+	stws,mb		%r20,-4(0,%r26)
+	vshd		%r29,%r0,%r20
+	bv		0(%r2)
+	stw		%r20,-4(0,%r26)
+
+LDEF(0003)
+	stws,mb		%r20,-4(0,%r26)
+
+LDEF(0004)
+	vshd		%r22,%r0,%r20
+	bv		0(%r2)
+	stw		%r20,-4(0,%r26)
+EPILOGUE()

diff --git a/third_party/gmp/mpn/pa32/pa-defs.m4 b/third_party/gmp/mpn/pa32/pa-defs.m4
new file mode 100644
index 0000000..b26e715
--- /dev/null
+++ b/third_party/gmp/mpn/pa32/pa-defs.m4

@@ -0,0 +1,64 @@
+divert(-1)
+
+dnl  m4 macros for HPPA assembler.
+
+dnl  Copyright 2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+dnl  hppa assembler comments are introduced with ";".
+dnl
+dnl  For cooperation with cpp, apparently lines "# 123" set the line number,
+dnl  and other lines starting with a "#" are ignored.
+
+changecom(;)
+
+
+dnl  Called: PROLOGUE_cpu(GSYM_PREFIX`'foo)
+dnl          EPILOGUE_cpu(GSYM_PREFIX`'foo)
+dnl
+dnl  These are the same as the basic PROLOGUE_cpu and EPILOGUE_cpu in
+dnl  mpn/asm-defs.m4, but using .proc / .procend.  These are standard and on
+dnl  an ELF system they do what .type and .size normally do.
+
+define(`PROLOGUE_cpu',
+m4_assert_numargs(1)
+	`.code
+	ALIGN(8)
+	.export	`$1',entry
+`$1'LABEL_SUFFIX'
+	.proc
+	.callinfo)	dnl  This is really bogus, but allows us to compile
+			dnl  again on hppa machines.
+
+
+define(`EPILOGUE_cpu',
+m4_assert_numargs(1)
+`	.procend')
+
+divert

diff --git a/third_party/gmp/mpn/pa32/rshift.asm b/third_party/gmp/mpn/pa32/rshift.asm
new file mode 100644
index 0000000..c5eac83
--- /dev/null
+++ b/third_party/gmp/mpn/pa32/rshift.asm

@@ -0,0 +1,72 @@
+dnl  HP-PA  mpn_rshift -- Shift a number right.
+
+dnl  Copyright 1992, 1994, 2000-2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr	gr26
+C s_ptr		gr25
+C size		gr24
+C cnt		gr23
+
+ASM_START()
+PROLOGUE(mpn_rshift)
+	ldws,ma		4(0,%r25),%r22
+	mtsar		%r23
+	addib,=		-1,%r24,L(0004)
+	vshd		%r22,%r0,%r28		C compute carry out limb
+	ldws,ma		4(0,%r25),%r29
+	addib,=		-1,%r24,L(0002)
+	vshd		%r29,%r22,%r20
+
+LDEF(loop)
+	ldws,ma		4(0,%r25),%r22
+	stws,ma		%r20,4(0,%r26)
+	addib,=		-1,%r24,L(0003)
+	vshd		%r22,%r29,%r20
+	ldws,ma		4(0,%r25),%r29
+	stws,ma		%r20,4(0,%r26)
+	addib,<>	-1,%r24,L(loop)
+	vshd		%r29,%r22,%r20
+
+LDEF(0002)
+	stws,ma		%r20,4(0,%r26)
+	vshd		%r0,%r29,%r20
+	bv		0(%r2)
+	stw		%r20,0(0,%r26)
+
+LDEF(0003)
+	stws,ma		%r20,4(0,%r26)
+
+LDEF(0004)
+	vshd		%r0,%r22,%r20
+	bv		0(%r2)
+	stw		%r20,0(0,%r26)
+EPILOGUE()

diff --git a/third_party/gmp/mpn/pa32/sub_n.asm b/third_party/gmp/mpn/pa32/sub_n.asm
new file mode 100644
index 0000000..9c71655
--- /dev/null
+++ b/third_party/gmp/mpn/pa32/sub_n.asm

@@ -0,0 +1,64 @@
+dnl  HP-PA mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+dnl  store difference in a third limb vector.
+
+dnl  Copyright 1992, 1994, 2000-2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr	gr26
+C s1_ptr	gr25
+C s2_ptr	gr24
+C size		gr23
+
+C One might want to unroll this as for other processors, but it turns out that
+C the data cache contention after a store makes such unrolling useless.  We
+C can't come under 5 cycles/limb anyway.
+
+ASM_START()
+PROLOGUE(mpn_sub_n)
+	ldws,ma		4(0,%r25),%r20
+	ldws,ma		4(0,%r24),%r19
+
+	addib,=		-1,%r23,L(end)	C check for (SIZE == 1)
+	 sub		%r20,%r19,%r28	C subtract first limbs ignoring cy
+
+LDEF(loop)
+	ldws,ma		4(0,%r25),%r20
+	ldws,ma		4(0,%r24),%r19
+	stws,ma		%r28,4(0,%r26)
+	addib,<>	-1,%r23,L(loop)
+	 subb		%r20,%r19,%r28
+
+LDEF(end)
+	stws		%r28,0(0,%r26)
+	addc		%r0,%r0,%r28
+	bv		0(%r2)
+	 subi		1,%r28,%r28
+EPILOGUE()

diff --git a/third_party/gmp/mpn/pa32/udiv.asm b/third_party/gmp/mpn/pa32/udiv.asm
new file mode 100644
index 0000000..addbf41
--- /dev/null
+++ b/third_party/gmp/mpn/pa32/udiv.asm

@@ -0,0 +1,291 @@
+dnl  HP-PA  __udiv_qrnnd division support, used from longlong.h.
+dnl  This version runs fast on pre-PA7000 CPUs.
+
+dnl  Copyright 1993, 1994, 2000-2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C rem_ptr	gr26
+C n1		gr25
+C n0		gr24
+C d		gr23
+
+C The code size is a bit excessive.  We could merge the last two ds;addc
+C sequences by simply moving the "bb,< Odd" instruction down.  The only
+C trouble is the FFFFFFFF code that would need some hacking.
+
+ASM_START()
+PROLOGUE(mpn_udiv_qrnnd)
+	comb,<		%r23,0,L(largedivisor)
+	 sub		%r0,%r23,%r1		C clear cy as side-effect
+	ds		%r0,%r1,%r0
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r23,%r25
+	addc		%r24,%r24,%r28
+	ds		%r25,%r23,%r25
+	comclr,>=	%r25,%r0,%r0
+	addl		%r25,%r23,%r25
+	stws		%r25,0(0,%r26)
+	bv		0(%r2)
+	 addc		%r28,%r28,%r28
+
+LDEF(largedivisor)
+	extru		%r24,31,1,%r19		C r19 = n0 & 1
+	bb,<		%r23,31,L(odd)
+	 extru		%r23,30,31,%r22		C r22 = d >> 1
+	shd		%r25,%r24,1,%r24	C r24 = new n0
+	extru		%r25,30,31,%r25		C r25 = new n1
+	sub		%r0,%r22,%r21
+	ds		%r0,%r21,%r0
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	comclr,>=	%r25,%r0,%r0
+	addl		%r25,%r22,%r25
+	sh1addl		%r25,%r19,%r25
+	stws		%r25,0(0,%r26)
+	bv		0(%r2)
+	 addc		%r24,%r24,%r28
+
+LDEF(odd)
+	addib,sv,n	1,%r22,L(FFFFFFFF)	C r22 = (d / 2 + 1)
+	shd		%r25,%r24,1,%r24	C r24 = new n0
+	extru		%r25,30,31,%r25		C r25 = new n1
+	sub		%r0,%r22,%r21
+	ds		%r0,%r21,%r0
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r24
+	ds		%r25,%r22,%r25
+	addc		%r24,%r24,%r28
+	comclr,>=	%r25,%r0,%r0
+	addl		%r25,%r22,%r25
+	sh1addl		%r25,%r19,%r25
+C We have computed (n1,,n0) / (d + 1), q' = r28, r' = r25
+	add,nuv		%r28,%r25,%r25
+	addl		%r25,%r1,%r25
+	addc		%r0,%r28,%r28
+	sub,<<		%r25,%r23,%r0
+	addl		%r25,%r1,%r25
+	stws		%r25,0(0,%r26)
+	bv		0(%r2)
+	 addc		%r0,%r28,%r28
+
+C This is just a special case of the code above.
+C We come here when d == 0xFFFFFFFF
+LDEF(FFFFFFFF)
+	add,uv		%r25,%r24,%r24
+	sub,<<		%r24,%r23,%r0
+	ldo		1(%r24),%r24
+	stws		%r24,0(0,%r26)
+	bv		0(%r2)
+	 addc		%r0,%r25,%r28
+EPILOGUE()

diff --git a/third_party/gmp/mpn/pa64/README b/third_party/gmp/mpn/pa64/README
new file mode 100644
index 0000000..a51ce02
--- /dev/null
+++ b/third_party/gmp/mpn/pa64/README

@@ -0,0 +1,78 @@
+Copyright 1999, 2001, 2002, 2004 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.
+
+
+
+
+This directory contains mpn functions for 64-bit PA-RISC 2.0.
+
+PIPELINE SUMMARY
+
+The PA8x00 processors have an orthogonal 4-way out-of-order pipeline.  Each
+cycle two ALU operations and two MEM operations can issue, but just one of the
+MEM operations may be a store.  The two ALU operations can be almost any
+combination of non-memory operations.  Unlike every other processor, integer
+and fp operations are completely equal here; they both count as just ALU
+operations.
+
+Unfortunately, some operations cause hickups in the pipeline.  Combining
+carry-consuming operations like ADD,DC with operations that does not set carry
+like ADD,L cause long delays.  Skip operations also seem to cause hickups.  If
+several ADD,DC are issued consecutively, or if plain carry-generating ADD feed
+ADD,DC, stalling does not occur.  We can effectively issue two ADD,DC
+operations/cycle.
+
+Latency scheduling is not as important as making sure to have a mix of ALU and
+MEM operations, but for full pipeline utilization, it is still a good idea to
+do some amount of latency scheduling.
+
+Like for all other processors, RAW memory scheduling is critically important.
+Since integer multiplication takes place in the floating-point unit, the GMP
+code needs to handle this problem frequently.
+
+STATUS
+
+* mpn_lshift and mpn_rshift run at 1.5 cycles/limb on PA8000 and at 1.0
+  cycles/limb on PA8500.  With latency scheduling, the numbers could
+  probably be improved to 1.0 cycles/limb for all PA8x00 chips.
+
+* mpn_add_n and mpn_sub_n run at 2.0 cycles/limb on PA8000 and at about
+  1.6875 cycles/limb on PA8500.  With latency scheduling, this could
+  probably be improved to get close to 1.5 cycles/limb.  A problem is the
+  stalling of carry-inputting instructions after instructions that do not
+  write to carry.
+
+* mpn_mul_1, mpn_addmul_1, and mpn_submul_1 run at between 5.625 and 6.375
+  on PA8500 and later, and about a cycle/limb slower on older chips.  The
+  code uses ADD,DC for adjacent limbs, and relies heavily on reordering.
+
+
+REFERENCES
+
+Hewlett Packard, "64-Bit Runtime Architecture for PA-RISC 2.0", version 3.3,
+October 1997.

diff --git a/third_party/gmp/mpn/pa64/addmul_1.asm b/third_party/gmp/mpn/pa64/addmul_1.asm
new file mode 100644
index 0000000..2cb9af9
--- /dev/null
+++ b/third_party/gmp/mpn/pa64/addmul_1.asm

@@ -0,0 +1,693 @@
+dnl  HP-PA 2.0 64-bit mpn_addmul_1 -- Multiply a limb vector with a limb and
+dnl  add the result to a second limb vector.
+
+dnl  Copyright 1998-2000, 2002, 2003 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C		    cycles/limb
+C 8000,8200:		7
+C 8500,8600,8700:	6.375
+
+C  The feed-in and wind-down code has not yet been scheduled.  Many cycles
+C  could be saved there per call.
+
+C  DESCRIPTION:
+C  The main loop "BIG" is 4-way unrolled, mainly to allow
+C  effective use of ADD,DC.  Delays in moving data via the cache from the FP
+C  registers to the IU registers, have demanded a deep software pipeline, and
+C  a lot of stack slots for partial products in flight.
+C
+C  CODE STRUCTURE:
+C  save-some-registers
+C  do 0, 1, 2, or 3 limbs
+C  if done, restore-some-regs and return
+C  save-many-regs
+C  do 4, 8, ... limb
+C  restore-all-regs
+
+C  STACK LAYOUT:
+C  HP-PA stack grows upwards.  We could allocate 8 fewer slots by using the
+C  slots marked FREE, as well as some slots in the caller's "frame marker".
+C
+C -00 <- r30
+C -08  FREE
+C -10  tmp
+C -18  tmp
+C -20  tmp
+C -28  tmp
+C -30  tmp
+C -38  tmp
+C -40  tmp
+C -48  tmp
+C -50  tmp
+C -58  tmp
+C -60  tmp
+C -68  tmp
+C -70  tmp
+C -78  tmp
+C -80  tmp
+C -88  tmp
+C -90  FREE
+C -98  FREE
+C -a0  FREE
+C -a8  FREE
+C -b0  r13
+C -b8  r12
+C -c0  r11
+C -c8  r10
+C -d0  r8
+C -d8  r8
+C -e0  r7
+C -e8  r6
+C -f0  r5
+C -f8  r4
+C -100 r3
+C  Previous frame:
+C  [unused area]
+C -38/-138 vlimb home slot.  For 2.0N, the vlimb arg will arrive here.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS:
+define(`rp',`%r26')	C
+define(`up',`%r25')	C
+define(`n',`%r24')	C
+define(`vlimb',`%r23')	C
+
+define(`climb',`%r23')	C
+
+ifdef(`HAVE_ABI_2_0w',
+`	.level	2.0w
+',`	.level	2.0
+')
+PROLOGUE(mpn_addmul_1)
+
+ifdef(`HAVE_ABI_2_0w',
+`	std		vlimb, -0x38(%r30)	C store vlimb into "home" slot
+')
+	std,ma		%r3, 0x100(%r30)
+	std		%r4, -0xf8(%r30)
+	std		%r5, -0xf0(%r30)
+	ldo		0(%r0), climb		C clear climb
+	fldd		-0x138(%r30), %fr8	C put vlimb in fp register
+
+define(`p032a1',`%r1')	C
+define(`p032a2',`%r19')	C
+
+define(`m032',`%r20')	C
+define(`m096',`%r21')	C
+
+define(`p000a',`%r22')	C
+define(`p064a',`%r29')	C
+
+define(`s000',`%r31')	C
+
+define(`ma000',`%r4')	C
+define(`ma064',`%r20')	C
+
+define(`r000',`%r3')	C
+
+	extrd,u		n, 63, 2, %r5
+	cmpb,=		%r5, %r0, L(BIG)
+	nop
+
+	fldd		0(up), %fr4
+	ldo		8(up), up
+	xmpyu		%fr8R, %fr4L, %fr22
+	xmpyu		%fr8L, %fr4R, %fr23
+	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
+	xmpyu		%fr8R, %fr4R, %fr24
+	xmpyu		%fr8L, %fr4L, %fr25
+	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
+	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79
+	addib,<>	-1, %r5, L(two_or_more)
+	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61
+LDEF(one)
+	ldd		-0x78(%r30), p032a1
+	ldd		-0x70(%r30), p032a2
+	ldd		-0x80(%r30), p000a
+	b		L(0_one_out)
+	ldd		-0x68(%r30), p064a
+
+LDEF(two_or_more)
+	fldd		0(up), %fr4
+	ldo		8(up), up
+	xmpyu		%fr8R, %fr4L, %fr22
+	xmpyu		%fr8L, %fr4R, %fr23
+	ldd		-0x78(%r30), p032a1
+	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
+	xmpyu		%fr8R, %fr4R, %fr24
+	xmpyu		%fr8L, %fr4L, %fr25
+	ldd		-0x70(%r30), p032a2
+	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
+	ldd		-0x80(%r30), p000a
+	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79
+	ldd		-0x68(%r30), p064a
+	addib,<>	-1, %r5, L(three_or_more)
+	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61
+LDEF(two)
+	add		p032a1, p032a2, m032
+	add,dc		%r0, %r0, m096
+	depd,z		m032, 31, 32, ma000
+	extrd,u		m032, 31, 32, ma064
+	ldd		0(rp), r000
+	b		L(0_two_out)
+	depd		m096, 31, 32, ma064
+
+LDEF(three_or_more)
+	fldd		0(up), %fr4
+	add		p032a1, p032a2, m032
+	add,dc		%r0, %r0, m096
+	depd,z		m032, 31, 32, ma000
+	extrd,u		m032, 31, 32, ma064
+	ldd		0(rp), r000
+C	addib,=		-1, %r5, L(0_out)
+	depd		m096, 31, 32, ma064
+LDEF(loop0)
+C	xmpyu		%fr8R, %fr4L, %fr22
+C	xmpyu		%fr8L, %fr4R, %fr23
+C	ldd		-0x78(%r30), p032a1
+C	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
+C
+C	xmpyu		%fr8R, %fr4R, %fr24
+C	xmpyu		%fr8L, %fr4L, %fr25
+C	ldd		-0x70(%r30), p032a2
+C	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
+C
+C	ldo		8(rp), rp
+C	add		climb, p000a, s000
+C	ldd		-0x80(%r30), p000a
+C	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79
+C
+C	add,dc		p064a, %r0, climb
+C	ldo		8(up), up
+C	ldd		-0x68(%r30), p064a
+C	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61
+C
+C	add		ma000, s000, s000
+C	add,dc		ma064, climb, climb
+C	fldd		0(up), %fr4
+C
+C	add		r000, s000, s000
+C	add,dc		%r0, climb, climb
+C	std		s000, -8(rp)
+C
+C	add		p032a1, p032a2, m032
+C	add,dc		%r0, %r0, m096
+C
+C	depd,z		m032, 31, 32, ma000
+C	extrd,u		m032, 31, 32, ma064
+C	ldd		0(rp), r000
+C	addib,<>	-1, %r5, L(loop0)
+C	depd		m096, 31, 32, ma064
+LDEF(0_out)
+	ldo		8(up), up
+	xmpyu		%fr8R, %fr4L, %fr22
+	xmpyu		%fr8L, %fr4R, %fr23
+	ldd		-0x78(%r30), p032a1
+	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
+	xmpyu		%fr8R, %fr4R, %fr24
+	xmpyu		%fr8L, %fr4L, %fr25
+	ldd		-0x70(%r30), p032a2
+	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
+	ldo		8(rp), rp
+	add		climb, p000a, s000
+	ldd		-0x80(%r30), p000a
+	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79
+	add,dc		p064a, %r0, climb
+	ldd		-0x68(%r30), p064a
+	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61
+	add		ma000, s000, s000
+	add,dc		ma064, climb, climb
+	add		r000, s000, s000
+	add,dc		%r0, climb, climb
+	std		s000, -8(rp)
+	add		p032a1, p032a2, m032
+	add,dc		%r0, %r0, m096
+	depd,z		m032, 31, 32, ma000
+	extrd,u		m032, 31, 32, ma064
+	ldd		0(rp), r000
+	depd		m096, 31, 32, ma064
+LDEF(0_two_out)
+	ldd		-0x78(%r30), p032a1
+	ldd		-0x70(%r30), p032a2
+	ldo		8(rp), rp
+	add		climb, p000a, s000
+	ldd		-0x80(%r30), p000a
+	add,dc		p064a, %r0, climb
+	ldd		-0x68(%r30), p064a
+	add		ma000, s000, s000
+	add,dc		ma064, climb, climb
+	add		r000, s000, s000
+	add,dc		%r0, climb, climb
+	std		s000, -8(rp)
+LDEF(0_one_out)
+	add		p032a1, p032a2, m032
+	add,dc		%r0, %r0, m096
+	depd,z		m032, 31, 32, ma000
+	extrd,u		m032, 31, 32, ma064
+	ldd		0(rp), r000
+	depd		m096, 31, 32, ma064
+
+	add		climb, p000a, s000
+	add,dc		p064a, %r0, climb
+	add		ma000, s000, s000
+	add,dc		ma064, climb, climb
+	add		r000, s000, s000
+	add,dc		%r0, climb, climb
+	std		s000, 0(rp)
+
+	cmpib,>=	4, n, L(done)
+	ldo		8(rp), rp
+
+C 4-way unrolled code.
+
+LDEF(BIG)
+
+define(`p032a1',`%r1')	C
+define(`p032a2',`%r19')	C
+define(`p096b1',`%r20')	C
+define(`p096b2',`%r21')	C
+define(`p160c1',`%r22')	C
+define(`p160c2',`%r29')	C
+define(`p224d1',`%r31')	C
+define(`p224d2',`%r3')	C
+			C
+define(`m032',`%r4')	C
+define(`m096',`%r5')	C
+define(`m160',`%r6')	C
+define(`m224',`%r7')	C
+define(`m288',`%r8')	C
+			C
+define(`p000a',`%r1')	C
+define(`p064a',`%r19')	C
+define(`p064b',`%r20')	C
+define(`p128b',`%r21')	C
+define(`p128c',`%r22')	C
+define(`p192c',`%r29')	C
+define(`p192d',`%r31')	C
+define(`p256d',`%r3')	C
+			C
+define(`s000',`%r10')	C
+define(`s064',`%r11')	C
+define(`s128',`%r12')	C
+define(`s192',`%r13')	C
+			C
+define(`ma000',`%r9')	C
+define(`ma064',`%r4')	C
+define(`ma128',`%r5')	C
+define(`ma192',`%r6')	C
+define(`ma256',`%r7')	C
+			C
+define(`r000',`%r1')	C
+define(`r064',`%r19')	C
+define(`r128',`%r20')	C
+define(`r192',`%r21')	C
+
+	std		%r6, -0xe8(%r30)
+	std		%r7, -0xe0(%r30)
+	std		%r8, -0xd8(%r30)
+	std		%r9, -0xd0(%r30)
+	std		%r10, -0xc8(%r30)
+	std		%r11, -0xc0(%r30)
+	std		%r12, -0xb8(%r30)
+	std		%r13, -0xb0(%r30)
+
+ifdef(`HAVE_ABI_2_0w',
+`	extrd,u		n, 61, 62, n		C right shift 2
+',`	extrd,u		n, 61, 30, n		C right shift 2, zero extend
+')
+
+LDEF(4_or_more)
+	fldd		0(up), %fr4
+	fldd		8(up), %fr5
+	fldd		16(up), %fr6
+	fldd		24(up), %fr7
+	xmpyu		%fr8R, %fr4L, %fr22
+	xmpyu		%fr8L, %fr4R, %fr23
+	xmpyu		%fr8R, %fr5L, %fr24
+	xmpyu		%fr8L, %fr5R, %fr25
+	xmpyu		%fr8R, %fr6L, %fr26
+	xmpyu		%fr8L, %fr6R, %fr27
+	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
+	xmpyu		%fr8R, %fr7L, %fr28
+	xmpyu		%fr8L, %fr7R, %fr29
+	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
+	xmpyu		%fr8R, %fr4R, %fr30
+	xmpyu		%fr8L, %fr4L, %fr31
+	fstd		%fr24, -0x38(%r30)	C mid product to  -0x38..-0x31
+	xmpyu		%fr8R, %fr5R, %fr22
+	xmpyu		%fr8L, %fr5L, %fr23
+	fstd		%fr25, -0x30(%r30)	C mid product to  -0x30..-0x29
+	xmpyu		%fr8R, %fr6R, %fr24
+	xmpyu		%fr8L, %fr6L, %fr25
+	fstd		%fr26, -0x58(%r30)	C mid product to  -0x58..-0x51
+	xmpyu		%fr8R, %fr7R, %fr26
+	fstd		%fr27, -0x50(%r30)	C mid product to  -0x50..-0x49
+	addib,<>	-1, n, L(8_or_more)
+	xmpyu		%fr8L, %fr7L, %fr27
+	fstd		%fr28, -0x18(%r30)	C mid product to  -0x18..-0x11
+	fstd		%fr29, -0x10(%r30)	C mid product to  -0x10..-0x09
+	fstd		%fr30, -0x80(%r30)	C low product to  -0x80..-0x79
+	fstd		%fr31, -0x68(%r30)	C high product to -0x68..-0x61
+	fstd		%fr22, -0x40(%r30)	C low product to  -0x40..-0x39
+	fstd		%fr23, -0x28(%r30)	C high product to -0x28..-0x21
+	fstd		%fr24, -0x60(%r30)	C low product to  -0x60..-0x59
+	fstd		%fr25, -0x48(%r30)	C high product to -0x48..-0x41
+	fstd		%fr26, -0x20(%r30)	C low product to  -0x20..-0x19
+	fstd		%fr27, -0x88(%r30)	C high product to -0x88..-0x81
+	ldd		-0x78(%r30), p032a1
+	ldd		-0x70(%r30), p032a2
+	ldd		-0x38(%r30), p096b1
+	ldd		-0x30(%r30), p096b2
+	ldd		-0x58(%r30), p160c1
+	ldd		-0x50(%r30), p160c2
+	ldd		-0x18(%r30), p224d1
+	ldd		-0x10(%r30), p224d2
+	b		L(end1)
+	nop
+
+LDEF(8_or_more)
+	fstd		%fr28, -0x18(%r30)	C mid product to  -0x18..-0x11
+	fstd		%fr29, -0x10(%r30)	C mid product to  -0x10..-0x09
+	ldo		32(up), up
+	fstd		%fr30, -0x80(%r30)	C low product to  -0x80..-0x79
+	fstd		%fr31, -0x68(%r30)	C high product to -0x68..-0x61
+	fstd		%fr22, -0x40(%r30)	C low product to  -0x40..-0x39
+	fstd		%fr23, -0x28(%r30)	C high product to -0x28..-0x21
+	fstd		%fr24, -0x60(%r30)	C low product to  -0x60..-0x59
+	fstd		%fr25, -0x48(%r30)	C high product to -0x48..-0x41
+	fstd		%fr26, -0x20(%r30)	C low product to  -0x20..-0x19
+	fstd		%fr27, -0x88(%r30)	C high product to -0x88..-0x81
+	fldd		0(up), %fr4
+	fldd		8(up), %fr5
+	fldd		16(up), %fr6
+	fldd		24(up), %fr7
+	xmpyu		%fr8R, %fr4L, %fr22
+	ldd		-0x78(%r30), p032a1
+	xmpyu		%fr8L, %fr4R, %fr23
+	xmpyu		%fr8R, %fr5L, %fr24
+	ldd		-0x70(%r30), p032a2
+	xmpyu		%fr8L, %fr5R, %fr25
+	xmpyu		%fr8R, %fr6L, %fr26
+	ldd		-0x38(%r30), p096b1
+	xmpyu		%fr8L, %fr6R, %fr27
+	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
+	xmpyu		%fr8R, %fr7L, %fr28
+	ldd		-0x30(%r30), p096b2
+	xmpyu		%fr8L, %fr7R, %fr29
+	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
+	xmpyu		%fr8R, %fr4R, %fr30
+	ldd		-0x58(%r30), p160c1
+	xmpyu		%fr8L, %fr4L, %fr31
+	fstd		%fr24, -0x38(%r30)	C mid product to  -0x38..-0x31
+	xmpyu		%fr8R, %fr5R, %fr22
+	ldd		-0x50(%r30), p160c2
+	xmpyu		%fr8L, %fr5L, %fr23
+	fstd		%fr25, -0x30(%r30)	C mid product to  -0x30..-0x29
+	xmpyu		%fr8R, %fr6R, %fr24
+	ldd		-0x18(%r30), p224d1
+	xmpyu		%fr8L, %fr6L, %fr25
+	fstd		%fr26, -0x58(%r30)	C mid product to  -0x58..-0x51
+	xmpyu		%fr8R, %fr7R, %fr26
+	ldd		-0x10(%r30), p224d2
+	fstd		%fr27, -0x50(%r30)	C mid product to  -0x50..-0x49
+	addib,=		-1, n, L(end2)
+	xmpyu		%fr8L, %fr7L, %fr27
+LDEF(loop)
+	add		p032a1, p032a2, m032
+	ldd		-0x80(%r30), p000a
+	add,dc		p096b1, p096b2, m096
+	fstd		%fr28, -0x18(%r30)	C mid product to  -0x18..-0x11
+
+	add,dc		p160c1, p160c2, m160
+	ldd		-0x68(%r30), p064a
+	add,dc		p224d1, p224d2, m224
+	fstd		%fr29, -0x10(%r30)	C mid product to  -0x10..-0x09
+
+	add,dc		%r0, %r0, m288
+	ldd		-0x40(%r30), p064b
+	ldo		32(up), up
+	fstd		%fr30, -0x80(%r30)	C low product to  -0x80..-0x79
+
+	depd,z		m032, 31, 32, ma000
+	ldd		-0x28(%r30), p128b
+	extrd,u		m032, 31, 32, ma064
+	fstd		%fr31, -0x68(%r30)	C high product to -0x68..-0x61
+
+	depd		m096, 31, 32, ma064
+	ldd		-0x60(%r30), p128c
+	extrd,u		m096, 31, 32, ma128
+	fstd		%fr22, -0x40(%r30)	C low product to  -0x40..-0x39
+
+	depd		m160, 31, 32, ma128
+	ldd		-0x48(%r30), p192c
+	extrd,u		m160, 31, 32, ma192
+	fstd		%fr23, -0x28(%r30)	C high product to -0x28..-0x21
+
+	depd		m224, 31, 32, ma192
+	ldd		-0x20(%r30), p192d
+	extrd,u		m224, 31, 32, ma256
+	fstd		%fr24, -0x60(%r30)	C low product to  -0x60..-0x59
+
+	depd		m288, 31, 32, ma256
+	ldd		-0x88(%r30), p256d
+	add		climb, p000a, s000
+	fstd		%fr25, -0x48(%r30)	C high product to -0x48..-0x41
+
+	add,dc		p064a, p064b, s064
+	ldd		0(rp), r000
+	add,dc		p128b, p128c, s128
+	fstd		%fr26, -0x20(%r30)	C low product to  -0x20..-0x19
+
+	add,dc		p192c, p192d, s192
+	ldd		8(rp), r064
+	add,dc		p256d, %r0, climb
+	fstd		%fr27, -0x88(%r30)	C high product to -0x88..-0x81
+
+	ldd		16(rp), r128
+	add		ma000, s000, s000	C accum mid 0
+	ldd		24(rp), r192
+	add,dc		ma064, s064, s064	C accum mid 1
+
+	add,dc		ma128, s128, s128	C accum mid 2
+	fldd		0(up), %fr4
+	add,dc		ma192, s192, s192	C accum mid 3
+	fldd		8(up), %fr5
+
+	add,dc		ma256, climb, climb
+	fldd		16(up), %fr6
+	add		r000, s000, s000	C accum rlimb 0
+	fldd		24(up), %fr7
+
+	add,dc		r064, s064, s064	C accum rlimb 1
+	add,dc		r128, s128, s128	C accum rlimb 2
+	std		s000, 0(rp)
+
+	add,dc		r192, s192, s192	C accum rlimb 3
+	add,dc		%r0, climb, climb
+	std		s064, 8(rp)
+
+	xmpyu		%fr8R, %fr4L, %fr22
+	ldd		-0x78(%r30), p032a1
+	xmpyu		%fr8L, %fr4R, %fr23
+	std		s128, 16(rp)
+
+	xmpyu		%fr8R, %fr5L, %fr24
+	ldd		-0x70(%r30), p032a2
+	xmpyu		%fr8L, %fr5R, %fr25
+	std		s192, 24(rp)
+
+	xmpyu		%fr8R, %fr6L, %fr26
+	ldd		-0x38(%r30), p096b1
+	xmpyu		%fr8L, %fr6R, %fr27
+	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
+
+	xmpyu		%fr8R, %fr7L, %fr28
+	ldd		-0x30(%r30), p096b2
+	xmpyu		%fr8L, %fr7R, %fr29
+	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
+
+	xmpyu		%fr8R, %fr4R, %fr30
+	ldd		-0x58(%r30), p160c1
+	xmpyu		%fr8L, %fr4L, %fr31
+	fstd		%fr24, -0x38(%r30)	C mid product to  -0x38..-0x31
+
+	xmpyu		%fr8R, %fr5R, %fr22
+	ldd		-0x50(%r30), p160c2
+	xmpyu		%fr8L, %fr5L, %fr23
+	fstd		%fr25, -0x30(%r30)	C mid product to  -0x30..-0x29
+
+	xmpyu		%fr8R, %fr6R, %fr24
+	ldd		-0x18(%r30), p224d1
+	xmpyu		%fr8L, %fr6L, %fr25
+	fstd		%fr26, -0x58(%r30)	C mid product to  -0x58..-0x51
+
+	xmpyu		%fr8R, %fr7R, %fr26
+	ldd		-0x10(%r30), p224d2
+	fstd		%fr27, -0x50(%r30)	C mid product to  -0x50..-0x49
+	xmpyu		%fr8L, %fr7L, %fr27
+
+	addib,<>	-1, n, L(loop)
+	ldo		32(rp), rp
+
+LDEF(end2)
+	add		p032a1, p032a2, m032
+	ldd		-0x80(%r30), p000a
+	add,dc		p096b1, p096b2, m096
+	fstd		%fr28, -0x18(%r30)	C mid product to  -0x18..-0x11
+	add,dc		p160c1, p160c2, m160
+	ldd		-0x68(%r30), p064a
+	add,dc		p224d1, p224d2, m224
+	fstd		%fr29, -0x10(%r30)	C mid product to  -0x10..-0x09
+	add,dc		%r0, %r0, m288
+	ldd		-0x40(%r30), p064b
+	fstd		%fr30, -0x80(%r30)	C low product to  -0x80..-0x79
+	depd,z		m032, 31, 32, ma000
+	ldd		-0x28(%r30), p128b
+	extrd,u		m032, 31, 32, ma064
+	fstd		%fr31, -0x68(%r30)	C high product to -0x68..-0x61
+	depd		m096, 31, 32, ma064
+	ldd		-0x60(%r30), p128c
+	extrd,u		m096, 31, 32, ma128
+	fstd		%fr22, -0x40(%r30)	C low product to  -0x40..-0x39
+	depd		m160, 31, 32, ma128
+	ldd		-0x48(%r30), p192c
+	extrd,u		m160, 31, 32, ma192
+	fstd		%fr23, -0x28(%r30)	C high product to -0x28..-0x21
+	depd		m224, 31, 32, ma192
+	ldd		-0x20(%r30), p192d
+	extrd,u		m224, 31, 32, ma256
+	fstd		%fr24, -0x60(%r30)	C low product to  -0x60..-0x59
+	depd		m288, 31, 32, ma256
+	ldd		-0x88(%r30), p256d
+	add		climb, p000a, s000
+	fstd		%fr25, -0x48(%r30)	C high product to -0x48..-0x41
+	add,dc		p064a, p064b, s064
+	ldd		0(rp), r000
+	add,dc		p128b, p128c, s128
+	fstd		%fr26, -0x20(%r30)	C low product to  -0x20..-0x19
+	add,dc		p192c, p192d, s192
+	ldd		8(rp), r064
+	add,dc		p256d, %r0, climb
+	fstd		%fr27, -0x88(%r30)	C high product to -0x88..-0x81
+	ldd		16(rp), r128
+	add		ma000, s000, s000	C accum mid 0
+	ldd		24(rp), r192
+	add,dc		ma064, s064, s064	C accum mid 1
+	add,dc		ma128, s128, s128	C accum mid 2
+	add,dc		ma192, s192, s192	C accum mid 3
+	add,dc		ma256, climb, climb
+	add		r000, s000, s000	C accum rlimb 0
+	add,dc		r064, s064, s064	C accum rlimb 1
+	add,dc		r128, s128, s128	C accum rlimb 2
+	std		s000, 0(rp)
+	add,dc		r192, s192, s192	C accum rlimb 3
+	add,dc		%r0, climb, climb
+	std		s064, 8(rp)
+	ldd		-0x78(%r30), p032a1
+	std		s128, 16(rp)
+	ldd		-0x70(%r30), p032a2
+	std		s192, 24(rp)
+	ldd		-0x38(%r30), p096b1
+	ldd		-0x30(%r30), p096b2
+	ldd		-0x58(%r30), p160c1
+	ldd		-0x50(%r30), p160c2
+	ldd		-0x18(%r30), p224d1
+	ldd		-0x10(%r30), p224d2
+	ldo		32(rp), rp
+
+LDEF(end1)
+	add		p032a1, p032a2, m032
+	ldd		-0x80(%r30), p000a
+	add,dc		p096b1, p096b2, m096
+	add,dc		p160c1, p160c2, m160
+	ldd		-0x68(%r30), p064a
+	add,dc		p224d1, p224d2, m224
+	add,dc		%r0, %r0, m288
+	ldd		-0x40(%r30), p064b
+	depd,z		m032, 31, 32, ma000
+	ldd		-0x28(%r30), p128b
+	extrd,u		m032, 31, 32, ma064
+	depd		m096, 31, 32, ma064
+	ldd		-0x60(%r30), p128c
+	extrd,u		m096, 31, 32, ma128
+	depd		m160, 31, 32, ma128
+	ldd		-0x48(%r30), p192c
+	extrd,u		m160, 31, 32, ma192
+	depd		m224, 31, 32, ma192
+	ldd		-0x20(%r30), p192d
+	extrd,u		m224, 31, 32, ma256
+	depd		m288, 31, 32, ma256
+	ldd		-0x88(%r30), p256d
+	add		climb, p000a, s000
+	add,dc		p064a, p064b, s064
+	ldd		0(rp), r000
+	add,dc		p128b, p128c, s128
+	add,dc		p192c, p192d, s192
+	ldd		8(rp), r064
+	add,dc		p256d, %r0, climb
+	ldd		16(rp), r128
+	add		ma000, s000, s000	C accum mid 0
+	ldd		24(rp), r192
+	add,dc		ma064, s064, s064	C accum mid 1
+	add,dc		ma128, s128, s128	C accum mid 2
+	add,dc		ma192, s192, s192	C accum mid 3
+	add,dc		ma256, climb, climb
+	add		r000, s000, s000	C accum rlimb 0
+	add,dc		r064, s064, s064	C accum rlimb 1
+	add,dc		r128, s128, s128	C accum rlimb 2
+	std		s000, 0(rp)
+	add,dc		r192, s192, s192	C accum rlimb 3
+	add,dc		%r0, climb, climb
+	std		s064, 8(rp)
+	std		s128, 16(rp)
+	std		s192, 24(rp)
+
+	ldd		-0xb0(%r30), %r13
+	ldd		-0xb8(%r30), %r12
+	ldd		-0xc0(%r30), %r11
+	ldd		-0xc8(%r30), %r10
+	ldd		-0xd0(%r30), %r9
+	ldd		-0xd8(%r30), %r8
+	ldd		-0xe0(%r30), %r7
+	ldd		-0xe8(%r30), %r6
+LDEF(done)
+ifdef(`HAVE_ABI_2_0w',
+`	copy		climb, %r28
+',`	extrd,u		climb, 63, 32, %r29
+	extrd,u		climb, 31, 32, %r28
+')
+	ldd		-0xf0(%r30), %r5
+	ldd		-0xf8(%r30), %r4
+	bve		(%r2)
+	ldd,mb		-0x100(%r30), %r3
+EPILOGUE(mpn_addmul_1)

diff --git a/third_party/gmp/mpn/pa64/aors_n.asm b/third_party/gmp/mpn/pa64/aors_n.asm
new file mode 100644
index 0000000..ab4536f
--- /dev/null
+++ b/third_party/gmp/mpn/pa64/aors_n.asm

@@ -0,0 +1,130 @@
+dnl  HP-PA 2.0 mpn_add_n, mpn_sub_n
+
+dnl  Copyright 1997, 2000, 2002, 2003, 2009, 2010 Free Software Foundation,
+dnl  Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+dnl  This runs at 2 cycles/limb on PA8000 and 1.6875 cycles/limb on PA8500.  It
+dnl  should be possible to reach the cache bandwidth 1.5 cycles/limb at least
+dnl  with PA8500.  The problem now is stalling of the first ADD,DC after LDO,
+dnl  where the processor gets confused about where carry comes from.
+
+include(`../config.m4')
+
+dnl INPUT PARAMETERS
+define(`rp',`%r26')
+define(`up',`%r25')
+define(`vp',`%r24')
+define(`n',`%r23')
+
+ifdef(`OPERATION_add_n', `
+	define(ADCSBC,	      `add,dc')
+	define(INITCY,	      `addi -1,%r22,%r0')
+	define(func,	      mpn_add_n)
+	define(func_nc,	      mpn_add_nc)')
+ifdef(`OPERATION_sub_n', `
+	define(ADCSBC,	      `sub,db')
+	define(INITCY,	      `subi 0,%r22,%r0')
+	define(func,	      mpn_sub_n)
+	define(func_nc,	      mpn_sub_nc)')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+
+ifdef(`HAVE_ABI_2_0w',
+`       .level  2.0w
+',`     .level  2.0
+')
+PROLOGUE(func_nc)
+ifdef(`HAVE_ABI_2_0w',
+`	b		L(com)
+	nop
+',`	b		L(com)
+	ldw		-52(%r30), %r22
+')
+EPILOGUE()
+PROLOGUE(func)
+	ldi		0, %r22
+LDEF(com)
+	sub		%r0, n, %r21
+	depw,z		%r21, 30, 3, %r28	C r28 = 2 * (-n & 7)
+	depw,z		%r21, 28, 3, %r21	C r21 = 8 * (-n & 7)
+	sub		up, %r21, up		C offset up
+	sub		vp, %r21, vp		C offset vp
+	sub		rp, %r21, rp		C offset rp
+	blr		%r28, %r0		C branch into loop
+	INITCY
+
+LDEF(loop)
+	ldd		0(up), %r20
+	ldd		0(vp), %r31
+	ADCSBC		%r20, %r31, %r20
+	std		%r20, 0(rp)
+LDEF(7)	ldd		8(up), %r21
+	ldd		8(vp), %r19
+	ADCSBC		%r21, %r19, %r21
+	std		%r21, 8(rp)
+LDEF(6)	ldd		16(up), %r20
+	ldd		16(vp), %r31
+	ADCSBC		%r20, %r31, %r20
+	std		%r20, 16(rp)
+LDEF(5)	ldd		24(up), %r21
+	ldd		24(vp), %r19
+	ADCSBC		%r21, %r19, %r21
+	std		%r21, 24(rp)
+LDEF(4)	ldd		32(up), %r20
+	ldd		32(vp), %r31
+	ADCSBC		%r20, %r31, %r20
+	std		%r20, 32(rp)
+LDEF(3)	ldd		40(up), %r21
+	ldd		40(vp), %r19
+	ADCSBC		%r21, %r19, %r21
+	std		%r21, 40(rp)
+LDEF(2)	ldd		48(up), %r20
+	ldd		48(vp), %r31
+	ADCSBC		%r20, %r31, %r20
+	std		%r20, 48(rp)
+LDEF(1)	ldd		56(up), %r21
+	ldd		56(vp), %r19
+	ADCSBC		%r21, %r19, %r21
+	ldo		64(up), up
+	std		%r21, 56(rp)
+	ldo		64(vp), vp
+	addib,>		-8, n, L(loop)
+	ldo		64(rp), rp
+
+	add,dc		%r0, %r0, %r29
+ifdef(`OPERATION_sub_n',`
+	subi		1, %r29, %r29
+')
+	bve		(%r2)
+ifdef(`HAVE_ABI_2_0w',
+`	copy		%r29, %r28
+',`	ldi		0, %r28
+')
+EPILOGUE()

diff --git a/third_party/gmp/mpn/pa64/aorslsh1_n.asm b/third_party/gmp/mpn/pa64/aorslsh1_n.asm
new file mode 100644
index 0000000..2a55dde
--- /dev/null
+++ b/third_party/gmp/mpn/pa64/aorslsh1_n.asm

@@ -0,0 +1,228 @@
+dnl  PA64 mpn_addlsh1_n/mpn_sublsh1_n -- rp[] = up[] +- (vp[] << 1).
+
+dnl  Copyright 2003 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C		    cycles/limb
+C 8000,8200:		2
+C 8500,8600,8700:	1.75
+
+C TODO
+C  * Write special feed-in code for each (n mod 8). (See the ia64 code.)
+C  * Try to make this run at closer to 1.5 c/l.
+C  * Set up register aliases (define(`u0',`%r19')).
+C  * Explicitly align loop.
+
+dnl INPUT PARAMETERS
+define(`rp',`%r26')
+define(`up',`%r25')
+define(`vp',`%r24')
+define(`n',`%r23')
+
+ifdef(`OPERATION_addlsh1_n',`
+  define(ADCSBC,	`add,dc')
+  define(INITC,		`ldi	0,')
+  define(func, mpn_addlsh1_n)
+')
+ifdef(`OPERATION_sublsh1_n',`
+  define(ADCSBC,	`sub,db')
+  define(INITC,		`ldi	1,')
+  define(func, mpn_sublsh1_n)
+')
+
+MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_sublsh1_n)
+
+ifdef(`HAVE_ABI_2_0w',`
+  define(LEVEL,		`.level 2.0w')
+  define(RETREG,	`%r28')
+  define(CLRRET1,	`dnl')
+')
+ifdef(`HAVE_ABI_2_0n',`
+  define(LEVEL,		`.level 2.0')
+  define(RETREG,	`%r29')
+  define(CLRRET1,	`ldi	0, %r28')
+')
+
+	LEVEL
+PROLOGUE(func)
+	std,ma		%r3, 0x100(%r30)	C save reg
+
+	INITC		%r1			C init saved cy
+
+C Primitive code for the first (n mod 8) limbs:
+	extrd,u		n, 63, 3, %r22		C count for loop0
+	comib,=		0, %r22, L(unrolled)	C skip loop0?
+	copy		%r0, %r28
+LDEF(loop0)
+	ldd	0(vp), %r21
+	ldo		8(vp), vp
+	ldd	0(up), %r19
+	ldo		8(up), up
+	shrpd	%r21, %r28, 63, %r31
+	addi		-1, %r1, %r0		C restore cy
+	ADCSBC	%r19, %r31, %r29
+	std	%r29, 0(rp)
+	add,dc		%r0, %r0, %r1		C save cy
+	copy	%r21, %r28
+	addib,>		-1, %r22, L(loop0)
+	ldo		8(rp), rp
+
+	addib,>=	-8, n, L(unrolled)
+	addi		-1, %r1, %r0		C restore cy
+
+	shrpd	%r0, %r28, 63, %r28
+	ADCSBC	%r0, %r28, RETREG
+ifdef(`OPERATION_sublsh1_n',
+`	sub	%r0, RETREG, RETREG')
+	CLRRET1
+
+	bve		(%r2)
+	ldd,mb		-0x100(%r30), %r3
+
+
+LDEF(unrolled)
+	std		%r4, -0xf8(%r30)	C save reg
+	ldd	0(vp), %r4
+	std		%r5, -0xf0(%r30)	C save reg
+	ldd	8(vp), %r5
+	std		%r6, -0xe8(%r30)	C save reg
+	ldd	16(vp), %r6
+	std		%r7, -0xe0(%r30)	C save reg
+
+	ldd	24(vp), %r7
+	shrpd	%r4, %r28, 63, %r31
+	std		%r8, -0xd8(%r30)	C save reg
+	ldd	32(vp), %r8
+	shrpd	%r5, %r4, 63, %r4
+	std		%r9, -0xd0(%r30)	C save reg
+	ldd	40(vp), %r9
+	shrpd	%r6, %r5, 63, %r5
+	ldd	48(vp), %r3
+	shrpd	%r7, %r6, 63, %r6
+	ldd	56(vp), %r28
+	shrpd	%r8, %r7, 63, %r7
+	ldd	0(up), %r19
+	shrpd	%r9, %r8, 63, %r8
+	ldd	8(up), %r20
+	shrpd	%r3, %r9, 63, %r9
+	ldd	16(up), %r21
+	shrpd	%r28, %r3, 63, %r3
+	ldd	24(up), %r22
+
+	nop					C alignment FIXME
+	addib,<=	-8, n, L(end)
+	addi		-1, %r1, %r0		C restore cy
+LDEF(loop)
+	ADCSBC	%r19, %r31, %r29
+	ldd	32(up), %r19
+	std	%r29, 0(rp)
+	ADCSBC	%r20, %r4, %r29
+	ldd	40(up), %r20
+	std	%r29, 8(rp)
+	ADCSBC	%r21, %r5, %r29
+	ldd	48(up), %r21
+	std	%r29, 16(rp)
+	ADCSBC	%r22, %r6, %r29
+	ldd	56(up), %r22
+	std	%r29, 24(rp)
+	ADCSBC	%r19, %r7, %r29
+	ldd	64(vp), %r4
+	std	%r29, 32(rp)
+	ADCSBC	%r20, %r8, %r29
+	ldd	72(vp), %r5
+	std	%r29, 40(rp)
+	ADCSBC	%r21, %r9, %r29
+	ldd	80(vp), %r6
+	std	%r29, 48(rp)
+	ADCSBC	%r22, %r3, %r29
+	std	%r29, 56(rp)
+
+	add,dc		%r0, %r0, %r1		C save cy
+
+	ldd	88(vp), %r7
+	shrpd	%r4, %r28, 63, %r31
+	ldd	96(vp), %r8
+	shrpd	%r5, %r4, 63, %r4
+	ldd	104(vp), %r9
+	shrpd	%r6, %r5, 63, %r5
+	ldd	112(vp), %r3
+	shrpd	%r7, %r6, 63, %r6
+	ldd	120(vp), %r28
+	shrpd	%r8, %r7, 63, %r7
+	ldd	64(up), %r19
+	shrpd	%r9, %r8, 63, %r8
+	ldd	72(up), %r20
+	shrpd	%r3, %r9, 63, %r9
+	ldd	80(up), %r21
+	shrpd	%r28, %r3, 63, %r3
+	ldd	88(up), %r22
+
+	ldo		64(vp), vp
+	ldo		64(rp), rp
+	ldo		64(up), up
+	addib,>		-8, n, L(loop)
+	addi		-1, %r1, %r0		C restore cy
+LDEF(end)
+	ADCSBC	%r19, %r31, %r29
+	ldd	32(up), %r19
+	std	%r29, 0(rp)
+	ADCSBC	%r20, %r4, %r29
+	ldd	40(up), %r20
+	std	%r29, 8(rp)
+	ADCSBC	%r21, %r5, %r29
+	ldd	48(up), %r21
+	std	%r29, 16(rp)
+	ADCSBC	%r22, %r6, %r29
+	ldd	56(up), %r22
+	std	%r29, 24(rp)
+	ADCSBC	%r19, %r7, %r29
+	ldd		-0xf8(%r30), %r4	C restore reg
+	std	%r29, 32(rp)
+	ADCSBC	%r20, %r8, %r29
+	ldd		-0xf0(%r30), %r5	C restore reg
+	std	%r29, 40(rp)
+	ADCSBC	%r21, %r9, %r29
+	ldd		-0xe8(%r30), %r6	C restore reg
+	std	%r29, 48(rp)
+	ADCSBC	%r22, %r3, %r29
+	ldd		-0xe0(%r30), %r7	C restore reg
+	std	%r29, 56(rp)
+
+	shrpd	%r0, %r28, 63, %r28
+	ldd		-0xd8(%r30), %r8	C restore reg
+	ADCSBC	%r0, %r28, RETREG
+ifdef(`OPERATION_sublsh1_n',
+`	sub	%r0, RETREG, RETREG')
+	CLRRET1
+
+	ldd		-0xd0(%r30), %r9	C restore reg
+	bve		(%r2)
+	ldd,mb		-0x100(%r30), %r3	C restore reg
+EPILOGUE()

diff --git a/third_party/gmp/mpn/pa64/gmp-mparam.h b/third_party/gmp/mpn/pa64/gmp-mparam.h
new file mode 100644
index 0000000..c2719c3
--- /dev/null
+++ b/third_party/gmp/mpn/pa64/gmp-mparam.h

@@ -0,0 +1,247 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 1999-2004, 2008-2010 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+/* 440MHz PA8200 */
+
+#define DIVREM_1_NORM_THRESHOLD              0  /* always */
+#define DIVREM_1_UNNORM_THRESHOLD            0  /* always */
+#define MOD_1_1P_METHOD                      2
+#define MOD_1_NORM_THRESHOLD                 0  /* always */
+#define MOD_1_UNNORM_THRESHOLD               0  /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          4
+#define MOD_1U_TO_MOD_1_1_THRESHOLD         10
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD         0  /* never mpn_mod_1_1p */
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD        14
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     11
+#define USE_PREINV_DIVREM_1                  1
+#define DIV_QR_2_PI2_THRESHOLD              21
+#define DIVEXACT_1_THRESHOLD                 0  /* always */
+#define BMOD_1_TO_MOD_1_THRESHOLD        MP_SIZE_T_MAX  /* never */
+
+#define MUL_TOOM22_THRESHOLD                31
+#define MUL_TOOM33_THRESHOLD               114
+#define MUL_TOOM44_THRESHOLD               179
+#define MUL_TOOM6H_THRESHOLD               222
+#define MUL_TOOM8H_THRESHOLD               296
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD     130
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     229
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD     129
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD      54
+
+#define SQR_BASECASE_THRESHOLD               5
+#define SQR_TOOM2_THRESHOLD                 58
+#define SQR_TOOM3_THRESHOLD                153
+#define SQR_TOOM4_THRESHOLD                278
+#define SQR_TOOM6_THRESHOLD                  0  /* always */
+#define SQR_TOOM8_THRESHOLD                  0  /* always */
+
+#define MULMID_TOOM42_THRESHOLD             56
+
+#define MULMOD_BNM1_THRESHOLD               15
+#define SQRMOD_BNM1_THRESHOLD               19
+
+#define POWM_SEC_TABLE  2,23,228,1084
+
+#define MUL_FFT_MODF_THRESHOLD             336  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    336, 5}, {     11, 4}, {     23, 5}, {     21, 6}, \
+    {     11, 5}, {     23, 6}, {     21, 7}, {     11, 6}, \
+    {     23, 7}, {     15, 6}, {     31, 7}, {     21, 8}, \
+    {     11, 7}, {     24, 8}, {     13, 7}, {     27, 8}, \
+    {     15, 7}, {     31, 8}, {     19, 7}, {     39, 8}, \
+    {     27, 9}, {     15, 8}, {     33, 9}, {     19, 8}, \
+    {     39, 9}, {     23, 8}, {     47, 9}, {     27,10}, \
+    {     15, 9}, {     39,10}, {     23, 9}, {     51,10}, \
+    {     31, 9}, {     67,10}, {     39, 9}, {     79,10}, \
+    {     47, 9}, {     95,10}, {     55,11}, {     31,10}, \
+    {     63, 9}, {    127,10}, {     71, 8}, {    287,10}, \
+    {     79,11}, {     47,10}, {     95, 9}, {    191, 8}, \
+    {    383, 7}, {    767,10}, {    103, 9}, {    207, 8}, \
+    {    415, 7}, {    831,12}, {     31,11}, {     63,10}, \
+    {    127, 9}, {    255, 8}, {    543, 7}, {   1087, 6}, \
+    {   2175,10}, {    143, 9}, {    287, 8}, {    575,11}, \
+    {     79, 9}, {    319, 8}, {    639, 7}, {   1279, 9}, \
+    {    335, 8}, {    671,10}, {    175, 9}, {    351, 8}, \
+    {    703,11}, {     95,10}, {    191, 9}, {    383, 8}, \
+    {    767,10}, {    207, 9}, {    415, 8}, {    831, 7}, \
+    {   1663,11}, {    111,10}, {    223, 9}, {    447, 8}, \
+    {    895,12}, {     63,11}, {    127,10}, {    255, 9}, \
+    {    543, 8}, {   1087, 7}, {   2175,10}, {    287, 9}, \
+    {    575, 8}, {   1215, 7}, {   2431,10}, {    319, 9}, \
+    {    639, 8}, {   1279,10}, {    335, 9}, {    671, 8}, \
+    {   1343, 9}, {    703, 8}, {   1407,12}, {     95,11}, \
+    {    191,10}, {    383,11}, {    207, 9}, {    831, 8}, \
+    {   1663,11}, {    223,10}, {    447, 9}, {    959,13}, \
+    {     63,12}, {    127,11}, {    255, 8}, {   2047,11}, \
+    {    271,10}, {    543, 9}, {   1087, 8}, {   2175,11}, \
+    {    287,10}, {    575, 9}, {   1215, 8}, {   2431,11}, \
+    {    319,10}, {    671, 9}, {   1343, 8}, {   2687,11}, \
+    {    351,10}, {    703, 9}, {   1471, 8}, {   2943,12}, \
+    {    191,11}, {    383, 8}, {   3071,11}, {    415,10}, \
+    {    831, 9}, {   1663,11}, {    479,10}, {    959, 9}, \
+    {   1919, 8}, {   3839,13}, {    127,12}, {    255,11}, \
+    {    543,10}, {   1087, 9}, {   2175,12}, {    287,11}, \
+    {    607,10}, {   1215, 9}, {   2431, 8}, {   4863,12}, \
+    {    319,11}, {    671,10}, {   1343,13}, {    191, 9}, \
+    {   3071,12}, {    415,11}, {    831,10}, {   1663, 8}, \
+    {   6655, 9}, {   3455,12}, {    447, 9}, {   3583,13}, \
+    {    255,12}, {    511,11}, {   1023,10}, {   2175,13}, \
+    {    319,11}, {   1279,12}, {    671,10}, {   2815,12}, \
+    {    735,10}, {   2943, 9}, {   5887,13}, {    383,12}, \
+    {    767,11}, {   1535,10}, {   3071,13}, {    447,10}, \
+    {   3583,12}, {    959,13}, {    511,12}, {   1087,13}, \
+    {    639,12}, {   1343,13}, {    767,11}, {   3071,13}, \
+    {    831,12}, {   1663,11}, {   3455,10}, {   6911,13}, \
+    {    895,14}, {    511,13}, {   1023,12}, {   2047,13}, \
+    {   1087,12}, {   2303,13}, {   1215,12}, {   2431,14}, \
+    {    639,13}, {   1279,12}, {   2559,13}, {   1343,12}, \
+    {   2687,11}, {   5375,13}, {   1407,12}, {   2815,11}, \
+    {   5631,12}, {   2943,13}, {   1535,12}, {   3199,13}, \
+    {   1663,12}, {   3327,13}, {   1727,14}, {    895,13}, \
+    {   1791,12}, {   3583,13}, {   1919,15}, {    511,14}, \
+    {   1023,13}, {   2047,12}, {   4095,14}, {   1151,13}, \
+    {   2431,14}, {   1279,13}, {   2687,14}, {   1407,13}, \
+    {   2815,12}, {   5631,15}, {    767,14}, {   1535,13}, \
+    {   3071,14}, {   1663,13}, {   3327,14}, {   1791,13}, \
+    {   3583,14}, {   1919,15}, {   1023,14}, {   2303,13}, \
+    {   4607,14}, {   2431,13}, {   4863,15}, {  32768,16}, \
+    {  65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \
+    {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 252
+#define MUL_FFT_THRESHOLD                 2368
+
+#define SQR_FFT_MODF_THRESHOLD             284  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    284, 5}, {      9, 4}, {     21, 5}, {     21, 6}, \
+    {     11, 5}, {     23, 6}, {     25, 7}, {     25, 8}, \
+    {     13, 7}, {     27, 8}, {     15, 7}, {     31, 8}, \
+    {     27, 9}, {     15, 8}, {     33, 9}, {     19, 8}, \
+    {     39, 9}, {     23, 8}, {     47, 9}, {     27,10}, \
+    {     15, 9}, {     39,10}, {     23, 9}, {     51,11}, \
+    {     15,10}, {     31, 9}, {     67,10}, {     39, 9}, \
+    {     79,10}, {     47, 9}, {     95,10}, {     55,11}, \
+    {     31,10}, {     63, 8}, {    255, 7}, {    511,10}, \
+    {     71, 8}, {    287, 7}, {    575,10}, {     79,11}, \
+    {     47,10}, {     95, 9}, {    191, 8}, {    383, 7}, \
+    {    767,10}, {    103, 9}, {    207, 8}, {    415,12}, \
+    {     31,11}, {     63,10}, {    127, 9}, {    255, 8}, \
+    {    543, 7}, {   1087, 8}, {    575, 7}, {   1151,11}, \
+    {     79, 8}, {    639, 7}, {   1279, 9}, {    335, 8}, \
+    {    671, 7}, {   1343,10}, {    175, 8}, {    703, 7}, \
+    {   1407,11}, {     95,10}, {    191, 9}, {    383, 8}, \
+    {    767,10}, {    207, 9}, {    415, 8}, {    831, 7}, \
+    {   1663, 9}, {    447, 8}, {    895,12}, {     63,11}, \
+    {    127,10}, {    255, 9}, {    543, 8}, {   1087, 7}, \
+    {   2175, 9}, {    575, 8}, {   1151,10}, {    303, 9}, \
+    {    607, 8}, {   1215, 7}, {   2431,10}, {    319, 9}, \
+    {    639, 8}, {   1279, 9}, {    671, 8}, {   1343, 7}, \
+    {   2687,10}, {    351, 9}, {    703, 8}, {   1407,12}, \
+    {     95,11}, {    191,10}, {    383, 9}, {    767,11}, \
+    {    207,10}, {    415, 9}, {    831, 8}, {   1663,11}, \
+    {    223,10}, {    447, 9}, {    895,13}, {     63,11}, \
+    {    255,10}, {    543, 8}, {   2175,11}, {    287,10}, \
+    {    575, 9}, {   1151,10}, {    607, 9}, {   1215, 8}, \
+    {   2431,11}, {    319, 9}, {   1279,10}, {    671, 9}, \
+    {   1343, 8}, {   2687,11}, {    351,10}, {    703, 9}, \
+    {   1407,10}, {    735,12}, {    191,11}, {    383,10}, \
+    {    831, 9}, {   1663,12}, {    223,11}, {    447,10}, \
+    {    895,11}, {    479, 9}, {   1919, 8}, {   3839,12}, \
+    {    255,11}, {    511,10}, {   1023,11}, {    543,10}, \
+    {   1087, 9}, {   2175,12}, {    287,11}, {    575,10}, \
+    {   1151,11}, {    607,10}, {   1215, 9}, {   2431, 8}, \
+    {   4863,10}, {   1279,11}, {    671,10}, {   1343, 9}, \
+    {   2687,12}, {    351,11}, {    703,10}, {   1407,11}, \
+    {    735,13}, {    191, 9}, {   3071, 7}, {  12287,11}, \
+    {    799,12}, {    415,11}, {    831,10}, {   1663,12}, \
+    {    447, 8}, {   7167,12}, {    479, 9}, {   3839,14}, \
+    {    127,13}, {    255,12}, {    511,11}, {   1023,12}, \
+    {    543,10}, {   2175, 9}, {   4607,11}, {   1215,10}, \
+    {   2431,11}, {   1279,10}, {   2559,13}, {    383,12}, \
+    {    767,11}, {   1535,12}, {    799,10}, {   3199, 9}, \
+    {   6399,12}, {    895,13}, {    511,12}, {   1023,11}, \
+    {   2047,12}, {   1087,13}, {    575,12}, {   1151,10}, \
+    {   4607,13}, {    639,12}, {   1279,11}, {   2687,14}, \
+    {    383,13}, {    767,11}, {   3071,12}, {   1599,13}, \
+    {    895,12}, {   1791,11}, {   3583,13}, {    959,15}, \
+    {    255,12}, {   2175,13}, {   1215,14}, {    639,13}, \
+    {   1279,12}, {   2559,13}, {   1343,12}, {   2687,13}, \
+    {   1471,11}, {   5887,14}, {    767,13}, {   1535,12}, \
+    {   3071,13}, {   1599,12}, {   3199,13}, {   1663,12}, \
+    {   3327,13}, {   1727,14}, {    895,13}, {   1791,12}, \
+    {   3583,15}, {    511,14}, {   1023,13}, {   2175,14}, \
+    {   1151,12}, {   4607,13}, {   2431,14}, {   1279,13}, \
+    {   2687,14}, {   1407,13}, {   2815,15}, {    767,13}, \
+    {   3199,14}, {   1663,13}, {   3327,14}, {   1791,13}, \
+    {   3583,14}, {   1919,15}, {   1023,14}, {   2047,13}, \
+    {   4095,14}, {   2303,13}, {   4607,14}, {   2431,15}, \
+    {  32768,16}, {  65536,17}, { 131072,18}, { 262144,19}, \
+    { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \
+    {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 257
+#define SQR_FFT_THRESHOLD                 1856
+
+#define MULLO_BASECASE_THRESHOLD             0  /* always */
+#define MULLO_DC_THRESHOLD                 113
+#define MULLO_MUL_N_THRESHOLD             4658
+
+#define DC_DIV_QR_THRESHOLD                123
+#define DC_DIVAPPR_Q_THRESHOLD             372
+#define DC_BDIV_QR_THRESHOLD               142
+#define DC_BDIV_Q_THRESHOLD                312
+
+#define INV_MULMOD_BNM1_THRESHOLD           58
+#define INV_NEWTON_THRESHOLD               315
+#define INV_APPR_THRESHOLD                 315
+
+#define BINV_NEWTON_THRESHOLD              360
+#define REDC_1_TO_REDC_N_THRESHOLD         101
+
+#define MU_DIV_QR_THRESHOLD                979
+#define MU_DIVAPPR_Q_THRESHOLD            1142
+#define MUPI_DIV_QR_THRESHOLD               93
+#define MU_BDIV_QR_THRESHOLD               889
+#define MU_BDIV_Q_THRESHOLD               1187
+
+#define MATRIX22_STRASSEN_THRESHOLD          9
+#define HGCD_THRESHOLD                     234
+#define HGCD_APPR_THRESHOLD                300
+#define HGCD_REDUCE_THRESHOLD             1553
+#define GCD_DC_THRESHOLD                   684
+#define GCDEXT_DC_THRESHOLD                525
+#define JACOBI_BASE_METHOD                   2
+
+#define GET_STR_DC_THRESHOLD                21
+#define GET_STR_PRECOMPUTE_THRESHOLD        24
+#define SET_STR_DC_THRESHOLD              1951
+#define SET_STR_PRECOMPUTE_THRESHOLD      4034

diff --git a/third_party/gmp/mpn/pa64/lshift.asm b/third_party/gmp/mpn/pa64/lshift.asm
new file mode 100644
index 0000000..c0fc292
--- /dev/null
+++ b/third_party/gmp/mpn/pa64/lshift.asm

@@ -0,0 +1,114 @@
+dnl  HP-PA 2.0 mpn_lshift -- Left shift.
+
+dnl  Copyright 1997, 2000, 2002, 2003 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+dnl  This runs at 1.5 cycles/limb on PA8000 and 1.0 cycles/limb on PA8500.
+
+include(`../config.m4')
+
+dnl  INPUT PARAMETERS
+define(`rp',`%r26')
+define(`up',`%r25')
+define(`n',`%r24')
+define(`cnt',`%r23')
+
+ifdef(`HAVE_ABI_2_0w',
+`       .level  2.0w
+',`     .level  2.0
+')
+PROLOGUE(mpn_lshift)
+	shladd		n, 3, up, up
+	shladd		n, 3, rp, rp
+	subi		64, cnt, cnt
+	mtsar		cnt
+	ldd		-8(up), %r21
+	addib,=		-1, n, L(end)
+	shrpd		%r0, %r21, %sar, %r29	C compute carry out limb
+	depw,z		n, 31, 3, %r28		C r28 = (size & 7)
+	sub		%r0, n, %r22
+	depw,z		%r22, 28, 3, %r22	C r22 = 8 * (-size & 7)
+	add		up, %r22, up		C offset up
+	blr		%r28, %r0		C branch into jump table
+	add		rp, %r22, rp		C offset rp
+	b		L(0)
+	nop
+	b		L(1)
+	copy		%r21, %r20
+	b		L(2)
+	nop
+	b		L(3)
+	copy		%r21, %r20
+	b		L(4)
+	nop
+	b		L(5)
+	copy		%r21, %r20
+	b		L(6)
+	nop
+	b		L(7)
+	copy		%r21, %r20
+
+LDEF(loop)
+LDEF(0)	ldd		-16(up), %r20
+	shrpd		%r21, %r20, %sar, %r21
+	std		%r21, -8(rp)
+LDEF(7)	ldd		-24(up), %r21
+	shrpd		%r20, %r21, %sar, %r20
+	std		%r20, -16(rp)
+LDEF(6)	ldd		-32(up), %r20
+	shrpd		%r21, %r20, %sar, %r21
+	std		%r21, -24(rp)
+LDEF(5)	ldd		-40(up), %r21
+	shrpd		%r20, %r21, %sar, %r20
+	std		%r20, -32(rp)
+LDEF(4)	ldd		-48(up), %r20
+	shrpd		%r21, %r20, %sar, %r21
+	std		%r21, -40(rp)
+LDEF(3)	ldd		-56(up), %r21
+	shrpd		%r20, %r21, %sar, %r20
+	std		%r20, -48(rp)
+LDEF(2)	ldd		-64(up), %r20
+	shrpd		%r21, %r20, %sar, %r21
+	std		%r21, -56(rp)
+LDEF(1)	ldd		-72(up), %r21
+	ldo		-64(up), up
+	shrpd		%r20, %r21, %sar, %r20
+	std		%r20, -64(rp)
+	addib,>		-8, n, L(loop)
+	ldo		-64(rp), rp
+
+LDEF(end)
+	shrpd		%r21, %r0, %sar, %r21
+	std		%r21, -8(rp)
+	bve		(%r2)
+ifdef(`HAVE_ABI_2_0w',
+`	copy		%r29,%r28
+',`	extrd,u		%r29, 31, 32, %r28
+')
+EPILOGUE(mpn_lshift)

diff --git a/third_party/gmp/mpn/pa64/mul_1.asm b/third_party/gmp/mpn/pa64/mul_1.asm
new file mode 100644
index 0000000..6935c23
--- /dev/null
+++ b/third_party/gmp/mpn/pa64/mul_1.asm

@@ -0,0 +1,646 @@
+dnl  HP-PA 2.0 64-bit mpn_mul_1 -- Multiply a limb vector with a limb and store
+dnl  the result in a second limb vector.
+
+dnl  Copyright 1998-2000, 2002, 2003 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C		    cycles/limb
+C 8000,8200:		6.5
+C 8500,8600,8700:	5.625
+
+C  The feed-in and wind-down code has not yet been scheduled.  Many cycles
+C  could be saved there per call.
+
+C  DESCRIPTION:
+C  The main loop "BIG" is 4-way unrolled, mainly to allow
+C  effective use of ADD,DC.  Delays in moving data via the cache from the FP
+C  registers to the IU registers, have demanded a deep software pipeline, and
+C  a lot of stack slots for partial products in flight.
+C
+C  CODE STRUCTURE:
+C  save-some-registers
+C  do 0, 1, 2, or 3 limbs
+C  if done, restore-some-regs and return
+C  save-many-regs
+C  do 4, 8, ... limb
+C  restore-all-regs
+
+C  STACK LAYOUT:
+C  HP-PA stack grows upwards.  We could allocate 8 fewer slots by using the
+C  slots marked FREE, as well as some slots in the caller's "frame marker".
+C
+C -00 <- r30
+C -08  FREE
+C -10  tmp
+C -18  tmp
+C -20  tmp
+C -28  tmp
+C -30  tmp
+C -38  tmp
+C -40  tmp
+C -48  tmp
+C -50  tmp
+C -58  tmp
+C -60  tmp
+C -68  tmp
+C -70  tmp
+C -78  tmp
+C -80  tmp
+C -88  tmp
+C -90  FREE
+C -98  FREE
+C -a0  FREE
+C -a8  FREE
+C -b0  r13
+C -b8  r12
+C -c0  r11
+C -c8  r10
+C -d0  r8
+C -d8  r8
+C -e0  r7
+C -e8  r6
+C -f0  r5
+C -f8  r4
+C -100 r3
+C  Previous frame:
+C  [unused area]
+C -38/-138 vlimb home slot.  For 2.0N, the vlimb arg will arrive here.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS:
+define(`rp',`%r26')	C
+define(`up',`%r25')	C
+define(`n',`%r24')	C
+define(`vlimb',`%r23')	C
+
+define(`climb',`%r23')	C
+
+ifdef(`HAVE_ABI_2_0w',
+`	.level	2.0w
+',`	.level	2.0
+')
+PROLOGUE(mpn_mul_1)
+
+ifdef(`HAVE_ABI_2_0w',
+`	std		vlimb, -0x38(%r30)	C store vlimb into "home" slot
+')
+	std,ma		%r3, 0x100(%r30)
+	std		%r4, -0xf8(%r30)
+	std		%r5, -0xf0(%r30)
+	ldo		0(%r0), climb		C clear climb
+	fldd		-0x138(%r30), %fr8	C put vlimb in fp register
+
+define(`p032a1',`%r1')	C
+define(`p032a2',`%r19')	C
+
+define(`m032',`%r20')	C
+define(`m096',`%r21')	C
+
+define(`p000a',`%r22')	C
+define(`p064a',`%r29')	C
+
+define(`s000',`%r31')	C
+
+define(`ma000',`%r4')	C
+define(`ma064',`%r20')	C
+
+C define(`r000',`%r3')	C	FIXME don't save r3 for n < 4.
+
+	extrd,u		n, 63, 2, %r5
+	cmpb,=		%r5, %r0, L(BIG)
+	nop
+
+	fldd		0(up), %fr4
+	ldo		8(up), up
+	xmpyu		%fr8R, %fr4L, %fr22
+	xmpyu		%fr8L, %fr4R, %fr23
+	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
+	xmpyu		%fr8R, %fr4R, %fr24
+	xmpyu		%fr8L, %fr4L, %fr25
+	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
+	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79
+	addib,<>	-1, %r5, L(two_or_more)
+	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61
+LDEF(one)
+	ldd		-0x78(%r30), p032a1
+	ldd		-0x70(%r30), p032a2
+	ldd		-0x80(%r30), p000a
+	b		L(0_one_out)
+	ldd		-0x68(%r30), p064a
+
+LDEF(two_or_more)
+	fldd		0(up), %fr4
+	ldo		8(up), up
+	xmpyu		%fr8R, %fr4L, %fr22
+	xmpyu		%fr8L, %fr4R, %fr23
+	ldd		-0x78(%r30), p032a1
+	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
+	xmpyu		%fr8R, %fr4R, %fr24
+	xmpyu		%fr8L, %fr4L, %fr25
+	ldd		-0x70(%r30), p032a2
+	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
+	ldd		-0x80(%r30), p000a
+	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79
+	ldd		-0x68(%r30), p064a
+	addib,<>	-1, %r5, L(three_or_more)
+	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61
+LDEF(two)
+	add		p032a1, p032a2, m032
+	add,dc		%r0, %r0, m096
+	depd,z		m032, 31, 32, ma000
+	extrd,u		m032, 31, 32, ma064
+	b		L(0_two_out)
+	depd		m096, 31, 32, ma064
+
+LDEF(three_or_more)
+	fldd		0(up), %fr4
+	add		p032a1, p032a2, m032
+	add,dc		%r0, %r0, m096
+	depd,z		m032, 31, 32, ma000
+	extrd,u		m032, 31, 32, ma064
+C	addib,=		-1, %r5, L(0_out)
+	depd		m096, 31, 32, ma064
+LDEF(loop0)
+C	xmpyu		%fr8R, %fr4L, %fr22
+C	xmpyu		%fr8L, %fr4R, %fr23
+C	ldd		-0x78(%r30), p032a1
+C	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
+C
+C	xmpyu		%fr8R, %fr4R, %fr24
+C	xmpyu		%fr8L, %fr4L, %fr25
+C	ldd		-0x70(%r30), p032a2
+C	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
+C
+C	ldo		8(rp), rp
+C	add		climb, p000a, s000
+C	ldd		-0x80(%r30), p000a
+C	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79
+C
+C	add,dc		p064a, %r0, climb
+C	ldo		8(up), up
+C	ldd		-0x68(%r30), p064a
+C	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61
+C
+C	add		ma000, s000, s000
+C	add,dc		ma064, climb, climb
+C	fldd		0(up), %fr4
+C
+C	std		s000, -8(rp)
+C
+C	add		p032a1, p032a2, m032
+C	add,dc		%r0, %r0, m096
+C
+C	depd,z		m032, 31, 32, ma000
+C	extrd,u		m032, 31, 32, ma064
+C	addib,<>	-1, %r5, L(loop0)
+C	depd		m096, 31, 32, ma064
+LDEF(0_out)
+	ldo		8(up), up
+	xmpyu		%fr8R, %fr4L, %fr22
+	xmpyu		%fr8L, %fr4R, %fr23
+	ldd		-0x78(%r30), p032a1
+	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
+	xmpyu		%fr8R, %fr4R, %fr24
+	xmpyu		%fr8L, %fr4L, %fr25
+	ldd		-0x70(%r30), p032a2
+	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
+	ldo		8(rp), rp
+	add		climb, p000a, s000
+	ldd		-0x80(%r30), p000a
+	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79
+	add,dc		p064a, %r0, climb
+	ldd		-0x68(%r30), p064a
+	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61
+	add		ma000, s000, s000
+	add,dc		ma064, climb, climb
+	std		s000, -8(rp)
+	add		p032a1, p032a2, m032
+	add,dc		%r0, %r0, m096
+	depd,z		m032, 31, 32, ma000
+	extrd,u		m032, 31, 32, ma064
+	depd		m096, 31, 32, ma064
+LDEF(0_two_out)
+	ldd		-0x78(%r30), p032a1
+	ldd		-0x70(%r30), p032a2
+	ldo		8(rp), rp
+	add		climb, p000a, s000
+	ldd		-0x80(%r30), p000a
+	add,dc		p064a, %r0, climb
+	ldd		-0x68(%r30), p064a
+	add		ma000, s000, s000
+	add,dc		ma064, climb, climb
+	std		s000, -8(rp)
+LDEF(0_one_out)
+	add		p032a1, p032a2, m032
+	add,dc		%r0, %r0, m096
+	depd,z		m032, 31, 32, ma000
+	extrd,u		m032, 31, 32, ma064
+	depd		m096, 31, 32, ma064
+
+	add		climb, p000a, s000
+	add,dc		p064a, %r0, climb
+	add		ma000, s000, s000
+	add,dc		ma064, climb, climb
+	std		s000, 0(rp)
+
+	cmpib,>=	4, n, L(done)
+	ldo		8(rp), rp
+
+C 4-way unrolled code.
+
+LDEF(BIG)
+
+define(`p032a1',`%r1')	C
+define(`p032a2',`%r19')	C
+define(`p096b1',`%r20')	C
+define(`p096b2',`%r21')	C
+define(`p160c1',`%r22')	C
+define(`p160c2',`%r29')	C
+define(`p224d1',`%r31')	C
+define(`p224d2',`%r3')	C
+			C
+define(`m032',`%r4')	C
+define(`m096',`%r5')	C
+define(`m160',`%r6')	C
+define(`m224',`%r7')	C
+define(`m288',`%r8')	C
+			C
+define(`p000a',`%r1')	C
+define(`p064a',`%r19')	C
+define(`p064b',`%r20')	C
+define(`p128b',`%r21')	C
+define(`p128c',`%r22')	C
+define(`p192c',`%r29')	C
+define(`p192d',`%r31')	C
+define(`p256d',`%r3')	C
+			C
+define(`s000',`%r10')	C
+define(`s064',`%r11')	C
+define(`s128',`%r12')	C
+define(`s192',`%r13')	C
+			C
+define(`ma000',`%r9')	C
+define(`ma064',`%r4')	C
+define(`ma128',`%r5')	C
+define(`ma192',`%r6')	C
+define(`ma256',`%r7')	C
+
+	std		%r6, -0xe8(%r30)
+	std		%r7, -0xe0(%r30)
+	std		%r8, -0xd8(%r30)
+	std		%r9, -0xd0(%r30)
+	std		%r10, -0xc8(%r30)
+	std		%r11, -0xc0(%r30)
+	std		%r12, -0xb8(%r30)
+	std		%r13, -0xb0(%r30)
+
+ifdef(`HAVE_ABI_2_0w',
+`	extrd,u		n, 61, 62, n		C right shift 2
+',`	extrd,u		n, 61, 30, n		C right shift 2, zero extend
+')
+
+LDEF(4_or_more)
+	fldd		0(up), %fr4
+	fldd		8(up), %fr5
+	fldd		16(up), %fr6
+	fldd		24(up), %fr7
+	xmpyu		%fr8R, %fr4L, %fr22
+	xmpyu		%fr8L, %fr4R, %fr23
+	xmpyu		%fr8R, %fr5L, %fr24
+	xmpyu		%fr8L, %fr5R, %fr25
+	xmpyu		%fr8R, %fr6L, %fr26
+	xmpyu		%fr8L, %fr6R, %fr27
+	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
+	xmpyu		%fr8R, %fr7L, %fr28
+	xmpyu		%fr8L, %fr7R, %fr29
+	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
+	xmpyu		%fr8R, %fr4R, %fr30
+	xmpyu		%fr8L, %fr4L, %fr31
+	fstd		%fr24, -0x38(%r30)	C mid product to  -0x38..-0x31
+	xmpyu		%fr8R, %fr5R, %fr22
+	xmpyu		%fr8L, %fr5L, %fr23
+	fstd		%fr25, -0x30(%r30)	C mid product to  -0x30..-0x29
+	xmpyu		%fr8R, %fr6R, %fr24
+	xmpyu		%fr8L, %fr6L, %fr25
+	fstd		%fr26, -0x58(%r30)	C mid product to  -0x58..-0x51
+	xmpyu		%fr8R, %fr7R, %fr26
+	fstd		%fr27, -0x50(%r30)	C mid product to  -0x50..-0x49
+	addib,<>	-1, n, L(8_or_more)
+	xmpyu		%fr8L, %fr7L, %fr27
+	fstd		%fr28, -0x18(%r30)	C mid product to  -0x18..-0x11
+	fstd		%fr29, -0x10(%r30)	C mid product to  -0x10..-0x09
+	fstd		%fr30, -0x80(%r30)	C low product to  -0x80..-0x79
+	fstd		%fr31, -0x68(%r30)	C high product to -0x68..-0x61
+	fstd		%fr22, -0x40(%r30)	C low product to  -0x40..-0x39
+	fstd		%fr23, -0x28(%r30)	C high product to -0x28..-0x21
+	fstd		%fr24, -0x60(%r30)	C low product to  -0x60..-0x59
+	fstd		%fr25, -0x48(%r30)	C high product to -0x48..-0x41
+	fstd		%fr26, -0x20(%r30)	C low product to  -0x20..-0x19
+	fstd		%fr27, -0x88(%r30)	C high product to -0x88..-0x81
+	ldd		-0x78(%r30), p032a1
+	ldd		-0x70(%r30), p032a2
+	ldd		-0x38(%r30), p096b1
+	ldd		-0x30(%r30), p096b2
+	ldd		-0x58(%r30), p160c1
+	ldd		-0x50(%r30), p160c2
+	ldd		-0x18(%r30), p224d1
+	ldd		-0x10(%r30), p224d2
+	b		L(end1)
+	nop
+
+LDEF(8_or_more)
+	fstd		%fr28, -0x18(%r30)	C mid product to  -0x18..-0x11
+	fstd		%fr29, -0x10(%r30)	C mid product to  -0x10..-0x09
+	ldo		32(up), up
+	fstd		%fr30, -0x80(%r30)	C low product to  -0x80..-0x79
+	fstd		%fr31, -0x68(%r30)	C high product to -0x68..-0x61
+	fstd		%fr22, -0x40(%r30)	C low product to  -0x40..-0x39
+	fstd		%fr23, -0x28(%r30)	C high product to -0x28..-0x21
+	fstd		%fr24, -0x60(%r30)	C low product to  -0x60..-0x59
+	fstd		%fr25, -0x48(%r30)	C high product to -0x48..-0x41
+	fstd		%fr26, -0x20(%r30)	C low product to  -0x20..-0x19
+	fstd		%fr27, -0x88(%r30)	C high product to -0x88..-0x81
+	fldd		0(up), %fr4
+	fldd		8(up), %fr5
+	fldd		16(up), %fr6
+	fldd		24(up), %fr7
+	xmpyu		%fr8R, %fr4L, %fr22
+	ldd		-0x78(%r30), p032a1
+	xmpyu		%fr8L, %fr4R, %fr23
+	xmpyu		%fr8R, %fr5L, %fr24
+	ldd		-0x70(%r30), p032a2
+	xmpyu		%fr8L, %fr5R, %fr25
+	xmpyu		%fr8R, %fr6L, %fr26
+	ldd		-0x38(%r30), p096b1
+	xmpyu		%fr8L, %fr6R, %fr27
+	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
+	xmpyu		%fr8R, %fr7L, %fr28
+	ldd		-0x30(%r30), p096b2
+	xmpyu		%fr8L, %fr7R, %fr29
+	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
+	xmpyu		%fr8R, %fr4R, %fr30
+	ldd		-0x58(%r30), p160c1
+	xmpyu		%fr8L, %fr4L, %fr31
+	fstd		%fr24, -0x38(%r30)	C mid product to  -0x38..-0x31
+	xmpyu		%fr8R, %fr5R, %fr22
+	ldd		-0x50(%r30), p160c2
+	xmpyu		%fr8L, %fr5L, %fr23
+	fstd		%fr25, -0x30(%r30)	C mid product to  -0x30..-0x29
+	xmpyu		%fr8R, %fr6R, %fr24
+	ldd		-0x18(%r30), p224d1
+	xmpyu		%fr8L, %fr6L, %fr25
+	fstd		%fr26, -0x58(%r30)	C mid product to  -0x58..-0x51
+	xmpyu		%fr8R, %fr7R, %fr26
+	ldd		-0x10(%r30), p224d2
+	fstd		%fr27, -0x50(%r30)	C mid product to  -0x50..-0x49
+	addib,=		-1, n, L(end2)
+	xmpyu		%fr8L, %fr7L, %fr27
+LDEF(loop)
+	add		p032a1, p032a2, m032
+	ldd		-0x80(%r30), p000a
+	add,dc		p096b1, p096b2, m096
+	fstd		%fr28, -0x18(%r30)	C mid product to  -0x18..-0x11
+
+	add,dc		p160c1, p160c2, m160
+	ldd		-0x68(%r30), p064a
+	add,dc		p224d1, p224d2, m224
+	fstd		%fr29, -0x10(%r30)	C mid product to  -0x10..-0x09
+
+	add,dc		%r0, %r0, m288
+	ldd		-0x40(%r30), p064b
+	ldo		32(up), up
+	fstd		%fr30, -0x80(%r30)	C low product to  -0x80..-0x79
+
+	depd,z		m032, 31, 32, ma000
+	ldd		-0x28(%r30), p128b
+	extrd,u		m032, 31, 32, ma064
+	fstd		%fr31, -0x68(%r30)	C high product to -0x68..-0x61
+
+	depd		m096, 31, 32, ma064
+	ldd		-0x60(%r30), p128c
+	extrd,u		m096, 31, 32, ma128
+	fstd		%fr22, -0x40(%r30)	C low product to  -0x40..-0x39
+
+	depd		m160, 31, 32, ma128
+	ldd		-0x48(%r30), p192c
+	extrd,u		m160, 31, 32, ma192
+	fstd		%fr23, -0x28(%r30)	C high product to -0x28..-0x21
+
+	depd		m224, 31, 32, ma192
+	ldd		-0x20(%r30), p192d
+	extrd,u		m224, 31, 32, ma256
+	fstd		%fr24, -0x60(%r30)	C low product to  -0x60..-0x59
+
+	depd		m288, 31, 32, ma256
+	ldd		-0x88(%r30), p256d
+	add		climb, p000a, s000
+	fstd		%fr25, -0x48(%r30)	C high product to -0x48..-0x41
+
+	add,dc		p064a, p064b, s064
+	add,dc		p128b, p128c, s128
+	fstd		%fr26, -0x20(%r30)	C low product to  -0x20..-0x19
+
+	add,dc		p192c, p192d, s192
+	add,dc		p256d, %r0, climb
+	fstd		%fr27, -0x88(%r30)	C high product to -0x88..-0x81
+
+	add		ma000, s000, s000	C accum mid 0
+	fldd		0(up), %fr4
+	add,dc		ma064, s064, s064	C accum mid 1
+	std		s000, 0(rp)
+
+	add,dc		ma128, s128, s128	C accum mid 2
+	fldd		8(up), %fr5
+	add,dc		ma192, s192, s192	C accum mid 3
+	std		s064, 8(rp)
+
+	add,dc		ma256, climb, climb
+	fldd		16(up), %fr6
+	std		s128, 16(rp)
+
+	xmpyu		%fr8R, %fr4L, %fr22
+	ldd		-0x78(%r30), p032a1
+	xmpyu		%fr8L, %fr4R, %fr23
+	fldd		24(up), %fr7
+
+	xmpyu		%fr8R, %fr5L, %fr24
+	ldd		-0x70(%r30), p032a2
+	xmpyu		%fr8L, %fr5R, %fr25
+	std		s192, 24(rp)
+
+	xmpyu		%fr8R, %fr6L, %fr26
+	ldd		-0x38(%r30), p096b1
+	xmpyu		%fr8L, %fr6R, %fr27
+	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
+
+	xmpyu		%fr8R, %fr7L, %fr28
+	ldd		-0x30(%r30), p096b2
+	xmpyu		%fr8L, %fr7R, %fr29
+	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
+
+	xmpyu		%fr8R, %fr4R, %fr30
+	ldd		-0x58(%r30), p160c1
+	xmpyu		%fr8L, %fr4L, %fr31
+	fstd		%fr24, -0x38(%r30)	C mid product to  -0x38..-0x31
+
+	xmpyu		%fr8R, %fr5R, %fr22
+	ldd		-0x50(%r30), p160c2
+	xmpyu		%fr8L, %fr5L, %fr23
+	fstd		%fr25, -0x30(%r30)	C mid product to  -0x30..-0x29
+
+	xmpyu		%fr8R, %fr6R, %fr24
+	ldd		-0x18(%r30), p224d1
+	xmpyu		%fr8L, %fr6L, %fr25
+	fstd		%fr26, -0x58(%r30)	C mid product to  -0x58..-0x51
+
+	xmpyu		%fr8R, %fr7R, %fr26
+	ldd		-0x10(%r30), p224d2
+	fstd		%fr27, -0x50(%r30)	C mid product to  -0x50..-0x49
+	xmpyu		%fr8L, %fr7L, %fr27
+
+	addib,<>	-1, n, L(loop)
+	ldo		32(rp), rp
+
+LDEF(end2)
+	add		p032a1, p032a2, m032
+	ldd		-0x80(%r30), p000a
+	add,dc		p096b1, p096b2, m096
+	fstd		%fr28, -0x18(%r30)	C mid product to  -0x18..-0x11
+	add,dc		p160c1, p160c2, m160
+	ldd		-0x68(%r30), p064a
+	add,dc		p224d1, p224d2, m224
+	fstd		%fr29, -0x10(%r30)	C mid product to  -0x10..-0x09
+	add,dc		%r0, %r0, m288
+	ldd		-0x40(%r30), p064b
+	fstd		%fr30, -0x80(%r30)	C low product to  -0x80..-0x79
+	depd,z		m032, 31, 32, ma000
+	ldd		-0x28(%r30), p128b
+	extrd,u		m032, 31, 32, ma064
+	fstd		%fr31, -0x68(%r30)	C high product to -0x68..-0x61
+	depd		m096, 31, 32, ma064
+	ldd		-0x60(%r30), p128c
+	extrd,u		m096, 31, 32, ma128
+	fstd		%fr22, -0x40(%r30)	C low product to  -0x40..-0x39
+	depd		m160, 31, 32, ma128
+	ldd		-0x48(%r30), p192c
+	extrd,u		m160, 31, 32, ma192
+	fstd		%fr23, -0x28(%r30)	C high product to -0x28..-0x21
+	depd		m224, 31, 32, ma192
+	ldd		-0x20(%r30), p192d
+	extrd,u		m224, 31, 32, ma256
+	fstd		%fr24, -0x60(%r30)	C low product to  -0x60..-0x59
+	depd		m288, 31, 32, ma256
+	ldd		-0x88(%r30), p256d
+	add		climb, p000a, s000
+	fstd		%fr25, -0x48(%r30)	C high product to -0x48..-0x41
+	add,dc		p064a, p064b, s064
+	add,dc		p128b, p128c, s128
+	fstd		%fr26, -0x20(%r30)	C low product to  -0x20..-0x19
+	add,dc		p192c, p192d, s192
+	add,dc		p256d, %r0, climb
+	fstd		%fr27, -0x88(%r30)	C high product to -0x88..-0x81
+	add		ma000, s000, s000	C accum mid 0
+	add,dc		ma064, s064, s064	C accum mid 1
+	add,dc		ma128, s128, s128	C accum mid 2
+	add,dc		ma192, s192, s192	C accum mid 3
+	add,dc		ma256, climb, climb
+	std		s000, 0(rp)
+	std		s064, 8(rp)
+	ldd		-0x78(%r30), p032a1
+	std		s128, 16(rp)
+	ldd		-0x70(%r30), p032a2
+	std		s192, 24(rp)
+	ldd		-0x38(%r30), p096b1
+	ldd		-0x30(%r30), p096b2
+	ldd		-0x58(%r30), p160c1
+	ldd		-0x50(%r30), p160c2
+	ldd		-0x18(%r30), p224d1
+	ldd		-0x10(%r30), p224d2
+	ldo		32(rp), rp
+
+LDEF(end1)
+	add		p032a1, p032a2, m032
+	ldd		-0x80(%r30), p000a
+	add,dc		p096b1, p096b2, m096
+	add,dc		p160c1, p160c2, m160
+	ldd		-0x68(%r30), p064a
+	add,dc		p224d1, p224d2, m224
+	add,dc		%r0, %r0, m288
+	ldd		-0x40(%r30), p064b
+	depd,z		m032, 31, 32, ma000
+	ldd		-0x28(%r30), p128b
+	extrd,u		m032, 31, 32, ma064
+	depd		m096, 31, 32, ma064
+	ldd		-0x60(%r30), p128c
+	extrd,u		m096, 31, 32, ma128
+	depd		m160, 31, 32, ma128
+	ldd		-0x48(%r30), p192c
+	extrd,u		m160, 31, 32, ma192
+	depd		m224, 31, 32, ma192
+	ldd		-0x20(%r30), p192d
+	extrd,u		m224, 31, 32, ma256
+	depd		m288, 31, 32, ma256
+	ldd		-0x88(%r30), p256d
+	add		climb, p000a, s000
+	add,dc		p064a, p064b, s064
+	add,dc		p128b, p128c, s128
+	add,dc		p192c, p192d, s192
+	add,dc		p256d, %r0, climb
+	add		ma000, s000, s000	C accum mid 0
+	add,dc		ma064, s064, s064	C accum mid 1
+	add,dc		ma128, s128, s128	C accum mid 2
+	add,dc		ma192, s192, s192	C accum mid 3
+	add,dc		ma256, climb, climb
+	std		s000, 0(rp)
+	std		s064, 8(rp)
+	std		s128, 16(rp)
+	std		s192, 24(rp)
+
+	ldd		-0xb0(%r30), %r13
+	ldd		-0xb8(%r30), %r12
+	ldd		-0xc0(%r30), %r11
+	ldd		-0xc8(%r30), %r10
+	ldd		-0xd0(%r30), %r9
+	ldd		-0xd8(%r30), %r8
+	ldd		-0xe0(%r30), %r7
+	ldd		-0xe8(%r30), %r6
+LDEF(done)
+ifdef(`HAVE_ABI_2_0w',
+`	copy		climb, %r28
+',`	extrd,u		climb, 63, 32, %r29
+	extrd,u		climb, 31, 32, %r28
+')
+	ldd		-0xf0(%r30), %r5
+	ldd		-0xf8(%r30), %r4
+	bve		(%r2)
+	ldd,mb		-0x100(%r30), %r3
+EPILOGUE(mpn_mul_1)

diff --git a/third_party/gmp/mpn/pa64/rshift.asm b/third_party/gmp/mpn/pa64/rshift.asm
new file mode 100644
index 0000000..cfc242e
--- /dev/null
+++ b/third_party/gmp/mpn/pa64/rshift.asm

@@ -0,0 +1,111 @@
+dnl  HP-PA 2.0 mpn_rshift -- Right shift.
+
+dnl  Copyright 1997, 2000, 2002, 2003 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+dnl  This runs at 1.5 cycles/limb on PA8000 and 1.0 cycles/limb on PA8500.
+
+include(`../config.m4')
+
+dnl  INPUT PARAMETERS
+define(`rp',`%r26')
+define(`up',`%r25')
+define(`n',`%r24')
+define(`cnt',`%r23')
+
+ifdef(`HAVE_ABI_2_0w',
+`       .level  2.0w
+',`     .level  2.0
+')
+PROLOGUE(mpn_rshift)
+	mtsar		cnt
+	ldd		0(up), %r21
+	addib,=		-1, n, L(end)
+	shrpd		%r21, %r0, %sar, %r29	C compute carry out limb
+	depw,z		n, 31, 3, %r28		C r28 = (size & 7)
+	sub		%r0, n, %r22
+	depw,z		%r22, 28, 3, %r22	C r22 = 8 * (-size & 7)
+	sub		up, %r22, up		C offset up
+	blr		%r28, %r0		C branch into jump table
+	sub		rp, %r22, rp		C offset rp
+	b		L(0)
+	nop
+	b		L(1)
+	copy		%r21, %r20
+	b		L(2)
+	nop
+	b		L(3)
+	copy		%r21, %r20
+	b		L(4)
+	nop
+	b		L(5)
+	copy		%r21, %r20
+	b		L(6)
+	nop
+	b		L(7)
+	copy		%r21, %r20
+
+LDEF(loop)
+LDEF(0)	ldd		8(up), %r20
+	shrpd		%r20, %r21, %sar, %r21
+	std		%r21, 0(rp)
+LDEF(7)	ldd		16(up), %r21
+	shrpd		%r21, %r20, %sar, %r20
+	std		%r20, 8(rp)
+LDEF(6)	ldd		24(up), %r20
+	shrpd		%r20, %r21, %sar, %r21
+	std		%r21, 16(rp)
+LDEF(5)	ldd		32(up), %r21
+	shrpd		%r21, %r20, %sar, %r20
+	std		%r20, 24(rp)
+LDEF(4)	ldd		40(up), %r20
+	shrpd		%r20, %r21, %sar, %r21
+	std		%r21, 32(rp)
+LDEF(3)	ldd		48(up), %r21
+	shrpd		%r21, %r20, %sar, %r20
+	std		%r20, 40(rp)
+LDEF(2)	ldd		56(up), %r20
+	shrpd		%r20, %r21, %sar, %r21
+	std		%r21, 48(rp)
+LDEF(1)	ldd		64(up), %r21
+	ldo		64(up), up
+	shrpd		%r21, %r20, %sar, %r20
+	std		%r20, 56(rp)
+	addib,>		-8, n, L(loop)
+	ldo		64(rp), rp
+
+LDEF(end)
+	shrpd		%r0, %r21, %sar, %r21
+	std		%r21, 0(rp)
+	bve		(%r2)
+ifdef(`HAVE_ABI_2_0w',
+`	copy		%r29,%r28
+',`	extrd,u		%r29, 31, 32, %r28
+')
+EPILOGUE(mpn_rshift)

diff --git a/third_party/gmp/mpn/pa64/sqr_diagonal.asm b/third_party/gmp/mpn/pa64/sqr_diagonal.asm
new file mode 100644
index 0000000..f6fadc9
--- /dev/null
+++ b/third_party/gmp/mpn/pa64/sqr_diagonal.asm

@@ -0,0 +1,191 @@
+dnl  HP-PA 2.0 64-bit mpn_sqr_diagonal.
+
+dnl  Copyright 2001-2003 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+dnl  This code runs at 7.25 cycles/limb on PA8000 and 7.75 cycles/limb on
+dnl  PA8500.  The cache would saturate at 5 cycles/limb, so there is some room
+dnl  for optimization.
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+define(`rp',`%r26')
+define(`up',`%r25')
+define(`n',`%r24')
+
+define(`p00',`%r28')
+define(`p32',`%r29')
+define(`p64',`%r31')
+define(`t0',`%r19')
+define(`t1',`%r20')
+
+ifdef(`HAVE_ABI_2_0w',
+`	.level	2.0w
+',`	.level	2.0
+')
+PROLOGUE(mpn_sqr_diagonal)
+	ldo		128(%r30),%r30
+
+	fldds,ma	8(up),%fr8
+	addib,=		-1,n,L(end1)
+	nop
+	fldds,ma	8(up),%fr4
+	xmpyu		%fr8l,%fr8r,%fr10
+	fstd		%fr10,-120(%r30)
+	xmpyu		%fr8r,%fr8r,%fr9
+	fstd		%fr9,0(rp)
+	xmpyu		%fr8l,%fr8l,%fr11
+	fstd		%fr11,8(rp)
+	addib,=		-1,n,L(end2)
+	ldo		16(rp),rp
+
+LDEF(loop)
+	fldds,ma	8(up),%fr8		C load next up limb
+	xmpyu		%fr4l,%fr4r,%fr6
+	fstd		%fr6,-128(%r30)
+	xmpyu		%fr4r,%fr4r,%fr5	C multiply in fp regs
+	fstd		%fr5,0(rp)
+	xmpyu		%fr4l,%fr4l,%fr7
+	fstd		%fr7,8(rp)
+	ldd		-120(%r30),p32
+	ldd		-16(rp),p00		C accumulate in int regs
+	ldd		-8(rp),p64
+	depd,z		p32,30,31,t0
+	add		t0,p00,p00
+	std		p00,-16(rp)
+	extrd,u		p32,32,33,t1
+	add,dc		t1,p64,p64
+	std		p64,-8(rp)
+	addib,=		-1,n,L(exit)
+	ldo		16(rp),rp
+
+	fldds,ma	8(up),%fr4
+	xmpyu		%fr8l,%fr8r,%fr10
+	fstd		%fr10,-120(%r30)
+	xmpyu		%fr8r,%fr8r,%fr9
+	fstd		%fr9,0(rp)
+	xmpyu		%fr8l,%fr8l,%fr11
+	fstd		%fr11,8(rp)
+	ldd		-128(%r30),p32
+	ldd		-16(rp),p00
+	ldd		-8(rp),p64
+	depd,z		p32,30,31,t0
+	add		t0,p00,p00
+	std		p00,-16(rp)
+	extrd,u		p32,32,33,t1
+	add,dc		t1,p64,p64
+	std		p64,-8(rp)
+	addib,<>	-1,n,L(loop)
+	ldo		16(rp),rp
+
+LDEF(end2)
+	xmpyu		%fr4l,%fr4r,%fr6
+	fstd		%fr6,-128(%r30)
+	xmpyu		%fr4r,%fr4r,%fr5
+	fstd		%fr5,0(rp)
+	xmpyu		%fr4l,%fr4l,%fr7
+	fstd		%fr7,8(rp)
+	ldd		-120(%r30),p32
+	ldd		-16(rp),p00
+	ldd		-8(rp),p64
+	depd,z		p32,30,31,t0
+	add		t0,p00,p00
+	std		p00,-16(rp)
+	extrd,u		p32,32,33,t1
+	add,dc		t1,p64,p64
+	std		p64,-8(rp)
+	ldo		16(rp),rp
+	ldd		-128(%r30),p32
+	ldd		-16(rp),p00
+	ldd		-8(rp),p64
+	depd,z		p32,30,31,t0
+	add		t0,p00,p00
+	std		p00,-16(rp)
+	extrd,u		p32,32,33,t1
+	add,dc		t1,p64,p64
+	std		p64,-8(rp)
+	bve		(%r2)
+	ldo		-128(%r30),%r30
+
+LDEF(exit)
+	xmpyu		%fr8l,%fr8r,%fr10
+	fstd		%fr10,-120(%r30)
+	xmpyu		%fr8r,%fr8r,%fr9
+	fstd		%fr9,0(rp)
+	xmpyu		%fr8l,%fr8l,%fr11
+	fstd		%fr11,8(rp)
+	ldd		-128(%r30),p32
+	ldd		-16(rp),p00
+	ldd		-8(rp),p64
+	depd,z		p32,31,32,t0
+	add		t0,p00,p00
+	extrd,u		p32,31,32,t1
+	add,dc		t1,p64,p64
+	add		t0,p00,p00
+	add,dc		t1,p64,p64
+	std		p00,-16(rp)
+	std		p64,-8(rp)
+	ldo		16(rp),rp
+	ldd		-120(%r30),p32
+	ldd		-16(rp),p00
+	ldd		-8(rp),p64
+	depd,z		p32,31,32,t0
+	add		t0,p00,p00
+	extrd,u		p32,31,32,t1
+	add,dc		t1,p64,p64
+	add		t0,p00,p00
+	add,dc		t1,p64,p64
+	std		p00,-16(rp)
+	std		p64,-8(rp)
+	bve		(%r2)
+	ldo		-128(%r30),%r30
+
+LDEF(end1)
+	xmpyu		%fr8l,%fr8r,%fr10
+	fstd		%fr10,-128(%r30)
+	xmpyu		%fr8r,%fr8r,%fr9
+	fstd		%fr9,0(rp)
+	xmpyu		%fr8l,%fr8l,%fr11
+	fstd		%fr11,8(rp)
+	ldo		16(rp),rp
+	ldd		-128(%r30),p32
+	ldd		-16(rp),p00
+	ldd		-8(rp),p64
+	depd,z		p32,31,32,t0
+	add		t0,p00,p00
+	extrd,u		p32,31,32,t1
+	add,dc		t1,p64,p64
+	add		t0,p00,p00
+	add,dc		t1,p64,p64
+	std		p00,-16(rp)
+	std		p64,-8(rp)
+	bve		(%r2)
+	ldo		-128(%r30),%r30
+EPILOGUE(mpn_sqr_diagonal)

diff --git a/third_party/gmp/mpn/pa64/submul_1.asm b/third_party/gmp/mpn/pa64/submul_1.asm
new file mode 100644
index 0000000..f8a1968
--- /dev/null
+++ b/third_party/gmp/mpn/pa64/submul_1.asm

@@ -0,0 +1,700 @@
+dnl  HP-PA 2.0 64-bit mpn_submul_1 -- Multiply a limb vector with a limb and
+dnl  subtract the result from a second limb vector.
+
+dnl  Copyright 1998-2000, 2002, 2003 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C		    cycles/limb
+C 8000,8200:		7
+C 8500,8600,8700:	6.5
+
+C  The feed-in and wind-down code has not yet been scheduled.  Many cycles
+C  could be saved there per call.
+
+C  DESCRIPTION:
+C  The main loop "BIG" is 4-way unrolled, mainly to allow
+C  effective use of ADD,DC.  Delays in moving data via the cache from the FP
+C  registers to the IU registers, have demanded a deep software pipeline, and
+C  a lot of stack slots for partial products in flight.
+C
+C  CODE STRUCTURE:
+C  save-some-registers
+C  do 0, 1, 2, or 3 limbs
+C  if done, restore-some-regs and return
+C  save-many-regs
+C  do 4, 8, ... limb
+C  restore-all-regs
+
+C  STACK LAYOUT:
+C  HP-PA stack grows upwards.  We could allocate 8 fewer slots by using the
+C  slots marked FREE, as well as some slots in the caller's "frame marker".
+C
+C -00 <- r30
+C -08  FREE
+C -10  tmp
+C -18  tmp
+C -20  tmp
+C -28  tmp
+C -30  tmp
+C -38  tmp
+C -40  tmp
+C -48  tmp
+C -50  tmp
+C -58  tmp
+C -60  tmp
+C -68  tmp
+C -70  tmp
+C -78  tmp
+C -80  tmp
+C -88  tmp
+C -90  FREE
+C -98  FREE
+C -a0  FREE
+C -a8  FREE
+C -b0  r13
+C -b8  r12
+C -c0  r11
+C -c8  r10
+C -d0  r8
+C -d8  r8
+C -e0  r7
+C -e8  r6
+C -f0  r5
+C -f8  r4
+C -100 r3
+C  Previous frame:
+C  [unused area]
+C -38/-138 vlimb home slot.  For 2.0N, the vlimb arg will arrive here.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS:
+define(`rp',`%r26')	C
+define(`up',`%r25')	C
+define(`n',`%r24')	C
+define(`vlimb',`%r23')	C
+
+define(`climb',`%r23')	C
+
+ifdef(`HAVE_ABI_2_0w',
+`	.level	2.0w
+',`	.level	2.0
+')
+PROLOGUE(mpn_submul_1)
+
+ifdef(`HAVE_ABI_2_0w',
+`	std		vlimb, -0x38(%r30)	C store vlimb into "home" slot
+')
+	std,ma		%r3, 0x100(%r30)
+	std		%r4, -0xf8(%r30)
+	std		%r5, -0xf0(%r30)
+	ldo		0(%r0), climb		C clear climb
+	fldd		-0x138(%r30), %fr8	C put vlimb in fp register
+
+define(`p032a1',`%r1')	C
+define(`p032a2',`%r19')	C
+
+define(`m032',`%r20')	C
+define(`m096',`%r21')	C
+
+define(`p000a',`%r22')	C
+define(`p064a',`%r29')	C
+
+define(`s000',`%r31')	C
+
+define(`ma000',`%r4')	C
+define(`ma064',`%r20')	C
+
+define(`r000',`%r3')	C
+
+	extrd,u		n, 63, 2, %r5
+	cmpb,=		%r5, %r0, L(BIG)
+	nop
+
+	fldd		0(up), %fr4
+	ldo		8(up), up
+	xmpyu		%fr8R, %fr4L, %fr22
+	xmpyu		%fr8L, %fr4R, %fr23
+	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
+	xmpyu		%fr8R, %fr4R, %fr24
+	xmpyu		%fr8L, %fr4L, %fr25
+	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
+	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79
+	addib,<>	-1, %r5, L(two_or_more)
+	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61
+LDEF(one)
+	ldd		-0x78(%r30), p032a1
+	ldd		-0x70(%r30), p032a2
+	ldd		-0x80(%r30), p000a
+	b		L(0_one_out)
+	ldd		-0x68(%r30), p064a
+
+LDEF(two_or_more)
+	fldd		0(up), %fr4
+	ldo		8(up), up
+	xmpyu		%fr8R, %fr4L, %fr22
+	xmpyu		%fr8L, %fr4R, %fr23
+	ldd		-0x78(%r30), p032a1
+	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
+	xmpyu		%fr8R, %fr4R, %fr24
+	xmpyu		%fr8L, %fr4L, %fr25
+	ldd		-0x70(%r30), p032a2
+	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
+	ldd		-0x80(%r30), p000a
+	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79
+	ldd		-0x68(%r30), p064a
+	addib,<>	-1, %r5, L(three_or_more)
+	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61
+LDEF(two)
+	add		p032a1, p032a2, m032
+	add,dc		%r0, %r0, m096
+	depd,z		m032, 31, 32, ma000
+	extrd,u		m032, 31, 32, ma064
+	ldd		0(rp), r000
+	b		L(0_two_out)
+	depd		m096, 31, 32, ma064
+
+LDEF(three_or_more)
+	fldd		0(up), %fr4
+	add		p032a1, p032a2, m032
+	add,dc		%r0, %r0, m096
+	depd,z		m032, 31, 32, ma000
+	extrd,u		m032, 31, 32, ma064
+	ldd		0(rp), r000
+C	addib,=		-1, %r5, L(0_out)
+	depd		m096, 31, 32, ma064
+LDEF(loop0)
+C	xmpyu		%fr8R, %fr4L, %fr22
+C	xmpyu		%fr8L, %fr4R, %fr23
+C	ldd		-0x78(%r30), p032a1
+C	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
+C
+C	xmpyu		%fr8R, %fr4R, %fr24
+C	xmpyu		%fr8L, %fr4L, %fr25
+C	ldd		-0x70(%r30), p032a2
+C	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
+C
+C	ldo		8(rp), rp
+C	add		climb, p000a, s000
+C	ldd		-0x80(%r30), p000a
+C	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79
+C
+C	add,dc		p064a, %r0, climb
+C	ldo		8(up), up
+C	ldd		-0x68(%r30), p064a
+C	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61
+C
+C	add		ma000, s000, s000
+C	add,dc		ma064, climb, climb
+C	fldd		0(up), %fr4
+C
+C	sub		r000, s000, s000
+C	sub,db		%r0, climb, climb
+C	sub		%r0, climb, climb
+C	std		s000, -8(rp)
+C
+C	add		p032a1, p032a2, m032
+C	add,dc		%r0, %r0, m096
+C
+C	depd,z		m032, 31, 32, ma000
+C	extrd,u		m032, 31, 32, ma064
+C	ldd		0(rp), r000
+C	addib,<>	-1, %r5, L(loop0)
+C	depd		m096, 31, 32, ma064
+LDEF(0_out)
+	ldo		8(up), up
+	xmpyu		%fr8R, %fr4L, %fr22
+	xmpyu		%fr8L, %fr4R, %fr23
+	ldd		-0x78(%r30), p032a1
+	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
+	xmpyu		%fr8R, %fr4R, %fr24
+	xmpyu		%fr8L, %fr4L, %fr25
+	ldd		-0x70(%r30), p032a2
+	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
+	ldo		8(rp), rp
+	add		climb, p000a, s000
+	ldd		-0x80(%r30), p000a
+	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79
+	add,dc		p064a, %r0, climb
+	ldd		-0x68(%r30), p064a
+	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61
+	add		ma000, s000, s000
+	add,dc		ma064, climb, climb
+	sub		r000, s000, s000
+	sub,db		%r0, climb, climb
+	sub		%r0, climb, climb
+	std		s000, -8(rp)
+	add		p032a1, p032a2, m032
+	add,dc		%r0, %r0, m096
+	depd,z		m032, 31, 32, ma000
+	extrd,u		m032, 31, 32, ma064
+	ldd		0(rp), r000
+	depd		m096, 31, 32, ma064
+LDEF(0_two_out)
+	ldd		-0x78(%r30), p032a1
+	ldd		-0x70(%r30), p032a2
+	ldo		8(rp), rp
+	add		climb, p000a, s000
+	ldd		-0x80(%r30), p000a
+	add,dc		p064a, %r0, climb
+	ldd		-0x68(%r30), p064a
+	add		ma000, s000, s000
+	add,dc		ma064, climb, climb
+	sub		r000, s000, s000
+	sub,db		%r0, climb, climb
+	sub		%r0, climb, climb
+	std		s000, -8(rp)
+LDEF(0_one_out)
+	add		p032a1, p032a2, m032
+	add,dc		%r0, %r0, m096
+	depd,z		m032, 31, 32, ma000
+	extrd,u		m032, 31, 32, ma064
+	ldd		0(rp), r000
+	depd		m096, 31, 32, ma064
+
+	add		climb, p000a, s000
+	add,dc		p064a, %r0, climb
+	add		ma000, s000, s000
+	add,dc		ma064, climb, climb
+	sub		r000, s000, s000
+	sub,db		%r0, climb, climb
+	sub		%r0, climb, climb
+	std		s000, 0(rp)
+
+	cmpib,>=	4, n, L(done)
+	ldo		8(rp), rp
+
+C 4-way unrolled code.
+
+LDEF(BIG)
+
+define(`p032a1',`%r1')	C
+define(`p032a2',`%r19')	C
+define(`p096b1',`%r20')	C
+define(`p096b2',`%r21')	C
+define(`p160c1',`%r22')	C
+define(`p160c2',`%r29')	C
+define(`p224d1',`%r31')	C
+define(`p224d2',`%r3')	C
+			C
+define(`m032',`%r4')	C
+define(`m096',`%r5')	C
+define(`m160',`%r6')	C
+define(`m224',`%r7')	C
+define(`m288',`%r8')	C
+			C
+define(`p000a',`%r1')	C
+define(`p064a',`%r19')	C
+define(`p064b',`%r20')	C
+define(`p128b',`%r21')	C
+define(`p128c',`%r22')	C
+define(`p192c',`%r29')	C
+define(`p192d',`%r31')	C
+define(`p256d',`%r3')	C
+			C
+define(`s000',`%r10')	C
+define(`s064',`%r11')	C
+define(`s128',`%r12')	C
+define(`s192',`%r13')	C
+			C
+define(`ma000',`%r9')	C
+define(`ma064',`%r4')	C
+define(`ma128',`%r5')	C
+define(`ma192',`%r6')	C
+define(`ma256',`%r7')	C
+			C
+define(`r000',`%r1')	C
+define(`r064',`%r19')	C
+define(`r128',`%r20')	C
+define(`r192',`%r21')	C
+
+	std		%r6, -0xe8(%r30)
+	std		%r7, -0xe0(%r30)
+	std		%r8, -0xd8(%r30)
+	std		%r9, -0xd0(%r30)
+	std		%r10, -0xc8(%r30)
+	std		%r11, -0xc0(%r30)
+	std		%r12, -0xb8(%r30)
+	std		%r13, -0xb0(%r30)
+
+ifdef(`HAVE_ABI_2_0w',
+`	extrd,u		n, 61, 62, n		C right shift 2
+',`	extrd,u		n, 61, 30, n		C right shift 2, zero extend
+')
+
+LDEF(4_or_more)
+	fldd		0(up), %fr4
+	fldd		8(up), %fr5
+	fldd		16(up), %fr6
+	fldd		24(up), %fr7
+	xmpyu		%fr8R, %fr4L, %fr22
+	xmpyu		%fr8L, %fr4R, %fr23
+	xmpyu		%fr8R, %fr5L, %fr24
+	xmpyu		%fr8L, %fr5R, %fr25
+	xmpyu		%fr8R, %fr6L, %fr26
+	xmpyu		%fr8L, %fr6R, %fr27
+	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
+	xmpyu		%fr8R, %fr7L, %fr28
+	xmpyu		%fr8L, %fr7R, %fr29
+	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
+	xmpyu		%fr8R, %fr4R, %fr30
+	xmpyu		%fr8L, %fr4L, %fr31
+	fstd		%fr24, -0x38(%r30)	C mid product to  -0x38..-0x31
+	xmpyu		%fr8R, %fr5R, %fr22
+	xmpyu		%fr8L, %fr5L, %fr23
+	fstd		%fr25, -0x30(%r30)	C mid product to  -0x30..-0x29
+	xmpyu		%fr8R, %fr6R, %fr24
+	xmpyu		%fr8L, %fr6L, %fr25
+	fstd		%fr26, -0x58(%r30)	C mid product to  -0x58..-0x51
+	xmpyu		%fr8R, %fr7R, %fr26
+	fstd		%fr27, -0x50(%r30)	C mid product to  -0x50..-0x49
+	addib,<>	-1, n, L(8_or_more)
+	xmpyu		%fr8L, %fr7L, %fr27
+	fstd		%fr28, -0x18(%r30)	C mid product to  -0x18..-0x11
+	fstd		%fr29, -0x10(%r30)	C mid product to  -0x10..-0x09
+	fstd		%fr30, -0x80(%r30)	C low product to  -0x80..-0x79
+	fstd		%fr31, -0x68(%r30)	C high product to -0x68..-0x61
+	fstd		%fr22, -0x40(%r30)	C low product to  -0x40..-0x39
+	fstd		%fr23, -0x28(%r30)	C high product to -0x28..-0x21
+	fstd		%fr24, -0x60(%r30)	C low product to  -0x60..-0x59
+	fstd		%fr25, -0x48(%r30)	C high product to -0x48..-0x41
+	fstd		%fr26, -0x20(%r30)	C low product to  -0x20..-0x19
+	fstd		%fr27, -0x88(%r30)	C high product to -0x88..-0x81
+	ldd		-0x78(%r30), p032a1
+	ldd		-0x70(%r30), p032a2
+	ldd		-0x38(%r30), p096b1
+	ldd		-0x30(%r30), p096b2
+	ldd		-0x58(%r30), p160c1
+	ldd		-0x50(%r30), p160c2
+	ldd		-0x18(%r30), p224d1
+	ldd		-0x10(%r30), p224d2
+	b		L(end1)
+	nop
+
+LDEF(8_or_more)
+	fstd		%fr28, -0x18(%r30)	C mid product to  -0x18..-0x11
+	fstd		%fr29, -0x10(%r30)	C mid product to  -0x10..-0x09
+	ldo		32(up), up
+	fstd		%fr30, -0x80(%r30)	C low product to  -0x80..-0x79
+	fstd		%fr31, -0x68(%r30)	C high product to -0x68..-0x61
+	fstd		%fr22, -0x40(%r30)	C low product to  -0x40..-0x39
+	fstd		%fr23, -0x28(%r30)	C high product to -0x28..-0x21
+	fstd		%fr24, -0x60(%r30)	C low product to  -0x60..-0x59
+	fstd		%fr25, -0x48(%r30)	C high product to -0x48..-0x41
+	fstd		%fr26, -0x20(%r30)	C low product to  -0x20..-0x19
+	fstd		%fr27, -0x88(%r30)	C high product to -0x88..-0x81
+	fldd		0(up), %fr4
+	fldd		8(up), %fr5
+	fldd		16(up), %fr6
+	fldd		24(up), %fr7
+	xmpyu		%fr8R, %fr4L, %fr22
+	ldd		-0x78(%r30), p032a1
+	xmpyu		%fr8L, %fr4R, %fr23
+	xmpyu		%fr8R, %fr5L, %fr24
+	ldd		-0x70(%r30), p032a2
+	xmpyu		%fr8L, %fr5R, %fr25
+	xmpyu		%fr8R, %fr6L, %fr26
+	ldd		-0x38(%r30), p096b1
+	xmpyu		%fr8L, %fr6R, %fr27
+	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
+	xmpyu		%fr8R, %fr7L, %fr28
+	ldd		-0x30(%r30), p096b2
+	xmpyu		%fr8L, %fr7R, %fr29
+	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
+	xmpyu		%fr8R, %fr4R, %fr30
+	ldd		-0x58(%r30), p160c1
+	xmpyu		%fr8L, %fr4L, %fr31
+	fstd		%fr24, -0x38(%r30)	C mid product to  -0x38..-0x31
+	xmpyu		%fr8R, %fr5R, %fr22
+	ldd		-0x50(%r30), p160c2
+	xmpyu		%fr8L, %fr5L, %fr23
+	fstd		%fr25, -0x30(%r30)	C mid product to  -0x30..-0x29
+	xmpyu		%fr8R, %fr6R, %fr24
+	ldd		-0x18(%r30), p224d1
+	xmpyu		%fr8L, %fr6L, %fr25
+	fstd		%fr26, -0x58(%r30)	C mid product to  -0x58..-0x51
+	xmpyu		%fr8R, %fr7R, %fr26
+	ldd		-0x10(%r30), p224d2
+	fstd		%fr27, -0x50(%r30)	C mid product to  -0x50..-0x49
+	addib,=		-1, n, L(end2)
+	xmpyu		%fr8L, %fr7L, %fr27
+LDEF(loop)
+	add		p032a1, p032a2, m032
+	ldd		-0x80(%r30), p000a
+	add,dc		p096b1, p096b2, m096
+	fstd		%fr28, -0x18(%r30)	C mid product to  -0x18..-0x11
+
+	add,dc		p160c1, p160c2, m160
+	ldd		-0x68(%r30), p064a
+	add,dc		p224d1, p224d2, m224
+	fstd		%fr29, -0x10(%r30)	C mid product to  -0x10..-0x09
+
+	add,dc		%r0, %r0, m288
+	ldd		-0x40(%r30), p064b
+	ldo		32(up), up
+	fstd		%fr30, -0x80(%r30)	C low product to  -0x80..-0x79
+
+	depd,z		m032, 31, 32, ma000
+	ldd		-0x28(%r30), p128b
+	extrd,u		m032, 31, 32, ma064
+	fstd		%fr31, -0x68(%r30)	C high product to -0x68..-0x61
+
+	depd		m096, 31, 32, ma064
+	ldd		-0x60(%r30), p128c
+	extrd,u		m096, 31, 32, ma128
+	fstd		%fr22, -0x40(%r30)	C low product to  -0x40..-0x39
+
+	depd		m160, 31, 32, ma128
+	ldd		-0x48(%r30), p192c
+	extrd,u		m160, 31, 32, ma192
+	fstd		%fr23, -0x28(%r30)	C high product to -0x28..-0x21
+
+	depd		m224, 31, 32, ma192
+	ldd		-0x20(%r30), p192d
+	extrd,u		m224, 31, 32, ma256
+	fstd		%fr24, -0x60(%r30)	C low product to  -0x60..-0x59
+
+	depd		m288, 31, 32, ma256
+	ldd		-0x88(%r30), p256d
+	add		climb, p000a, s000
+	fstd		%fr25, -0x48(%r30)	C high product to -0x48..-0x41
+
+	add,dc		p064a, p064b, s064
+	ldd		0(rp), r000
+	add,dc		p128b, p128c, s128
+	fstd		%fr26, -0x20(%r30)	C low product to  -0x20..-0x19
+
+	add,dc		p192c, p192d, s192
+	ldd		8(rp), r064
+	add,dc		p256d, %r0, climb
+	fstd		%fr27, -0x88(%r30)	C high product to -0x88..-0x81
+
+	ldd		16(rp), r128
+	add		ma000, s000, s000	C accum mid 0
+	ldd		24(rp), r192
+	add,dc		ma064, s064, s064	C accum mid 1
+
+	add,dc		ma128, s128, s128	C accum mid 2
+	fldd		0(up), %fr4
+	add,dc		ma192, s192, s192	C accum mid 3
+	fldd		8(up), %fr5
+
+	add,dc		ma256, climb, climb
+	fldd		16(up), %fr6
+	sub		r000, s000, s000	C accum rlimb 0
+	fldd		24(up), %fr7
+
+	sub,db		r064, s064, s064	C accum rlimb 1
+	sub,db		r128, s128, s128	C accum rlimb 2
+	std		s000, 0(rp)
+
+	sub,db		r192, s192, s192	C accum rlimb 3
+	sub,db		%r0, climb, climb
+	sub		%r0, climb, climb
+	std		s064, 8(rp)
+
+	xmpyu		%fr8R, %fr4L, %fr22
+	ldd		-0x78(%r30), p032a1
+	xmpyu		%fr8L, %fr4R, %fr23
+	std		s128, 16(rp)
+
+	xmpyu		%fr8R, %fr5L, %fr24
+	ldd		-0x70(%r30), p032a2
+	xmpyu		%fr8L, %fr5R, %fr25
+	std		s192, 24(rp)
+
+	xmpyu		%fr8R, %fr6L, %fr26
+	ldd		-0x38(%r30), p096b1
+	xmpyu		%fr8L, %fr6R, %fr27
+	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
+
+	xmpyu		%fr8R, %fr7L, %fr28
+	ldd		-0x30(%r30), p096b2
+	xmpyu		%fr8L, %fr7R, %fr29
+	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
+
+	xmpyu		%fr8R, %fr4R, %fr30
+	ldd		-0x58(%r30), p160c1
+	xmpyu		%fr8L, %fr4L, %fr31
+	fstd		%fr24, -0x38(%r30)	C mid product to  -0x38..-0x31
+
+	xmpyu		%fr8R, %fr5R, %fr22
+	ldd		-0x50(%r30), p160c2
+	xmpyu		%fr8L, %fr5L, %fr23
+	fstd		%fr25, -0x30(%r30)	C mid product to  -0x30..-0x29
+
+	xmpyu		%fr8R, %fr6R, %fr24
+	ldd		-0x18(%r30), p224d1
+	xmpyu		%fr8L, %fr6L, %fr25
+	fstd		%fr26, -0x58(%r30)	C mid product to  -0x58..-0x51
+
+	xmpyu		%fr8R, %fr7R, %fr26
+	ldd		-0x10(%r30), p224d2
+	fstd		%fr27, -0x50(%r30)	C mid product to  -0x50..-0x49
+	xmpyu		%fr8L, %fr7L, %fr27
+
+	addib,<>	-1, n, L(loop)
+	ldo		32(rp), rp
+
+LDEF(end2)
+	add		p032a1, p032a2, m032
+	ldd		-0x80(%r30), p000a
+	add,dc		p096b1, p096b2, m096
+	fstd		%fr28, -0x18(%r30)	C mid product to  -0x18..-0x11
+	add,dc		p160c1, p160c2, m160
+	ldd		-0x68(%r30), p064a
+	add,dc		p224d1, p224d2, m224
+	fstd		%fr29, -0x10(%r30)	C mid product to  -0x10..-0x09
+	add,dc		%r0, %r0, m288
+	ldd		-0x40(%r30), p064b
+	fstd		%fr30, -0x80(%r30)	C low product to  -0x80..-0x79
+	depd,z		m032, 31, 32, ma000
+	ldd		-0x28(%r30), p128b
+	extrd,u		m032, 31, 32, ma064
+	fstd		%fr31, -0x68(%r30)	C high product to -0x68..-0x61
+	depd		m096, 31, 32, ma064
+	ldd		-0x60(%r30), p128c
+	extrd,u		m096, 31, 32, ma128
+	fstd		%fr22, -0x40(%r30)	C low product to  -0x40..-0x39
+	depd		m160, 31, 32, ma128
+	ldd		-0x48(%r30), p192c
+	extrd,u		m160, 31, 32, ma192
+	fstd		%fr23, -0x28(%r30)	C high product to -0x28..-0x21
+	depd		m224, 31, 32, ma192
+	ldd		-0x20(%r30), p192d
+	extrd,u		m224, 31, 32, ma256
+	fstd		%fr24, -0x60(%r30)	C low product to  -0x60..-0x59
+	depd		m288, 31, 32, ma256
+	ldd		-0x88(%r30), p256d
+	add		climb, p000a, s000
+	fstd		%fr25, -0x48(%r30)	C high product to -0x48..-0x41
+	add,dc		p064a, p064b, s064
+	ldd		0(rp), r000
+	add,dc		p128b, p128c, s128
+	fstd		%fr26, -0x20(%r30)	C low product to  -0x20..-0x19
+	add,dc		p192c, p192d, s192
+	ldd		8(rp), r064
+	add,dc		p256d, %r0, climb
+	fstd		%fr27, -0x88(%r30)	C high product to -0x88..-0x81
+	ldd		16(rp), r128
+	add		ma000, s000, s000	C accum mid 0
+	ldd		24(rp), r192
+	add,dc		ma064, s064, s064	C accum mid 1
+	add,dc		ma128, s128, s128	C accum mid 2
+	add,dc		ma192, s192, s192	C accum mid 3
+	add,dc		ma256, climb, climb
+	sub		r000, s000, s000	C accum rlimb 0
+	sub,db		r064, s064, s064	C accum rlimb 1
+	sub,db		r128, s128, s128	C accum rlimb 2
+	std		s000, 0(rp)
+	sub,db		r192, s192, s192	C accum rlimb 3
+	sub,db		%r0, climb, climb
+	sub		%r0, climb, climb
+	std		s064, 8(rp)
+	ldd		-0x78(%r30), p032a1
+	std		s128, 16(rp)
+	ldd		-0x70(%r30), p032a2
+	std		s192, 24(rp)
+	ldd		-0x38(%r30), p096b1
+	ldd		-0x30(%r30), p096b2
+	ldd		-0x58(%r30), p160c1
+	ldd		-0x50(%r30), p160c2
+	ldd		-0x18(%r30), p224d1
+	ldd		-0x10(%r30), p224d2
+	ldo		32(rp), rp
+
+LDEF(end1)
+	add		p032a1, p032a2, m032
+	ldd		-0x80(%r30), p000a
+	add,dc		p096b1, p096b2, m096
+	add,dc		p160c1, p160c2, m160
+	ldd		-0x68(%r30), p064a
+	add,dc		p224d1, p224d2, m224
+	add,dc		%r0, %r0, m288
+	ldd		-0x40(%r30), p064b
+	depd,z		m032, 31, 32, ma000
+	ldd		-0x28(%r30), p128b
+	extrd,u		m032, 31, 32, ma064
+	depd		m096, 31, 32, ma064
+	ldd		-0x60(%r30), p128c
+	extrd,u		m096, 31, 32, ma128
+	depd		m160, 31, 32, ma128
+	ldd		-0x48(%r30), p192c
+	extrd,u		m160, 31, 32, ma192
+	depd		m224, 31, 32, ma192
+	ldd		-0x20(%r30), p192d
+	extrd,u		m224, 31, 32, ma256
+	depd		m288, 31, 32, ma256
+	ldd		-0x88(%r30), p256d
+	add		climb, p000a, s000
+	add,dc		p064a, p064b, s064
+	ldd		0(rp), r000
+	add,dc		p128b, p128c, s128
+	add,dc		p192c, p192d, s192
+	ldd		8(rp), r064
+	add,dc		p256d, %r0, climb
+	ldd		16(rp), r128
+	add		ma000, s000, s000	C accum mid 0
+	ldd		24(rp), r192
+	add,dc		ma064, s064, s064	C accum mid 1
+	add,dc		ma128, s128, s128	C accum mid 2
+	add,dc		ma192, s192, s192	C accum mid 3
+	add,dc		ma256, climb, climb
+	sub		r000, s000, s000	C accum rlimb 0
+	sub,db		r064, s064, s064	C accum rlimb 1
+	sub,db		r128, s128, s128	C accum rlimb 2
+	std		s000, 0(rp)
+	sub,db		r192, s192, s192	C accum rlimb 3
+	sub,db		%r0, climb, climb
+	sub		%r0, climb, climb
+	std		s064, 8(rp)
+	std		s128, 16(rp)
+	std		s192, 24(rp)
+
+	ldd		-0xb0(%r30), %r13
+	ldd		-0xb8(%r30), %r12
+	ldd		-0xc0(%r30), %r11
+	ldd		-0xc8(%r30), %r10
+	ldd		-0xd0(%r30), %r9
+	ldd		-0xd8(%r30), %r8
+	ldd		-0xe0(%r30), %r7
+	ldd		-0xe8(%r30), %r6
+LDEF(done)
+ifdef(`HAVE_ABI_2_0w',
+`	copy		climb, %r28
+',`	extrd,u		climb, 63, 32, %r29
+	extrd,u		climb, 31, 32, %r28
+')
+	ldd		-0xf0(%r30), %r5
+	ldd		-0xf8(%r30), %r4
+	bve		(%r2)
+	ldd,mb		-0x100(%r30), %r3
+EPILOGUE(mpn_submul_1)

diff --git a/third_party/gmp/mpn/pa64/udiv.asm b/third_party/gmp/mpn/pa64/udiv.asm
new file mode 100644
index 0000000..1380a85
--- /dev/null
+++ b/third_party/gmp/mpn/pa64/udiv.asm

@@ -0,0 +1,125 @@
+dnl  HP-PA 2.0 64-bit mpn_udiv_qrnnd_r.
+
+dnl  Copyright 2001-2003 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C This runs at about 280 cycles on both PA8000 and PA8500, corresponding to a
+C bit more than 4 cycles/bit.
+
+C INPUT PARAMETERS
+define(`n1',`%r26')
+define(`n0',`%r25')
+define(`d',`%r24')
+define(`remptr',`%r23')
+
+define(`q',`%r28')
+define(`dn',`%r29')
+
+define(`old_divstep',
+       `add,dc		n0,n0,n0
+	add,dc		n1,n1,n1
+	sub,*<<		n1,d,%r22
+	copy		%r22,n1')
+
+define(`divstep',
+       `add		n0,n0,n0
+	add,dc		n1,n1,n1
+	sub		n1,d,%r1
+	add,dc		q,q,q
+	cmpclr,*<<	n1,d,%r0
+	copy		%r1,n1
+')
+
+ifdef(`HAVE_ABI_2_0w',
+`	.level	2.0w
+',`	.level	2.0
+')
+PROLOGUE(mpn_udiv_qrnnd_r)
+ifdef(`HAVE_ABI_2_0n',
+`	depd		%r25,31,32,%r26
+	depd		%r23,31,32,%r24
+	copy		%r24,%r25
+	ldd		-56(%r30),%r24
+	ldw		-60(%r30),%r23
+')
+	ldi		0,q
+	cmpib,*>=	0,d,L(large_divisor)
+	ldi		8,%r31		C setup loop counter
+
+	sub		%r0,d,dn
+LDEF(Loop)
+	divstep divstep divstep divstep divstep divstep divstep divstep
+	addib,<>	-1,%r31,L(Loop)
+	nop
+
+ifdef(`HAVE_ABI_2_0n',
+`	copy		%r28,%r29
+	extrd,u		%r28,31,32,%r28
+')
+	bve		(%r2)
+	std		n1,0(remptr)	C store remainder
+
+LDEF(large_divisor)
+	extrd,u		n0,63,1,%r19	C save lsb of dividend
+	shrpd		n1,n0,1,n0	C n0 = lo(n1n0 >> 1)
+	shrpd		%r0,n1,1,n1	C n1 = hi(n1n0 >> 1)
+	extrd,u		d,63,1,%r20	C save lsb of divisor
+	shrpd		%r0,d,1,d	C d = floor(orig_d / 2)
+	add,l		%r20,d,d	C d = ceil(orig_d / 2)
+
+	sub		%r0,d,dn
+LDEF(Loop2)
+	divstep divstep divstep divstep divstep divstep divstep divstep
+	addib,<>	-1,%r31,L(Loop2)
+	nop
+
+	cmpib,*=	0,%r20,L(even_divisor)
+	shladd		n1,1,%r19,n1	C shift in omitted dividend lsb
+
+	add		d,d,d		C restore orig...
+	sub		d,%r20,d	C ...d value
+	sub		%r0,d,dn	C r21 = -d
+
+	add,*nuv	n1,q,n1		C fix remainder for omitted divisor lsb
+	add,l		n1,dn,n1	C adjust remainder if rem. fix carried
+	add,dc		%r0,q,q		C adjust quotient accordingly
+
+	sub,*<<		n1,d,%r0	C remainder >= divisor?
+	add,l		n1,dn,n1	C adjust remainder
+	add,dc		%r0,q,q		C adjust quotient
+
+LDEF(even_divisor)
+ifdef(`HAVE_ABI_2_0n',
+`	copy		%r28,%r29
+	extrd,u		%r28,31,32,%r28
+')
+	bve		(%r2)
+	std		n1,0(remptr)	C store remainder
+EPILOGUE(mpn_udiv_qrnnd_r)

diff --git a/third_party/gmp/mpn/pa64/umul.asm b/third_party/gmp/mpn/pa64/umul.asm
new file mode 100644
index 0000000..bd5a71f
--- /dev/null
+++ b/third_party/gmp/mpn/pa64/umul.asm

@@ -0,0 +1,97 @@
+dnl  Copyright 1999, 2000, 2002, 2003 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+dnl  Optimizations:
+dnl  * Avoid skip instructions
+dnl  * Put carry-generating and carry-consuming insns consecutively
+dnl  * Don't allocate any stack, "home" positions for parameters could be used.
+
+include(`../config.m4')
+
+define(`p0',`%r28')
+define(`p1',`%r29')
+define(`t32',`%r19')
+define(`t0',`%r20')
+define(`t1',`%r21')
+define(`x',`%r22')
+define(`m0',`%r23')
+define(`m1',`%r24')
+
+ifdef(`HAVE_ABI_2_0w',
+`	.level	2.0w
+',`	.level	2.0
+')
+PROLOGUE(mpn_umul_ppmm_r)
+	ldo		128(%r30),%r30
+ifdef(`HAVE_ABI_2_0w',
+`	std		%r26,-64(%r30)
+	std		%r25,-56(%r30)
+	copy		%r24,%r31
+',`
+	depd		%r25,31,32,%r26
+	std		%r26,-64(%r30)
+	depd		%r23,31,32,%r24
+	std		%r24,-56(%r30)
+	ldw		-180(%r30),%r31
+')
+
+	fldd		-64(%r30),%fr4
+	fldd		-56(%r30),%fr5
+
+	xmpyu		%fr5R,%fr4R,%fr6
+	fstd		%fr6,-128(%r30)
+	xmpyu		%fr5R,%fr4L,%fr7
+	fstd		%fr7,-120(%r30)
+	xmpyu		%fr5L,%fr4R,%fr8
+	fstd		%fr8,-112(%r30)
+	xmpyu		%fr5L,%fr4L,%fr9
+	fstd		%fr9,-104(%r30)
+
+	depdi,z		1,31,1,t32		C t32 = 2^32
+
+	ldd		-128(%r30),p0		C lo = low 64 bit of product
+	ldd		-120(%r30),m0		C m0 = mid0 64 bit of product
+	ldd		-112(%r30),m1		C m1 = mid1 64 bit of product
+	ldd		-104(%r30),p1		C hi = high 64 bit of product
+
+	add,l,*nuv	m0,m1,x			C x = m1+m0
+	 add,l		t32,p1,p1		C propagate carry to mid of p1
+	depd,z		x,31,32,t0		C lo32(m1+m0)
+	add		t0,p0,p0
+	extrd,u		x,31,32,t1		C hi32(m1+m0)
+	add,dc		t1,p1,p1
+
+	std		p0,0(%r31)		C store low half of product
+ifdef(`HAVE_ABI_2_0w',
+`	copy		p1,%r28			C return val in %r28
+',`	extrd,u		p1,31,32,%r28		C return val in %r28,%r29
+')
+	bve		(%r2)
+	ldo		-128(%r30),%r30
+EPILOGUE(mpn_umul_ppmm_r)

diff --git a/third_party/gmp/mpn/power/add_n.asm b/third_party/gmp/mpn/power/add_n.asm
new file mode 100644
index 0000000..6d6ca73
--- /dev/null
+++ b/third_party/gmp/mpn/power/add_n.asm

@@ -0,0 +1,83 @@
+dnl  IBM POWER mpn_add_n -- Add two limb vectors of equal, non-zero length.
+
+dnl  Copyright 1992, 1994-1996, 1999-2001, 2005 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+dnl  INPUT PARAMETERS
+dnl  res_ptr	r3
+dnl  s1_ptr	r4
+dnl  s2_ptr	r5
+dnl  size	r6
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_add_n)
+	andil.	10,6,1		C odd or even number of limbs?
+	l	8,0(4)		C load least significant s1 limb
+	l	0,0(5)		C load least significant s2 limb
+	cal	3,-4(3)		C offset res_ptr, it's updated before it's used
+	sri	10,6,1		C count for unrolled loop
+	a	7,0,8		C add least significant limbs, set cy
+	mtctr	10		C copy count into CTR
+	beq	0,Leven		C branch if even # of limbs (# of limbs >= 2)
+
+C We have an odd # of limbs.  Add the first limbs separately.
+	cmpi	1,10,0		C is count for unrolled loop zero?
+	bc	4,6,L1		C bne cr1,L1 (misassembled by gas)
+	st	7,4(3)
+	aze	3,10		C use the fact that r10 is zero...
+	br			C return
+
+C We added least significant limbs.  Now reload the next limbs to enter loop.
+L1:	lu	8,4(4)		C load s1 limb and update s1_ptr
+	lu	0,4(5)		C load s2 limb and update s2_ptr
+	stu	7,4(3)
+	ae	7,0,8		C add limbs, set cy
+Leven:	lu	9,4(4)		C load s1 limb and update s1_ptr
+	lu	10,4(5)		C load s2 limb and update s2_ptr
+	bdz	Lend		C If done, skip loop
+
+Loop:	lu	8,4(4)		C load s1 limb and update s1_ptr
+	lu	0,4(5)		C load s2 limb and update s2_ptr
+	ae	11,10,9		C add previous limbs with cy, set cy
+	stu	7,4(3)		C
+	lu	9,4(4)		C load s1 limb and update s1_ptr
+	lu	10,4(5)		C load s2 limb and update s2_ptr
+	ae	7,0,8		C add previous limbs with cy, set cy
+	stu	11,4(3)		C
+	bdn	Loop		C decrement CTR and loop back
+
+Lend:	ae	11,10,9		C add limbs with cy, set cy
+	st	7,4(3)		C
+	st	11,8(3)		C
+	lil	3,0		C load cy into ...
+	aze	3,3		C ... return value register
+	br
+EPILOGUE(mpn_add_n)

diff --git a/third_party/gmp/mpn/power/addmul_1.asm b/third_party/gmp/mpn/power/addmul_1.asm
new file mode 100644
index 0000000..76d8df3
--- /dev/null
+++ b/third_party/gmp/mpn/power/addmul_1.asm

@@ -0,0 +1,126 @@
+dnl  IBM POWER mpn_addmul_1 -- Multiply a limb vector with a limb and add the
+dnl  result to a second limb vector.
+
+dnl  Copyright 1992, 1994, 1999-2001 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+dnl  INPUT PARAMETERS
+dnl  res_ptr	r3
+dnl  s1_ptr	r4
+dnl  size	r5
+dnl  s2_limb	r6
+
+dnl  The POWER architecture has no unsigned 32x32->64 bit multiplication
+dnl  instruction.  To obtain that operation, we have to use the 32x32->64
+dnl  signed multiplication instruction, and add the appropriate compensation to
+dnl  the high limb of the result.  We add the multiplicand if the multiplier
+dnl  has its most significant bit set, and we add the multiplier if the
+dnl  multiplicand has its most significant bit set.  We need to preserve the
+dnl  carry flag between each iteration, so we have to compute the compensation
+dnl  carefully (the natural, srai+and doesn't work).  Since all POWER can
+dnl  branch in zero cycles, we use conditional branches for the compensation.
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_addmul_1)
+	cal	3,-4(3)
+	l	0,0(4)
+	cmpi	0,6,0
+	mtctr	5
+	mul	9,0,6
+	srai	7,0,31
+	and	7,7,6
+	mfmq	8
+	cax	9,9,7
+	l	7,4(3)
+	a	8,8,7		C add res_limb
+	blt	Lneg
+Lpos:	bdz	Lend
+
+Lploop:	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	10,0,6
+	mfmq	0
+	ae	8,0,9		C low limb + old_cy_limb + old cy
+	l	7,4(3)
+	aze	10,10		C propagate cy to new cy_limb
+	a	8,8,7		C add res_limb
+	bge	Lp0
+	cax	10,10,6		C adjust high limb for negative limb from s1
+Lp0:	bdz	Lend0
+	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	9,0,6
+	mfmq	0
+	ae	8,0,10
+	l	7,4(3)
+	aze	9,9
+	a	8,8,7
+	bge	Lp1
+	cax	9,9,6		C adjust high limb for negative limb from s1
+Lp1:	bdn	Lploop
+
+	b	Lend
+
+Lneg:	cax	9,9,0
+	bdz	Lend
+Lnloop:	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	10,0,6
+	mfmq	7
+	ae	8,7,9
+	l	7,4(3)
+	ae	10,10,0		C propagate cy to new cy_limb
+	a	8,8,7		C add res_limb
+	bge	Ln0
+	cax	10,10,6		C adjust high limb for negative limb from s1
+Ln0:	bdz	Lend0
+	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	9,0,6
+	mfmq	7
+	ae	8,7,10
+	l	7,4(3)
+	ae	9,9,0		C propagate cy to new cy_limb
+	a	8,8,7		C add res_limb
+	bge	Ln1
+	cax	9,9,6		C adjust high limb for negative limb from s1
+Ln1:	bdn	Lnloop
+	b	Lend
+
+Lend0:	cal	9,0(10)
+Lend:	st	8,4(3)
+	aze	3,9
+	br
+EPILOGUE(mpn_addmul_1)

diff --git a/third_party/gmp/mpn/power/gmp-mparam.h b/third_party/gmp/mpn/power/gmp-mparam.h
new file mode 100644
index 0000000..7cb36f9
--- /dev/null
+++ b/third_party/gmp/mpn/power/gmp-mparam.h

@@ -0,0 +1,69 @@
+/* POWER gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2002-2004 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+/* Generated by tuneup.c, 2003-02-10, gcc 3.2, POWER2 66.7MHz */
+
+#define MUL_TOOM22_THRESHOLD             12
+#define MUL_TOOM33_THRESHOLD             75
+
+#define SQR_BASECASE_THRESHOLD            7
+#define SQR_TOOM2_THRESHOLD              28
+#define SQR_TOOM3_THRESHOLD              86
+
+#define DIV_SB_PREINV_THRESHOLD       MP_SIZE_T_MAX  /* never */
+#define DIV_DC_THRESHOLD                 36
+#define POWM_THRESHOLD                   69
+
+#define HGCD_THRESHOLD                   97
+#define GCD_ACCEL_THRESHOLD               3
+#define GCD_DC_THRESHOLD                590
+#define JACOBI_BASE_METHOD                2
+
+#define DIVREM_1_NORM_THRESHOLD          12
+#define DIVREM_1_UNNORM_THRESHOLD     MP_SIZE_T_MAX  /* never */
+#define MOD_1_NORM_THRESHOLD             10
+#define MOD_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */
+#define USE_PREINV_DIVREM_1               0
+#define USE_PREINV_MOD_1                  1
+#define DIVREM_2_THRESHOLD               11
+#define DIVEXACT_1_THRESHOLD              0  /* always */
+#define MODEXACT_1_ODD_THRESHOLD          0  /* always */
+
+#define GET_STR_DC_THRESHOLD             10
+#define GET_STR_PRECOMPUTE_THRESHOLD     20
+#define SET_STR_THRESHOLD              2899
+
+#define MUL_FFT_TABLE  { 336, 800, 1408, 3584, 10240, 24576, 0 }
+#define MUL_FFT_MODF_THRESHOLD          296
+#define MUL_FFT_THRESHOLD              2304
+
+#define SQR_FFT_TABLE  { 336, 800, 1408, 3584, 10240, 24576, 0 }
+#define SQR_FFT_MODF_THRESHOLD          296
+#define SQR_FFT_THRESHOLD              2304

diff --git a/third_party/gmp/mpn/power/lshift.asm b/third_party/gmp/mpn/power/lshift.asm
new file mode 100644
index 0000000..efa2105
--- /dev/null
+++ b/third_party/gmp/mpn/power/lshift.asm

@@ -0,0 +1,61 @@
+dnl  IBM POWER mpn_lshift -- Shift a number left.
+
+dnl  Copyright 1992, 1994, 1999-2001 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+dnl  INPUT PARAMETERS
+dnl  res_ptr	r3
+dnl  s_ptr	r4
+dnl  size	r5
+dnl  cnt	r6
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_lshift)
+	sli	0,5,2
+	cax	9,3,0
+	cax	4,4,0
+	sfi	8,6,32
+	mtctr	5		C put limb count in CTR loop register
+	lu	0,-4(4)		C read most significant limb
+	sre	3,0,8		C compute carry out limb, and init MQ register
+	bdz	Lend2		C if just one limb, skip loop
+	lu	0,-4(4)		C read 2:nd most significant limb
+	sreq	7,0,8		C compute most significant limb of result
+	bdz	Lend		C if just two limb, skip loop
+Loop:	lu	0,-4(4)		C load next lower limb
+	stu	7,-4(9)		C store previous result during read latency
+	sreq	7,0,8		C compute result limb
+	bdn	Loop		C loop back until CTR is zero
+Lend:	stu	7,-4(9)		C store 2:nd least significant limb
+Lend2:	sle	7,0,6		C compute least significant limb
+	st	7,-4(9)		C store it
+	br
+EPILOGUE(mpn_lshift)

diff --git a/third_party/gmp/mpn/power/mul_1.asm b/third_party/gmp/mpn/power/mul_1.asm
new file mode 100644
index 0000000..38b7b66
--- /dev/null
+++ b/third_party/gmp/mpn/power/mul_1.asm

@@ -0,0 +1,113 @@
+dnl  IBM POWER mpn_mul_1 -- Multiply a limb vector with a limb and store the
+dnl  result in a second limb vector.
+
+dnl  Copyright 1992, 1994, 1999-2001 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+dnl  INPUT PARAMETERS
+dnl  res_ptr	r3
+dnl  s1_ptr	r4
+dnl  size	r5
+dnl  s2_limb	r6
+
+dnl  The POWER architecture has no unsigned 32x32->64 bit multiplication
+dnl  instruction.  To obtain that operation, we have to use the 32x32->64
+dnl  signed multiplication instruction, and add the appropriate compensation to
+dnl  the high limb of the result.  We add the multiplicand if the multiplier
+dnl  has its most significant bit set, and we add the multiplier if the
+dnl  multiplicand has its most significant bit set.  We need to preserve the
+dnl  carry flag between each iteration, so we have to compute the compensation
+dnl  carefully (the natural, srai+and doesn't work).  Since all POWER can
+dnl  branch in zero cycles, we use conditional branches for the compensation.
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_mul_1)
+	cal	3,-4(3)
+	l	0,0(4)
+	cmpi	0,6,0
+	mtctr	5
+	mul	9,0,6
+	srai	7,0,31
+	and	7,7,6
+	mfmq	8
+	ai	0,0,0		C reset carry
+	cax	9,9,7
+	blt	Lneg
+Lpos:	bdz	Lend
+Lploop:	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	10,0,6
+	mfmq	0
+	ae	8,0,9
+	bge	Lp0
+	cax	10,10,6		C adjust high limb for negative limb from s1
+Lp0:	bdz	Lend0
+	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	9,0,6
+	mfmq	0
+	ae	8,0,10
+	bge	Lp1
+	cax	9,9,6		C adjust high limb for negative limb from s1
+Lp1:	bdn	Lploop
+	b	Lend
+
+Lneg:	cax	9,9,0
+	bdz	Lend
+Lnloop:	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	10,0,6
+	cax	10,10,0		C adjust high limb for negative s2_limb
+	mfmq	0
+	ae	8,0,9
+	bge	Ln0
+	cax	10,10,6		C adjust high limb for negative limb from s1
+Ln0:	bdz	Lend0
+	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	9,0,6
+	cax	9,9,0		C adjust high limb for negative s2_limb
+	mfmq	0
+	ae	8,0,10
+	bge	Ln1
+	cax	9,9,6		C adjust high limb for negative limb from s1
+Ln1:	bdn	Lnloop
+	b	Lend
+
+Lend0:	cal	9,0(10)
+Lend:	st	8,4(3)
+	aze	3,9
+	br
+EPILOGUE(mpn_mul_1)

diff --git a/third_party/gmp/mpn/power/rshift.asm b/third_party/gmp/mpn/power/rshift.asm
new file mode 100644
index 0000000..1d1815c
--- /dev/null
+++ b/third_party/gmp/mpn/power/rshift.asm

@@ -0,0 +1,59 @@
+dnl  IBM POWER mpn_rshift -- Shift a number right.
+
+dnl  Copyright 1992, 1994, 1999-2001 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+dnl  INPUT PARAMETERS
+dnl  res_ptr	r3
+dnl  s_ptr	r4
+dnl  size	r5
+dnl  cnt	r6
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_rshift)
+	sfi	8,6,32
+	mtctr	5		C put limb count in CTR loop register
+	l	0,0(4)		C read least significant limb
+	ai	9,3,-4		C adjust res_ptr since it's offset in the stu:s
+	sle	3,0,8		C compute carry limb, and init MQ register
+	bdz	Lend2		C if just one limb, skip loop
+	lu	0,4(4)		C read 2:nd least significant limb
+	sleq	7,0,8		C compute least significant limb of result
+	bdz	Lend		C if just two limb, skip loop
+Loop:	lu	0,4(4)		C load next higher limb
+	stu	7,4(9)		C store previous result during read latency
+	sleq	7,0,8		C compute result limb
+	bdn	Loop		C loop back until CTR is zero
+Lend:	stu	7,4(9)		C store 2:nd most significant limb
+Lend2:	sre	7,0,6		C compute most significant limb
+	st	7,4(9)		C store it
+	br
+EPILOGUE(mpn_rshift)

diff --git a/third_party/gmp/mpn/power/sdiv.asm b/third_party/gmp/mpn/power/sdiv.asm
new file mode 100644
index 0000000..4a9ed14
--- /dev/null
+++ b/third_party/gmp/mpn/power/sdiv.asm

@@ -0,0 +1,39 @@
+dnl  Copyright 1999, 2001 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_sdiv_qrnnd)
+	mtmq	5
+	div	0,4,6
+	mfmq	9
+	st	9,0(3)
+	mr	3,0
+	br
+EPILOGUE(mpn_sdiv_qrnnd)

diff --git a/third_party/gmp/mpn/power/sub_n.asm b/third_party/gmp/mpn/power/sub_n.asm
new file mode 100644
index 0000000..390c802
--- /dev/null
+++ b/third_party/gmp/mpn/power/sub_n.asm

@@ -0,0 +1,85 @@
+dnl  IBM POWER mpn_sub_n -- Subtract two limb vectors of equal, non-zero
+dnl  length.
+
+dnl  Copyright 1992, 1994-1996, 1999-2001, 2005 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+dnl  INPUT PARAMETERS
+dnl  res_ptr	r3
+dnl  s1_ptr	r4
+dnl  s2_ptr	r5
+dnl  size	r6
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_sub_n)
+	andil.	10,6,1		C odd or even number of limbs?
+	l	8,0(4)		C load least significant s1 limb
+	l	0,0(5)		C load least significant s2 limb
+	cal	3,-4(3)		C offset res_ptr, it's updated before it's used
+	sri	10,6,1		C count for unrolled loop
+	sf	7,0,8		C subtract least significant limbs, set cy
+	mtctr	10		C copy count into CTR
+	beq	0,Leven		C branch if even # of limbs (# of limbs >= 2)
+
+C We have an odd # of limbs.  Add the first limbs separately.
+	cmpi	1,10,0		C is count for unrolled loop zero?
+	bc	4,6,L1		C bne cr1,L1 (misassembled by gas)
+	st	7,4(3)
+	sfe	3,0,0		C load !cy into ...
+	sfi	3,3,0		C ... return value register
+	br			C return
+
+C We added least significant limbs.  Now reload the next limbs to enter loop.
+L1:	lu	8,4(4)		C load s1 limb and update s1_ptr
+	lu	0,4(5)		C load s2 limb and update s2_ptr
+	stu	7,4(3)
+	sfe	7,0,8		C subtract limbs, set cy
+Leven:	lu	9,4(4)		C load s1 limb and update s1_ptr
+	lu	10,4(5)		C load s2 limb and update s2_ptr
+	bdz	Lend		C If done, skip loop
+
+Loop:	lu	8,4(4)		C load s1 limb and update s1_ptr
+	lu	0,4(5)		C load s2 limb and update s2_ptr
+	sfe	11,10,9		C subtract previous limbs with cy, set cy
+	stu	7,4(3)		C
+	lu	9,4(4)		C load s1 limb and update s1_ptr
+	lu	10,4(5)		C load s2 limb and update s2_ptr
+	sfe	7,0,8		C subtract previous limbs with cy, set cy
+	stu	11,4(3)		C
+	bdn	Loop		C decrement CTR and loop back
+
+Lend:	sfe	11,10,9		C subtract limbs with cy, set cy
+	st	7,4(3)		C
+	st	11,8(3)		C
+	sfe	3,0,0		C load !cy into ...
+	sfi	3,3,0		C ... return value register
+	br
+EPILOGUE(mpn_sub_n)

diff --git a/third_party/gmp/mpn/power/submul_1.asm b/third_party/gmp/mpn/power/submul_1.asm
new file mode 100644
index 0000000..1788e0d
--- /dev/null
+++ b/third_party/gmp/mpn/power/submul_1.asm

@@ -0,0 +1,131 @@
+dnl  IBM POWER mpn_submul_1 -- Multiply a limb vector with a limb and subtract
+dnl  the result from a second limb vector.
+
+dnl  Copyright 1992, 1994, 1999-2001 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+dnl  INPUT PARAMETERS
+dnl  res_ptr	r3
+dnl  s1_ptr	r4
+dnl  size	r5
+dnl  s2_limb	r6
+
+dnl  The POWER architecture has no unsigned 32x32->64 bit multiplication
+dnl  instruction.  To obtain that operation, we have to use the 32x32->64
+dnl  signed multiplication instruction, and add the appropriate compensation to
+dnl  the high limb of the result.  We add the multiplicand if the multiplier
+dnl  has its most significant bit set, and we add the multiplier if the
+dnl  multiplicand has its most significant bit set.  We need to preserve the
+dnl  carry flag between each iteration, so we have to compute the compensation
+dnl  carefully (the natural, srai+and doesn't work).  Since all POWER can
+dnl  branch in zero cycles, we use conditional branches for the compensation.
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_submul_1)
+	cal	3,-4(3)
+	l	0,0(4)
+	cmpi	0,6,0
+	mtctr	5
+	mul	9,0,6
+	srai	7,0,31
+	and	7,7,6
+	mfmq	11
+	cax	9,9,7
+	l	7,4(3)
+	sf	8,11,7		C add res_limb
+	a	11,8,11		C invert cy (r11 is junk)
+	blt	Lneg
+Lpos:	bdz	Lend
+
+Lploop:	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	10,0,6
+	mfmq	0
+	ae	11,0,9		C low limb + old_cy_limb + old cy
+	l	7,4(3)
+	aze	10,10		C propagate cy to new cy_limb
+	sf	8,11,7		C add res_limb
+	a	11,8,11		C invert cy (r11 is junk)
+	bge	Lp0
+	cax	10,10,6		C adjust high limb for negative limb from s1
+Lp0:	bdz	Lend0
+	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	9,0,6
+	mfmq	0
+	ae	11,0,10
+	l	7,4(3)
+	aze	9,9
+	sf	8,11,7
+	a	11,8,11		C invert cy (r11 is junk)
+	bge	Lp1
+	cax	9,9,6		C adjust high limb for negative limb from s1
+Lp1:	bdn	Lploop
+
+	b	Lend
+
+Lneg:	cax	9,9,0
+	bdz	Lend
+Lnloop:	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	10,0,6
+	mfmq	7
+	ae	11,7,9
+	l	7,4(3)
+	ae	10,10,0		C propagate cy to new cy_limb
+	sf	8,11,7		C add res_limb
+	a	11,8,11		C invert cy (r11 is junk)
+	bge	Ln0
+	cax	10,10,6		C adjust high limb for negative limb from s1
+Ln0:	bdz	Lend0
+	lu	0,4(4)
+	stu	8,4(3)
+	cmpi	0,0,0
+	mul	9,0,6
+	mfmq	7
+	ae	11,7,10
+	l	7,4(3)
+	ae	9,9,0		C propagate cy to new cy_limb
+	sf	8,11,7		C add res_limb
+	a	11,8,11		C invert cy (r11 is junk)
+	bge	Ln1
+	cax	9,9,6		C adjust high limb for negative limb from s1
+Ln1:	bdn	Lnloop
+	b	Lend
+
+Lend0:	cal	9,0(10)
+Lend:	st	8,4(3)
+	aze	3,9
+	br
+EPILOGUE(mpn_submul_1)

diff --git a/third_party/gmp/mpn/power/umul.asm b/third_party/gmp/mpn/power/umul.asm
new file mode 100644
index 0000000..5a0599e
--- /dev/null
+++ b/third_party/gmp/mpn/power/umul.asm

@@ -0,0 +1,43 @@
+dnl  Copyright 1999, 2001 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_umul_ppmm)
+	mul	9,4,5
+	srai	0,4,31
+	and	0,0,5
+	srai	5,5,31
+	and	5,5,4
+	cax	0,0,5
+	mfmq	11
+	st	11,0(3)
+	cax	3,9,0
+	br
+EPILOGUE(mpn_umul_ppmm)

diff --git a/third_party/gmp/mpn/powerpc32/750/com.asm b/third_party/gmp/mpn/powerpc32/750/com.asm
new file mode 100644
index 0000000..1b8b574
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc32/750/com.asm

@@ -0,0 +1,79 @@
+dnl  PowerPC 750 mpn_com -- mpn bitwise one's complement
+
+dnl  Copyright 2002, 2003 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C                cycles/limb
+C 603e:            ?
+C 604e:            3.0
+C 75x (G3):        2.0
+C 7400,7410 (G4):  2.0
+C 744x,745x (G4+): 3.0
+
+C void mpn_com (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C This loop form is necessary for the claimed speed.
+
+ASM_START()
+PROLOGUE(mpn_com)
+
+	C r3	dst
+	C r4	src
+	C r5	size
+
+	mtctr	r5		C size
+	lwz	r5, 0(r4)	C src low limb
+
+	sub	r4, r4, r3	C src-dst
+	subi	r3, r3, 4	C dst-4
+
+	addi	r4, r4, 8	C src-dst+8
+	bdz	L(one)
+
+L(top):
+	C r3	&dst[i-1]
+	C r4	src-dst
+	C r5	src[i]
+	C r6	scratch
+
+	not	r6, r5		C ~src[i]
+	lwzx	r5, r4,r3	C src[i+1]
+
+	stwu	r6, 4(r3)	C dst[i]
+	bdnz	L(top)
+
+L(one):
+	not	r6, r5
+
+	stw	r6, 4(r3)	C dst[size-1]
+	blr
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/powerpc32/750/gmp-mparam.h b/third_party/gmp/mpn/powerpc32/750/gmp-mparam.h
new file mode 100644
index 0000000..3667e85
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc32/750/gmp-mparam.h

@@ -0,0 +1,192 @@
+/* PowerPC-32 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2002, 2004, 2009, 2010 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+
+/* This file is used for 75x (G3) and for 7400/7410 (G4), both which have
+   much slow multiply instructions.  */
+
+/* 450 MHz PPC 7400 */
+
+#define DIVREM_1_NORM_THRESHOLD              0  /* always */
+#define DIVREM_1_UNNORM_THRESHOLD            0  /* always */
+#define MOD_1_NORM_THRESHOLD                 3
+#define MOD_1_UNNORM_THRESHOLD               0  /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD         11
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          7
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD        11
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD        18
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     38
+#define USE_PREINV_DIVREM_1                  1
+#define DIVEXACT_1_THRESHOLD                 0  /* always */
+#define BMOD_1_TO_MOD_1_THRESHOLD        MP_SIZE_T_MAX  /* never */
+
+#define MUL_TOOM22_THRESHOLD                10
+#define MUL_TOOM33_THRESHOLD                38
+#define MUL_TOOM44_THRESHOLD                99
+#define MUL_TOOM6H_THRESHOLD               141
+#define MUL_TOOM8H_THRESHOLD               212
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      65
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD      69
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD      65
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD      66
+
+#define SQR_BASECASE_THRESHOLD               4
+#define SQR_TOOM2_THRESHOLD                 18
+#define SQR_TOOM3_THRESHOLD                 57
+#define SQR_TOOM4_THRESHOLD                142
+#define SQR_TOOM6_THRESHOLD                173
+#define SQR_TOOM8_THRESHOLD                309
+
+#define MULMOD_BNM1_THRESHOLD                9
+#define SQRMOD_BNM1_THRESHOLD               11
+
+#define MUL_FFT_MODF_THRESHOLD             220  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    220, 5}, {     13, 6}, {      7, 5}, {     15, 6}, \
+    {      8, 5}, {     17, 6}, {      9, 5}, {     19, 6}, \
+    {     13, 7}, {      7, 6}, {     17, 7}, {      9, 6}, \
+    {     19, 7}, {     11, 6}, {     23, 7}, {     13, 8}, \
+    {      7, 7}, {     19, 8}, {     11, 7}, {     23, 9}, \
+    {      7, 8}, {     15, 7}, {     33, 8}, {     19, 7}, \
+    {     39, 8}, {     23, 9}, {     15, 8}, {     39, 9}, \
+    {     23, 8}, {     47,10}, {     15, 9}, {     31, 8}, \
+    {     67, 9}, {     55,10}, {     31, 9}, {     63, 8}, \
+    {    127, 7}, {    255, 9}, {     71, 8}, {    143, 7}, \
+    {    287, 9}, {     79,10}, {     47, 9}, {     95,11}, \
+    {     31,10}, {     63, 9}, {    127, 8}, {    255, 9}, \
+    {    143, 8}, {    287,10}, {     79, 9}, {    159, 8}, \
+    {    319, 9}, {    175, 8}, {    351, 7}, {    703,10}, \
+    {     95, 9}, {    191, 8}, {    383, 9}, {    207,10}, \
+    {    111,11}, {     63,10}, {    127, 9}, {    255,10}, \
+    {    143, 9}, {    287, 8}, {    575,10}, {    159, 9}, \
+    {    319,10}, {    175, 9}, {    351, 8}, {    703,11}, \
+    {     95,10}, {    191, 9}, {    383,10}, {    207, 9}, \
+    {    415, 8}, {    831,12}, {     63,11}, {    127,10}, \
+    {    255, 9}, {    511,10}, {    271, 9}, {    543,10}, \
+    {    287, 9}, {    575,11}, {    159,10}, {    351, 9}, \
+    {    703, 8}, {   1407,11}, {    191,10}, {    415, 9}, \
+    {    831,11}, {    223,10}, {    447, 9}, {    895,12}, \
+    {    127,11}, {    255,10}, {    543,11}, {    287,10}, \
+    {    575,11}, {    351,10}, {    703, 9}, {   1407,12}, \
+    {    191,11}, {    415,10}, {    831,11}, {    447,10}, \
+    {    895,13}, {    127,12}, {    255,11}, {    543,10}, \
+    {   1087,11}, {    575,12}, {    319,11}, {    703,10}, \
+    {   1407,12}, {    383,11}, {    831,12}, {    447,11}, \
+    {    895,10}, {   1791,11}, {    959,13}, {    255,12}, \
+    {    511,11}, {   1087,12}, {    575,11}, {   1215,12}, \
+    {    703,11}, {   1407,13}, {    383,12}, {    895,11}, \
+    {   1791,12}, {    959,14}, {    255,13}, {    511,12}, \
+    {   1215,13}, {    639,12}, {   1407,13}, {    895,12}, \
+    {   1919,14}, {    511,13}, {   1023,12}, {   2047,13}, \
+    {   1151,12}, {   2303,13}, {   1407,14}, {    767,13}, \
+    {   1919,10}, {  15359,12}, {   4096,13}, {   8192,14}, \
+    {  16384,15}, {  32768,16} }
+#define MUL_FFT_TABLE3_SIZE 154
+#define MUL_FFT_THRESHOLD                 2688
+
+#define SQR_FFT_MODF_THRESHOLD             184  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    184, 5}, {      6, 4}, {     13, 5}, {     13, 6}, \
+    {      7, 5}, {     15, 6}, {     13, 7}, {      7, 6}, \
+    {     16, 7}, {      9, 6}, {     19, 7}, {     11, 6}, \
+    {     23, 7}, {     13, 8}, {      7, 7}, {     19, 8}, \
+    {     11, 7}, {     25, 9}, {      7, 8}, {     15, 7}, \
+    {     31, 8}, {     19, 7}, {     39, 8}, {     27, 9}, \
+    {     15, 8}, {     39, 9}, {     23,10}, {     15, 9}, \
+    {     31, 8}, {     63, 9}, {     39, 8}, {     79, 9}, \
+    {     47, 8}, {     95,10}, {     31, 9}, {     63, 8}, \
+    {    127, 7}, {    255, 9}, {     71, 8}, {    143, 7}, \
+    {    287, 9}, {     79, 8}, {    159,10}, {     47, 9}, \
+    {     95,11}, {     31,10}, {     63, 9}, {    127, 8}, \
+    {    255, 9}, {    143, 8}, {    287, 7}, {    575,10}, \
+    {     79, 9}, {    159, 8}, {    319, 9}, {    175, 8}, \
+    {    351,10}, {     95, 9}, {    191, 8}, {    383, 9}, \
+    {    207,10}, {    111,11}, {     63,10}, {    127, 9}, \
+    {    255,10}, {    143, 9}, {    287, 8}, {    575,10}, \
+    {    159, 9}, {    319,10}, {    175, 9}, {    351,11}, \
+    {     95,10}, {    191, 9}, {    383,10}, {    207, 9}, \
+    {    415, 8}, {    831,10}, {    223,12}, {     63,11}, \
+    {    127,10}, {    255, 9}, {    511,10}, {    287, 9}, \
+    {    575,11}, {    159,10}, {    351, 9}, {    703,11}, \
+    {    191,10}, {    415, 9}, {    831,11}, {    223,10}, \
+    {    447, 9}, {    895,12}, {    127,11}, {    255,10}, \
+    {    511,11}, {    287,10}, {    575,11}, {    319,10}, \
+    {    639,11}, {    351,10}, {    703, 9}, {   1407,12}, \
+    {    191,11}, {    383,10}, {    767,11}, {    415,10}, \
+    {    831,11}, {    447,10}, {    895,13}, {    127,12}, \
+    {    255,11}, {    511,10}, {   1023,11}, {    575,12}, \
+    {    319,11}, {    703,10}, {   1407,12}, {    383,11}, \
+    {    831,12}, {    447,11}, {    895,10}, {   1791,11}, \
+    {    959,13}, {    255,12}, {    511,11}, {   1023,12}, \
+    {    575,11}, {   1215,12}, {    703,11}, {   1407,13}, \
+    {    383,12}, {    895,11}, {   1791,12}, {    959,14}, \
+    {    255,13}, {    511,12}, {   1215,13}, {    639,12}, \
+    {   1471,13}, {    767,12}, {   1535,13}, {    895,12}, \
+    {   1919,14}, {    511,13}, {   1151,12}, {   2431,13}, \
+    {   1407,14}, {    767,13}, {   1919,15}, {  32768,16} }
+#define SQR_FFT_TABLE3_SIZE 152
+#define SQR_FFT_THRESHOLD                 1728
+
+#define MULLO_BASECASE_THRESHOLD             0  /* always */
+#define MULLO_DC_THRESHOLD                  33
+#define MULLO_MUL_N_THRESHOLD             5240
+
+#define DC_DIV_QR_THRESHOLD                 31
+#define DC_DIVAPPR_Q_THRESHOLD             108
+#define DC_BDIV_QR_THRESHOLD                35
+#define DC_BDIV_Q_THRESHOLD                 88
+
+#define INV_MULMOD_BNM1_THRESHOLD           42
+#define INV_NEWTON_THRESHOLD               149
+#define INV_APPR_THRESHOLD                 125
+
+#define BINV_NEWTON_THRESHOLD              156
+#define REDC_1_TO_REDC_N_THRESHOLD          39
+
+#define MU_DIV_QR_THRESHOLD                807
+#define MU_DIVAPPR_Q_THRESHOLD             807
+#define MUPI_DIV_QR_THRESHOLD               66
+#define MU_BDIV_QR_THRESHOLD               667
+#define MU_BDIV_Q_THRESHOLD                807
+
+#define MATRIX22_STRASSEN_THRESHOLD         11
+#define HGCD_THRESHOLD                      87
+#define GCD_DC_THRESHOLD                   233
+#define GCDEXT_DC_THRESHOLD                198
+#define JACOBI_BASE_METHOD                   1
+
+#define GET_STR_DC_THRESHOLD                12
+#define GET_STR_PRECOMPUTE_THRESHOLD        28
+#define SET_STR_DC_THRESHOLD               390
+#define SET_STR_PRECOMPUTE_THRESHOLD       814

diff --git a/third_party/gmp/mpn/powerpc32/750/lshift.asm b/third_party/gmp/mpn/powerpc32/750/lshift.asm
new file mode 100644
index 0000000..3a1c1a7
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc32/750/lshift.asm

@@ -0,0 +1,155 @@
+dnl  PowerPC 750 mpn_lshift -- mpn left shift.
+
+dnl  Copyright 2002, 2003 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C       cycles/limb
+C 750:     3.0
+C 7400:    3.0
+
+
+C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                       unsigned shift);
+C
+C This code is the same per-limb speed as mpn/powerpc32/lshift.asm, but
+C smaller and saving about 30 or so cycles of overhead.
+
+ASM_START()
+PROLOGUE(mpn_lshift)
+
+	C r3	dst
+	C r4	src
+	C r5	size
+	C r6	shift
+
+	mtctr	r5		C size
+	slwi	r5, r5, 2	C 4*size
+
+	subfic	r7, r6, 32	C 32-shift
+	add	r4, r4, r5	C &src[size]
+
+	add	r5, r3, r5	C &dst[size]
+	lwz	r8, -4(r4)	C src[size-1]
+	bdz	L(one)
+
+	lwzu	r9, -8(r4)	C src[size-2]
+
+	srw	r3, r8, r7	C return value
+	slw	r8, r8, r6	C src[size-1] << shift
+	bdz	L(two)
+
+
+L(top):
+	C r3	return value
+	C r4	src, incrementing
+	C r5	dst, incrementing
+	C r6	lshift
+	C r7	32-shift
+	C r8	src[i+1] << shift
+	C r9	src[i]
+	C r10
+
+	lwzu	r10, -4(r4)
+	srw	r11, r9, r7
+
+	or	r8, r8, r11
+	stwu	r8, -4(r5)
+
+	slw	r8, r9, r6
+	bdz	L(odd)
+
+	C r8	src[i+1] << shift
+	C r9
+	C r10	src[i]
+
+	lwzu	r9, -4(r4)
+	srw	r11, r10, r7
+
+	or	r8, r8, r11
+	stwu	r8, -4(r5)
+
+	slw	r8, r10, r6
+	bdnz	L(top)
+
+
+L(two):
+	C r3	return value
+	C r4
+	C r5	&dst[2]
+	C r6	shift
+	C r7	32-shift
+	C r8	src[1] << shift
+	C r9	src[0]
+	C r10
+
+	srw	r11, r9, r7
+	slw	r12, r9, r6	C src[0] << shift
+
+	or	r8, r8, r11
+	stw	r12, -8(r5)	C dst[0]
+
+	stw	r8, -4(r5)	C dst[1]
+	blr
+
+
+L(odd):
+	C r3	return value
+	C r4
+	C r5	&dst[2]
+	C r6	shift
+	C r7	32-shift
+	C r8	src[1] << shift
+	C r9
+	C r10	src[0]
+
+	srw	r11, r10, r7
+	slw	r12, r10, r6
+
+	or	r8, r8, r11
+	stw	r12, -8(r5)	C dst[0]
+
+	stw	r8, -4(r5)	C dst[1]
+	blr
+
+
+L(one):
+	C r5	&dst[1]
+	C r6	shift
+	C r7	32-shift
+	C r8	src[0]
+
+	srw	r3, r8, r7	C return value
+	slw	r8, r8, r6	C src[size-1] << shift
+
+	stw	r8, -4(r5)	C dst[0]
+	blr
+
+EPILOGUE(mpn_lshift)

diff --git a/third_party/gmp/mpn/powerpc32/750/rshift.asm b/third_party/gmp/mpn/powerpc32/750/rshift.asm
new file mode 100644
index 0000000..4825fee
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc32/750/rshift.asm

@@ -0,0 +1,153 @@
+dnl  PowerPC 750 mpn_rshift -- mpn right shift.
+
+dnl  Copyright 2002, 2003 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C       cycles/limb
+C 750:     3.0
+C 7400:    3.0
+
+
+C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                       unsigned shift);
+C
+C This code is the same per-limb speed as mpn/powerpc32/rshift.asm, but
+C smaller and saving about 30 or so cycles of overhead.
+
+ASM_START()
+PROLOGUE(mpn_rshift)
+
+	C r3	dst
+	C r4	src
+	C r5	size
+	C r6	shift
+
+	mtctr	r5		C size
+	lwz	r8, 0(r4)	C src[0]
+
+	subfic	r7, r6, 32	C 32-shift
+	addi	r5, r3, -4	C dst-4
+
+	slw	r3, r8, r7	C return value
+	bdz	L(one)
+
+	lwzu	r9, 4(r4)	C src[1]
+	srw	r8, r8, r6	C src[0] >> shift
+	bdz	L(two)
+
+
+L(top):
+	C r3	return value
+	C r4	src, incrementing
+	C r5	dst, incrementing
+	C r6	shift
+	C r7	32-shift
+	C r8	src[i-1] >> shift
+	C r9	src[i]
+	C r10
+
+	lwzu	r10, 4(r4)
+	slw	r11, r9, r7
+
+	or	r8, r8, r11
+	stwu	r8, 4(r5)
+
+	srw	r8, r9, r6
+	bdz	L(odd)
+
+	C r8	src[i-1] >> shift
+	C r9
+	C r10	src[i]
+
+	lwzu	r9, 4(r4)
+	slw	r11, r10, r7
+
+	or	r8, r8, r11
+	stwu	r8, 4(r5)
+
+	srw	r8, r10, r6
+	bdnz	L(top)
+
+
+L(two):
+	C r3	return value
+	C r4
+	C r5	&dst[size-2]
+	C r6	shift
+	C r7	32-shift
+	C r8	src[size-2] >> shift
+	C r9	src[size-1]
+	C r10
+
+	slw	r11, r9, r7
+	srw	r12, r9, r6	C src[size-1] >> shift
+
+	or	r8, r8, r11
+	stw	r12, 8(r5)	C dst[size-1]
+
+	stw	r8, 4(r5)	C dst[size-2]
+	blr
+
+
+L(odd):
+	C r3	return value
+	C r4
+	C r5	&dst[size-2]
+	C r6	shift
+	C r7	32-shift
+	C r8	src[size-2] >> shift
+	C r9
+	C r10	src[size-1]
+
+	slw	r11, r10, r7
+	srw	r12, r10, r6
+
+	or	r8, r8, r11
+	stw	r12, 8(r5)	C dst[size-1]
+
+	stw	r8, 4(r5)	C dst[size-2]
+	blr
+
+
+L(one):
+	C r3	return value
+	C r4
+	C r5	dst-4
+	C r6	shift
+	C r7
+	C r8	src[0]
+
+	srw	r8, r8, r6
+
+	stw	r8, 4(r5)	C dst[0]
+	blr
+
+EPILOGUE(mpn_rshift)

diff --git a/third_party/gmp/mpn/powerpc32/README b/third_party/gmp/mpn/powerpc32/README
new file mode 100644
index 0000000..887e78b
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc32/README

@@ -0,0 +1,180 @@
+Copyright 2002, 2005 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.
+
+
+
+
+
+                    POWERPC 32-BIT MPN SUBROUTINES
+
+
+This directory contains mpn functions for various 32-bit PowerPC chips.
+
+
+CODE ORGANIZATION
+
+	directory	  used for
+	================================================
+	powerpc           generic, 604, 604e, 744x, 745x
+	powerpc/750       740, 750, 7400, 7410
+
+
+The top-level powerpc directory is currently mostly aimed at 604/604e but
+should be reasonable on all powerpcs.
+
+
+
+STATUS
+
+The code is quite well optimized for the 604e, other chips have had less
+attention.
+
+Altivec SIMD available in 74xx might hold some promise, but unfortunately
+GMP only guarantees 32-bit data alignment, so there's lots of fiddling
+around with partial operations at the start and end of limb vectors.  A
+128-bit limb would be a novel idea, but is unlikely to be practical, since
+it would have to work with ordinary +, -, * etc in the C code.
+
+Also, Altivec isn't very well suited for the GMP multiplication needs.
+Using floating-point based multiplication has much better better performance
+potential for all current powerpcs, both the ones with slow integer multiply
+units (603, 740, 750, 7400, 7410) and those with fast (604, 604e, 744x,
+745x).  This is because all powerpcs do some level of pipelining in the FPU:
+
+603 and 750 can sustain one fmadd every 2nd cycle.
+604 and 604e can sustain one fmadd per cycle.
+7400 and 7410 can sustain 3 fmadd in 4 cycles.
+744x and 745x can sustain 4 fmadd in 5 cycles.
+
+
+
+REGISTER NAMES
+
+The normal powerpc convention is to give registers as plain numbers, like
+"mtctr 6", but on Apple MacOS X (powerpc*-*-rhapsody* and
+powerpc*-*-darwin*) the assembler demands an "r" like "mtctr r6".  Note
+however when register 0 in an instruction means a literal zero the "r" is
+omitted, for instance "lwzx r6,0,r7".
+
+The GMP code uses the "r" forms, powerpc-defs.m4 transforms them to plain
+numbers according to what GMP_ASM_POWERPC_R_REGISTERS finds is needed.
+(Note that this style isn't fully general, as the identifier r4 and the
+register r4 will not be distinguishable on some systems.  However, this is
+not a problem for the limited GMP assembly usage.)
+
+
+
+GLOBAL REFERENCES
+
+Linux non-PIC
+	lis	9, __gmp_binvert_limb_table@ha
+	rlwinm	11, 5, 31, 25, 31
+	la	9, __gmp_binvert_limb_table@l(9)
+	lbzx	11, 9, 11
+
+Linux PIC (FIXME)
+.LCL0:
+	.long .LCTOC1-.LCF0
+	bcl	20, 31, .LCF0
+.LCF0:
+	mflr	30
+	lwz	7, .LCL0-.LCF0(30)
+	add	30, 7, 30
+	lwz	11, .LC0-.LCTOC1(30)
+	rlwinm	3, 5, 31, 25, 31
+	lbzx	7, 11, 3
+
+AIX (always PIC)
+LC..0:
+	.tc __gmp_binvert_limb_table[TC],__gmp_binvert_limb_table[RW]
+	lwz	9, LC..0(2)
+	rlwinm	0, 5, 31, 25, 31
+	lbzx	0, 9, 0
+
+Darwin (non-PIC)
+	lis	r2, ha16(___gmp_binvert_limb_table)
+	rlwinm	r9, r5, 31, 25, 31
+	la	r2, lo16(___gmp_binvert_limb_table)(r2)
+	lbzx	r0, r2, r9
+Darwin (PIC)
+	mflr	r0
+	bcl	20, 31, L0001$pb
+L0001$pb:
+	mflr	r7
+	mtlr	r0
+	addis	r2, r7, ha16(L___gmp_binvert_limb_table$non_lazy_ptr-L0001$pb)
+	rlwinm	r9, r5, 31, 25, 31
+	lwz	r2, lo16(L___gmp_binvert_limb_table$non_lazy_ptr-L0001$pb)(r2)
+	lbzx	r0, r2, r9
+------
+	.non_lazy_symbol_pointer
+L___gmp_binvert_limb_table$non_lazy_ptr:
+	.indirect_symbol ___gmp_binvert_limb_table
+	.long	0
+	.subsections_via_symbols
+
+
+For GNU/Linux and Darwin, we might want to duplicate __gmp_binvert_limb_table
+into the text section in this file.  We should thus be able to reach it like
+this:
+
+	blr	L0
+L0:	mflr	r2
+	rlwinm	r9, r5, 31, 25, 31
+	addi	r9, r9, lo16(local_binvert_table-L0)
+	lbzx	r0, r2, r9
+
+
+
+REFERENCES
+
+PowerPC Microprocessor Family: The Programming Environments for 32-bit
+Microprocessors, IBM document G522-0290-01, 2000.
+
+PowerPC 604e RISC Microprocessor User's Manual with Supplement for PowerPC
+604 Microprocessor, IBM document G552-0330-00, Freescale document
+MPC604EUM/AD, 3/1998.
+
+MPC7410/MPC7400 RISC Microprocessor User's Manual, Freescale document
+MPC7400UM/D, rev 1, 11/2002.
+
+MPC7450 RISC Microprocessor Family Reference Manual, Freescale document
+MPC7450UM, rev 5, 1/2005.
+
+The above are available online from
+
+	http://www.ibm.com/chips/techlib/techlib.nsf/productfamilies/PowerPC
+	http://www.freescale.com/PowerPC
+
+
+
+----------------
+Local variables:
+mode: text
+fill-column: 76
+End:

diff --git a/third_party/gmp/mpn/powerpc32/addlsh1_n.asm b/third_party/gmp/mpn/powerpc32/addlsh1_n.asm
new file mode 100644
index 0000000..71645c3
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc32/addlsh1_n.asm

@@ -0,0 +1,100 @@
+dnl  PowerPC-32 mpn_addlsh1_n -- rp[] = up[] + (vp[] << 1)
+
+dnl  Copyright 2003, 2005, 2007 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C                cycles/limb
+C 603e:            ?
+C 604e:            4.0
+C 75x (G3):        5.0
+C 7400,7410 (G4):  5.0
+C 744x,745x (G4+): 5.0
+C power4/ppc970:   4.25
+C power5:          5.0
+
+C INPUT PARAMETERS
+C rp	r3
+C up	r4
+C vp	r5
+C n	r6
+
+define(`rp',`r3')
+define(`up',`r4')
+define(`vp',`r5')
+
+define(`s0',`r6')
+define(`s1',`r7')
+define(`u0',`r8')
+define(`v0',`r10')
+define(`v1',`r11')
+
+ASM_START()
+PROLOGUE(mpn_addlsh1_n)
+	mtctr	r6		C copy n in ctr
+	addic	r31, r31, 0	C clear cy
+
+	lwz	v0, 0(vp)	C load v limb
+	lwz	u0, 0(up)	C load u limb
+	addi	up, up, -4	C update up
+	addi	rp, rp, -4	C update rp
+	slwi	s1, v0, 1
+	bdz	L(end)		C If done, skip loop
+
+L(loop):
+	lwz	v1, 4(vp)	C load v limb
+	adde	s1, s1, u0	C add limbs with cy, set cy
+	srwi	s0, v0, 31	C shift down previous v limb
+	stw	s1, 4(rp)	C store result limb
+	lwzu	u0, 8(up)	C load u limb and update up
+	rlwimi	s0, v1, 1, 0,30	C left shift v limb and merge with prev v limb
+
+	bdz	L(exit)		C decrement ctr and exit if done
+
+	lwzu	v0, 8(vp)	C load v limb and update vp
+	adde	s0, s0, u0	C add limbs with cy, set cy
+	srwi	s1, v1, 31	C shift down previous v limb
+	stwu	s0, 8(rp)	C store result limb and update rp
+	lwz	u0, 4(up)	C load u limb
+	rlwimi	s1, v0, 1, 0,30	C left shift v limb and merge with prev v limb
+
+	bdnz	L(loop)		C decrement ctr and loop back
+
+L(end):	adde	r7, s1, u0
+	srwi	r4, v0, 31
+	stw	r7, 4(rp)	C store last result limb
+	addze	r3, r4
+	blr
+L(exit):
+	adde	r7, s0, u0
+	srwi	r4, v1, 31
+	stw	r7, 8(rp)	C store last result limb
+	addze	r3, r4
+	blr
+EPILOGUE()

diff --git a/third_party/gmp/mpn/powerpc32/addmul_1.asm b/third_party/gmp/mpn/powerpc32/addmul_1.asm
new file mode 100644
index 0000000..07486df
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc32/addmul_1.asm

@@ -0,0 +1,159 @@
+dnl  PowerPC-32 mpn_addmul_1 -- Multiply a limb vector with a limb and add the
+dnl  result to a second limb vector.
+
+dnl  Copyright 1995, 1997, 1998, 2000-2003, 2005 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C                cycles/limb
+C 603e:            ?
+C 604e:            6.75
+C 75x (G3):        8.7-14.3
+C 7400,7410 (G4):  8.7-14.3
+C 744x,745x (G4+): 9.5
+C power4/ppc970:   6.25
+C power5:          6.25
+
+C INPUT PARAMETERS
+C rp	r3
+C up	r4
+C n	r5
+C vl	r6
+
+C This is optimized for the PPC604.  It has not been tuned for other
+C PowerPC processors.
+C
+C Loop Analysis for the 604:
+C 12 mem insn
+C 8 serializing insn
+C 8 int multiply
+C 25 int reg write
+C 9 int ops (8 of which serialize)
+C
+C The multiply insns need 16 cycles/4limb.
+C The integer register writes will need 13 cycles/4limb.
+C All-in-all, it should be possible to get to 4 or 5 cycles/limb on PPC604,
+C but that will require some clever FPNOPS and BNOPS for exact
+C issue control.
+
+
+ASM_START()
+PROLOGUE(mpn_addmul_1)
+	cmpwi	cr0,r5,9	C more than 9 limbs?
+	bgt	cr0,L(big)	C branch if more than 9 limbs
+
+	mtctr	r5
+	lwz	r0,0(r4)
+	mullw	r7,r0,r6
+	mulhwu	r10,r0,r6
+	lwz	r9,0(r3)
+	addc	r8,r7,r9
+	addi	r3,r3,-4
+	bdz	L(end)
+L(loop):
+	lwzu	r0,4(r4)
+	stwu	r8,4(r3)
+	mullw	r8,r0,r6
+	adde	r7,r8,r10
+	mulhwu	r10,r0,r6
+	lwz	r9,4(r3)
+	addze	r10,r10
+	addc	r8,r7,r9
+	bdnz	L(loop)
+L(end):	stw	r8,4(r3)
+	addze	r3,r10
+	blr
+
+L(big):	stwu	r1,-16(r1)
+	addi	r5,r5,-1
+	stw	r30,8(r1)
+	srwi	r0,r5,2
+	stw	r31,12(r1)
+	mtctr	r0
+
+	lwz	r7,0(r4)
+	mullw	r8,r7,r6
+	mulhwu	r0,r7,r6
+	lwz	r7,0(r3)
+	addc	r8,r8,r7
+	stw	r8,0(r3)
+
+L(loopU):
+	lwz	r7,4(r4)
+	lwz	r12,8(r4)
+	lwz	r30,12(r4)
+	lwzu	r31,16(r4)
+	mullw	r8,r7,r6
+	mullw	r9,r12,r6
+	mullw	r10,r30,r6
+	mullw	r11,r31,r6
+	adde	r8,r8,r0	C add cy_limb
+	mulhwu	r0,r7,r6
+	lwz	r7,4(r3)
+	adde	r9,r9,r0
+	mulhwu	r0,r12,r6
+	lwz	r12,8(r3)
+	adde	r10,r10,r0
+	mulhwu	r0,r30,r6
+	lwz	r30,12(r3)
+	adde	r11,r11,r0
+	mulhwu	r0,r31,r6
+	lwz	r31,16(r3)
+	addze	r0,r0		C new cy_limb
+	addc	r8,r8,r7
+	stw	r8,4(r3)
+	adde	r9,r9,r12
+	stw	r9,8(r3)
+	adde	r10,r10,r30
+	stw	r10,12(r3)
+	adde	r11,r11,r31
+	stwu	r11,16(r3)
+	bdnz	L(loopU)
+
+	andi.	r31,r5,3
+	mtctr	r31
+	beq	cr0,L(endx)
+
+L(loopE):
+	lwzu	r7,4(r4)
+	mullw	r8,r7,r6
+	adde	r8,r8,r0	C add cy_limb
+	mulhwu	r0,r7,r6
+	lwz	r7,4(r3)
+	addze	r0,r0		C new cy_limb
+	addc	r8,r8,r7
+	stwu	r8,4(r3)
+	bdnz	L(loopE)
+L(endx):
+	addze	r3,r0
+	lwz	r30,8(r1)
+	lwz	r31,12(r1)
+	addi	r1,r1,16
+	blr
+EPILOGUE(mpn_addmul_1)

diff --git a/third_party/gmp/mpn/powerpc32/aix.m4 b/third_party/gmp/mpn/powerpc32/aix.m4
new file mode 100644
index 0000000..fde2020
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc32/aix.m4

@@ -0,0 +1,82 @@
+divert(-1)
+dnl  m4 macros for AIX 32-bit assembly.
+
+dnl  Copyright 2000-2002, 2005, 2006 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+define(`ASM_START',
+`	.toc')
+
+dnl  Called: PROLOGUE_cpu(GSYM_PREFIX`'foo)
+dnl          EPILOGUE_cpu(GSYM_PREFIX`'foo)
+dnl
+dnl  Don't want ELF style .size in the epilogue.
+
+define(`PROLOGUE_cpu',
+m4_assert_numargs(1)
+	`
+	.globl	$1
+	.globl	.$1
+	.csect	[DS], 2
+$1:
+	.long	.$1, TOC[tc0], 0
+	.csect	[PR]
+	.align	2
+.$1:')
+
+define(`EPILOGUE_cpu',
+m4_assert_numargs(1)
+`')
+
+define(`TOC_ENTRY', `')
+
+define(`LEA',
+m4_assert_numargs(2)
+`define(`TOC_ENTRY',
+`	.toc
+tc$2:
+	.tc	$2[TC], $2')'
+`	lwz	$1, tc$2(2)')
+
+define(`EXTERN',
+m4_assert_numargs(1)
+`	.globl	$1')
+
+define(`DEF_OBJECT',
+m4_assert_numargs_range(1,2)
+`	.csect	[RO], 3
+	ALIGN(ifelse($#,1,2,$2))
+$1:
+')
+
+define(`END_OBJECT',
+m4_assert_numargs(1))
+
+define(`ASM_END', `TOC_ENTRY')
+
+divert

diff --git a/third_party/gmp/mpn/powerpc32/aors_n.asm b/third_party/gmp/mpn/powerpc32/aors_n.asm
new file mode 100644
index 0000000..25ece09
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc32/aors_n.asm

@@ -0,0 +1,157 @@
+dnl  PowerPC-32 mpn_add_n and mpn_sub_n.
+
+dnl  Copyright 2002, 2005, 2007 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C                   cycles/limb
+C 603e:                  ?
+C 604e:                  ?		old: 3.25
+C 75x (G3):              ?		old: 3.5
+C 7400,7410 (G4):        3.25
+C 744x,745x (G4+):       4
+C POWER3/PPC630          2
+C POWER4/PPC970          2.4
+C POWER5                 2.75
+C POWER6               40-140
+C POWER7                 3
+
+C INPUT PARAMETERS
+define(`rp',	`r3')
+define(`up',	`r4')
+define(`vp',	`r5')
+define(`n',	`r6')
+define(`cy',	`r7')
+
+ifdef(`OPERATION_add_n', `
+	define(ADCSBC,	adde)
+	define(func,	mpn_add_n)
+	define(func_nc,	mpn_add_nc)
+	define(IFADD,	`$1')
+	define(IFSUB,	`')')
+ifdef(`OPERATION_sub_n', `
+	define(ADCSBC,	subfe)
+	define(func,	mpn_sub_n)
+	define(func_nc,	mpn_sub_nc)
+	define(IFADD,	`')
+	define(IFSUB,	`$1')')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+
+ASM_START()
+
+PROLOGUE(func_nc)
+IFADD(`	addic	r0, cy, -1')		C set carry from argument
+IFSUB(`	subfic	r0, cy, 0')		C set carry from argument
+	b	L(ent)
+EPILOGUE()
+
+PROLOGUE(func)
+IFADD(`	addic	r0, n, 0')		C clear carry
+IFSUB(`	addic	r0, n, -1')		C set carry
+L(ent):	andi.	r0, n, 3
+	addi	r3, r3, -12
+	addi	n, n, 1
+	cmpwi	cr7, r0, 2
+	srwi	r0, n, 2
+	sub	r4, r4, r3
+	sub	r5, r5, r3
+	mtctr	r0
+	bne	cr0, L(n00)
+
+	lwzx	r7, r4, r3		C n = 4, 8, 12, ...
+	lwzx	r8, r5, r3
+	addi	r3, r3, 4
+	lwzx	r9, r4, r3
+	ADCSBC	r7, r8, r7
+	lwzx	r10, r5, r3
+	addi	r3, r3, 4
+	b	L(00)
+
+L(n00):	bge	cr7, L(n01)
+	cmpwi	cr0, r0, 0		C n = 1, 5, 9, 13, ...
+	lwzx	r0, r4, r3
+	lwzx	r6, r5, r3
+	addi	r3, r3, 4
+	ADCSBC	r0, r6, r0
+	ble	L(ret)
+L(gt1):	lwzx	r7, r4, r3
+	lwzx	r8, r5, r3
+	addi	r3, r3, 4
+	b	L(01)
+
+L(n10):
+	lwzx	r9, r4, r3		C n = 3, 7, 11, 15, ...
+	lwzx	r10, r5, r3
+	addi	r3, r3, 4
+	lwzx	r11, r4, r3
+	ADCSBC	r9, r10, r9
+	lwzx	r12, r5, r3
+	addi	r3, r3, 4
+	b	L(11)
+
+L(n01):	bne	cr7, L(n10)
+	cmpwi	cr0, r0, 0		C n = 2, 6, 10, 14, ...
+	lwzx	r11, r4, r3
+	lwzx	r12, r5, r3
+	addi	r3, r3, 4
+	lwzx	r0, r4, r3
+	ADCSBC	r11, r12, r11
+	lwzx	r6, r5, r3
+	addi	r3, r3, 4
+	ble	cr0, L(end)
+
+
+L(lp):	lwzx	r7, r4, r3
+	ADCSBC	r0, r6, r0
+	lwzx	r8, r5, r3
+	stwu	r11, 4(r3)
+L(01):	lwzx	r9, r4, r3
+	ADCSBC	r7, r8, r7
+	lwzx	r10, r5, r3
+	stwu	r0, 4(r3)
+L(00):	lwzx	r11, r4, r3
+	ADCSBC	r9, r10, r9
+	lwzx	r12, r5, r3
+	stwu	r7, 4(r3)
+L(11):	lwzx	r0, r4, r3
+	ADCSBC	r11, r12, r11
+	lwzx	r6, r5, r3
+	stwu	r9, 4(r3)
+	bdnz	L(lp)
+
+L(end):	ADCSBC	r0, r6, r0
+	stw	r11, 4(r3)
+L(ret):	stw	r0, 8(r3)
+IFADD(`	li	r3, 0	')
+IFADD(`	addze	r3, r3	')
+IFSUB(`	subfe	r3, r0, r0')
+IFSUB(`	neg	r3, r3')
+	blr
+EPILOGUE()

diff --git a/third_party/gmp/mpn/powerpc32/bdiv_dbm1c.asm b/third_party/gmp/mpn/powerpc32/bdiv_dbm1c.asm
new file mode 100644
index 0000000..72b2c48
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc32/bdiv_dbm1c.asm

@@ -0,0 +1,131 @@
+dnl  PPC32 mpn_bdiv_dbm1c.
+
+dnl  Copyright 2008 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C                cycles/limb
+C 603e:            ?
+C 604e:            ?
+C 75x (G3):        ?
+C 7400,7410 (G4):  9.43
+C 744x,745x (G4+): 6.28
+C power4/ppc970:   ?
+C power5:          ?
+
+C TODO
+C  * Nothing to do...
+
+C INPUT PARAMETERS
+define(`rp', `r3')
+define(`up', `r4')
+define(`n',  `r5')
+define(`bd', `r6')
+define(`cy', `r7')
+
+ASM_START()
+PROLOGUE(mpn_bdiv_dbm1c)
+	lwz	r0, 0(r4)
+
+	rlwinm.	r12, r5, 0,30,31
+	cmplwi	cr6, r12, 2
+	cmplwi	cr7, r5, 4
+	addi	r5, r5, 1
+	srwi	r5, r5, 2
+	mtctr	r5
+	beq	cr0, L(b00)
+	blt	cr6, L(b01)
+	beq	cr6, L(b10)
+
+L(b11):	mullw	r5, r0, r6
+	mulhwu	r12, r0, r6
+	lwz	r0, 4(r4)
+	addi	r4, r4, -12
+	addi	r3, r3, -12
+	b	L(3)
+
+L(b00):	mullw	r9, r0, r6
+	mulhwu	r8, r0, r6
+	lwz	r0, 4(r4)
+	addi	r4, r4, -8
+	addi	r3, r3, -8
+	b	L(0)
+
+L(b01):	mullw	r5, r0, r6
+	mulhwu	r12, r0, r6
+	addi	r3, r3, -4
+	ble	cr7, L(e1)
+	lwz	r0, 4(r4)
+	addi	r4, r4, -4
+	b	L(1)
+
+L(b10):	mullw	r9, r0, r6
+	mulhwu	r8, r0, r6
+	lwz	r0, 4(r4)
+	ble	cr7, L(e2)
+
+	ALIGN(16)
+L(top):	mullw	r5, r0, r6
+	mulhwu	r12, r0, r6
+	subfc	r11, r9, r7
+	lwz	r0, 8(r4)
+	subfe	r7, r8, r11
+	stw	r11, 0(r3)
+L(1):	mullw	r9, r0, r6
+	mulhwu	r8, r0, r6
+	subfc	r11, r5, r7
+	lwz	r0, 12(r4)
+	subfe	r7, r12, r11
+	stw	r11, 4(r3)
+L(0):	mullw	r5, r0, r6
+	mulhwu	r12, r0, r6
+	subfc	r11, r9, r7
+	lwz	r0, 16(r4)
+	subfe	r7, r8, r11
+	stw	r11, 8(r3)
+L(3):	mullw	r9, r0, r6
+	mulhwu	r8, r0, r6
+	subfc	r11, r5, r7
+	lwz	r0, 20(r4)
+	subfe	r7, r12, r11
+	stw	r11, 12(r3)
+	addi	r4, r4, 16
+	addi	r3, r3, 16
+	bdnz	L(top)
+
+L(e2):	mullw	r5, r0, r6
+	mulhwu	r12, r0, r6
+	subfc	r11, r9, r7
+	subfe	r7, r8, r11
+	stw	r11, 0(r3)
+L(e1):	subfc	r11, r5, r7
+	stw	r11, 4(r3)
+	subfe	r3, r12, r11
+	blr
+EPILOGUE()

diff --git a/third_party/gmp/mpn/powerpc32/darwin.m4 b/third_party/gmp/mpn/powerpc32/darwin.m4
new file mode 100644
index 0000000..db42268
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc32/darwin.m4

@@ -0,0 +1,91 @@
+divert(-1)
+dnl  m4 macros for Mac OS 32-bit assembly.
+
+dnl  Copyright 2005, 2006 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+define(`ASM_START',`')
+
+dnl  Called: PROLOGUE_cpu(GSYM_PREFIX`'foo[,toc])
+dnl          EPILOGUE_cpu(GSYM_PREFIX`'foo)
+dnl
+
+define(`PROLOGUE_cpu',
+m4_assert_numargs_range(1,2)
+`ifelse(`$2',toc,,
+`ifelse(`$2',,,`m4_error(`Unrecognised PROLOGUE parameter')')')dnl
+	.text
+	.globl	$1
+	.align	3
+$1:')
+
+define(`EPILOGUE_cpu',
+m4_assert_numargs(1))
+
+
+dnl  LEA -- Load Effective Address.
+
+define(`LEA',
+m4_assert_numargs(2)
+`ifdef(`PIC',
+`	mflr	r0			C save return address
+	bcl	20, 31, 1f
+1:	mflr	$1
+	addis	$1, $1, ha16($2-1b)
+	la	$1, lo16($2-1b)($1)
+	mtlr	r0			C restore return address
+',`
+	lis	$1, ha16($2)
+	la	$1, lo16($2)($1)
+')')
+
+define(`LEAL',
+m4_assert_numargs(2)
+`LEA($1,$2)')
+
+
+define(`EXTERN',
+m4_assert_numargs(1)
+`dnl')
+
+define(`DEF_OBJECT',
+m4_assert_numargs_range(1,2)
+`	.const
+	ALIGN(ifelse($#,1,2,$2))
+$1:
+')
+
+define(`END_OBJECT',
+m4_assert_numargs(1))
+
+define(`ASM_END', `dnl')
+
+ifdef(`PIC',`
+define(`PIC_SLOW')')
+
+divert

diff --git a/third_party/gmp/mpn/powerpc32/diveby3.asm b/third_party/gmp/mpn/powerpc32/diveby3.asm
new file mode 100644
index 0000000..288a7d3
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc32/diveby3.asm

@@ -0,0 +1,93 @@
+dnl  PowerPC-32 mpn_divexact_by3 -- mpn by 3 exact division
+
+dnl  Copyright 2002, 2003, 2005, 2006 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C                cycles/limb
+C 603e:              ?
+C 604e:              5
+C 75x (G3):          ?
+C 7400,7410 (G4):    8
+C 744x,745x (G4+):   6
+C power4/ppc970:    12
+C power5:            ?
+
+C void mpn_divexact_by3 (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C We avoid the slow subfe instruction and instead rely on an extremely unlikely
+C branch.
+C
+C The mullw has the inverse in the first operand, since 0xAA..AB won't allow
+C any early-out.  The src[] data normally won't either, but there's at least
+C a chance, whereas 0xAA..AB never will.  If, for instance, src[] is all
+C zeros (not a sensible input of course) we run at 7.0 c/l on ppc750.
+C
+C The mulhwu has the "3" multiplier in the second operand, which lets 750 and
+C 7400 use an early-out.
+
+C INPUT PARAMETERS
+define(`rp', `r3')
+define(`up', `r4')
+define(`n',  `r5')
+define(`cy', `r6')
+
+ASM_START()
+PROLOGUE(mpn_divexact_by3c)
+	lwz	r11, 0(up)
+	mtctr	n
+	lis	r12, 0xAAAA
+	ori	r12, r12, 0xAAAB
+	li	r10, 3
+
+	cmplw	cr7, cy, r11
+	subf	r11, cy, r11
+
+	mullw	r0, r11, r12
+	stw	r0, 0(rp)
+	bdz	L(one)
+
+L(top):	lwzu	r9, 4(up)
+	mulhwu	r7, r0, r10
+	bgt-	cr7, L(adj)		C very unlikely branch
+L(bko):	cmplw	cr7, r7, r9
+	subf	r0, r7, r9
+	mullw	r0, r12, r0
+	stwu	r0, 4(rp)
+	bdnz	L(top)
+
+L(one):	mulhwu	r3, r0, r10
+	blelr+	cr7
+	addi	r3, r3, 1
+	blr
+
+L(adj):	addi	r7, r7, 1
+	b	L(bko)
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/powerpc32/divrem_2.asm b/third_party/gmp/mpn/powerpc32/divrem_2.asm
new file mode 100644
index 0000000..74423f4
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc32/divrem_2.asm

@@ -0,0 +1,182 @@
+dnl  PPC-32 mpn_divrem_2 -- Divide an mpn number by a normalized 2-limb number.
+
+dnl  Copyright 2007, 2008, 2012, 2017 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C		cycles/limb
+C		norm	frac
+C 7410		~36.5	~36.5
+C 744x, 745x	 29	 29
+
+C INPUT PARAMETERS
+C qp  = r3
+C fn  = r4
+C up  = r5
+C un  = r6
+C d   = r7
+
+C TODO
+C  * Decrease register usage.
+C  * Make sure mul operands and optimal for early-out.
+C  * Check that things work well for a shared library build.
+C  * Write an invert_limb, perhaps inline, perhaps as a private call.  Or at
+C    least vastly improve the current __udiv_qrnnd_c based code.
+
+
+ASM_START()
+PROLOGUE(mpn_divrem_2)
+	stwu	r1, -32(r1)
+	slwi	r0, r6, 2
+	add	r5, r5, r0
+	stmw	r28, 8(r1)
+	addi	r29, r5, -8		C up = up_param + un - 2
+	lwz	r10, 4(r7)
+	lwz	r12, 4(r29)
+	addi	r8, r3, -12
+	lwz	r7, 0(r7)
+	cmplw	cr7, r12, r10
+	lwz	r28, 0(r29)
+	blt-	cr7, L(2)
+	bgt+	cr7, L(4)
+	cmplw	cr7, r28, r7
+	blt-	cr7, L(2)
+L(4):	subfc	r28, r7, r28
+	subfe	r12, r10, r12
+	li	r3, 1
+	b	L(6)
+L(2):	li	r3, 0
+
+L(6):	add	r0, r4, r6
+	addic.	r30, r0, -2
+	ble-	cr0, L(ret)
+
+	slwi	r9, r0, 2
+	add	r8, r8, r9		C rp += un + fn
+	mtctr	r30
+
+C Compute di from d1
+	srwi	r11, r10, 16
+	nor	r0, r10, r10
+	divwu	r31, r0, r11
+	rlwinm	r5, r10, 0, 16, 31
+	mullw	r9, r11, r31
+	mullw	r6, r5, r31
+	subf	r0, r9, r0
+	slwi	r0, r0, 16
+	ori	r0, r0, 65535
+	cmplw	cr7, r0, r6
+	bge-	cr7, L(9)
+	add	r0, r0, r10
+	cmplw	cr7, r0, r10
+	cmplw	cr6, r6, r0
+	addi	r31, r31, -1		C q1--
+	crorc	28, 28, 25
+	blt+	cr7, L(9)
+	addi	r31, r31, -1		C q1--
+	add	r0, r0, r10
+L(9):	subf	r0, r6, r0
+	divwu	r6, r0, r11
+	mullw	r9, r11, r6
+	mullw	r11, r5, r6
+	subf	r0, r9, r0
+	slwi	r0, r0, 16
+	ori	r0, r0, 65535
+	cmplw	cr7, r0, r11
+	bge-	cr7, L(13)
+	add	r0, r0, r10
+	cmplw	cr7, r0, r10
+	cmplw	cr6, r11, r0
+	addi	r6, r6, -1		C q0--
+	crorc	28, 28, 25
+	blt+	cr7, L(13)
+C	add	r0, r0, r10		C final remainder
+	addi	r6, r6, -1		C q0--
+L(13):	rlwimi	r6, r31, 16, 0, 15	C assemble final quotient
+
+C Adjust di by including d0
+	mullw	r9, r10, r6		C t0 = LO(di * d1)
+	addc	r11, r9, r7
+	subfe	r0, r1, r1
+	mulhwu	r9, r6, r7		C s1 = HI(di * d0)
+	addc	r9, r11, r9
+	addze.	r0, r0
+	blt	cr0, L(17)
+L(18):	subfc	r9, r10, r9
+	addi	r6, r6, -1
+	addme.	r0, r0
+	bge+	cr0, L(18)
+L(17):
+
+C r0  r3  r4  r5  r6  r7  r8  r9 r10 r11 r12 r28 r29 r30 r31
+C     msl         di  d0  qp     d1          fn  up  un
+L(loop):
+	mullw	r0, r12, r6		C q0 = LO(n2 * di)
+	cmpw	cr7, r30, r4
+	addc	r31, r0, r28		C q0 += n1
+	mulhwu	r9, r12, r6		C q  = HI(n2 * di)
+	adde	r12, r9, r12		C q  += n2
+	addi	r30, r30, -1
+	mullw	r0, r10, r12		C d1 * q
+	li	r9, 0
+	subf	r0, r0, r28		C n1 -= d1 * q
+	addi	r5, r12, 1
+	ble-	cr7, L(23)
+	lwzu	r9, -4(r29)
+L(23):	mullw	r11, r12, r7		C t0 = LO(d0 * q)
+	subfc	r28, r7, r9		C n0 -= d0
+	subfe	r0, r10, r0		C n1 -= d1
+	mulhwu	r12, r12, r7		C t1 = HI(d0 * q)
+	subfc	r28, r11, r28		C n0 -= t0
+	subfe	r12, r12, r0		C n1 -= t1
+	cmplw	cr7, r12, r31
+	blt+	cr7, L(24)
+	addc	r28, r28, r7
+	adde	r12, r12, r10
+	addi	r5, r5, -1
+L(24):	cmplw	cr7, r12, r10
+	bge-	cr7, L(fix)
+L(bck):	stw	r5, 0(r8)
+	addi	r8, r8, -4
+	bdnz	L(loop)
+
+L(ret):	stw	r28, 0(r29)
+	stw	r12, 4(r29)
+	lmw	r28, 8(r1)
+	addi	r1, r1, 32
+	blr
+
+L(fix):	cmplw	cr6, r28, r7
+	bgt+	cr7, L(28)
+	blt-	cr6, L(bck)
+L(28):	subfc	r28, r7, r28
+	subfe	r12, r10, r12
+	addi	r5, r5, 1
+	b	L(bck)
+EPILOGUE()

diff --git a/third_party/gmp/mpn/powerpc32/eabi.m4 b/third_party/gmp/mpn/powerpc32/eabi.m4
new file mode 100644
index 0000000..cd7633c
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc32/eabi.m4

@@ -0,0 +1,86 @@
+divert(-1)
+dnl  m4 macros for powerpc32 eABI assembly.
+
+dnl  Copyright 2003, 2005, 2006 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+define(`ASM_START',`')
+
+dnl  Called: PROLOGUE_cpu(GSYM_PREFIX`'foo)
+dnl          EPILOGUE_cpu(GSYM_PREFIX`'foo)
+dnl
+
+define(`PROLOGUE_cpu',
+m4_assert_numargs(1)
+	`
+	.section	".text"
+	.align	3
+	.globl	$1
+	.type	$1, @function
+$1:')
+
+define(`EPILOGUE_cpu',
+m4_assert_numargs(1)
+`	.size	$1, .-$1')
+
+dnl  This ought to support PIC, but it is unclear how that is done for eABI
+define(`LEA',
+m4_assert_numargs(2)
+`
+	lis	$1, $2@ha
+	la	$1, $2@l($1)
+')
+
+define(`EXTERN',
+m4_assert_numargs(1)
+`dnl')
+
+define(`DEF_OBJECT',
+m4_assert_numargs_range(1,2)
+`
+	.section	.rodata
+	ALIGN(ifelse($#,1,2,$2))
+	.type	$1, @object
+$1:
+')
+
+define(`END_OBJECT',
+m4_assert_numargs(1)
+`	.size	$1, .-$1')
+
+define(`ASM_END', `dnl')
+
+ifdef(`PIC',`
+define(`PIC_SLOW')')
+
+dnl  64-bit "long long" parameters are put in an even-odd pair, skipping an
+dnl  even register if that was in turn.  I wish somebody could explain why that
+dnl  is a good idea.
+define(`BROKEN_LONGLONG_PARAM')
+
+divert

diff --git a/third_party/gmp/mpn/powerpc32/elf.m4 b/third_party/gmp/mpn/powerpc32/elf.m4
new file mode 100644
index 0000000..1ed9c12
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc32/elf.m4

@@ -0,0 +1,100 @@
+divert(-1)
+dnl  m4 macros for powerpc32 GNU/Linux assembly.
+
+dnl  Copyright 2003, 2005, 2006 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+define(`ASM_START',`')
+
+dnl  Called: PROLOGUE_cpu(GSYM_PREFIX`'foo[,toc])
+dnl          EPILOGUE_cpu(GSYM_PREFIX`'foo)
+dnl
+
+define(`PROLOGUE_cpu',
+m4_assert_numargs_range(1,2)
+`ifelse(`$2',toc,,
+`ifelse(`$2',,,`m4_error(`Unrecognised PROLOGUE parameter')')')dnl
+	.section	".text"
+	.align	3
+	.globl	$1
+	.type	$1, @function
+$1:')
+
+define(`EPILOGUE_cpu',
+m4_assert_numargs(1)
+`	.size	$1, .-$1')
+
+define(`LEA',
+m4_assert_numargs(2)
+`ifdef(`PIC',`
+	mflr	r0
+	bcl	20, 31, 1f
+1:	mflr	$1
+	addis	$1, $1, (_GLOBAL_OFFSET_TABLE_-1b)@ha
+	addi	$1, $1, (_GLOBAL_OFFSET_TABLE_-1b)@l
+	mtlr	r0
+	lwz	$1, $2@got($1)
+',`
+	lis	$1, $2@ha
+	la	$1, $2@l($1)
+')')
+
+
+define(`LEAL',
+m4_assert_numargs(2)
+`LEA($1,$2)')
+
+
+define(`EXTERN',
+m4_assert_numargs(1)
+`dnl')
+
+define(`DEF_OBJECT',
+m4_assert_numargs_range(1,2)
+`
+	.section	.rodata
+	ALIGN(ifelse($#,1,2,$2))
+	.type	$1, @object
+$1:
+')
+
+define(`END_OBJECT',
+m4_assert_numargs(1)
+`	.size	$1, .-$1')
+
+define(`ASM_END', `dnl')
+
+ifdef(`PIC',`
+define(`PIC_SLOW')')
+
+dnl  64-bit "long long" parameters are put in an even-odd pair, skipping an
+dnl  even register if that was in turn.  I wish somebody could explain why that
+dnl  is a good idea.
+define(`BROKEN_LONGLONG_PARAM')
+
+divert

diff --git a/third_party/gmp/mpn/powerpc32/gmp-mparam.h b/third_party/gmp/mpn/powerpc32/gmp-mparam.h
new file mode 100644
index 0000000..e835a39
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc32/gmp-mparam.h

@@ -0,0 +1,222 @@
+/* PowerPC-32 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 1999-2004, 2008-2010, 2014, 2015 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+
+/* This file is supposed to be used for 604, 604e, 744x/745x/747x (G4+), i.e.,
+   32-bit PowerPC processors with reasonably fast integer multiply insns.  The
+   values below are chosen to be best for the latter processors, since 604 is
+   largely irrelevant today.
+
+   In mpn/powerpc32/750/gmp-mparam.h there are values for 75x (G3) and for
+   7400/7410 (G4), both which have much slower multiply instructions.  */
+
+/* 1417 MHz PPC 7447A */
+/* FFT tuning limit = 15 M */
+/* Generated by tuneup.c, 2015-10-08, gcc 4.6 */
+
+#define DIVREM_1_NORM_THRESHOLD              0  /* always */
+#define DIVREM_1_UNNORM_THRESHOLD            0  /* always */
+#define MOD_1_1P_METHOD                      1
+#define MOD_1_NORM_THRESHOLD                 0  /* always */
+#define MOD_1_UNNORM_THRESHOLD               0  /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          8
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          6
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD         8
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD        45
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     18
+#define USE_PREINV_DIVREM_1                  1
+#define DIV_QR_1N_PI1_METHOD                 1
+#define DIV_QR_1_NORM_THRESHOLD              2
+#define DIV_QR_1_UNNORM_THRESHOLD            1
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD                 0  /* always */
+#define BMOD_1_TO_MOD_1_THRESHOLD           69
+
+#define MUL_TOOM22_THRESHOLD                14
+#define MUL_TOOM33_THRESHOLD                73
+#define MUL_TOOM44_THRESHOLD               106
+#define MUL_TOOM6H_THRESHOLD               156
+#define MUL_TOOM8H_THRESHOLD               236
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      73
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD      71
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD      73
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD      72
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD      82
+
+#define SQR_BASECASE_THRESHOLD               0  /* always */
+#define SQR_TOOM2_THRESHOLD                 22
+#define SQR_TOOM3_THRESHOLD                 74
+#define SQR_TOOM4_THRESHOLD                142
+#define SQR_TOOM6_THRESHOLD                190
+#define SQR_TOOM8_THRESHOLD                333
+
+#define MULMID_TOOM42_THRESHOLD             32
+
+#define MULMOD_BNM1_THRESHOLD                9
+#define SQRMOD_BNM1_THRESHOLD               13
+
+#define MUL_FFT_MODF_THRESHOLD             284  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    284, 5}, {     15, 6}, {      8, 5}, {     17, 6}, \
+    {      9, 5}, {     19, 6}, {     17, 7}, {      9, 6}, \
+    {     20, 7}, {     11, 6}, {     23, 7}, {     13, 8}, \
+    {      7, 7}, {     19, 8}, {     11, 7}, {     25, 9}, \
+    {      7, 8}, {     15, 7}, {     33, 8}, {     19, 7}, \
+    {     39, 8}, {     23, 7}, {     47, 8}, {     27, 9}, \
+    {     15, 8}, {     39, 9}, {     23, 8}, {     47,10}, \
+    {     15, 9}, {     31, 8}, {     67, 9}, {     39, 8}, \
+    {     79, 9}, {     47, 8}, {     95,10}, {     31, 9}, \
+    {     71, 8}, {    143, 9}, {     79,10}, {     47, 9}, \
+    {     95,11}, {     31,10}, {     63, 9}, {    127, 8}, \
+    {    255, 9}, {    135, 8}, {    271, 9}, {    143,10}, \
+    {     79, 9}, {    159, 8}, {    319, 9}, {    175,10}, \
+    {     95, 9}, {    191, 8}, {    383, 9}, {    207, 8}, \
+    {    415,11}, {     63,10}, {    127, 9}, {    255, 8}, \
+    {    511, 9}, {    271,10}, {    143, 9}, {    287, 8}, \
+    {    575,10}, {    159, 9}, {    319,10}, {    175,11}, \
+    {     95,10}, {    191, 9}, {    383,10}, {    207, 9}, \
+    {    415, 8}, {    831,12}, {     63,11}, {    127,10}, \
+    {    255, 9}, {    511,10}, {    271, 9}, {    543, 8}, \
+    {   1087,10}, {    287, 9}, {    575,11}, {    159,10}, \
+    {    319, 9}, {    639,10}, {    351, 9}, {    703,11}, \
+    {    191,10}, {    415, 9}, {    831,11}, {    223,10}, \
+    {    447, 9}, {    895,10}, {    479, 9}, {    959,12}, \
+    {    127,11}, {    255,10}, {    543, 9}, {   1087,11}, \
+    {    287,10}, {    607,11}, {    319,10}, {    639,11}, \
+    {    351,10}, {    703, 9}, {   1407,12}, {    191,11}, \
+    {    383,10}, {    767,11}, {    415,10}, {    831,11}, \
+    {    447,10}, {    895,11}, {    479,10}, {    959,13}, \
+    {    127,12}, {    255,11}, {    543,10}, {   1087,11}, \
+    {    607,12}, {    319,11}, {    639,10}, {   1279,11}, \
+    {    703,10}, {   1407,12}, {    383,11}, {    831,12}, \
+    {    447,11}, {    959,13}, {    255,12}, {    511,11}, \
+    {   1087,12}, {    575,11}, {   1215,10}, {   2431,12}, \
+    {    639,11}, {   1279,12}, {    703,11}, {   1407,13}, \
+    {    383,12}, {    959,14}, {    255,13}, {    511,12}, \
+    {   1215,11}, {   2431,13}, {    639,12}, {   1471,13}, \
+    {    767,12}, {   1599,13}, {    895,12}, {   1919,14}, \
+    {    511,13}, {   1023,12}, {   2111,13}, {   1151,12}, \
+    {   2431,13}, {   1407,14}, {    767,13}, {   1535,12}, \
+    {   3071,13}, {   8192,14}, {  16384,15}, {  32768,16} }
+#define MUL_FFT_TABLE3_SIZE 164
+#define MUL_FFT_THRESHOLD                 3712
+
+#define SQR_FFT_MODF_THRESHOLD             248  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    248, 5}, {     17, 6}, {      9, 5}, {     19, 6}, \
+    {     17, 7}, {      9, 6}, {     20, 7}, {     11, 6}, \
+    {     23, 7}, {     13, 8}, {      7, 7}, {     19, 8}, \
+    {     11, 7}, {     25, 9}, {      7, 8}, {     15, 7}, \
+    {     33, 8}, {     19, 7}, {     39, 8}, {     27, 9}, \
+    {     15, 8}, {     39, 9}, {     23, 8}, {     47,10}, \
+    {     15, 9}, {     31, 8}, {     63, 9}, {     39, 8}, \
+    {     79, 9}, {     47,10}, {     31, 9}, {     63, 8}, \
+    {    127, 9}, {     71, 8}, {    143, 9}, {     79,10}, \
+    {     47, 9}, {     95,11}, {     31,10}, {     63, 9}, \
+    {    127, 8}, {    255, 7}, {    511, 9}, {    143,10}, \
+    {     79, 9}, {    159, 8}, {    319, 9}, {    175, 8}, \
+    {    351,10}, {     95, 9}, {    191, 8}, {    383, 9}, \
+    {    207, 8}, {    415, 7}, {    831,11}, {     63,10}, \
+    {    127, 9}, {    255, 8}, {    511,10}, {    143, 9}, \
+    {    287, 8}, {    575,10}, {    159, 9}, {    319,10}, \
+    {    175, 9}, {    351,11}, {     95,10}, {    191, 9}, \
+    {    383,10}, {    207, 9}, {    415, 8}, {    831,12}, \
+    {     63,11}, {    127,10}, {    255, 9}, {    511,10}, \
+    {    271, 9}, {    543,10}, {    287, 9}, {    575,11}, \
+    {    159,10}, {    319, 9}, {    639,10}, {    351, 9}, \
+    {    703,11}, {    191,10}, {    383, 9}, {    767,10}, \
+    {    415, 9}, {    831,11}, {    223,10}, {    447, 9}, \
+    {    895,12}, {    127,11}, {    255,10}, {    543,11}, \
+    {    287,10}, {    607,11}, {    319,10}, {    639,11}, \
+    {    351,10}, {    703, 9}, {   1407,12}, {    191,11}, \
+    {    383,10}, {    767,11}, {    415,10}, {    831,11}, \
+    {    447,10}, {    895,11}, {    479,13}, {    127,12}, \
+    {    255,11}, {    543,10}, {   1087,11}, {    607,12}, \
+    {    319,11}, {    639,10}, {   1279,11}, {    703,10}, \
+    {   1407,12}, {    383,11}, {    831,12}, {    447,11}, \
+    {    959,13}, {    255,12}, {    511,11}, {   1087,12}, \
+    {    575,11}, {   1215,12}, {    639,11}, {   1279,12}, \
+    {    703,11}, {   1407,13}, {    383,12}, {    959,14}, \
+    {    255,13}, {    511,12}, {   1215,13}, {    639,12}, \
+    {   1471,13}, {    767,12}, {   1599,13}, {    895,12}, \
+    {   1919,14}, {    511,13}, {   1023,12}, {   2111,13}, \
+    {   1151,12}, {   2431,13}, {   1407,14}, {    767,13}, \
+    {   1535,12}, {   3199,13}, {   8192,14}, {  16384,15}, \
+    {  32768,16} }
+#define SQR_FFT_TABLE3_SIZE 157
+#define SQR_FFT_THRESHOLD                 2688
+
+#define MULLO_BASECASE_THRESHOLD             2
+#define MULLO_DC_THRESHOLD                  50
+#define MULLO_MUL_N_THRESHOLD             6633
+#define SQRLO_BASECASE_THRESHOLD             4
+#define SQRLO_DC_THRESHOLD                 115
+#define SQRLO_SQR_THRESHOLD               5274
+
+#define DC_DIV_QR_THRESHOLD                 43
+#define DC_DIVAPPR_Q_THRESHOLD             141
+#define DC_BDIV_QR_THRESHOLD                51
+#define DC_BDIV_Q_THRESHOLD                120
+
+#define INV_MULMOD_BNM1_THRESHOLD           43
+#define INV_NEWTON_THRESHOLD               173
+#define INV_APPR_THRESHOLD                 156
+
+#define BINV_NEWTON_THRESHOLD              204
+#define REDC_1_TO_REDC_N_THRESHOLD          51
+
+#define MU_DIV_QR_THRESHOLD               1017
+#define MU_DIVAPPR_Q_THRESHOLD            1078
+#define MUPI_DIV_QR_THRESHOLD               84
+#define MU_BDIV_QR_THRESHOLD               872
+#define MU_BDIV_Q_THRESHOLD               1078
+
+#define POWM_SEC_TABLE  1,16,102,428,1378
+
+#define GET_STR_DC_THRESHOLD                12
+#define GET_STR_PRECOMPUTE_THRESHOLD        27
+#define SET_STR_DC_THRESHOLD               781
+#define SET_STR_PRECOMPUTE_THRESHOLD      1505
+
+#define FAC_DSC_THRESHOLD                  141
+#define FAC_ODD_THRESHOLD                   34
+
+#define MATRIX22_STRASSEN_THRESHOLD         12
+#define HGCD_THRESHOLD                     118
+#define HGCD_APPR_THRESHOLD                161
+#define HGCD_REDUCE_THRESHOLD             1679
+#define GCD_DC_THRESHOLD                   351
+#define GCDEXT_DC_THRESHOLD                273
+#define JACOBI_BASE_METHOD                   4

diff --git a/third_party/gmp/mpn/powerpc32/invert_limb.asm b/third_party/gmp/mpn/powerpc32/invert_limb.asm
new file mode 100644
index 0000000..612bfe5
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc32/invert_limb.asm

@@ -0,0 +1,142 @@
+dnl  PowerPC-32 mpn_invert_limb -- Invert a normalized limb.
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C		 cycles/limb
+C 603e:		      ?
+C 604e:		      ?
+C 75x (G3):	      ?
+C 7400,7410 (G4):     ?
+C 744x,745x (G4+):   32
+C power4/ppc970:      ?
+C power5:	      ?
+
+EXTERN(approx_tab)
+
+ASM_START()
+PROLOGUE(mpn_invert_limb)
+	rlwinm	r6, r3, 11, 22, 30	C extract bits 30..22 to pos 2^1
+	srwi	r10, r3, 11		C extract bits 31..11
+	LEA(	r9, approx_tab)		C N.B. clobbers r0 for ELF and Darwin
+	lhzx	r9, r9, r6		C w2
+	addi	r0, r10, 1
+	mullw	r11, r9, r9
+	slwi	r9, r9, 4
+	mulhwu	r7, r11, r0
+	rlwinm	r11, r3, 0, 31, 31	C extract bit 0
+	addi	r0, r9, -1
+	srwi	r9, r3, 1		C d >> 1
+	subf	r0, r7, r0		C w1
+	add	r9, r9, r11		C d31
+	mullw	r9, r0, r9		C w1 * d31
+	srwi	r10, r0, 1		C w1 >> 1
+	neg	r11, r11
+	and	r11, r10, r11
+	subf	r11, r9, r11
+	mulhwu	r9, r11, r0
+	slwi	r0, r0, 15
+	srwi	r9, r9, 1
+	add	r0, r9, r0		C w0
+	mullw	r10, r0, r3
+	mulhwu	r9, r0, r3
+	addc	r11, r10, r3
+	adde	r3, r9, r3
+	subf	r3, r3, r0
+	blr
+EPILOGUE()
+
+DEF_OBJECT(approx_tab)
+	.short 0x7fe1,0x7fa1,0x7f61,0x7f22,0x7ee3,0x7ea4,0x7e65,0x7e27
+	.short 0x7de9,0x7dab,0x7d6d,0x7d30,0x7cf3,0x7cb6,0x7c79,0x7c3d
+	.short 0x7c00,0x7bc4,0x7b89,0x7b4d,0x7b12,0x7ad7,0x7a9c,0x7a61
+	.short 0x7a27,0x79ec,0x79b2,0x7979,0x793f,0x7906,0x78cc,0x7894
+	.short 0x785b,0x7822,0x77ea,0x77b2,0x777a,0x7742,0x770b,0x76d3
+	.short 0x769c,0x7665,0x762f,0x75f8,0x75c2,0x758c,0x7556,0x7520
+	.short 0x74ea,0x74b5,0x7480,0x744b,0x7416,0x73e2,0x73ad,0x7379
+	.short 0x7345,0x7311,0x72dd,0x72aa,0x7277,0x7243,0x7210,0x71de
+	.short 0x71ab,0x7179,0x7146,0x7114,0x70e2,0x70b1,0x707f,0x704e
+	.short 0x701c,0x6feb,0x6fba,0x6f8a,0x6f59,0x6f29,0x6ef9,0x6ec8
+	.short 0x6e99,0x6e69,0x6e39,0x6e0a,0x6ddb,0x6dab,0x6d7d,0x6d4e
+	.short 0x6d1f,0x6cf1,0x6cc2,0x6c94,0x6c66,0x6c38,0x6c0a,0x6bdd
+	.short 0x6bb0,0x6b82,0x6b55,0x6b28,0x6afb,0x6acf,0x6aa2,0x6a76
+	.short 0x6a49,0x6a1d,0x69f1,0x69c6,0x699a,0x696e,0x6943,0x6918
+	.short 0x68ed,0x68c2,0x6897,0x686c,0x6842,0x6817,0x67ed,0x67c3
+	.short 0x6799,0x676f,0x6745,0x671b,0x66f2,0x66c8,0x669f,0x6676
+	.short 0x664d,0x6624,0x65fc,0x65d3,0x65aa,0x6582,0x655a,0x6532
+	.short 0x650a,0x64e2,0x64ba,0x6493,0x646b,0x6444,0x641c,0x63f5
+	.short 0x63ce,0x63a7,0x6381,0x635a,0x6333,0x630d,0x62e7,0x62c1
+	.short 0x629a,0x6275,0x624f,0x6229,0x6203,0x61de,0x61b8,0x6193
+	.short 0x616e,0x6149,0x6124,0x60ff,0x60da,0x60b6,0x6091,0x606d
+	.short 0x6049,0x6024,0x6000,0x5fdc,0x5fb8,0x5f95,0x5f71,0x5f4d
+	.short 0x5f2a,0x5f07,0x5ee3,0x5ec0,0x5e9d,0x5e7a,0x5e57,0x5e35
+	.short 0x5e12,0x5def,0x5dcd,0x5dab,0x5d88,0x5d66,0x5d44,0x5d22
+	.short 0x5d00,0x5cde,0x5cbd,0x5c9b,0x5c7a,0x5c58,0x5c37,0x5c16
+	.short 0x5bf5,0x5bd4,0x5bb3,0x5b92,0x5b71,0x5b51,0x5b30,0x5b10
+	.short 0x5aef,0x5acf,0x5aaf,0x5a8f,0x5a6f,0x5a4f,0x5a2f,0x5a0f
+	.short 0x59ef,0x59d0,0x59b0,0x5991,0x5972,0x5952,0x5933,0x5914
+	.short 0x58f5,0x58d6,0x58b7,0x5899,0x587a,0x585b,0x583d,0x581f
+	.short 0x5800,0x57e2,0x57c4,0x57a6,0x5788,0x576a,0x574c,0x572e
+	.short 0x5711,0x56f3,0x56d5,0x56b8,0x569b,0x567d,0x5660,0x5643
+	.short 0x5626,0x5609,0x55ec,0x55cf,0x55b2,0x5596,0x5579,0x555d
+	.short 0x5540,0x5524,0x5507,0x54eb,0x54cf,0x54b3,0x5497,0x547b
+	.short 0x545f,0x5443,0x5428,0x540c,0x53f0,0x53d5,0x53b9,0x539e
+	.short 0x5383,0x5368,0x534c,0x5331,0x5316,0x52fb,0x52e0,0x52c6
+	.short 0x52ab,0x5290,0x5276,0x525b,0x5240,0x5226,0x520c,0x51f1
+	.short 0x51d7,0x51bd,0x51a3,0x5189,0x516f,0x5155,0x513b,0x5121
+	.short 0x5108,0x50ee,0x50d5,0x50bb,0x50a2,0x5088,0x506f,0x5056
+	.short 0x503c,0x5023,0x500a,0x4ff1,0x4fd8,0x4fbf,0x4fa6,0x4f8e
+	.short 0x4f75,0x4f5c,0x4f44,0x4f2b,0x4f13,0x4efa,0x4ee2,0x4eca
+	.short 0x4eb1,0x4e99,0x4e81,0x4e69,0x4e51,0x4e39,0x4e21,0x4e09
+	.short 0x4df1,0x4dda,0x4dc2,0x4daa,0x4d93,0x4d7b,0x4d64,0x4d4d
+	.short 0x4d35,0x4d1e,0x4d07,0x4cf0,0x4cd8,0x4cc1,0x4caa,0x4c93
+	.short 0x4c7d,0x4c66,0x4c4f,0x4c38,0x4c21,0x4c0b,0x4bf4,0x4bde
+	.short 0x4bc7,0x4bb1,0x4b9a,0x4b84,0x4b6e,0x4b58,0x4b41,0x4b2b
+	.short 0x4b15,0x4aff,0x4ae9,0x4ad3,0x4abd,0x4aa8,0x4a92,0x4a7c
+	.short 0x4a66,0x4a51,0x4a3b,0x4a26,0x4a10,0x49fb,0x49e5,0x49d0
+	.short 0x49bb,0x49a6,0x4990,0x497b,0x4966,0x4951,0x493c,0x4927
+	.short 0x4912,0x48fe,0x48e9,0x48d4,0x48bf,0x48ab,0x4896,0x4881
+	.short 0x486d,0x4858,0x4844,0x482f,0x481b,0x4807,0x47f3,0x47de
+	.short 0x47ca,0x47b6,0x47a2,0x478e,0x477a,0x4766,0x4752,0x473e
+	.short 0x472a,0x4717,0x4703,0x46ef,0x46db,0x46c8,0x46b4,0x46a1
+	.short 0x468d,0x467a,0x4666,0x4653,0x4640,0x462c,0x4619,0x4606
+	.short 0x45f3,0x45e0,0x45cd,0x45ba,0x45a7,0x4594,0x4581,0x456e
+	.short 0x455b,0x4548,0x4536,0x4523,0x4510,0x44fe,0x44eb,0x44d8
+	.short 0x44c6,0x44b3,0x44a1,0x448f,0x447c,0x446a,0x4458,0x4445
+	.short 0x4433,0x4421,0x440f,0x43fd,0x43eb,0x43d9,0x43c7,0x43b5
+	.short 0x43a3,0x4391,0x437f,0x436d,0x435c,0x434a,0x4338,0x4327
+	.short 0x4315,0x4303,0x42f2,0x42e0,0x42cf,0x42bd,0x42ac,0x429b
+	.short 0x4289,0x4278,0x4267,0x4256,0x4244,0x4233,0x4222,0x4211
+	.short 0x4200,0x41ef,0x41de,0x41cd,0x41bc,0x41ab,0x419a,0x418a
+	.short 0x4179,0x4168,0x4157,0x4147,0x4136,0x4125,0x4115,0x4104
+	.short 0x40f4,0x40e3,0x40d3,0x40c2,0x40b2,0x40a2,0x4091,0x4081
+	.short 0x4071,0x4061,0x4050,0x4040,0x4030,0x4020,0x4010,0x4000
+END_OBJECT(approx_tab)
+ASM_END()

diff --git a/third_party/gmp/mpn/powerpc32/lshift.asm b/third_party/gmp/mpn/powerpc32/lshift.asm
new file mode 100644
index 0000000..ce85d4d
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc32/lshift.asm

@@ -0,0 +1,168 @@
+dnl  PowerPC-32 mpn_lshift -- Shift a number left.
+
+dnl  Copyright 1995, 1998, 2000, 2002-2005 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C                cycles/limb
+C 603e:            ?
+C 604e:            3.0
+C 75x (G3):        3.0
+C 7400,7410 (G4):  3.0
+C 7445,7455 (G4+): 2.5
+C 7447,7457 (G4+): 2.25
+C power4/ppc970:   2.5
+C power5:          2.5
+
+C INPUT PARAMETERS
+C rp	r3
+C up	r4
+C n	r5
+C cnt	r6
+
+ASM_START()
+PROLOGUE(mpn_lshift)
+	cmpwi	cr0, r5, 30	C more than 30 limbs?
+	slwi	r0, r5, 2
+	add	r4, r4, r0	C make r4 point at end of s1
+	add	r7, r3, r0	C make r7 point at end of res
+	bgt	L(BIG)		C branch if more than 12 limbs
+
+	mtctr	r5		C copy size into CTR
+	subfic	r8, r6, 32
+	lwzu	r11, -4(r4)	C load first s1 limb
+	srw	r3, r11, r8	C compute function return value
+	bdz	L(end1)
+
+L(oop):	lwzu	r10, -4(r4)
+	slw	r9, r11, r6
+	srw	r12, r10, r8
+	or	r9, r9, r12
+	stwu	r9, -4(r7)
+	bdz	L(end2)
+	lwzu	r11, -4(r4)
+	slw	r9, r10, r6
+	srw	r12, r11, r8
+	or	r9, r9, r12
+	stwu	r9, -4(r7)
+	bdnz	L(oop)
+
+L(end1):
+	slw	r0, r11, r6
+	stw	r0, -4(r7)
+	blr
+L(end2):
+	slw	r0, r10, r6
+	stw	r0, -4(r7)
+	blr
+
+L(BIG):
+	stwu	r1, -48(r1)
+	stmw	r24, 8(r1)	C save registers we are supposed to preserve
+	lwzu	r9, -4(r4)
+	subfic	r8, r6, 32
+	srw	r3, r9, r8	C compute function return value
+	slw	r0, r9, r6
+	addi	r5, r5, -1
+
+	andi.	r10, r5, 3	C count for spill loop
+	beq	L(e)
+	mtctr	r10
+	lwzu	r28, -4(r4)
+	bdz	L(xe0)
+
+L(loop0):
+	slw	r12, r28, r6
+	srw	r24, r28, r8
+	lwzu	r28, -4(r4)
+	or	r24, r0, r24
+	stwu	r24, -4(r7)
+	mr	r0, r12
+	bdnz	L(loop0)	C taken at most once!
+
+L(xe0):	slw	r12, r28, r6
+	srw	r24, r28, r8
+	or	r24, r0, r24
+	stwu	r24, -4(r7)
+	mr	r0, r12
+
+L(e):	srwi	r5, r5, 2	C count for unrolled loop
+	addi	r5, r5, -1
+	mtctr	r5
+	lwz	r28, -4(r4)
+	lwz	r29, -8(r4)
+	lwz	r30, -12(r4)
+	lwzu	r31, -16(r4)
+
+L(loopU):
+	slw	r9, r28, r6
+	srw	r24, r28, r8
+	lwz	r28, -4(r4)
+	slw	r10, r29, r6
+	srw	r25, r29, r8
+	lwz	r29, -8(r4)
+	slw	r11, r30, r6
+	srw	r26, r30, r8
+	lwz	r30, -12(r4)
+	slw	r12, r31, r6
+	srw	r27, r31, r8
+	lwzu	r31, -16(r4)
+	or	r24, r0, r24
+	stw	r24, -4(r7)
+	or	r25, r9, r25
+	stw	r25, -8(r7)
+	or	r26, r10, r26
+	stw	r26, -12(r7)
+	or	r27, r11, r27
+	stwu	r27, -16(r7)
+	mr	r0, r12
+	bdnz	L(loopU)
+
+	slw	r9, r28, r6
+	srw	r24, r28, r8
+	slw	r10, r29, r6
+	srw	r25, r29, r8
+	slw	r11, r30, r6
+	srw	r26, r30, r8
+	slw	r12, r31, r6
+	srw	r27, r31, r8
+	or	r24, r0, r24
+	stw	r24, -4(r7)
+	or	r25, r9, r25
+	stw	r25, -8(r7)
+	or	r26, r10, r26
+	stw	r26, -12(r7)
+	or	r27, r11, r27
+	stw	r27, -16(r7)
+
+	stw	r12, -20(r7)
+	lmw	r24, 8(r1)	C restore registers
+	addi	r1, r1, 48
+	blr
+EPILOGUE()

diff --git a/third_party/gmp/mpn/powerpc32/lshiftc.asm b/third_party/gmp/mpn/powerpc32/lshiftc.asm
new file mode 100644
index 0000000..b683def
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc32/lshiftc.asm

@@ -0,0 +1,170 @@
+dnl  PowerPC-32 mpn_lshiftc.
+
+dnl  Copyright 1995, 1998, 2000, 2002-2005, 2010 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C                cycles/limb
+C 603e:            ?
+C 604e:            3.0
+C 75x (G3):        3.0
+C 7400,7410 (G4):  3.0
+C 7445,7455 (G4+): 2.5
+C 7447,7457 (G4+): 2.25
+C power4/ppc970:   2.5
+C power5:          2.5
+
+C INPUT PARAMETERS
+C rp	r3
+C up	r4
+C n	r5
+C cnt	r6
+
+ASM_START()
+PROLOGUE(mpn_lshiftc)
+	cmpwi	cr0, r5, 30	C more than 30 limbs?
+	slwi	r0, r5, 2
+	add	r4, r4, r0	C make r4 point at end of s1
+	add	r7, r3, r0	C make r7 point at end of res
+	bgt	L(BIG)		C branch if more than 12 limbs
+
+	mtctr	r5		C copy size into CTR
+	subfic	r8, r6, 32
+	lwzu	r11, -4(r4)	C load first s1 limb
+	srw	r3, r11, r8	C compute function return value
+	bdz	L(end1)
+
+L(oop):	lwzu	r10, -4(r4)
+	slw	r9, r11, r6
+	srw	r12, r10, r8
+	nor	r9, r9, r12
+	stwu	r9, -4(r7)
+	bdz	L(end2)
+	lwzu	r11, -4(r4)
+	slw	r9, r10, r6
+	srw	r12, r11, r8
+	nor	r9, r9, r12
+	stwu	r9, -4(r7)
+	bdnz	L(oop)
+
+L(end1):
+	slw	r0, r11, r6
+	nor	r0, r0, r0
+	stw	r0, -4(r7)
+	blr
+L(end2):
+	slw	r0, r10, r6
+	nor	r0, r0, r0
+	stw	r0, -4(r7)
+	blr
+
+L(BIG):
+	stwu	r1, -48(r1)
+	stmw	r24, 8(r1)	C save registers we are supposed to preserve
+	lwzu	r9, -4(r4)
+	subfic	r8, r6, 32
+	srw	r3, r9, r8	C compute function return value
+	slw	r0, r9, r6
+	addi	r5, r5, -1
+
+	andi.	r10, r5, 3	C count for spill loop
+	beq	L(e)
+	mtctr	r10
+	lwzu	r28, -4(r4)
+	bdz	L(xe0)
+
+L(loop0):
+	slw	r12, r28, r6
+	srw	r24, r28, r8
+	lwzu	r28, -4(r4)
+	nor	r24, r0, r24
+	stwu	r24, -4(r7)
+	mr	r0, r12
+	bdnz	L(loop0)	C taken at most once!
+
+L(xe0):	slw	r12, r28, r6
+	srw	r24, r28, r8
+	nor	r24, r0, r24
+	stwu	r24, -4(r7)
+	mr	r0, r12
+
+L(e):	srwi	r5, r5, 2	C count for unrolled loop
+	addi	r5, r5, -1
+	mtctr	r5
+	lwz	r28, -4(r4)
+	lwz	r29, -8(r4)
+	lwz	r30, -12(r4)
+	lwzu	r31, -16(r4)
+
+L(loopU):
+	slw	r9, r28, r6
+	srw	r24, r28, r8
+	lwz	r28, -4(r4)
+	slw	r10, r29, r6
+	srw	r25, r29, r8
+	lwz	r29, -8(r4)
+	slw	r11, r30, r6
+	srw	r26, r30, r8
+	lwz	r30, -12(r4)
+	slw	r12, r31, r6
+	srw	r27, r31, r8
+	lwzu	r31, -16(r4)
+	nor	r24, r0, r24
+	stw	r24, -4(r7)
+	nor	r25, r9, r25
+	stw	r25, -8(r7)
+	nor	r26, r10, r26
+	stw	r26, -12(r7)
+	nor	r27, r11, r27
+	stwu	r27, -16(r7)
+	mr	r0, r12
+	bdnz	L(loopU)
+
+	slw	r9, r28, r6
+	srw	r24, r28, r8
+	slw	r10, r29, r6
+	srw	r25, r29, r8
+	slw	r11, r30, r6
+	srw	r26, r30, r8
+	slw	r12, r31, r6
+	srw	r27, r31, r8
+	nor	r24, r0, r24
+	stw	r24, -4(r7)
+	nor	r25, r9, r25
+	stw	r25, -8(r7)
+	nor	r26, r10, r26
+	stw	r26, -12(r7)
+	nor	r27, r11, r27
+	stw	r27, -16(r7)
+	nor	r12, r12, r12
+	stw	r12, -20(r7)
+	lmw	r24, 8(r1)	C restore registers
+	addi	r1, r1, 48
+	blr
+EPILOGUE()

diff --git a/third_party/gmp/mpn/powerpc32/mod_34lsub1.asm b/third_party/gmp/mpn/powerpc32/mod_34lsub1.asm
new file mode 100644
index 0000000..6d7fe4d
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc32/mod_34lsub1.asm

@@ -0,0 +1,145 @@
+dnl  PowerPC-32 mpn_mod_34lsub1 -- mpn remainder mod 2^24-1.
+
+dnl  Copyright 2002, 2003, 2005 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C                cycles/limb
+C 603e:            ?
+C 604e:            3
+C 75x (G3):        3
+C 7400,7410 (G4):  3
+C 744x,745x (G4+): 3
+C power4/ppc970:   2.5
+C power5:          2.5
+
+C mp_limb_t mpn_mod_34lsub1 (mp_srcptr src, mp_size_t size)
+C
+C There seems no need to schedule the loads back, the code is still 3.0 c/l
+C on 750/7400 no matter where they're placed.
+C
+C Alternatives:
+C
+C Fetching half words would allow add instead for accumulating, instead of
+C adde and its serialization.  An outer loop would be required though, since
+C 2^16 halfwords can overflow.  lhz+add would be 2.0 c/l, but if there's
+C also a bdz or bdnz for each and a pointer update say every three limbs
+C then the total would be 2.67 c/l which isn't much faster than the current
+C simpler code.
+
+ASM_START()
+PROLOGUE(mpn_mod_34lsub1)
+
+	C r3	src
+	C r4	size
+
+	mtctr	r4
+	addic	r6, r3, 8		C &src[2], and clear CA
+
+	lwz	r3, 0(r3)		C acc0 = src[0]
+	bdz	L(done)
+
+	lwz	r4, -4(r6)		C acc1 = src[1]
+	bdz	L(two)
+
+	lwz	r5, 0(r6)		C acc2 = src[2]
+	lis	r7, 0			C no carry if just three limbs
+
+	bdz	L(three)
+	lis	r7, 1			C 0x10000 carry pos
+
+L(top):
+	C r3	acc0
+	C r4	acc1
+	C r5	acc2
+	C r6	src, incrementing
+	C r7	carry pos
+
+	lwz	r0, 4(r6)
+	adde	r3, r3, r0
+	bdz	L(end0)
+
+	lwz	r0, 8(r6)
+	adde	r4, r4, r0
+	bdz	L(end1)
+
+	lwzu	r0, 12(r6)
+	adde	r5, r5, r0
+	bdnz	L(top)
+
+
+	srwi	r7, r7, 8
+L(end0):
+	srwi	r7, r7, 8
+L(end1):
+	subfe	r0, r0, r0		C -1 if not CA
+
+	andc	r7, r7, r0		C final carry, 0x10000, 0x100, 1 or 0
+L(three):
+	rlwinm	r6, r3, 0,8,31		C acc0 low
+
+	add	r7, r7, r6
+	rlwinm	r6, r3, 8,24,31		C acc0 high
+
+	add	r7, r7, r6
+	rlwinm	r6, r4, 8,8,23		C acc1 low
+
+	add	r7, r7, r6
+	rlwinm	r6, r4, 16,16,31	C acc1 high
+
+	add	r7, r7, r6
+	rlwinm	r6, r5, 16,8,15		C acc2 low
+
+	add	r7, r7, r6
+	rlwinm	r6, r5, 24,8,31		C acc2 high
+
+	add	r3, r7, r6
+
+L(done):
+	blr
+
+L(two):
+	C r3	acc0
+	C r4	acc1
+
+	rlwinm	r5, r3, 8,24,31		C acc0 high
+	rlwinm	r3, r3, 0,8,31		C acc0 low
+
+	add	r3, r3, r5		C acc0 high + low
+	rlwinm	r5, r4, 16,16,31	C acc1 high
+
+	add	r3, r3, r5		C add acc1 high
+	rlwinm	r5, r4, 8,8,23		C acc1 low
+
+	add	r3, r3, r5		C add acc1 low
+
+	blr
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/powerpc32/mode1o.asm b/third_party/gmp/mpn/powerpc32/mode1o.asm
new file mode 100644
index 0000000..e8a6b5e
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc32/mode1o.asm

@@ -0,0 +1,127 @@
+dnl  PowerPC-32 mpn_modexact_1_odd -- mpn by limb exact remainder.
+
+dnl  Copyright 2002, 2003, 2005, 2006 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C                cycles/limb
+C 603e:             ?
+C 604e:             6.0
+C 75x (G3):         6.0-13.0, depending on divisor
+C 7400,7410 (G4):   6.0-13.0, depending on divisor
+C 744x,745x (G4+):  8.0-10.0, depending on divisor
+C power4/ppc970:   12.0
+C power5:          12.0
+
+
+C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size,
+C                               mp_limb_t divisor);
+C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size,
+C                                mp_limb_t divisor, mp_limb_t carry);
+C
+C For PIC, the inverse is established arithmetically since it measures about
+C 5 cycles faster than the nonsense needed to access binvert_limb_table in
+C SVR4 or Darwin style PIC.  AIX might be better, since it avoids bl/mflr to
+C get at the GOT/TOC/whatever.
+C
+C Using divwu for size==1 measured about 10 cycles slower on 604e, or about
+C 3-5 cycles faster on 750.  For now it doesn't seem worth bothering with.
+C
+C The loop allows an early-out on mullw for the inverse, and on mulhwu for
+C the divisor.  So the fastest is for instance divisor==1 (inverse==-1), and
+C the slowest is anything giving a full 32-bits in both, such as
+C divisor==0xDEADBEEF (inverse==0x904B300F).  These establish the stated
+C range above for 750 and 7400.
+
+
+ASM_START()
+
+EXTERN(binvert_limb_table)
+
+PROLOGUE(mpn_modexact_1_odd)
+	li	r6, 0
+
+PROLOGUE(mpn_modexact_1c_odd)
+
+	mtctr	r4			C size
+
+ifdef(`PIC_SLOW',`
+C Load from our table with PIC is so slow on Linux and Darwin that we avoid it
+	rlwinm	r7, r5, 1,28,28		C (divisor << 1) & 8
+	rlwinm	r8, r5, 2,28,28		C (divisor << 2) & 8
+	xor	r7, r7, r8		C ((divisor << 1) ^ (divisor << 2)) & 8
+	rlwinm	r4, r5, 0,28,31		C divisor low 4 bits, speedup mullw
+	xor	r4, r4, r7		C inverse, 4 bits
+	mullw	r7, r4, r4		C i*i
+	slwi	r4, r4, 1		C 2*i
+	rlwinm	r8, r5, 0,24,31		C divisor low 8 bits, speedup mullw
+	mullw	r7, r7, r8		C i*i*d
+	sub	r4, r4, r7		C inverse, 8 bits
+',`
+	LEA(	r7, binvert_limb_table)
+	rlwinm	r4, r5, 31,25,31	C (divisor/2) & 0x7F
+	lbzx	r4, r4,r7		C inverse, 8 bits
+')
+
+	mullw	r7, r4, r4		C i*i
+	slwi	r4, r4, 1		C 2*i
+	mullw	r7, r5, r7		C i*i*d   [i*i is 16 bits, so second operand]
+	sub	r4, r4, r7		C inverse, 16 bits
+	mullw	r7, r4, r4		C i*i
+	slwi	r4, r4, 1		C 2*i
+	mullw	r7, r7, r5		C i*i*d
+	lwz	r0, 0(r3)		C src[0]
+	sub	r4, r4, r7		C inverse, 32 bits
+	subfc	r7, r6, r0		C l = src[0] - carry
+
+	mullw	r7, r7, r4		C q = l * inverse
+	bdz	L(one)
+
+	lwzu	r0, 4(r3)		C src[1]
+	mulhwu	r6, r7, r5		C carry = high(q*divisor)
+	subfe	r7, r6, r0		C l = src[1] - carry
+	bdz	L(two)
+
+L(top):
+	mullw	r7, r7, r4		C q = l * inverse
+	lwzu	r0, 4(r3)		C src[i]
+	mulhwu	r6, r7, r5		C carry = high(q*divisor)
+	subfe	r7, r6, r0		C l = src[i] - carry
+	bdnz	L(top)
+
+L(two):	mullw	r7, r7, r4		C q = l * inverse
+L(one):	subfe	r3, r3, r3		C ca 0 or -1
+	mulhwu	r6, r7, r5		C carry = high(q*divisor)
+	subf	r3, r3, r6		C carry + ca
+	blr
+
+EPILOGUE(mpn_modexact_1c_odd)
+EPILOGUE(mpn_modexact_1_odd)
+ASM_END()

diff --git a/third_party/gmp/mpn/powerpc32/mul_1.asm b/third_party/gmp/mpn/powerpc32/mul_1.asm
new file mode 100644
index 0000000..e42087c
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc32/mul_1.asm

@@ -0,0 +1,101 @@
+dnl  PowerPC-32 mpn_mul_1 -- Multiply a limb vector with a limb and store the
+dnl  result in a second limb vector.
+
+dnl  Copyright 1995, 1997, 2000, 2002, 2003, 2005 Free Software Foundation,
+dnl  Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C                cycles/limb
+C 603e:            ?
+C 604e:            4.0
+C 75x (G3):        4.5-11
+C 7400,7410 (G4):  4.5-11
+C 744x,745x (G4+): 6.0
+C power4/ppc970:   6.0
+C power5:          5.63
+
+C INPUT PARAMETERS
+C rp	r3
+C up	r4
+C n	r5
+C vl	r6
+
+ASM_START()
+PROLOGUE(mpn_mul_1)
+	mtctr	r5
+	addi	r3,r3,-4	C adjust res_ptr, it's offset before it's used
+	li	r12,0		C clear upper product reg
+	addic	r0,r0,0		C clear cy
+C Start software pipeline
+	lwz	r8,0(r4)
+	bdz	L(end3)
+	lwzu	r9,4(r4)
+	mullw	r11,r8,r6
+	mulhwu	r0,r8,r6
+	bdz	L(end1)
+C Software pipelined main loop
+L(loop):
+	lwz	r8,4(r4)
+	mullw	r10,r9,r6
+	adde	r5,r11,r12
+	mulhwu	r12,r9,r6
+	stw	r5,4(r3)
+	bdz	L(end2)
+	lwzu	r9,8(r4)
+	mullw	r11,r8,r6
+	adde	r7,r10,r0
+	mulhwu	r0,r8,r6
+	stwu	r7,8(r3)
+	bdnz	L(loop)
+C Finish software pipeline
+L(end1):
+	mullw	r10,r9,r6
+	adde	r5,r11,r12
+	mulhwu	r12,r9,r6
+	stw	r5,4(r3)
+	adde	r7,r10,r0
+	stwu	r7,8(r3)
+	addze	r3,r12
+	blr
+L(end2):
+	mullw	r11,r8,r6
+	adde	r7,r10,r0
+	mulhwu	r0,r8,r6
+	stwu	r7,8(r3)
+	adde	r5,r11,r12
+	stw	r5,4(r3)
+	addze	r3,r0
+	blr
+L(end3):
+	mullw	r11,r8,r6
+	stw	r11,4(r3)
+	mulhwu	r3,r8,r6
+	blr
+EPILOGUE(mpn_mul_1)

diff --git a/third_party/gmp/mpn/powerpc32/p3-p7/aors_n.asm b/third_party/gmp/mpn/powerpc32/p3-p7/aors_n.asm
new file mode 100644
index 0000000..3b6685e
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc32/p3-p7/aors_n.asm

@@ -0,0 +1,187 @@
+dnl  PowerPC-32 mpn_add_n/mpn_sub_n -- mpn addition and subtraction.
+
+dnl  Copyright 1999-2001, 2003-2005, 2007, 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C                   cycles/limb
+C POWER3/PPC630          1.5
+C POWER4/PPC970          2
+C POWER5                 2
+C POWER6                 2.78
+C POWER7               2.15-2.87
+
+C This code is based on powerpc64/aors_n.asm.
+
+C INPUT PARAMETERS
+C rp	r3
+C up	r4
+C vp	r5
+C n	r6
+
+ifdef(`OPERATION_add_n',`
+  define(ADDSUBC,	adde)
+  define(ADDSUB,	addc)
+  define(func,		mpn_add_n)
+  define(func_nc,	mpn_add_nc)
+  define(GENRVAL,	`addi	r3, r3, 1')
+  define(SETCBR,	`addic	r0, $1, -1')
+  define(CLRCB,		`addic	r0, r0, 0')
+')
+ifdef(`OPERATION_sub_n',`
+  define(ADDSUBC,	subfe)
+  define(ADDSUB,	subfc)
+  define(func,		mpn_sub_n)
+  define(func_nc,	mpn_sub_nc)
+  define(GENRVAL,	`neg	r3, r3')
+  define(SETCBR,	`subfic	r0, $1, 0')
+  define(CLRCB,		`addic	r0, r1, -1')
+')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+
+ASM_START()
+PROLOGUE(func_nc)
+	SETCBR(r7)
+	b	L(ent)
+EPILOGUE()
+
+PROLOGUE(func)
+	CLRCB
+L(ent):	stwu	r1, -32(r1)
+	rlwinm.	r0, r6, 0,30,31	C r0 = n & 3, set cr0
+	cmpwi	cr6, r0, 2
+	stw	r28, 8(r1)
+	addi	r6, r6, 3	C compute count...
+	stw	r29, 12(r1)
+	srwi	r6, r6, 2	C ...for ctr
+	stw	r30, 16(r1)
+	mtctr	r6		C copy count into ctr
+	stw	r31, 20(r1)
+	beq	cr0, L(b00)
+	blt	cr6, L(b01)
+	beq	cr6, L(b10)
+
+L(b11):	lwz	r8, 0(r4)	C load s1 limb
+	lwz	r9, 0(r5)	C load s2 limb
+	lwz	r10, 4(r4)	C load s1 limb
+	lwz	r11, 4(r5)	C load s2 limb
+	lwz	r12, 8(r4)	C load s1 limb
+	addi	r4, r4, 12
+	lwz	r0, 8(r5)	C load s2 limb
+	addi	r5, r5, 12
+	ADDSUBC	r29, r9, r8
+	ADDSUBC	r30, r11, r10
+	ADDSUBC	r31, r0, r12
+	stw	r29, 0(r3)
+	stw	r30, 4(r3)
+	stw	r31, 8(r3)
+	addi	r3, r3, 12
+	bdnz	L(go)
+	b	L(ret)
+
+L(b01):	lwz	r12, 0(r4)	C load s1 limb
+	addi	r4, r4, 4
+	lwz	r0, 0(r5)	C load s2 limb
+	addi	r5, r5, 4
+	ADDSUBC	r31, r0, r12	C add
+	stw	r31, 0(r3)
+	addi	r3, r3, 4
+	bdnz	L(go)
+	b	L(ret)
+
+L(b10):	lwz	r10, 0(r4)	C load s1 limb
+	lwz	r11, 0(r5)	C load s2 limb
+	lwz	r12, 4(r4)	C load s1 limb
+	addi	r4, r4, 8
+	lwz	r0, 4(r5)	C load s2 limb
+	addi	r5, r5, 8
+	ADDSUBC	r30, r11, r10	C add
+	ADDSUBC	r31, r0, r12	C add
+	stw	r30, 0(r3)
+	stw	r31, 4(r3)
+	addi	r3, r3, 8
+	bdnz	L(go)
+	b	L(ret)
+
+L(b00):	C INITCY		C clear/set cy
+L(go):	lwz	r6, 0(r4)	C load s1 limb
+	lwz	r7, 0(r5)	C load s2 limb
+	lwz	r8, 4(r4)	C load s1 limb
+	lwz	r9, 4(r5)	C load s2 limb
+	lwz	r10, 8(r4)	C load s1 limb
+	lwz	r11, 8(r5)	C load s2 limb
+	lwz	r12, 12(r4)	C load s1 limb
+	lwz	r0, 12(r5)	C load s2 limb
+	bdz	L(end)
+
+	addi	r4, r4, 16
+	addi	r5, r5, 16
+
+	ALIGN(16)
+L(top):	ADDSUBC	r28, r7, r6
+	lwz	r6, 0(r4)	C load s1 limb
+	lwz	r7, 0(r5)	C load s2 limb
+	ADDSUBC	r29, r9, r8
+	lwz	r8, 4(r4)	C load s1 limb
+	lwz	r9, 4(r5)	C load s2 limb
+	ADDSUBC	r30, r11, r10
+	lwz	r10, 8(r4)	C load s1 limb
+	lwz	r11, 8(r5)	C load s2 limb
+	ADDSUBC	r31, r0, r12
+	lwz	r12, 12(r4)	C load s1 limb
+	lwz	r0, 12(r5)	C load s2 limb
+	stw	r28, 0(r3)
+	addi	r4, r4, 16
+	stw	r29, 4(r3)
+	addi	r5, r5, 16
+	stw	r30, 8(r3)
+	stw	r31, 12(r3)
+	addi	r3, r3, 16
+	bdnz	L(top)		C decrement ctr and loop back
+
+L(end):	ADDSUBC	r28, r7, r6
+	ADDSUBC	r29, r9, r8
+	ADDSUBC	r30, r11, r10
+	ADDSUBC	r31, r0, r12
+	stw	r28, 0(r3)
+	stw	r29, 4(r3)
+	stw	r30, 8(r3)
+	stw	r31, 12(r3)
+
+L(ret):
+	lwz	r28, 8(r1)
+	lwz	r29, 12(r1)
+	subfe	r3, r0, r0	C -cy
+	lwz	r30, 16(r1)
+	GENRVAL
+	lwz	r31, 20(r1)
+	addi	r1, r1, 32
+	blr
+EPILOGUE()

diff --git a/third_party/gmp/mpn/powerpc32/p3/gmp-mparam.h b/third_party/gmp/mpn/powerpc32/p3/gmp-mparam.h
new file mode 100644
index 0000000..3382695
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc32/p3/gmp-mparam.h

@@ -0,0 +1,155 @@
+/* PowerPC-32 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 1999-2004, 2008-2011 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* 450 MHz POWER3 */
+
+#define DIVREM_1_NORM_THRESHOLD              0  /* always */
+#define DIVREM_1_UNNORM_THRESHOLD            0  /* always */
+#define MOD_1_1P_METHOD                      2
+#define MOD_1_NORM_THRESHOLD                 0  /* always */
+#define MOD_1_UNNORM_THRESHOLD               0  /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          3
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          2
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD        12
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD        18
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD      8
+#define USE_PREINV_DIVREM_1                  1
+#define DIVEXACT_1_THRESHOLD                 0  /* always */
+#define BMOD_1_TO_MOD_1_THRESHOLD        MP_SIZE_T_MAX  /* never */
+
+#define MUL_TOOM22_THRESHOLD                10
+#define MUL_TOOM33_THRESHOLD                38
+#define MUL_TOOM44_THRESHOLD                58
+#define MUL_TOOM6H_THRESHOLD               129
+#define MUL_TOOM8H_THRESHOLD               212
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      65
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD      63
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD      59
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD      64
+
+#define SQR_BASECASE_THRESHOLD               0  /* always */
+#define SQR_TOOM2_THRESHOLD                 14
+#define SQR_TOOM3_THRESHOLD                 53
+#define SQR_TOOM4_THRESHOLD                 76
+#define SQR_TOOM6_THRESHOLD                106
+#define SQR_TOOM8_THRESHOLD                284
+
+#define MULMOD_BNM1_THRESHOLD                9
+#define SQRMOD_BNM1_THRESHOLD                9
+
+#define MUL_FFT_MODF_THRESHOLD             220  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    220, 5}, {     13, 6}, {      7, 5}, {     15, 6}, \
+    {      9, 5}, {     19, 6}, {     13, 7}, {      7, 6}, \
+    {     16, 7}, {     13, 8}, {      7, 7}, {     19, 8}, \
+    {     11, 7}, {     23, 9}, {      7, 8}, {     15, 7}, \
+    {     33, 8}, {     23, 9}, {     15, 8}, {     35, 9}, \
+    {     23,10}, {     15, 9}, {     31, 8}, {     67, 9}, \
+    {     39, 8}, {     79, 9}, {     47,10}, {     31, 9}, \
+    {     63, 8}, {    127, 9}, {     71, 8}, {    143, 9}, \
+    {     79,10}, {     47,11}, {     31,10}, {     63, 9}, \
+    {    127, 8}, {    255, 9}, {    143,10}, {     79, 9}, \
+    {    159, 8}, {    319, 9}, {    175, 8}, {    351,10}, \
+    {     95, 9}, {    191, 8}, {    383,10}, {    111,11}, \
+    {     63,10}, {    127, 9}, {    255,10}, {    143, 9}, \
+    {    287, 8}, {    575,10}, {    159, 9}, {    319,10}, \
+    {    175, 9}, {    351,11}, {     95,10}, {    191, 9}, \
+    {    383,10}, {    207, 9}, {    415,12}, {     63,11}, \
+    {    127,10}, {    255, 9}, {    511,10}, {    287, 9}, \
+    {    575,11}, {    159,10}, {    351, 9}, {    703, 8}, \
+    {   1407,11}, {    191,10}, {    415,11}, {    223,10}, \
+    {    447, 9}, {    895,12}, {   4096,13}, {   8192,14}, \
+    {  16384,15}, {  32768,16} }
+#define MUL_FFT_TABLE3_SIZE 82
+#define MUL_FFT_THRESHOLD                 2688
+
+#define SQR_FFT_MODF_THRESHOLD             176  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    176, 5}, {     13, 6}, {      7, 5}, {     15, 6}, \
+    {     13, 7}, {      7, 6}, {     16, 7}, {      9, 6}, \
+    {     19, 7}, {     11, 6}, {     23, 7}, {     13, 8}, \
+    {      7, 7}, {     19, 8}, {     11, 7}, {     23, 9}, \
+    {      7, 8}, {     15, 7}, {     31, 8}, {     23, 9}, \
+    {     15, 8}, {     39, 9}, {     23,10}, {     15, 9}, \
+    {     31, 8}, {     63, 9}, {     39, 8}, {     79, 9}, \
+    {     47, 8}, {     95,10}, {     31, 9}, {     63, 8}, \
+    {    127, 9}, {     71, 8}, {    143, 7}, {    287, 6}, \
+    {    575, 9}, {     79, 8}, {    159,10}, {     47, 9}, \
+    {     95,11}, {     31,10}, {     63, 9}, {    127, 8}, \
+    {    255, 9}, {    143, 8}, {    287, 7}, {    575,10}, \
+    {     79, 9}, {    159, 8}, {    319, 9}, {    175,10}, \
+    {     95, 9}, {    191, 8}, {    383,10}, {    111, 9}, \
+    {    223,11}, {     63,10}, {    127, 9}, {    255,10}, \
+    {    143, 9}, {    287, 8}, {    575,10}, {    159, 9}, \
+    {    319,10}, {    175,11}, {     95,10}, {    191, 9}, \
+    {    383,10}, {    223,12}, {     63,11}, {    127,10}, \
+    {    287, 9}, {    575,11}, {    159,10}, {    351, 9}, \
+    {    703, 8}, {   1407,11}, {    191,10}, {    383,11}, \
+    {    223,10}, {    447, 9}, {    895,12}, {   4096,13}, \
+    {   8192,14}, {  16384,15}, {  32768,16} }
+#define SQR_FFT_TABLE3_SIZE 87
+#define SQR_FFT_THRESHOLD                 1728
+
+#define MULLO_BASECASE_THRESHOLD             2
+#define MULLO_DC_THRESHOLD                  33
+#define MULLO_MUL_N_THRESHOLD             5240
+
+#define DC_DIV_QR_THRESHOLD                 32
+#define DC_DIVAPPR_Q_THRESHOLD             123
+#define DC_BDIV_QR_THRESHOLD                34
+#define DC_BDIV_Q_THRESHOLD                 84
+
+#define INV_MULMOD_BNM1_THRESHOLD           42
+#define INV_NEWTON_THRESHOLD               129
+#define INV_APPR_THRESHOLD                 124
+
+#define BINV_NEWTON_THRESHOLD              148
+#define REDC_1_TO_REDC_N_THRESHOLD          38
+
+#define MU_DIV_QR_THRESHOLD                748
+#define MU_DIVAPPR_Q_THRESHOLD             748
+#define MUPI_DIV_QR_THRESHOLD               59
+#define MU_BDIV_QR_THRESHOLD               562
+#define MU_BDIV_Q_THRESHOLD                654
+
+#define MATRIX22_STRASSEN_THRESHOLD         11
+#define HGCD_THRESHOLD                      76
+#define GCD_DC_THRESHOLD                   205
+#define GCDEXT_DC_THRESHOLD                174
+#define JACOBI_BASE_METHOD                   1
+
+#define GET_STR_DC_THRESHOLD                14
+#define GET_STR_PRECOMPUTE_THRESHOLD        27
+#define SET_STR_DC_THRESHOLD               181
+#define SET_STR_PRECOMPUTE_THRESHOLD       525

diff --git a/third_party/gmp/mpn/powerpc32/p4/gmp-mparam.h b/third_party/gmp/mpn/powerpc32/p4/gmp-mparam.h
new file mode 100644
index 0000000..7ac59f5
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc32/p4/gmp-mparam.h

@@ -0,0 +1,209 @@
+/* PowerPC-32 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 1999-2004, 2008-2011, 2014 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+/* 1800 MHz PowerPC-970 */
+/* FFT tuning limit = 10000000 */
+/* Generated by tuneup.c, 2014-03-12, gcc 4.0 */
+
+#define DIVREM_1_NORM_THRESHOLD              0  /* always */
+#define DIVREM_1_UNNORM_THRESHOLD            0  /* always */
+#define MOD_1_1P_METHOD                      1
+#define MOD_1_NORM_THRESHOLD                 0  /* always */
+#define MOD_1_UNNORM_THRESHOLD               0  /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          7
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          5
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD         9
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD        42
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     14
+#define USE_PREINV_DIVREM_1                  1
+#define DIV_QR_1N_PI1_METHOD                 1
+#define DIV_QR_1_NORM_THRESHOLD              1
+#define DIV_QR_1_UNNORM_THRESHOLD            1
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD                 0  /* always */
+#define BMOD_1_TO_MOD_1_THRESHOLD           45
+
+#define DIV_1_VS_MUL_1_PERCENT             225
+
+#define MUL_TOOM22_THRESHOLD                20
+#define MUL_TOOM33_THRESHOLD                73
+#define MUL_TOOM44_THRESHOLD               130
+#define MUL_TOOM6H_THRESHOLD               222
+#define MUL_TOOM8H_THRESHOLD               333
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD     107
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     108
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD      89
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD      92
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD     100
+
+#define SQR_BASECASE_THRESHOLD               5
+#define SQR_TOOM2_THRESHOLD                 30
+#define SQR_TOOM3_THRESHOLD                 85
+#define SQR_TOOM4_THRESHOLD                160
+#define SQR_TOOM6_THRESHOLD                197
+#define SQR_TOOM8_THRESHOLD                357
+
+#define MULMID_TOOM42_THRESHOLD             32
+
+#define MULMOD_BNM1_THRESHOLD               15
+#define SQRMOD_BNM1_THRESHOLD               16
+
+#define MUL_FFT_MODF_THRESHOLD             444  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    444, 5}, {     17, 6}, {      9, 5}, {     21, 6}, \
+    {     11, 5}, {     23, 6}, {     21, 7}, {     11, 6}, \
+    {     24, 7}, {     13, 6}, {     28, 7}, {     15, 6}, \
+    {     31, 7}, {     21, 8}, {     11, 7}, {     27, 8}, \
+    {     15, 7}, {     33, 8}, {     19, 7}, {     41, 8}, \
+    {     23, 7}, {     47, 8}, {     27, 9}, {     15, 8}, \
+    {     39, 9}, {     23, 8}, {     51,10}, {     15, 9}, \
+    {     31, 8}, {     67, 9}, {     39, 8}, {     79, 9}, \
+    {     47, 8}, {     95,10}, {     31, 9}, {     63, 8}, \
+    {    127, 9}, {     79,10}, {     47, 9}, {     95,11}, \
+    {     31,10}, {     63, 9}, {    135,10}, {     79, 9}, \
+    {    167,10}, {     95, 9}, {    191, 8}, {    383,10}, \
+    {    111,11}, {     63,10}, {    127, 9}, {    255, 8}, \
+    {    511,10}, {    143, 9}, {    287, 8}, {    575, 9}, \
+    {    303,10}, {    159, 9}, {    319,11}, {     95,10}, \
+    {    191, 9}, {    383,12}, {     63,11}, {    127,10}, \
+    {    255, 9}, {    511,10}, {    271, 9}, {    543, 8}, \
+    {   1087,10}, {    287, 9}, {    575,10}, {    303,11}, \
+    {    159,10}, {    335, 9}, {    671, 8}, {   1343,10}, \
+    {    351, 9}, {    703,11}, {    191,10}, {    383, 9}, \
+    {    767,10}, {    415, 9}, {    831,11}, {    223,10}, \
+    {    447,12}, {    127,11}, {    255,10}, {    543, 9}, \
+    {   1087,11}, {    287,10}, {    607, 9}, {   1215,11}, \
+    {    319,10}, {    671, 9}, {   1343,11}, {    351,10}, \
+    {    703, 9}, {   1407,12}, {    191,11}, {    383,10}, \
+    {    767,11}, {    415,10}, {    831,11}, {    447,13}, \
+    {    127,12}, {    255,11}, {    543,10}, {   1087,11}, \
+    {    607,10}, {   1215,12}, {    319,11}, {    671,10}, \
+    {   1343,11}, {    703,10}, {   1407,11}, {    735,12}, \
+    {    383,11}, {    767,10}, {   1535,11}, {    831,12}, \
+    {    447,10}, {   1791,11}, {    959,13}, {    255,12}, \
+    {    511,11}, {   1087,12}, {    575,11}, {   1215,10}, \
+    {   2431,12}, {    639,11}, {   1343,12}, {    703,11}, \
+    {   1407,13}, {    383,12}, {    767,11}, {   1535,12}, \
+    {    831,11}, {   1727,10}, {   3455,11}, {   1791,12}, \
+    {    959,14}, {    255,13}, {    511,12}, {   1215,11}, \
+    {   2431,13}, {    639,12}, {   1471,13}, {    767,12}, \
+    {   1727,11}, {   3455,12}, {   1791,14}, {    511,13}, \
+    {   1151,12}, {   2431,13}, {   8192,14}, {  16384,15}, \
+    {  32768,16} }
+#define MUL_FFT_TABLE3_SIZE 157
+#define MUL_FFT_THRESHOLD                 6784
+
+#define SQR_FFT_MODF_THRESHOLD             340  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    340, 5}, {     21, 6}, {     11, 5}, {     23, 6}, \
+    {     21, 7}, {     11, 6}, {     24, 7}, {     13, 6}, \
+    {     28, 7}, {     21, 8}, {     11, 7}, {     27, 8}, \
+    {     15, 7}, {     33, 8}, {     19, 7}, {     39, 8}, \
+    {     23, 7}, {     47, 8}, {     27, 9}, {     15, 8}, \
+    {     39, 9}, {     23, 8}, {     47,10}, {     15, 9}, \
+    {     31, 8}, {     63, 9}, {     39, 8}, {     79, 9}, \
+    {     47,10}, {     31, 9}, {     79,10}, {     47, 9}, \
+    {     95,11}, {     31,10}, {     63, 9}, {    127, 8}, \
+    {    255, 9}, {    135,10}, {     79, 9}, {    159, 8}, \
+    {    319,10}, {     95, 9}, {    191, 8}, {    383, 9}, \
+    {    207,11}, {     63,10}, {    127, 9}, {    255, 8}, \
+    {    511, 9}, {    271,10}, {    143, 9}, {    287, 8}, \
+    {    575, 9}, {    303, 8}, {    607,10}, {    159, 9}, \
+    {    319,10}, {    175,11}, {     95,10}, {    191, 9}, \
+    {    383,10}, {    207,12}, {     63,11}, {    127,10}, \
+    {    255, 9}, {    511,10}, {    271, 9}, {    543, 8}, \
+    {   1087,10}, {    287, 9}, {    575,10}, {    303, 9}, \
+    {    607,11}, {    159,10}, {    319, 9}, {    639,10}, \
+    {    335, 9}, {    671,10}, {    351, 9}, {    703,11}, \
+    {    191,10}, {    383, 9}, {    767,10}, {    415, 9}, \
+    {    831,11}, {    223,10}, {    447,12}, {    127,11}, \
+    {    255,10}, {    543, 9}, {   1087,11}, {    287,10}, \
+    {    607, 9}, {   1215,11}, {    319,10}, {    671,11}, \
+    {    351,10}, {    703,12}, {    191,11}, {    383,10}, \
+    {    767,11}, {    415,10}, {    831,11}, {    479,13}, \
+    {    127,12}, {    255,11}, {    543,10}, {   1087,11}, \
+    {    607,10}, {   1215,12}, {    319,11}, {    671,10}, \
+    {   1343,11}, {    703,10}, {   1407,11}, {    735,12}, \
+    {    383,11}, {    831,12}, {    447,11}, {    959,13}, \
+    {    255,12}, {    511,11}, {   1087,12}, {    575,11}, \
+    {   1215,12}, {    639,11}, {   1343,12}, {    703,11}, \
+    {   1407,13}, {    383,12}, {    831,11}, {   1727,12}, \
+    {    959,14}, {    255,13}, {    511,12}, {   1215,13}, \
+    {    639,12}, {   1471,13}, {    767,12}, {   1727,13}, \
+    {    895,12}, {   1919,14}, {    511,13}, {   1023,12}, \
+    {   2111,13}, {   1151,12}, {   2431,13}, {   8192,14}, \
+    {  16384,15}, {  32768,16} }
+#define SQR_FFT_TABLE3_SIZE 150
+#define SQR_FFT_THRESHOLD                 4736
+
+#define MULLO_BASECASE_THRESHOLD             0  /* always */
+#define MULLO_DC_THRESHOLD                  55
+#define MULLO_MUL_N_THRESHOLD            13463
+#define SQRLO_BASECASE_THRESHOLD             4
+#define SQRLO_DC_THRESHOLD                 169
+#define SQRLO_SQR_THRESHOLD               9335
+
+#define DC_DIV_QR_THRESHOLD                 50
+#define DC_DIVAPPR_Q_THRESHOLD             196
+#define DC_BDIV_QR_THRESHOLD                51
+#define DC_BDIV_Q_THRESHOLD                166
+
+#define INV_MULMOD_BNM1_THRESHOLD           50
+#define INV_NEWTON_THRESHOLD               226
+#define INV_APPR_THRESHOLD                 202
+
+#define BINV_NEWTON_THRESHOLD              228
+#define REDC_1_TO_REDC_N_THRESHOLD          67
+
+#define MU_DIV_QR_THRESHOLD               1187
+#define MU_DIVAPPR_Q_THRESHOLD            1308
+#define MUPI_DIV_QR_THRESHOLD              114
+#define MU_BDIV_QR_THRESHOLD               998
+#define MU_BDIV_Q_THRESHOLD               1142
+
+#define POWM_SEC_TABLE  3,28,78,480,1099
+
+#define GET_STR_DC_THRESHOLD                11
+#define GET_STR_PRECOMPUTE_THRESHOLD        24
+#define SET_STR_DC_THRESHOLD               381
+#define SET_STR_PRECOMPUTE_THRESHOLD      1002
+
+#define FAC_DSC_THRESHOLD                  179
+#define FAC_ODD_THRESHOLD                   28
+
+#define MATRIX22_STRASSEN_THRESHOLD          9
+#define HGCD_THRESHOLD                      93
+#define HGCD_APPR_THRESHOLD                109
+#define HGCD_REDUCE_THRESHOLD             2479
+#define GCD_DC_THRESHOLD                   379
+#define GCDEXT_DC_THRESHOLD                273
+#define JACOBI_BASE_METHOD                   4

diff --git a/third_party/gmp/mpn/powerpc32/p5/gmp-mparam.h b/third_party/gmp/mpn/powerpc32/p5/gmp-mparam.h
new file mode 100644
index 0000000..faa1e81
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc32/p5/gmp-mparam.h

@@ -0,0 +1,156 @@
+/* PowerPC-32 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 1999-2004, 2008-2011 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* 1650 MHz POWER5 */
+
+#define DIVREM_1_NORM_THRESHOLD              0  /* always */
+#define DIVREM_1_UNNORM_THRESHOLD            0  /* always */
+#define MOD_1_1P_METHOD                      1
+#define MOD_1_NORM_THRESHOLD                 0  /* always */
+#define MOD_1_UNNORM_THRESHOLD               0  /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          8
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          6
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD         9
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD        50
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     18
+#define USE_PREINV_DIVREM_1                  1
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD                 0  /* always */
+#define BMOD_1_TO_MOD_1_THRESHOLD           61
+
+#define MUL_TOOM22_THRESHOLD                22
+#define MUL_TOOM33_THRESHOLD                57
+#define MUL_TOOM44_THRESHOLD               130
+#define MUL_TOOM6H_THRESHOLD               189
+#define MUL_TOOM8H_THRESHOLD               309
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      89
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD      99
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD      83
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD      88
+
+#define SQR_BASECASE_THRESHOLD               6
+#define SQR_TOOM2_THRESHOLD                 40
+#define SQR_TOOM3_THRESHOLD                 77
+#define SQR_TOOM4_THRESHOLD                124
+#define SQR_TOOM6_THRESHOLD                140
+#define SQR_TOOM8_THRESHOLD                238
+
+#define MULMID_TOOM42_THRESHOLD             40
+
+#define MULMOD_BNM1_THRESHOLD               15
+#define SQRMOD_BNM1_THRESHOLD               16
+
+#define POWM_SEC_TABLE  4,29,252,840,2080
+
+#define MUL_FFT_MODF_THRESHOLD             412  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    412, 5}, {     21, 6}, {     11, 5}, {     23, 6}, \
+    {     12, 5}, {     25, 6}, {     21, 7}, {     11, 6}, \
+    {     25, 7}, {     13, 6}, {     27, 7}, {     21, 8}, \
+    {     11, 7}, {     27, 8}, {     15, 7}, {     33, 8}, \
+    {     19, 7}, {     39, 8}, {     23, 7}, {     47, 8}, \
+    {     27, 9}, {     15, 8}, {     39, 9}, {     23, 8}, \
+    {     51,10}, {     15, 9}, {     31, 8}, {     67, 9}, \
+    {     39, 8}, {     79, 9}, {     55,10}, {     31, 9}, \
+    {     79,10}, {     47, 9}, {     95,11}, {     31,10}, \
+    {     63, 9}, {    135,10}, {     79, 9}, {    159,10}, \
+    {     95,11}, {     63,10}, {    127, 9}, {    255,10}, \
+    {    143, 9}, {    287,10}, {    159,11}, {     95,10}, \
+    {    191,12}, {     63,11}, {    127,10}, {    255, 9}, \
+    {    511,10}, {    271, 9}, {    543,10}, {    287,11}, \
+    {    159,10}, {    335, 9}, {    671,10}, {    351, 9}, \
+    {    703,11}, {    191,10}, {    383, 9}, {    767,10}, \
+    {    415, 9}, {    831,11}, {    223,12}, {   4096,13}, \
+    {   8192,14}, {  16384,15}, {  32768,16} }
+#define MUL_FFT_TABLE3_SIZE 71
+#define MUL_FFT_THRESHOLD                 4736
+
+#define SQR_FFT_MODF_THRESHOLD             340  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    340, 5}, {     21, 6}, {     11, 5}, {     23, 6}, \
+    {     21, 7}, {     11, 6}, {     24, 7}, {     13, 6}, \
+    {     27, 7}, {     21, 8}, {     11, 7}, {     27, 8}, \
+    {     15, 7}, {     33, 8}, {     19, 7}, {     39, 8}, \
+    {     23, 7}, {     47, 8}, {     27, 9}, {     15, 8}, \
+    {     39, 9}, {     23, 8}, {     47,10}, {     15, 9}, \
+    {     31, 8}, {     67, 9}, {     47,10}, {     31, 9}, \
+    {     71,10}, {     47,11}, {     31,10}, {     63, 9}, \
+    {    127, 8}, {    255, 9}, {    135,10}, {     79, 9}, \
+    {    159,10}, {     95, 9}, {    191,11}, {     63,10}, \
+    {    127, 9}, {    255, 8}, {    511, 9}, {    271,10}, \
+    {    143, 9}, {    287, 8}, {    575, 9}, {    303,10}, \
+    {    159,11}, {     95,10}, {    191,12}, {     63,11}, \
+    {    127,10}, {    255, 9}, {    511,10}, {    271, 9}, \
+    {    543,10}, {    287, 9}, {    575,10}, {    303,11}, \
+    {    159,10}, {    319, 9}, {    639,10}, {    335, 9}, \
+    {    671,10}, {    351,11}, {    191,10}, {    383, 9}, \
+    {    767,10}, {    415,11}, {    223,10}, {    447,12}, \
+    {   4096,13}, {   8192,14}, {  16384,15}, {  32768,16} }
+#define SQR_FFT_TABLE3_SIZE 76
+#define SQR_FFT_THRESHOLD                 3712
+
+#define MULLO_BASECASE_THRESHOLD             2
+#define MULLO_DC_THRESHOLD                  68
+#define MULLO_MUL_N_THRESHOLD             9236
+
+#define DC_DIV_QR_THRESHOLD                 69
+#define DC_DIVAPPR_Q_THRESHOLD             220
+#define DC_BDIV_QR_THRESHOLD                75
+#define DC_BDIV_Q_THRESHOLD                188
+
+#define INV_MULMOD_BNM1_THRESHOLD           54
+#define INV_NEWTON_THRESHOLD               230
+#define INV_APPR_THRESHOLD                 230
+
+#define BINV_NEWTON_THRESHOLD              278
+#define REDC_1_TO_REDC_N_THRESHOLD          87
+
+#define MU_DIV_QR_THRESHOLD               1210
+#define MU_DIVAPPR_Q_THRESHOLD            1308
+#define MUPI_DIV_QR_THRESHOLD              106
+#define MU_BDIV_QR_THRESHOLD              1017
+#define MU_BDIV_Q_THRESHOLD               1210
+
+#define MATRIX22_STRASSEN_THRESHOLD         14
+#define HGCD_THRESHOLD                     110
+#define HGCD_APPR_THRESHOLD                138
+#define HGCD_REDUCE_THRESHOLD             2578
+#define GCD_DC_THRESHOLD                   408
+#define GCDEXT_DC_THRESHOLD                298
+#define JACOBI_BASE_METHOD                   4
+
+#define GET_STR_DC_THRESHOLD                13
+#define GET_STR_PRECOMPUTE_THRESHOLD        24
+#define SET_STR_DC_THRESHOLD               527
+#define SET_STR_PRECOMPUTE_THRESHOLD      1090

diff --git a/third_party/gmp/mpn/powerpc32/p6/gmp-mparam.h b/third_party/gmp/mpn/powerpc32/p6/gmp-mparam.h
new file mode 100644
index 0000000..c9504b6
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc32/p6/gmp-mparam.h

@@ -0,0 +1,165 @@
+/* PowerPC-32 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 1999-2004, 2008-2011 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* 3500 MHz POWER6 */
+
+#define DIVREM_1_NORM_THRESHOLD              0  /* always */
+#define DIVREM_1_UNNORM_THRESHOLD            0  /* always */
+#define MOD_1_1P_METHOD                      2
+#define MOD_1_NORM_THRESHOLD                 3
+#define MOD_1_UNNORM_THRESHOLD               0  /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          3
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          3
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD     MP_SIZE_T_MAX
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD         0  /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD      8
+#define USE_PREINV_DIVREM_1                  1
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD                 0  /* always */
+#define BMOD_1_TO_MOD_1_THRESHOLD        MP_SIZE_T_MAX  /* never */
+
+#define MUL_TOOM22_THRESHOLD                19
+#define MUL_TOOM33_THRESHOLD                55
+#define MUL_TOOM44_THRESHOLD                88
+#define MUL_TOOM6H_THRESHOLD               137
+#define MUL_TOOM8H_THRESHOLD               181
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      57
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD      56
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD      57
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD      56
+
+#define SQR_BASECASE_THRESHOLD               0  /* always */
+#define SQR_TOOM2_THRESHOLD                 30
+#define SQR_TOOM3_THRESHOLD                 56
+#define SQR_TOOM4_THRESHOLD                130
+#define SQR_TOOM6_THRESHOLD                189
+#define SQR_TOOM8_THRESHOLD                296
+
+#define MULMID_TOOM42_THRESHOLD             26
+
+#define MULMOD_BNM1_THRESHOLD                7
+#define SQRMOD_BNM1_THRESHOLD               12
+
+#define POWM_SEC_TABLE  2,26,127,453,1068
+
+#define MUL_FFT_MODF_THRESHOLD             212  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    212, 5}, {     13, 6}, {      7, 5}, {     15, 6}, \
+    {     13, 7}, {      7, 6}, {     16, 7}, {      9, 6}, \
+    {     19, 7}, {     13, 8}, {      7, 7}, {     19, 8}, \
+    {     11, 7}, {     25, 9}, {      7, 8}, {     15, 7}, \
+    {     31, 8}, {     19, 7}, {     39, 8}, {     23, 9}, \
+    {     15, 8}, {     39, 9}, {     23, 8}, {     47,10}, \
+    {     15, 9}, {     31, 8}, {     63, 9}, {     39, 8}, \
+    {     79, 9}, {     47,10}, {     31, 9}, {     63, 8}, \
+    {    127, 9}, {     71, 8}, {    143, 7}, {    287, 9}, \
+    {     79,10}, {     47,11}, {     31,10}, {     63, 9}, \
+    {    127, 8}, {    255, 7}, {    511, 9}, {    143, 8}, \
+    {    287,10}, {     79, 9}, {    159, 8}, {    319, 9}, \
+    {    175, 8}, {    351,10}, {     95, 9}, {    191, 8}, \
+    {    383, 9}, {    207,10}, {    111,11}, {     63,10}, \
+    {    127, 9}, {    255, 8}, {    511,10}, {    143, 9}, \
+    {    287, 8}, {    575,10}, {    159, 9}, {    319,10}, \
+    {    175, 9}, {    351,11}, {     95,10}, {    191, 9}, \
+    {    383,10}, {    207, 9}, {    415,12}, {     63,11}, \
+    {    127,10}, {    255, 9}, {    511,10}, {    287, 9}, \
+    {    575,11}, {    159,10}, {    351, 9}, {    703,11}, \
+    {    191,10}, {    415, 9}, {    831,11}, {    223,10}, \
+    {    447,12}, {   4096,13}, {   8192,14}, {  16384,15}, \
+    {  32768,16} }
+#define MUL_FFT_TABLE3_SIZE 89
+#define MUL_FFT_THRESHOLD                 1728
+
+#define SQR_FFT_MODF_THRESHOLD             184  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    184, 5}, {      6, 4}, {     13, 5}, {     13, 6}, \
+    {      7, 5}, {     15, 6}, {     13, 7}, {      7, 6}, \
+    {     16, 7}, {      9, 6}, {     19, 7}, {     11, 6}, \
+    {     23, 7}, {     13, 8}, {      7, 7}, {     19, 8}, \
+    {     11, 7}, {     23, 9}, {      7, 8}, {     23, 9}, \
+    {     15, 8}, {     39, 9}, {     23,10}, {     15, 9}, \
+    {     31, 8}, {     63, 9}, {     39, 8}, {     79, 9}, \
+    {     47,10}, {     31, 9}, {     63, 8}, {    127, 7}, \
+    {    255, 9}, {     71, 8}, {    143, 7}, {    287, 6}, \
+    {    575, 9}, {     79,10}, {     47,11}, {     31,10}, \
+    {     63, 9}, {    127, 8}, {    255, 9}, {    143, 8}, \
+    {    287, 7}, {    575,10}, {     79, 9}, {    159, 8}, \
+    {    319, 9}, {    175, 8}, {    351,10}, {     95, 9}, \
+    {    191, 8}, {    383, 9}, {    207,10}, {    111, 9}, \
+    {    223,11}, {     63,10}, {    127, 9}, {    255,10}, \
+    {    143, 9}, {    287, 8}, {    575,10}, {    159, 9}, \
+    {    319,10}, {    175, 9}, {    351,11}, {     95,10}, \
+    {    191, 9}, {    383,10}, {    207, 9}, {    415,10}, \
+    {    223,12}, {     63,11}, {    127,10}, {    255, 9}, \
+    {    511,10}, {    287, 9}, {    575,11}, {    159,10}, \
+    {    351, 9}, {    703, 8}, {   1407,11}, {    191,10}, \
+    {    415,11}, {    223,10}, {    447, 9}, {    895,12}, \
+    {   4096,13}, {   8192,14}, {  16384,15}, {  32768,16} }
+#define SQR_FFT_TABLE3_SIZE 92
+#define SQR_FFT_THRESHOLD                 1600
+
+#define MULLO_BASECASE_THRESHOLD             2
+#define MULLO_DC_THRESHOLD                  57
+#define MULLO_MUL_N_THRESHOLD             3176
+
+#define DC_DIV_QR_THRESHOLD                 52
+#define DC_DIVAPPR_Q_THRESHOLD             187
+#define DC_BDIV_QR_THRESHOLD                64
+#define DC_BDIV_Q_THRESHOLD                146
+
+#define INV_MULMOD_BNM1_THRESHOLD           68
+#define INV_NEWTON_THRESHOLD               182
+#define INV_APPR_THRESHOLD                 182
+
+#define BINV_NEWTON_THRESHOLD              186
+#define REDC_1_TO_REDC_N_THRESHOLD          60
+
+#define MU_DIV_QR_THRESHOLD                924
+#define MU_DIVAPPR_Q_THRESHOLD             807
+#define MUPI_DIV_QR_THRESHOLD               73
+#define MU_BDIV_QR_THRESHOLD               667
+#define MU_BDIV_Q_THRESHOLD                823
+
+#define MATRIX22_STRASSEN_THRESHOLD          8
+#define HGCD_THRESHOLD                      61
+#define HGCD_APPR_THRESHOLD                 50
+#define HGCD_REDUCE_THRESHOLD              974
+#define GCD_DC_THRESHOLD                   195
+#define GCDEXT_DC_THRESHOLD                134
+#define JACOBI_BASE_METHOD                   4
+
+#define GET_STR_DC_THRESHOLD                 9
+#define GET_STR_PRECOMPUTE_THRESHOLD        21
+#define SET_STR_DC_THRESHOLD               190
+#define SET_STR_PRECOMPUTE_THRESHOLD       411

diff --git a/third_party/gmp/mpn/powerpc32/p7/gmp-mparam.h b/third_party/gmp/mpn/powerpc32/p7/gmp-mparam.h
new file mode 100644
index 0000000..ad48dac
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc32/p7/gmp-mparam.h

@@ -0,0 +1,170 @@
+/* PowerPC-32 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2017 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* 4150 MHz POWER8/T4 */
+/* FFT tuning limit = 0.5 M */
+/* Generated by tuneup.c, 2017-02-18, gcc 6.1 */
+
+#define DIVREM_1_NORM_THRESHOLD              0  /* always */
+#define DIVREM_1_UNNORM_THRESHOLD            0  /* always */
+#define MOD_1_1P_METHOD                      1
+#define MOD_1_NORM_THRESHOLD                 0  /* always */
+#define MOD_1_UNNORM_THRESHOLD               0  /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          9
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          8
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD         0  /* never mpn_mod_1_1p */
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD        22
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     11
+#define USE_PREINV_DIVREM_1                  1
+#define DIV_QR_1N_PI1_METHOD                 2
+#define DIV_QR_1_NORM_THRESHOLD              4
+#define DIV_QR_1_UNNORM_THRESHOLD            3
+#define DIV_QR_2_PI2_THRESHOLD              15
+#define DIVEXACT_1_THRESHOLD                 0  /* always */
+#define BMOD_1_TO_MOD_1_THRESHOLD           39
+
+#define DIV_1_VS_MUL_1_PERCENT             343
+
+#define MUL_TOOM22_THRESHOLD                20
+#define MUL_TOOM33_THRESHOLD                73
+#define MUL_TOOM44_THRESHOLD               202
+#define MUL_TOOM6H_THRESHOLD               286
+#define MUL_TOOM8H_THRESHOLD               430
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD     137
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     140
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD     128
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD     145
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD     121
+
+#define SQR_BASECASE_THRESHOLD               0  /* always */
+#define SQR_TOOM2_THRESHOLD                 26
+#define SQR_TOOM3_THRESHOLD                 97
+#define SQR_TOOM4_THRESHOLD                236
+#define SQR_TOOM6_THRESHOLD                318
+#define SQR_TOOM8_THRESHOLD                478
+
+#define MULMID_TOOM42_THRESHOLD             34
+
+#define MULMOD_BNM1_THRESHOLD               18
+#define SQRMOD_BNM1_THRESHOLD               18
+
+#define MUL_FFT_MODF_THRESHOLD             444  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    444, 5}, {     21, 6}, {     12, 5}, {     25, 6}, \
+    {     13, 5}, {     27, 6}, {     21, 7}, {     11, 6}, \
+    {     25, 7}, {     13, 6}, {     27, 7}, {     17, 6}, \
+    {     35, 7}, {     21, 8}, {     11, 7}, {     27, 8}, \
+    {     15, 7}, {     35, 8}, {     19, 7}, {     41, 8}, \
+    {     27, 9}, {     15, 8}, {     39, 9}, {     23, 8}, \
+    {     51,10}, {     15, 9}, {     31, 8}, {     63, 9}, \
+    {     39, 8}, {     79, 9}, {     47,10}, {     31, 9}, \
+    {     79,10}, {     47,11}, {     31,10}, {     63, 9}, \
+    {    135,10}, {     79, 9}, {    159,10}, {     95, 9}, \
+    {    191,10}, {    111,11}, {     63,10}, {    127, 9}, \
+    {    255,10}, {    143, 9}, {    287,10}, {    159,11}, \
+    {     95,10}, {    191,12}, {     63,11}, {    127,10}, \
+    {    255, 9}, {    511,10}, {    271, 9}, {    543, 8}, \
+    {   1087,10}, {    287,11}, {    159,10}, {    319, 9}, \
+    {    639,10}, {    351,11}, {    191,10}, {    415, 9}, \
+    {    831,11}, {    223,12}, {   4096,13}, {   8192,14}, \
+    {  16384,15}, {  32768,16} }
+#define MUL_FFT_TABLE3_SIZE 70
+#define MUL_FFT_THRESHOLD                 4544
+
+#define SQR_FFT_MODF_THRESHOLD             332  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    332, 5}, {     21, 6}, {     11, 5}, {     23, 6}, \
+    {     21, 7}, {     11, 6}, {     24, 7}, {     13, 6}, \
+    {     27, 7}, {     15, 6}, {     31, 7}, {     21, 8}, \
+    {     11, 7}, {     27, 8}, {     15, 7}, {     33, 8}, \
+    {     19, 7}, {     39, 8}, {     27, 9}, {     15, 8}, \
+    {     39, 9}, {     23, 8}, {     47,10}, {     15, 9}, \
+    {     31, 8}, {     63, 9}, {     47,10}, {     31, 9}, \
+    {     79,10}, {     47,11}, {     31,10}, {     63, 9}, \
+    {    127, 8}, {    255, 9}, {    135,10}, {     95, 9}, \
+    {    191,11}, {     63,10}, {    127, 9}, {    255, 8}, \
+    {    511, 9}, {    271,10}, {    143, 9}, {    287, 8}, \
+    {    575, 9}, {    303, 8}, {    607,10}, {    159,11}, \
+    {     95,10}, {    191,12}, {     63,11}, {    127,10}, \
+    {    255, 9}, {    511,10}, {    271, 9}, {    543,10}, \
+    {    287, 9}, {    575,10}, {    303, 9}, {    607,11}, \
+    {    159,10}, {    319, 9}, {    639,10}, {    351, 9}, \
+    {    703,11}, {    191,10}, {    383, 9}, {    767,10}, \
+    {    415,11}, {    223,10}, {    447,12}, {   4096,13}, \
+    {   8192,14}, {  16384,15}, {  32768,16} }
+#define SQR_FFT_TABLE3_SIZE 75
+#define SQR_FFT_THRESHOLD                 3520
+
+#define MULLO_BASECASE_THRESHOLD             0  /* always */
+#define MULLO_DC_THRESHOLD                  36
+#define MULLO_MUL_N_THRESHOLD             8648
+#define SQRLO_BASECASE_THRESHOLD             5
+#define SQRLO_DC_THRESHOLD                 193
+#define SQRLO_SQR_THRESHOLD               6675
+
+#define DC_DIV_QR_THRESHOLD                 33
+#define DC_DIVAPPR_Q_THRESHOLD             134
+#define DC_BDIV_QR_THRESHOLD                51
+#define DC_BDIV_Q_THRESHOLD                134
+
+#define INV_MULMOD_BNM1_THRESHOLD           66
+#define INV_NEWTON_THRESHOLD               132
+#define INV_APPR_THRESHOLD                 131
+
+#define BINV_NEWTON_THRESHOLD              292
+#define REDC_1_TO_REDC_N_THRESHOLD          67
+
+#define MU_DIV_QR_THRESHOLD               1334
+#define MU_DIVAPPR_Q_THRESHOLD            1334
+#define MUPI_DIV_QR_THRESHOLD               62
+#define MU_BDIV_QR_THRESHOLD              1142
+#define MU_BDIV_Q_THRESHOLD               1470
+
+#define POWM_SEC_TABLE  3,25,114,480,1486
+
+#define GET_STR_DC_THRESHOLD                 8
+#define GET_STR_PRECOMPUTE_THRESHOLD        14
+#define SET_STR_DC_THRESHOLD               644
+#define SET_STR_PRECOMPUTE_THRESHOLD      1365
+
+#define FAC_DSC_THRESHOLD                  107
+#define FAC_ODD_THRESHOLD                   29
+
+#define MATRIX22_STRASSEN_THRESHOLD         16
+#define HGCD_THRESHOLD                      95
+#define HGCD_APPR_THRESHOLD                121
+#define HGCD_REDUCE_THRESHOLD             2681
+#define GCD_DC_THRESHOLD                   456
+#define GCDEXT_DC_THRESHOLD                386
+#define JACOBI_BASE_METHOD                   4

diff --git a/third_party/gmp/mpn/powerpc32/powerpc-defs.m4 b/third_party/gmp/mpn/powerpc32/powerpc-defs.m4
new file mode 100644
index 0000000..8a1451c
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc32/powerpc-defs.m4

@@ -0,0 +1,121 @@
+divert(-1)
+
+dnl  m4 macros for PowerPC assembler (32 and 64 bit).
+
+dnl  Copyright 2000, 2002, 2003, 2017, 2018 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+dnl  Called: PROLOGUE_cpu(GSYM_PREFIX`'foo)
+dnl
+dnl  This is the same as the default in mpn/asm-defs.m4, but with ALIGN(4)
+dnl  not 8.
+dnl
+dnl  4-byte alignment is normally enough, certainly it's what gcc gives.  We
+dnl  don't want bigger alignment within PROLOGUE since it can introduce
+dnl  padding into multiple-entrypoint routines, and with gas such padding is
+dnl  zero words, which are not valid instructions.
+
+define(`PROLOGUE_cpu',
+m4_assert_numargs(1)
+`	TEXT
+	ALIGN(4)
+	GLOBL	`$1' GLOBL_ATTR
+	TYPE(`$1',`function')
+`$1'LABEL_SUFFIX')
+
+
+dnl  Usage: r0 ... r31, cr0 ... cr7
+dnl
+dnl  Registers names, either left as "r0" etc or mapped to plain 0 etc,
+dnl  according to the result of the GMP_ASM_POWERPC_REGISTERS configure
+dnl  test.
+
+ifelse(WANT_R_REGISTERS,no,`
+forloop(i,0,31,`deflit(`r'i,i)')
+forloop(i,0,31,`deflit(`v'i,i)')
+forloop(i,0,31,`deflit(`f'i,i)')
+forloop(i,0,7, `deflit(`cr'i,i)')
+')
+
+
+dnl  Usage: ASSERT(cond,instructions)
+dnl
+dnl  If WANT_ASSERT is 1, output the given instructions and expect the given
+dnl  flags condition to then be satisfied.  For example,
+dnl
+dnl         ASSERT(eq, `cmpwi r6, 123')
+dnl
+dnl  The instructions can be omitted to just assert a flags condition with
+dnl  no extra calculation.  For example,
+dnl
+dnl         ASSERT(ne)
+dnl
+dnl  The condition can be omitted to just output the given instructions when
+dnl  assertion checking is wanted.  For example,
+dnl
+dnl         ASSERT(, `mr r11, r0')
+dnl
+dnl  Using a zero word for an illegal instruction is probably not ideal,
+dnl  since it marks the beginning of a traceback table in the 64-bit ABI.
+dnl  But assertions are only for development, so it doesn't matter too much.
+
+define(ASSERT,
+m4_assert_numargs_range(1,2)
+m4_assert_defined(`WANT_ASSERT')
+`ifelse(WANT_ASSERT,1,
+	`C ASSERT
+	$2
+ifelse(`$1',,,
+`	b$1	L(ASSERT_ok`'ASSERT_counter)
+	W32	0	C assertion failed
+L(ASSERT_ok`'ASSERT_counter):
+define(`ASSERT_counter',incr(ASSERT_counter))
+')')')
+
+define(ASSERT_counter,1)
+
+dnl  Manually assemble some new instructions
+dnl
+
+define(`maddld',m4_assert_numargs(4)`dnl
+.long eval(0x10000033+m4_lshift($1,21)+m4_lshift($2,16)+m4_lshift($3,11)+m4_lshift($4,6))')
+
+define(`maddhdu',m4_assert_numargs(4)`dnl
+.long eval(0x10000031+m4_lshift($1,21)+m4_lshift($2,16)+m4_lshift($3,11)+m4_lshift($4,6))')
+
+define(`popcntd',m4_assert_numargs(2)`dnl
+.long eval(0x7c0003f4+m4_lshift($2,21)+m4_lshift($1,16))')
+
+define(`divdeu',m4_assert_numargs(3)`dnl
+.long eval(0x7c000312+m4_lshift($1,21)+m4_lshift($2,16)+m4_lshift($3,11))')
+
+define(`addex',m4_assert_numargs(4)`dnl
+.long eval(0x7c000154+m4_lshift($1,21)+m4_lshift($2,16)+m4_lshift($3,11)+m4_lshift($4,9))')
+
+divert

diff --git a/third_party/gmp/mpn/powerpc32/rshift.asm b/third_party/gmp/mpn/powerpc32/rshift.asm
new file mode 100644
index 0000000..d86cdcb
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc32/rshift.asm

@@ -0,0 +1,166 @@
+dnl  PowerPC-32 mpn_rshift -- Shift a number right.
+
+dnl  Copyright 1995, 1998, 2000, 2002-2005 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C                cycles/limb
+C 603e:            ?
+C 604e:            3.0
+C 75x (G3):        3.0
+C 7400,7410 (G4):  3.0
+C 7445,7455 (G4+): 2.5
+C 7447,7457 (G4+): 2.25
+C power4/ppc970:   2.5
+C power5:          2.5
+
+C INPUT PARAMETERS
+C rp	r3
+C up	r4
+C n	r5
+C cnt	r6
+
+ASM_START()
+PROLOGUE(mpn_rshift)
+	cmpwi	cr0, r5, 30	C more than 30 limbs?
+	addi	r7, r3, -4	C dst-4
+	bgt	L(BIG)		C branch if more than 12 limbs
+
+	mtctr	r5		C copy size into CTR
+	subfic	r8, r6, 32
+	lwz	r11, 0(r4)	C load first s1 limb
+	slw	r3, r11, r8	C compute function return value
+	bdz	L(end1)
+
+L(oop):	lwzu	r10, 4(r4)
+	srw	r9, r11, r6
+	slw	r12, r10, r8
+	or	r9, r9, r12
+	stwu	r9, 4(r7)
+	bdz	L(end2)
+	lwzu	r11, 4(r4)
+	srw	r9, r10, r6
+	slw	r12, r11, r8
+	or	r9, r9, r12
+	stwu	r9, 4(r7)
+	bdnz	L(oop)
+
+L(end1):
+	srw	r0, r11, r6
+	stw	r0, 4(r7)
+	blr
+L(end2):
+	srw	r0, r10, r6
+	stw	r0, 4(r7)
+	blr
+
+L(BIG):
+	stwu	r1, -48(r1)
+	stmw	r24, 8(r1)	C save registers we are supposed to preserve
+	lwz	r9, 0(r4)
+	subfic	r8, r6, 32
+	slw	r3, r9, r8	C compute function return value
+	srw	r0, r9, r6
+	addi	r5, r5, -1
+
+	andi.	r10, r5, 3	C count for spill loop
+	beq	L(e)
+	mtctr	r10
+	lwzu	r28, 4(r4)
+	bdz	L(xe0)
+
+L(loop0):
+	srw	r12, r28, r6
+	slw	r24, r28, r8
+	lwzu	r28, 4(r4)
+	or	r24, r0, r24
+	stwu	r24, 4(r7)
+	mr	r0, r12
+	bdnz	L(loop0)	C taken at most once!
+
+L(xe0):	srw	r12, r28, r6
+	slw	r24, r28, r8
+	or	r24, r0, r24
+	stwu	r24, 4(r7)
+	mr	r0, r12
+
+L(e):	srwi	r5, r5, 2	C count for unrolled loop
+	addi	r5, r5, -1
+	mtctr	r5
+	lwz	r28, 4(r4)
+	lwz	r29, 8(r4)
+	lwz	r30, 12(r4)
+	lwzu	r31, 16(r4)
+
+L(loopU):
+	srw	r9, r28, r6
+	slw	r24, r28, r8
+	lwz	r28, 4(r4)
+	srw	r10, r29, r6
+	slw	r25, r29, r8
+	lwz	r29, 8(r4)
+	srw	r11, r30, r6
+	slw	r26, r30, r8
+	lwz	r30, 12(r4)
+	srw	r12, r31, r6
+	slw	r27, r31, r8
+	lwzu	r31, 16(r4)
+	or	r24, r0, r24
+	stw	r24, 4(r7)
+	or	r25, r9, r25
+	stw	r25, 8(r7)
+	or	r26, r10, r26
+	stw	r26, 12(r7)
+	or	r27, r11, r27
+	stwu	r27, 16(r7)
+	mr	r0, r12
+	bdnz	L(loopU)
+
+	srw	r9, r28, r6
+	slw	r24, r28, r8
+	srw	r10, r29, r6
+	slw	r25, r29, r8
+	srw	r11, r30, r6
+	slw	r26, r30, r8
+	srw	r12, r31, r6
+	slw	r27, r31, r8
+	or	r24, r0, r24
+	stw	r24, 4(r7)
+	or	r25, r9, r25
+	stw	r25, 8(r7)
+	or	r26, r10, r26
+	stw	r26, 12(r7)
+	or	r27, r11, r27
+	stw	r27, 16(r7)
+
+	stw	r12, 20(r7)
+	lmw	r24, 8(r1)	C restore registers
+	addi	r1, r1, 48
+	blr
+EPILOGUE()

diff --git a/third_party/gmp/mpn/powerpc32/sec_tabselect.asm b/third_party/gmp/mpn/powerpc32/sec_tabselect.asm
new file mode 100644
index 0000000..d50718e
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc32/sec_tabselect.asm

@@ -0,0 +1,143 @@
+dnl  PowerPC-32 mpn_sec_tabselect.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2011-2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C                   cycles/limb
+C 603e:			 ?
+C 604e:			 ?
+C 75x (G3):		 ?
+C 7400,7410 (G4):	 2.5
+C 744x,745x (G4+):	 2.0
+C power4/ppc970:	 2.0
+C power5:		 ?
+
+define(`rp',     `r3')
+define(`tp',     `r4')
+define(`n',      `r5')
+define(`nents',  `r6')
+define(`which',  `r7')
+
+define(`i',      `r8')
+define(`j',      `r9')
+define(`stride', `r12')
+define(`mask',   `r11')
+
+
+ASM_START()
+PROLOGUE(mpn_sec_tabselect)
+	stwu	r1, -32(r1)
+	addic.	j, n, -4		C outer loop induction variable
+	stmw	r27, 8(r1)
+	slwi	stride, n, 2
+
+	blt	cr0, L(outer_end)
+L(outer_top):
+	mtctr	nents
+	mr	r10, tp
+	li	r28, 0
+	li	r29, 0
+	li	r30, 0
+	li	r31, 0
+	addic.	j, j, -4		C outer loop induction variable
+	mr	i, which
+
+	ALIGN(16)
+L(top):	addic	i, i, -1		C set carry iff i != 0
+	subfe	mask, mask, mask
+	lwz	r0, 0(tp)
+	lwz	r27, 4(tp)
+	and	r0, r0, mask
+	and	r27, r27, mask
+	or	r28, r28, r0
+	or	r29, r29, r27
+	lwz	r0, 8(tp)
+	lwz	r27, 12(tp)
+	and	r0, r0, mask
+	and	r27, r27, mask
+	or	r30, r30, r0
+	or	r31, r31, r27
+	add	tp, tp, stride
+	bdnz	L(top)
+
+	stw	r28, 0(rp)
+	stw	r29, 4(rp)
+	stw	r30, 8(rp)
+	stw	r31, 12(rp)
+	addi	tp, r10, 16
+	addi	rp, rp, 16
+	bge	cr0, L(outer_top)
+L(outer_end):
+
+	andi.	r0, n, 2
+	beq	cr0, L(b0x)
+L(b1x):	mtctr	nents
+	mr	r10, tp
+	li	r28, 0
+	li	r29, 0
+	mr	i, which
+	ALIGN(16)
+L(tp2):	addic	i, i, -1
+	subfe	mask, mask, mask
+	lwz	r0, 0(tp)
+	lwz	r27, 4(tp)
+	and	r0, r0, mask
+	and	r27, r27, mask
+	or	r28, r28, r0
+	or	r29, r29, r27
+	add	tp, tp, stride
+	bdnz	L(tp2)
+	stw	r28, 0(rp)
+	stw	r29, 4(rp)
+	addi	tp, r10, 8
+	addi	rp, rp, 8
+
+L(b0x):	andi.	r0, n, 1
+	beq	cr0, L(b00)
+L(b01):	mtctr	nents
+	mr	r10, tp
+	li	r28, 0
+	mr	i, which
+	ALIGN(16)
+L(tp1):	addic	i, i, -1
+	subfe	mask, mask, mask
+	lwz	r0, 0(tp)
+	and	r0, r0, mask
+	or	r28, r28, r0
+	add	tp, tp, stride
+	bdnz	L(tp1)
+	stw	r28, 0(rp)
+
+L(b00):	lmw	r27, 8(r1)
+	addi	r1, r1, 32
+	blr
+EPILOGUE()

diff --git a/third_party/gmp/mpn/powerpc32/sqr_diag_addlsh1.asm b/third_party/gmp/mpn/powerpc32/sqr_diag_addlsh1.asm
new file mode 100644
index 0000000..f7aba33
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc32/sqr_diag_addlsh1.asm

@@ -0,0 +1,80 @@
+dnl  PowerPC-32 mpn_sqr_diag_addlsh1.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C                cycles/limb
+C 603e			 ?
+C 604e			 ?
+C 75x (G3)		 ?
+C 7400,7410 (G4)	 ?
+C 744x,745x (G4+)	 6
+C power4/ppc970		 ?
+C power5		 ?
+
+C This has been feebly optimised for 7447 but not for any other CPU.
+
+define(`rp',	r3)
+define(`tp',	r4)
+define(`up',	r5)
+define(`n',	r6)
+
+ASM_START()
+PROLOGUE(mpn_sqr_diag_addlsh1)
+	addi	n, n, -1
+	addi	tp, tp, -4
+	mtctr	n
+	lwz	r0, 0(up)
+	li	r10, 0
+	mullw	r7, r0, r0
+	stw	r7, 0(rp)
+	mulhwu	r6, r0, r0
+	addic	r31, r31, 0	C clear CF
+
+	ALIGN(16)
+L(top):	lwzu	r0, 4(up)
+	mullw	r7, r0, r0
+	lwz	r8, 4(tp)
+	lwzu	r9, 8(tp)
+	rlwimi	r10, r8, 1,0,30
+	srwi	r11, r8, 31
+	rlwimi	r11, r9, 1,0,30
+	adde	r10, r10, r6
+	adde	r11, r11, r7
+	stw	r10, 4(rp)
+	srwi	r10, r9, 31
+	mulhwu	r6, r0, r0
+	stwu	r11, 8(rp)
+	bdnz	L(top)
+
+	adde	r10, r10, r6
+	stw	r10, 4(rp)
+	blr
+EPILOGUE()

diff --git a/third_party/gmp/mpn/powerpc32/sublsh1_n.asm b/third_party/gmp/mpn/powerpc32/sublsh1_n.asm
new file mode 100644
index 0000000..6dc6460
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc32/sublsh1_n.asm

@@ -0,0 +1,101 @@
+dnl  PowerPC-32 mpn_sublsh1_n -- rp[] = up[] - (vp[] << 1)
+
+dnl  Copyright 2003, 2005, 2007 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C                cycles/limb
+C 603e:            ?
+C 604e:            4.0
+C 75x (G3):        5.0
+C 7400,7410 (G4):  5.0
+C 744x,745x (G4+): 5.0
+C power4/ppc970:   4.25
+C power5:          5.0
+
+C INPUT PARAMETERS
+C rp	r3
+C up	r4
+C vp	r5
+C n	r6
+
+define(`rp',`r3')
+define(`up',`r4')
+define(`vp',`r5')
+
+define(`s0',`r6')
+define(`s1',`r7')
+define(`u0',`r8')
+define(`v0',`r10')
+define(`v1',`r11')
+
+ASM_START()
+PROLOGUE(mpn_sublsh1_n)
+	mtctr	r6		C copy n in ctr
+
+	lwz	v0, 0(vp)	C load v limb
+	lwz	u0, 0(up)	C load u limb
+	addic	up, up, -4	C update up; set cy
+	addi	rp, rp, -4	C update rp
+	slwi	s1, v0, 1
+	bdz	L(end)		C If done, skip loop
+
+L(loop):
+	lwz	v1, 4(vp)	C load v limb
+	subfe	s1, s1, u0	C add limbs with cy, set cy
+	srwi	s0, v0, 31	C shift down previous v limb
+	stw	s1, 4(rp)	C store result limb
+	lwzu	u0, 8(up)	C load u limb and update up
+	rlwimi	s0, v1, 1, 0,30	C left shift v limb and merge with prev v limb
+
+	bdz	L(exit)		C decrement ctr and exit if done
+
+	lwzu	v0, 8(vp)	C load v limb and update vp
+	subfe	s0, s0, u0	C add limbs with cy, set cy
+	srwi	s1, v1, 31	C shift down previous v limb
+	stwu	s0, 8(rp)	C store result limb and update rp
+	lwz	u0, 4(up)	C load u limb
+	rlwimi	s1, v0, 1, 0,30	C left shift v limb and merge with prev v limb
+
+	bdnz	L(loop)		C decrement ctr and loop back
+
+L(end):	subfe	r7, s1, u0
+	srwi	r4, v0, 31
+	stw	r7, 4(rp)	C store last result limb
+	subfze	r3, r4
+	neg	r3, r3
+	blr
+L(exit):
+	subfe	r7, s0, u0
+	srwi	r4, v1, 31
+	stw	r7, 8(rp)	C store last result limb
+	subfze	r3, r4
+	neg	r3, r3
+	blr
+EPILOGUE()

diff --git a/third_party/gmp/mpn/powerpc32/submul_1.asm b/third_party/gmp/mpn/powerpc32/submul_1.asm
new file mode 100644
index 0000000..8ef37b0
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc32/submul_1.asm

@@ -0,0 +1,151 @@
+dnl  PowerPC-32 mpn_submul_1 -- Multiply a limb vector with a limb and subtract
+dnl  the result from a second limb vector.
+
+dnl  Copyright 1995, 1997, 1998, 2000, 2002, 2005 Free Software Foundation,
+dnl  Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C                cycles/limb
+C 603e:            ?
+C 604e:            7.5
+C 75x (G3):        9.3-15
+C 7400,7410 (G4):  9.3-15
+C 744x,745x (G4+): 10.5
+C power4/ppc970:   6.75
+C power5:          6.5
+
+C INPUT PARAMETERS
+C rp	r3
+C up	r4
+C n	r5
+C vl	r6
+
+C This is optimized for the PPC604.  See addmul_1.asm for additional comments.
+
+ASM_START()
+PROLOGUE(mpn_submul_1)
+	cmpwi	cr0,r5,9	C more than 9 limbs?
+	bgt	cr0,L(big)	C branch if more than 9 limbs
+
+	mtctr	r5
+	lwz	r0,0(r4)
+	mullw	r7,r0,r6
+	mulhwu	r10,r0,r6
+	lwz	r9,0(r3)
+	subfc	r8,r7,r9
+	addc	r7,r7,r8	C invert cy (r7 is junk)
+	addi	r3,r3,-4
+	bdz	L(end)
+L(loop):
+	lwzu	r0,4(r4)
+	stwu	r8,4(r3)
+	mullw	r8,r0,r6
+	adde	r7,r8,r10
+	mulhwu	r10,r0,r6
+	lwz	r9,4(r3)
+	addze	r10,r10
+	subfc	r8,r7,r9
+	addc	r7,r7,r8	C invert cy (r7 is junk)
+	bdnz	L(loop)
+L(end):	stw	r8,4(r3)
+	addze	r3,r10
+	blr
+
+L(big):	stwu	r1,-16(r1)
+	addi	r5,r5,-1
+	stw	r30,8(r1)
+	srwi	r0,r5,2
+	stw	r31,12(r1)
+	mtctr	r0
+
+	lwz	r7,0(r4)
+	mullw	r8,r7,r6
+	mulhwu	r0,r7,r6
+	lwz	r7,0(r3)
+	subfc	r7,r8,r7
+	addc	r8,r8,r7
+	stw	r7,0(r3)
+
+L(loopU):
+	lwz	r7,4(r4)
+	lwz	r12,8(r4)
+	lwz	r30,12(r4)
+	lwzu	r31,16(r4)
+	mullw	r8,r7,r6
+	mullw	r9,r12,r6
+	mullw	r10,r30,r6
+	mullw	r11,r31,r6
+	adde	r8,r8,r0	C add cy_limb
+	mulhwu	r0,r7,r6
+	lwz	r7,4(r3)
+	adde	r9,r9,r0
+	mulhwu	r0,r12,r6
+	lwz	r12,8(r3)
+	adde	r10,r10,r0
+	mulhwu	r0,r30,r6
+	lwz	r30,12(r3)
+	adde	r11,r11,r0
+	mulhwu	r0,r31,r6
+	lwz	r31,16(r3)
+	addze	r0,r0		C new cy_limb
+	subfc	r7,r8,r7
+	stw	r7,4(r3)
+	subfe	r12,r9,r12
+	stw	r12,8(r3)
+	subfe	r30,r10,r30
+	stw	r30,12(r3)
+	subfe	r31,r11,r31
+	stwu	r31,16(r3)
+	subfe	r11,r11,r11	C invert ...
+	addic	r11,r11,1	C ... carry
+	bdnz	L(loopU)
+
+	andi.	r31,r5,3
+	mtctr	r31
+	beq	cr0,L(endx)
+
+L(loopE):
+	lwzu	r7,4(r4)
+	mullw	r8,r7,r6
+	adde	r8,r8,r0	C add cy_limb
+	mulhwu	r0,r7,r6
+	lwz	r7,4(r3)
+	addze	r0,r0		C new cy_limb
+	subfc	r7,r8,r7
+	addc	r8,r8,r7
+	stwu	r7,4(r3)
+	bdnz	L(loopE)
+L(endx):
+	addze	r3,r0
+	lwz	r30,8(r1)
+	lwz	r31,12(r1)
+	addi	r1,r1,16
+	blr
+EPILOGUE(mpn_submul_1)

diff --git a/third_party/gmp/mpn/powerpc32/umul.asm b/third_party/gmp/mpn/powerpc32/umul.asm
new file mode 100644
index 0000000..a5811e1
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc32/umul.asm

@@ -0,0 +1,50 @@
+dnl  PowerPC-32 umul_ppmm -- support for longlong.h
+
+dnl  Copyright 2000, 2001 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_umul_ppmm (mp_limb_t *lowptr, mp_limb_t m1, mp_limb_t m2);
+C
+
+ASM_START()
+PROLOGUE(mpn_umul_ppmm)
+
+	C r3	lowptr
+	C r4	m1
+	C r5	m2
+
+	mullw	r0, r4, r5
+	mulhwu	r9, r4, r5
+	stw	r0, 0(r3)
+	mr	r3, r9
+	blr
+
+EPILOGUE(mpn_umul_ppmm)

diff --git a/third_party/gmp/mpn/powerpc32/vmx/copyd.asm b/third_party/gmp/mpn/powerpc32/vmx/copyd.asm
new file mode 100644
index 0000000..dee7266
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc32/vmx/copyd.asm

@@ -0,0 +1,203 @@
+dnl  PowerPC-32/VMX and PowerPC-64/VMX mpn_copyd.
+
+dnl  Copyright 2006 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C                16-byte coaligned      unaligned
+C                   cycles/limb        cycles/limb
+C 7400,7410 (G4):       0.5                0.64
+C 744x,745x (G4+):      0.75               0.82
+C 970 (G5):             0.78               1.02		(64-bit limbs)
+
+C STATUS
+C  * Works for all sizes and alignments.
+
+C TODO
+C  * Optimize unaligned case.  Some basic tests with 2-way and 4-way unrolling
+C    indicate that we can reach 0.56 c/l for 7400, 0.75 c/l for 745x, and 0.80
+C    c/l for 970.
+C  * Consider using VMX instructions also for head and tail, by using some
+C    read-modify-write tricks.
+C  * The VMX code is used from the smallest sizes it handles, but measurements
+C    show a large speed bump at the cutoff points.  Small copying (perhaps
+C    using some read-modify-write technique) should be optimized.
+C  * Make an mpn_com based on this code.
+
+define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8))
+define(`LIMBS_PER_VR',  eval(16/GMP_LIMB_BYTES))
+define(`LIMBS_PER_2VR', eval(32/GMP_LIMB_BYTES))
+
+
+ifelse(GMP_LIMB_BITS,32,`
+	define(`LIMB32',`	$1')
+	define(`LIMB64',`')
+',`
+	define(`LIMB32',`')
+	define(`LIMB64',`	$1')
+')
+
+C INPUT PARAMETERS
+define(`rp',	`r3')
+define(`up',	`r4')
+define(`n',	`r5')
+
+define(`us',	`v4')
+
+
+ASM_START()
+PROLOGUE(mpn_copyd)
+
+LIMB32(`slwi.	r0, n, 2	')
+LIMB64(`sldi.	r0, n, 3	')
+	add	rp, rp, r0
+	add	up, up, r0
+
+LIMB32(`cmpi	cr7, n, 11	')
+LIMB64(`cmpdi	cr7, n, 5	')
+	bge	cr7, L(big)
+
+	beqlr	cr0
+
+C Handle small cases with plain operations
+	mtctr	n
+L(topS):
+LIMB32(`lwz	r0, -4(up)	')
+LIMB64(`ld	r0, -8(up)	')
+	addi	up, up, -GMP_LIMB_BYTES
+LIMB32(`stw	r0, -4(rp)	')
+LIMB64(`std	r0, -8(rp)	')
+	addi	rp, rp, -GMP_LIMB_BYTES
+	bdnz	L(topS)
+	blr
+
+C Handle large cases with VMX operations
+L(big):
+	addi	rp, rp, -16
+	addi	up, up, -16
+	mfspr	r12, 256
+	oris	r0, r12, 0xf800		C Set VRSAVE bit 0-4
+	mtspr	256, r0
+
+LIMB32(`rlwinm.	r7, rp, 30,30,31')	C (rp >> 2) mod 4
+LIMB64(`rlwinm.	r7, rp, 29,31,31')	C (rp >> 3) mod 2
+	beq	L(rp_aligned)
+
+	subf	n, r7, n
+L(top0):
+LIMB32(`lwz	r0, 12(up)	')
+LIMB64(`ld	r0, 8(up)	')
+	addi	up, up, -GMP_LIMB_BYTES
+LIMB32(`addic.	r7, r7, -1	')
+LIMB32(`stw	r0, 12(rp)	')
+LIMB64(`std	r0, 8(rp)	')
+	addi	rp, rp, -GMP_LIMB_BYTES
+LIMB32(`bne	L(top0)		')
+
+L(rp_aligned):
+
+LIMB32(`rlwinm.	r0, up, 30,30,31')	C (up >> 2) mod 4
+LIMB64(`rlwinm.	r0, up, 29,31,31')	C (up >> 3) mod 2
+
+LIMB64(`srdi	r7, n, 2	')	C loop count corresponding to n
+LIMB32(`srwi	r7, n, 3	')	C loop count corresponding to n
+	mtctr	r7			C copy n to count register
+
+	li	r10, -16
+
+	beq	L(up_aligned)
+
+	lvsl	us, 0, up
+
+	addi	up, up, 16
+LIMB32(`andi.	r0, n, 0x4	')
+LIMB64(`andi.	r0, n, 0x2	')
+	beq	L(1)
+	lvx	v0, 0, up
+	lvx	v2, r10, up
+	vperm	v3, v2, v0, us
+	stvx	v3, 0, rp
+	addi	up, up, -32
+	addi	rp, rp, -16
+	b	L(lpu)
+L(1):	lvx	v2, 0, up
+	addi	up, up, -16
+	b	L(lpu)
+
+	ALIGN(32)
+L(lpu):	lvx	v0, 0, up
+	vperm	v3, v0, v2, us
+	stvx	v3, 0, rp
+	lvx	v2, r10, up
+	addi	up, up, -32
+	vperm	v3, v2, v0, us
+	stvx	v3, r10, rp
+	addi	rp, rp, -32
+	bdnz	L(lpu)
+
+	b	L(tail)
+
+L(up_aligned):
+
+LIMB32(`andi.	r0, n, 0x4	')
+LIMB64(`andi.	r0, n, 0x2	')
+	beq	L(lpa)
+	lvx	v0, 0,   up
+	stvx	v0, 0,   rp
+	addi	up, up, -16
+	addi	rp, rp, -16
+	b	L(lpa)
+
+	ALIGN(32)
+L(lpa):	lvx	v0, 0,   up
+	lvx	v1, r10, up
+	addi	up, up, -32
+	nop
+	stvx	v0, 0,   rp
+	stvx	v1, r10, rp
+	addi	rp, rp, -32
+	bdnz	L(lpa)
+
+L(tail):
+LIMB32(`rlwinm.	r7, n, 0,30,31	')	C r7 = n mod 4
+LIMB64(`rlwinm.	r7, n, 0,31,31	')	C r7 = n mod 2
+	beq	L(ret)
+LIMB32(`li	r10, 12		')
+L(top2):
+LIMB32(`lwzx	r0, r10, up	')
+LIMB64(`ld	r0, 8(up)	')
+LIMB32(`addic.	r7, r7, -1	')
+LIMB32(`stwx	r0, r10, rp	')
+LIMB64(`std	r0, 8(rp)	')
+LIMB32(`addi	r10, r10, -GMP_LIMB_BYTES')
+LIMB32(`bne	L(top2)		')
+
+L(ret):	mtspr	256, r12
+	blr
+EPILOGUE()

diff --git a/third_party/gmp/mpn/powerpc32/vmx/copyi.asm b/third_party/gmp/mpn/powerpc32/vmx/copyi.asm
new file mode 100644
index 0000000..992b468
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc32/vmx/copyi.asm

@@ -0,0 +1,198 @@
+dnl  PowerPC-32/VMX and PowerPC-64/VMX mpn_copyi.
+
+dnl  Copyright 2006 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C                16-byte coaligned      unaligned
+C                   cycles/limb        cycles/limb
+C 7400,7410 (G4):       0.5                0.64
+C 744x,745x (G4+):      0.75               0.82
+C 970 (G5):             0.78               1.02		(64-bit limbs)
+
+C STATUS
+C  * Works for all sizes and alignments.
+
+C TODO
+C  * Optimize unaligned case.  Some basic tests with 2-way and 4-way unrolling
+C    indicate that we can reach 0.56 c/l for 7400, 0.75 c/l for 745x, and 0.80
+C    c/l for 970.
+C  * Consider using VMX instructions also for head and tail, by using some
+C    read-modify-write tricks.
+C  * The VMX code is used from the smallest sizes it handles, but measurements
+C    show a large speed bump at the cutoff points.  Small copying (perhaps
+C    using some read-modify-write technique) should be optimized.
+C  * Make an mpn_com based on this code.
+
+define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8))
+define(`LIMBS_PER_VR',  eval(16/GMP_LIMB_BYTES))
+define(`LIMBS_PER_2VR', eval(32/GMP_LIMB_BYTES))
+
+
+ifelse(GMP_LIMB_BITS,32,`
+	define(`LIMB32',`	$1')
+	define(`LIMB64',`')
+',`
+	define(`LIMB32',`')
+	define(`LIMB64',`	$1')
+')
+
+C INPUT PARAMETERS
+define(`rp',	`r3')
+define(`up',	`r4')
+define(`n',	`r5')
+
+define(`us',	`v4')
+
+
+ASM_START()
+PROLOGUE(mpn_copyi)
+
+LIMB32(`cmpi	cr7, n, 11	')
+LIMB64(`cmpdi	cr7, n, 5	')
+	bge	cr7, L(big)
+
+	or.	r0, n, n
+	beqlr	cr0
+
+C Handle small cases with plain operations
+	mtctr	n
+L(topS):
+LIMB32(`lwz	r0, 0(up)	')
+LIMB64(`ld	r0, 0(up)	')
+	addi	up, up, GMP_LIMB_BYTES
+LIMB32(`stw	r0, 0(rp)	')
+LIMB64(`std	r0, 0(rp)	')
+	addi	rp, rp, GMP_LIMB_BYTES
+	bdnz	L(topS)
+	blr
+
+C Handle large cases with VMX operations
+L(big):
+	mfspr	r12, 256
+	oris	r0, r12, 0xf800		C Set VRSAVE bit 0-4
+	mtspr	256, r0
+
+LIMB32(`rlwinm.	r7, rp, 30,30,31')	C (rp >> 2) mod 4
+LIMB64(`rlwinm.	r7, rp, 29,31,31')	C (rp >> 3) mod 2
+	beq	L(rp_aligned)
+
+	subfic	r7, r7, LIMBS_PER_VR
+	subf	n, r7, n
+L(top0):
+LIMB32(`lwz	r0, 0(up)	')
+LIMB64(`ld	r0, 0(up)	')
+	addi	up, up, GMP_LIMB_BYTES
+LIMB32(`addic.	r7, r7, -1	')
+LIMB32(`stw	r0, 0(rp)	')
+LIMB64(`std	r0, 0(rp)	')
+	addi	rp, rp, GMP_LIMB_BYTES
+LIMB32(`bne	L(top0)		')
+
+L(rp_aligned):
+
+LIMB32(`rlwinm.	r0, up, 30,30,31')	C (up >> 2) mod 4
+LIMB64(`rlwinm.	r0, up, 29,31,31')	C (up >> 3) mod 2
+
+LIMB64(`srdi	r7, n, 2	')	C loop count corresponding to n
+LIMB32(`srwi	r7, n, 3	')	C loop count corresponding to n
+	mtctr	r7			C copy n to count register
+
+	li	r10, 16
+
+	beq	L(up_aligned)
+
+	lvsl	us, 0, up
+
+LIMB32(`andi.	r0, n, 0x4	')
+LIMB64(`andi.	r0, n, 0x2	')
+	beq	L(1)
+	lvx	v0, 0, up
+	lvx	v2, r10, up
+	vperm	v3, v0, v2, us
+	stvx	v3, 0, rp
+	addi	up, up, 32
+	addi	rp, rp, 16
+	b	L(lpu)
+L(1):	lvx	v2, 0, up
+	addi	up, up, 16
+	b	L(lpu)
+
+	ALIGN(32)
+L(lpu):	lvx	v0, 0, up
+	vperm	v3, v2, v0, us
+	stvx	v3, 0, rp
+	lvx	v2, r10, up
+	addi	up, up, 32
+	vperm	v3, v0, v2, us
+	stvx	v3, r10, rp
+	addi	rp, rp, 32
+	bdnz	L(lpu)
+
+	addi	up, up, -16
+	b	L(tail)
+
+L(up_aligned):
+
+LIMB32(`andi.	r0, n, 0x4	')
+LIMB64(`andi.	r0, n, 0x2	')
+	beq	L(lpa)
+	lvx	v0, 0,   up
+	stvx	v0, 0,   rp
+	addi	up, up, 16
+	addi	rp, rp, 16
+	b	L(lpa)
+
+	ALIGN(32)
+L(lpa):	lvx	v0, 0,   up
+	lvx	v1, r10, up
+	addi	up, up, 32
+	nop
+	stvx	v0, 0,   rp
+	stvx	v1, r10, rp
+	addi	rp, rp, 32
+	bdnz	L(lpa)
+
+L(tail):
+LIMB32(`rlwinm.	r7, n, 0,30,31	')	C r7 = n mod 4
+LIMB64(`rlwinm.	r7, n, 0,31,31	')	C r7 = n mod 2
+	beq	L(ret)
+LIMB32(`li	r10, 0		')
+L(top2):
+LIMB32(`lwzx	r0, r10, up	')
+LIMB64(`ld	r0, 0(up)	')
+LIMB32(`addic.	r7, r7, -1	')
+LIMB32(`stwx	r0, r10, rp	')
+LIMB64(`std	r0, 0(rp)	')
+LIMB32(`addi	r10, r10, GMP_LIMB_BYTES')
+LIMB32(`bne	L(top2)		')
+
+L(ret):	mtspr	256, r12
+	blr
+EPILOGUE()

diff --git a/third_party/gmp/mpn/powerpc32/vmx/logops_n.asm b/third_party/gmp/mpn/powerpc32/vmx/logops_n.asm
new file mode 100644
index 0000000..d656d3b
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc32/vmx/logops_n.asm

@@ -0,0 +1,310 @@
+dnl  PowerPC-32/VMX and PowerPC-64/VMX mpn_and_n, mpn_andn_n, mpn_nand_n,
+dnl  mpn_ior_n, mpn_iorn_n, mpn_nior_n, mpn_xor_n, mpn_xnor_n -- mpn bitwise
+dnl  logical operations.
+
+dnl  Copyright 2006 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C               and,ior,andn,nior,xor    iorn,xnor         nand
+C                   cycles/limb         cycles/limb    cycles/limb
+C 7400,7410 (G4):       1.39                 ?              ?
+C 744x,745x (G4+):      1.14                1.39           1.39
+C 970:                  1.7                 2.0            2.0
+
+C STATUS
+C  * Works for all sizes and alignment for 32-bit limbs.
+C  * Works for n >= 4 for 64-bit limbs; untested for smaller operands.
+C  * Current performance makes this pointless for 970
+
+C TODO
+C  * Might want to make variants when just one of the source operands needs
+C    vperm, and when neither needs it.  The latter runs 50% faster on 7400.
+C  * Idea: If the source operands are equally aligned, we could do the logops
+C    first, then vperm before storing!  That means we never need more than one
+C    vperm, ever!
+C  * Perhaps align `rp' after initial alignment loop?
+C  * Instead of having scalar code in the beginning and end, consider using
+C    read-modify-write vector code.
+C  * Software pipeline?  Hopefully not too important, this is hairy enough
+C    already.
+C  * At least be more clever about operand loading, i.e., load v operands before
+C    u operands, since v operands are sometimes negated.
+
+define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8))
+define(`LIMBS_PER_VR',  eval(16/GMP_LIMB_BYTES))
+define(`LIMBS_PER_2VR', eval(32/GMP_LIMB_BYTES))
+
+define(`vnegb', `')		C default neg-before to null
+define(`vnega', `')		C default neg-before to null
+
+ifdef(`OPERATION_and_n',
+`	define(`func',	`mpn_and_n')
+	define(`logopS',`and	$1,$2,$3')
+	define(`logop',	`vand	$1,$2,$3')')
+ifdef(`OPERATION_andn_n',
+`	define(`func',	`mpn_andn_n')
+	define(`logopS',`andc	$1,$2,$3')
+	define(`logop',	`vandc	$1,$2,$3')')
+ifdef(`OPERATION_nand_n',
+`	define(`func',	`mpn_nand_n')
+	define(`logopS',`nand	$1,$2,$3')
+	define(`logop',	`vand	$1,$2,$3')
+	define(`vnega',	`vnor	$1,$2,$2')')
+ifdef(`OPERATION_ior_n',
+`	define(`func',	`mpn_ior_n')
+	define(`logopS',`or	$1,$2,$3')
+	define(`logop',	`vor	$1,$2,$3')')
+ifdef(`OPERATION_iorn_n',
+`	define(`func',	`mpn_iorn_n')
+	define(`logopS',`orc	$1,$2,$3')
+	define(`vnegb',	`vnor	$1,$2,$2')
+	define(`logop',	`vor	$1,$2,$3')')
+ifdef(`OPERATION_nior_n',
+`	define(`func',	`mpn_nior_n')
+	define(`logopS',`nor	$1,$2,$3')
+	define(`logop',	`vnor	$1,$2,$3')')
+ifdef(`OPERATION_xor_n',
+`	define(`func',	`mpn_xor_n')
+	define(`logopS',`xor	$1,$2,$3')
+	define(`logop',	`vxor	$1,$2,$3')')
+ifdef(`OPERATION_xnor_n',
+`	define(`func',`mpn_xnor_n')
+	define(`logopS',`eqv	$1,$2,$3')
+	define(`vnegb',	`vnor	$1,$2,$2')
+	define(`logop',	`vxor	$1,$2,$3')')
+
+ifelse(GMP_LIMB_BITS,`32',`
+	define(`LIMB32',`	$1')
+	define(`LIMB64',`')
+',`
+	define(`LIMB32',`')
+	define(`LIMB64',`	$1')
+')
+
+C INPUT PARAMETERS
+define(`rp',	`r3')
+define(`up',	`r4')
+define(`vp',	`r5')
+define(`n',	`r6')
+
+define(`us',	`v8')
+define(`vs',	`v9')
+
+MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n)
+
+ASM_START()
+PROLOGUE(func)
+
+LIMB32(`cmpwi	cr0, n, 8	')
+LIMB64(`cmpdi	cr0, n, 4	')
+	bge	L(big)
+
+	mtctr	n
+
+LIMB32(`lwz	r8, 0(up)	')
+LIMB32(`lwz	r9, 0(vp)	')
+LIMB32(`logopS(	r0, r8, r9)	')
+LIMB32(`stw	r0, 0(rp)	')
+LIMB32(`bdz	L(endS)		')
+
+L(topS):
+LIMB32(`lwzu	r8, 4(up)	')
+LIMB64(`ld	r8, 0(up)	')
+LIMB64(`addi	up, up, GMP_LIMB_BYTES	')
+LIMB32(`lwzu	r9, 4(vp)	')
+LIMB64(`ld	r9, 0(vp)	')
+LIMB64(`addi	vp, vp, GMP_LIMB_BYTES	')
+	logopS(	r0, r8, r9)
+LIMB32(`stwu	r0, 4(rp)	')
+LIMB64(`std	r0, 0(rp)	')
+LIMB64(`addi	rp, rp, GMP_LIMB_BYTES	')
+	bdnz	L(topS)
+L(endS):
+	blr
+
+L(big):	mfspr	r12, 256
+	oris	r0, r12, 0xfffc		C Set VRSAVE bit 0-13 FIXME
+	mtspr	256, r0
+
+C First loop until the destination is 16-byte aligned.  This will execute 0 or 1
+C times for 64-bit machines, and 0 to 3 times for 32-bit machines.
+
+LIMB32(`rlwinm.	r0, rp, 30,30,31')	C (rp >> 2) mod 4
+LIMB64(`rlwinm.	r0, rp, 29,31,31')	C (rp >> 3) mod 2
+	beq	L(aligned)
+
+	subfic	r7, r0, LIMBS_PER_VR
+LIMB32(`li	r10, 0		')
+	subf	n, r7, n
+L(top0):
+LIMB32(`lwz	r8, 0(up)	')
+LIMB64(`ld	r8, 0(up)	')
+	addi	up, up, GMP_LIMB_BYTES
+LIMB32(`lwz	r9, 0(vp)	')
+LIMB64(`ld	r9, 0(vp)	')
+	addi	vp, vp, GMP_LIMB_BYTES
+LIMB32(`addic.	r7, r7, -1	')
+	logopS(	r0, r8, r9)
+LIMB32(`stwx	r0, r10, rp	')
+LIMB64(`std	r0, 0(rp)	')
+LIMB32(`addi	r10, r10, GMP_LIMB_BYTES')
+LIMB32(`bne	L(top0)		')
+
+	addi	rp, rp, 16		C update rp, but preserve its alignment
+
+L(aligned):
+LIMB64(`srdi	r7, n, 1	')	C loop count corresponding to n
+LIMB32(`srwi	r7, n, 2	')	C loop count corresponding to n
+	mtctr	r7			C copy n to count register
+
+	li	r10, 16
+	lvsl	us, 0, up
+	lvsl	vs, 0, vp
+
+	lvx	v2, 0, up
+	lvx	v3, 0, vp
+	bdnz	L(gt1)
+	lvx	v0, r10, up
+	lvx	v1, r10, vp
+	vperm	v4, v2, v0, us
+	vperm	v5, v3, v1, vs
+	vnegb(	v5, v5)
+	logop(	v6, v4, v5)
+	vnega(	v6, v6)
+	stvx	v6, 0, rp
+	addi	up, up, 16
+	addi	vp, vp, 16
+	addi	rp, rp, 4
+	b	L(tail)
+
+L(gt1):	addi	up, up, 16
+	addi	vp, vp, 16
+
+L(top):	lvx	v0, 0, up
+	lvx	v1, 0, vp
+	vperm	v4, v2, v0, us
+	vperm	v5, v3, v1, vs
+	vnegb(	v5, v5)
+	logop(	v6, v4, v5)
+	vnega(	v6, v6)
+	stvx	v6, 0, rp
+	bdz	L(end)
+	lvx	v2, r10, up
+	lvx	v3, r10, vp
+	vperm	v4, v0, v2, us
+	vperm	v5, v1, v3, vs
+	vnegb(	v5, v5)
+	logop(	v6, v4, v5)
+	vnega(	v6, v6)
+	stvx	v6, r10, rp
+	addi	up, up, 32
+	addi	vp, vp, 32
+	addi	rp, rp, 32
+	bdnz	L(top)
+
+	andi.	r0, up, 15
+	vxor	v0, v0, v0
+	beq	1f
+	lvx	v0, 0, up
+1:	andi.	r0, vp, 15
+	vxor	v1, v1, v1
+	beq	1f
+	lvx	v1, 0, vp
+1:	vperm	v4, v2, v0, us
+	vperm	v5, v3, v1, vs
+	vnegb(	v5, v5)
+	logop(	v6, v4, v5)
+	vnega(	v6, v6)
+	stvx	v6, 0, rp
+	addi	rp, rp, 4
+	b	L(tail)
+
+L(end):	andi.	r0, up, 15
+	vxor	v2, v2, v2
+	beq	1f
+	lvx	v2, r10, up
+1:	andi.	r0, vp, 15
+	vxor	v3, v3, v3
+	beq	1f
+	lvx	v3, r10, vp
+1:	vperm	v4, v0, v2, us
+	vperm	v5, v1, v3, vs
+	vnegb(	v5, v5)
+	logop(	v6, v4, v5)
+	vnega(	v6, v6)
+	stvx	v6, r10, rp
+
+	addi	up, up, 16
+	addi	vp, vp, 16
+	addi	rp, rp, 20
+
+L(tail):
+LIMB32(`rlwinm.	r7, n, 0,30,31	')	C r7 = n mod 4
+LIMB64(`rlwinm.	r7, n, 0,31,31	')	C r7 = n mod 2
+	beq	L(ret)
+	addi	rp, rp, 15
+LIMB32(`rlwinm	rp, rp, 0,0,27	')
+LIMB64(`rldicr	rp, rp, 0,59	')
+	li	r10, 0
+L(top2):
+LIMB32(`lwzx	r8, r10, up	')
+LIMB64(`ldx	r8, r10, up	')
+LIMB32(`lwzx	r9, r10, vp	')
+LIMB64(`ldx	r9, r10, vp	')
+LIMB32(`addic.	r7, r7, -1	')
+	logopS(	r0, r8, r9)
+LIMB32(`stwx	r0, r10, rp	')
+LIMB64(`std	r0, 0(rp)	')
+LIMB32(`addi	r10, r10, GMP_LIMB_BYTES')
+LIMB32(`bne	L(top2)		')
+
+L(ret):	mtspr	256, r12
+	blr
+EPILOGUE()
+
+C This works for 64-bit PowerPC, since a limb ptr can only be aligned
+C in 2 relevant ways, which means we can always find a pair of aligned
+C pointers of rp, up, and vp.
+C process words until rp is 16-byte aligned
+C if (((up | vp) & 15) == 0)
+C   process with VMX without any vperm
+C else if ((up & 15) != 0 && (vp & 15) != 0)
+C   process with VMX using vperm on store data
+C else if ((up & 15) != 0)
+C   process with VMX using vperm on up data
+C else
+C   process with VMX using vperm on vp data
+C
+C	rlwinm,	r0, up, 0,28,31
+C	rlwinm	r0, vp, 0,28,31
+C	cmpwi	cr7, r0, 0
+C	cror	cr6, cr0, cr7
+C	crand	cr0, cr0, cr7

diff --git a/third_party/gmp/mpn/powerpc32/vmx/mod_34lsub1.asm b/third_party/gmp/mpn/powerpc32/vmx/mod_34lsub1.asm
new file mode 100644
index 0000000..2bb11cd
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc32/vmx/mod_34lsub1.asm

@@ -0,0 +1,388 @@
+dnl  PowerPC-32 mpn_mod_34lsub1 -- mpn remainder mod 2^24-1.
+
+dnl  Copyright 2002, 2003, 2005-2007, 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+
+C                cycles/limb
+C 603e:              -
+C 604e:              -
+C 75x (G3):          -
+C 7400,7410 (G4):    1          simple load-use scheduling results in 0.75
+C 744x,745x (G4+):   0.75
+C ppc970:            0.75
+C power4:            -
+C power5:            -
+
+C TODO
+C  * Either start using the low-end masking constants, or remove them.
+C  * Merge multiple feed-in cases into a parameterized code block.
+C  * Reduce register usage.  It should be possible to almost halve it.
+
+define(`up', `r3')
+define(`n', `r4')
+
+define(`a0', `v3')
+define(`a1', `v4')
+define(`a2', `v5')
+define(`c0', `v6')
+define(`c1', `v7')
+define(`c2', `v8')
+define(`z',  `v9')
+define(`x0', `v10')
+define(`x1', `v11')
+define(`x2', `v12')
+define(`x3', `v13')
+define(`pv', `v14')
+define(`y0', `v0')
+define(`y1', `v1')
+define(`y2', `v2')
+define(`y3', `v15')
+
+ASM_START()
+PROLOGUE(mpn_mod_34lsub1)
+	cmpwi	cr0, n, 20		C tuned cutoff point
+	bge	L(large)
+
+	li	r9, 0			C result accumulator
+	mulli	r10, n, 0xb		C 0xb = ceil(32/3)
+	srwi.	r10, r10, 5		C r10 = floor(n/3), n < 32
+	beq	L(small_tail)
+	mtctr	r10
+	lwz	r6, 0(up)
+	lwz	r7, 4(up)
+	lwzu	r8, 8(up)
+	subf	n, r10, n
+	subf	n, r10, n
+	subf	n, r10, n
+	bdz	L(small_end)
+
+	ALIGN(16)
+L(los):	rlwinm	r0, r6, 0,8,31
+	add	r9, r9, r0		C add 24b from u0
+	srwi	r0, r6, 24
+	lwz	r6, 4(up)
+	rlwimi	r0, r7, 8, 0x00ffff00	C --111100
+	add	r9, r9, r0		C add 8b from u0 and 16b from u1
+	srwi	r0, r7, 16
+	lwz	r7, 8(up)
+	rlwimi	r0, r8, 16, 0x00ff0000	C --221111
+	add	r9, r9, r0		C add 16b from u1 and 8b from u2
+	srwi	r0, r8, 8		C --222222
+	lwzu	r8, 12(up)
+	add	r9, r9, r0		C add 24b from u2
+	bdnz	L(los)
+L(small_end):
+	rlwinm	r0, r6, 0,8,31
+	add	r9, r9, r0		C add 24b from u0
+	srwi	r0, r6, 24
+	rlwimi	r0, r7, 8, 0x00ffff00	C --111100
+	add	r9, r9, r0		C add 8b from u0 and 16b from u1
+	srwi	r0, r7, 16
+	rlwimi	r0, r8, 16, 0x00ff0000	C --221111
+	add	r9, r9, r0		C add 16b from u1 and 8b from u2
+	srwi	r0, r8, 8		C --222222
+	add	r9, r9, r0		C add 24b from u2
+
+	addi	up, up, 4
+	rlwinm	r0, r9, 0,8,31
+	srwi	r9, r9, 24
+	add	r9, r9, r0
+
+L(small_tail):
+	cmpi	cr0, n, 1
+	blt	L(ret)
+
+	lwz	r6, 0(up)
+	rlwinm	r0, r6, 0,8,31
+	srwi	r6, r6, 24
+	add	r9, r9, r0
+	add	r9, r9, r6
+
+	beq	L(ret)
+
+	lwz	r6, 4(up)
+	rlwinm	r0, r6, 8,8,23
+	srwi	r6, r6, 16
+	add	r9, r9, r0
+	add	r9, r9, r6
+
+L(ret):	mr	r3, r9
+	blr
+
+
+L(large):
+	stwu	r1, -32(r1)
+	mfspr	r10, 256
+	oris	r0, r10, 0xffff		C Set VRSAVE bit 0-15
+	mtspr	256, r0
+
+	andi.	r7, up, 15
+	vxor	a0, v0, v0
+	lis	r9, 0xaaaa
+	vxor	a1, v0, v0
+	ori	r9, r9, 0xaaab
+	vxor	a2, v0, v0
+	li	r5, 16
+	vxor	c0, v0, v0
+	li	r6, 32
+	vxor	c1, v0, v0
+	LEAL(	r11, cnsts)		C CAUTION clobbers r0 for elf, darwin
+	vxor	c2, v0, v0
+	vxor	z, v0, v0
+
+	beq	L(aligned16)
+
+	cmpwi	cr7, r7, 8
+	bge	cr7, L(na4)
+
+	lvx	a2, 0, up
+	addi	up, up, 16
+	vsldoi	a2, a2, z, 4
+	vsldoi	a2, z, a2, 12
+
+	addi	n, n, 9
+	mulhwu	r0, n, r9
+	srwi	r0, r0, 3		C r0 = floor(n/12)
+	mtctr	r0
+
+	mulli	r8, r0, 12
+	subf	n, r8, n
+	b	L(2)
+
+L(na4):	bne	cr7, L(na8)
+
+	lvx	a1, 0, up
+	addi	up, up, -16
+	vsldoi	a1, a1, z, 8
+	vsldoi	a1, z, a1, 8
+
+	addi	n, n, 6
+	mulhwu	r0, n, r9
+	srwi	r0, r0, 3		C r0 = floor(n/12)
+	mtctr	r0
+
+	mulli	r8, r0, 12
+	subf	n, r8, n
+	b	L(1)
+
+L(na8):
+	lvx	a0, 0, up
+	vsldoi	a0, a0, z, 12
+	vsldoi	a0, z, a0, 4
+
+	addi	n, n, 3
+	mulhwu	r0, n, r9
+	srwi	r0, r0, 3		C r0 = floor(n/12)
+	mtctr	r0
+
+	mulli	r8, r0, 12
+	subf	n, r8, n
+	b	L(0)
+
+L(aligned16):
+	mulhwu	r0, n, r9
+	srwi	r0, r0, 3		C r0 = floor(n/12)
+	mtctr	r0
+
+	mulli	r8, r0, 12
+	subf	n, r8, n
+
+	lvx	a0, 0, up
+L(0):	lvx	a1, r5, up
+L(1):	lvx	a2, r6, up
+	addi	up, up, 48
+L(2):	bdz	L(end)
+	li	r12, 256
+	li	r9, 288
+	ALIGN(32)
+L(top):
+	lvx	v0, 0, up
+	vaddcuw	v10, a0, v0
+	vadduwm	a0, a0, v0
+	vadduwm	c0, c0, v10
+
+	lvx	v1, r5, up
+	vaddcuw	v10, a1, v1
+	vadduwm	a1, a1, v1
+	vadduwm	c1, c1, v10
+
+	lvx	v2, r6, up
+	dcbt	up, r12
+	dcbt	up, r9
+	addi	up, up, 48
+	vaddcuw	v10, a2, v2
+	vadduwm	a2, a2, v2
+	vadduwm	c2, c2, v10
+	bdnz	L(top)
+
+L(end):
+C n = 0...11
+	cmpwi	cr0, n, 0
+	beq	L(sum)
+	cmpwi	cr0, n, 4
+	ble	L(tail.1..4)
+	cmpwi	cr0, n, 8
+	ble	L(tail.5..8)
+
+L(tail.9..11):
+	lvx	v0, 0, up
+	vaddcuw	v10, a0, v0
+	vadduwm	a0, a0, v0
+	vadduwm	c0, c0, v10
+
+	lvx	v1, r5, up
+	vaddcuw	v10, a1, v1
+	vadduwm	a1, a1, v1
+	vadduwm	c1, c1, v10
+
+	lvx	v2, r6, up
+
+	addi	r8, r11, 96
+	rlwinm	r3, n ,4,26,27
+	lvx	v11, r3, r8
+	vand	v2, v2, v11
+
+	vaddcuw	v10, a2, v2
+	vadduwm	a2, a2, v2
+	vadduwm	c2, c2, v10
+	b	L(sum)
+
+L(tail.5..8):
+	lvx	v0, 0, up
+	vaddcuw	v10, a0, v0
+	vadduwm	a0, a0, v0
+	vadduwm	c0, c0, v10
+
+	lvx	v1, r5, up
+
+	addi	r8, r11, 96
+	rlwinm	r3, n ,4,26,27
+	lvx	v11, r3, r8
+	vand	v1, v1, v11
+
+	vaddcuw	v10, a1, v1
+	vadduwm	a1, a1, v1
+	vadduwm	c1, c1, v10
+	b	L(sum)
+
+L(tail.1..4):
+	lvx	v0, 0, up
+
+	addi	r8, r11, 96
+	rlwinm	r3, n ,4,26,27
+	lvx	v11, r3, r8
+	vand	v0, v0, v11
+
+	vaddcuw	v10, a0, v0
+	vadduwm	a0, a0, v0
+	vadduwm	c0, c0, v10
+
+L(sum):	lvx	pv, 0, r11
+	vperm	x0, a0, z, pv		C extract 4 24-bit field from a0
+	vperm	y0, c2, z, pv
+	lvx	pv, r5, r11
+	vperm	x1, a1, z, pv		C extract 4 24-bit field from a1
+	vperm	y1, c0, z, pv		C extract 4 24-bit field from a1
+	lvx	pv, r6, r11
+	vperm	x2, a2, z, pv		C extract 4 24-bit field from a1
+	vperm	y2, c1, z, pv		C extract 4 24-bit field from a1
+	li	r10,  48
+	lvx	pv, r10, r11
+	vperm	x3, a0, z, pv		C extract remaining/partial a0 fields
+	vperm	y3, c2, z, pv		C extract remaining/partial a0 fields
+	li	r10,  64
+	lvx	pv, r10, r11
+	vperm	x3, a1, x3, pv		C insert remaining/partial a1 fields
+	vperm	y3, c0, y3, pv		C insert remaining/partial a1 fields
+	li	r10,  80
+	lvx	pv, r10, r11
+	vperm	x3, a2, x3, pv		C insert remaining/partial a2 fields
+	vperm	y3, c1, y3, pv		C insert remaining/partial a2 fields
+
+C We now have 4 128-bit accumulators to sum
+	vadduwm	x0, x0, x1
+	vadduwm	x2, x2, x3
+	vadduwm	x0, x0, x2
+
+	vadduwm	y0, y0, y1
+	vadduwm	y2, y2, y3
+	vadduwm	y0, y0, y2
+
+	vadduwm	x0, x0, y0
+
+C Reduce 32-bit fields
+	vsumsws	x0, x0, z
+
+	li	r7, 16
+	stvx	x0, r7, r1
+	lwz	r3, 28(r1)
+
+	mtspr	256, r10
+	addi	r1, r1, 32
+	blr
+EPILOGUE()
+
+C load	|      v0       |      v1       |      v2       |
+C acc	|      a0       |      a1       |      a2       |
+C carry	|      c0       |      c1       |      c2       |
+C	| 0   1   2   3 | 4   5   6   7 | 8   9  10  11 |  128
+C	|---|---|---|---|---|---|---|---|---|---|---|---|   32
+C	|  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |   24
+C	|     |     |     |     |     |     |     |     |   48
+
+C       $---------------$---------------$---------------$---------------$
+C       |   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   |
+C       |_______________________________________________________________|
+C   |           |           |           |           |           |           |
+C       <-hi16-> <--- 24 --> <--- 24 --> <--- 24 --> <--- 24 --> <-lo16->
+
+
+DEF_OBJECT(cnsts,16)
+C Permutation vectors in the order they are used above
+C #      00   01   02   03    04   05   06   07    08   09   0a   0b    0c   0d   0e   0f
+ .byte 0x10,0x01,0x02,0x03, 0x10,0x06,0x07,0x00, 0x10,0x0b,0x04,0x05, 0x10,0x08,0x09,0x0a C a0
+ .byte 0x10,0x07,0x00,0x01, 0x10,0x04,0x05,0x06, 0x10,0x09,0x0a,0x0b, 0x10,0x0e,0x0f,0x08 C a1
+ .byte 0x10,0x00,0x01,0x02, 0x10,0x05,0x06,0x07, 0x10,0x0a,0x0b,0x04, 0x10,0x0f,0x08,0x09 C a2
+ .byte 0x10,0x0d,0x0e,0x0f, 0x10,0x10,0x10,0x0c, 0x10,0x10,0x10,0x10, 0x10,0x10,0x10,0x10 C part a0
+ .byte 0x10,0x11,0x12,0x13, 0x10,0x02,0x03,0x17, 0x10,0x10,0x0c,0x0d, 0x10,0x10,0x10,0x10 C part a1
+ .byte 0x10,0x11,0x12,0x13, 0x10,0x15,0x16,0x17, 0x10,0x03,0x1a,0x1b, 0x10,0x0c,0x0d,0x0e C part a2
+C Masks for high end of number
+ .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+ .byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+ .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+ .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
+C Masks for low end of number
+C .byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+C .byte	0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+C .byte	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+C .byte	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff
+END_OBJECT(cnsts)

diff --git a/third_party/gmp/mpn/powerpc32/vmx/popcount.asm b/third_party/gmp/mpn/powerpc32/vmx/popcount.asm
new file mode 100644
index 0000000..943c92d
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc32/vmx/popcount.asm

@@ -0,0 +1,34 @@
+dnl  PowerPC-32/VMX mpn_popcount.
+
+dnl  Copyright 2006 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_popcount)
+include_mpn(`powerpc64/vmx/popcount.asm')

diff --git a/third_party/gmp/mpn/powerpc64/README b/third_party/gmp/mpn/powerpc64/README
new file mode 100644
index 0000000..50dd399
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc64/README

@@ -0,0 +1,166 @@
+Copyright 1999-2001, 2003-2005 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.
+
+
+
+                    POWERPC-64 MPN SUBROUTINES
+
+
+This directory contains mpn functions for 64-bit PowerPC chips.
+
+
+CODE ORGANIZATION
+
+	mpn/powerpc64          mode-neutral code
+	mpn/powerpc64/mode32   code for mode32
+	mpn/powerpc64/mode64   code for mode64
+
+
+The mode32 and mode64 sub-directories contain code which is for use in the
+respective chip mode, 32 or 64.  The top-level directory is code that's
+unaffected by the mode.
+
+The "adde" instruction is the main difference between mode32 and mode64.  It
+operates on either on a 32-bit or 64-bit quantity according to the chip mode.
+Other instructions have an operand size in their opcode and hence don't vary.
+
+
+
+POWER3/PPC630 pipeline information:
+
+Decoding is 4-way + branch and issue is 8-way with some out-of-order
+capability.
+
+Functional units:
+LS1  - ld/st unit 1
+LS2  - ld/st unit 2
+FXU1 - integer unit 1, handles any simple integer instruction
+FXU2 - integer unit 2, handles any simple integer instruction
+FXU3 - integer unit 3, handles integer multiply and divide
+FPU1 - floating-point unit 1
+FPU2 - floating-point unit 2
+
+Memory:		  Any two memory operations can issue, but memory subsystem
+		  can sustain just one store per cycle.  No need for data
+		  prefetch; the hardware has very sophisticated prefetch logic.
+Simple integer:	  2 operations (such as add, rl*)
+Integer multiply: 1 operation every 9th cycle worst case; exact timing depends
+		  on 2nd operand's most significant bit position (10 bits per
+		  cycle).  Multiply unit is not pipelined, only one multiply
+		  operation in progress is allowed.
+Integer divide:	  ?
+Floating-point:	  Any plain 2 arithmetic instructions (such as fmul, fadd, and
+		  fmadd), latency 4 cycles.
+Floating-point divide:
+		  ?
+Floating-point square root:
+		  ?
+
+POWER3/PPC630 best possible times for the main loops:
+shift:	      1.5 cycles limited by integer unit contention.
+	      With 63 special loops, one for each shift count, we could
+	      reduce the needed integer instructions to 2, which would
+	      reduce the best possible time to 1 cycle.
+add/sub:      1.5 cycles, limited by ld/st unit contention.
+mul:	      18 cycles (average) unless floating-point operations are used,
+	      but that would only help for multiplies of perhaps 10 and more
+	      limbs.
+addmul/submul:Same situation as for mul.
+
+
+POWER4/PPC970 and POWER5 pipeline information:
+
+This is a very odd pipeline, it is basically a VLIW masquerading as a plain
+architecture.  Its issue rules are not made public, and since it is so weird,
+it is very hard to figure out any useful information from experimentation.
+An example:
+
+  A well-aligned loop with nop's take 3, 4, 6, 7, ... cycles.
+    3 cycles for  0,  1,  2,  3,  4,  5,  6,  7 nop's
+    4 cycles for  8,  9, 10, 11, 12, 13, 14, 15 nop's
+    6 cycles for 16, 17, 18, 19, 20, 21, 22, 23 nop's
+    7 cycles for 24, 25, 26, 27 nop's
+    8 cycles for 28, 29, 30, 31 nop's
+    ... continues regularly
+
+
+Functional units:
+LS1  - ld/st unit 1
+LS2  - ld/st unit 2
+FXU1 - integer unit 1, handles any integer instruction
+FXU2 - integer unit 2, handles any integer instruction
+FPU1 - floating-point unit 1
+FPU2 - floating-point unit 2
+
+While this is one integer unit less than POWER3/PPC630, the remaining units
+are more powerful; here they handle multiply and divide.
+
+Memory:		  2 ld/st.  Stores go to the L2 cache, which can sustain just
+		  one store per cycle.
+		  L1 load latency: to gregs 3-4 cycles, to fregs 5-6 cycles.
+		  Operations that modify the address register might be split
+		  to use also an integer issue slot.
+Simple integer:	  2 operations every cycle, latency 2.
+Integer multiply: 2 operations every 6th cycle, latency 7 cycles.
+Integer divide:	  ?
+Floating-point:	  Any plain 2 arithmetic instructions (such as fmul, fadd, and
+		  fmadd), latency 6 cycles.
+Floating-point divide:
+		  ?
+Floating-point square root:
+		  ?
+
+
+IDEAS
+
+*mul_1: Handling one limb using mulld/mulhdu and two limbs using floating-
+point operations should give performance of about 20 cycles for 3 limbs, or 7
+cycles/limb.
+
+We should probably split the single-limb operand in 32-bit chunks, and the
+multi-limb operand in 16-bit chunks, allowing us to accumulate well in fp
+registers.
+
+Problem is to get 32-bit or 16-bit words to the fp registers.  Only 64-bit fp
+memops copies bits without fiddling with them.  We might therefore need to
+load to integer registers with zero extension, store as 64 bits into temp
+space, and then load to fp regs.  Alternatively, load directly to fp space
+and add well-chosen constants to get cancellation.  (Other part after given by
+subsequent subtraction.)
+
+Possible code mix for load-via-intregs variant:
+
+lwz,std,lfd
+fmadd,fmadd,fmul,fmul
+fctidz,stfd,ld,fctidz,stfd,ld
+add,adde
+lwz,std,lfd
+fmadd,fmadd,fmul,fmul
+fctidz,stfd,ld,fctidz,stfd,ld
+add,adde
+srd,sld,add,adde,add,adde

diff --git a/third_party/gmp/mpn/powerpc64/aix.m4 b/third_party/gmp/mpn/powerpc64/aix.m4
new file mode 100644
index 0000000..04378b8
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc64/aix.m4

@@ -0,0 +1,99 @@
+divert(-1)
+dnl  m4 macros for AIX 64-bit assembly.
+
+dnl  Copyright 2000-2002, 2005, 2006, 2010, 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+define(`AIX')
+
+define(`ASM_START',
+	`.machine	"any"
+	.toc')
+
+dnl  Called: PROLOGUE_cpu(GSYM_PREFIX`'foo[,toc])
+dnl          EPILOGUE_cpu(GSYM_PREFIX`'foo)
+dnl
+dnl  Don't want ELF style .size in the epilogue.
+
+define(`PROLOGUE_cpu',
+m4_assert_numargs_range(1,2)
+`ifelse(`$2',toc,,
+`ifelse(`$2',,,`m4_error(`Unrecognised PROLOGUE parameter')')')dnl
+	.globl	$1
+	.globl	.$1
+	.csect	[DS], 3
+$1:
+	.llong	.$1, TOC[tc0], 0
+	.csect	.$1[PR], 6
+.$1:')
+
+define(`EPILOGUE_cpu',
+m4_assert_numargs(1)
+`')
+
+define(`TOC_ENTRY', `')
+
+define(`LEA',
+m4_assert_numargs(2)
+`define(`TOC_ENTRY',
+`	.toc
+..$2:	.tc	$2[TC], $2')'
+	`ld	$1, ..$2(2)')
+
+define(`LEAL',
+m4_assert_numargs(2)
+`LEA($1,$2)')
+
+
+define(`EXTERN',
+m4_assert_numargs(1)
+`	.globl	$1')
+
+define(`EXTERN_FUNC',
+m4_assert_numargs(1)
+`	.globl	.$1')
+
+define(`DEF_OBJECT',
+m4_assert_numargs_range(1,2)
+`	.csect	[RO], 3
+	ALIGN(ifelse($#,1,2,$2))
+$1:
+')
+
+define(`END_OBJECT',
+m4_assert_numargs(1))
+
+define(`CALL',
+	`bl	.$1
+	nop')
+
+define(`ASM_END', `TOC_ENTRY')
+
+undefine(`EXTRA_REGISTER')
+
+divert

diff --git a/third_party/gmp/mpn/powerpc64/com.asm b/third_party/gmp/mpn/powerpc64/com.asm
new file mode 100644
index 0000000..074b7ff
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc64/com.asm

@@ -0,0 +1,136 @@
+dnl  PowerPC-64 mpn_com.
+
+dnl  Copyright 2004, 2005, 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C                  cycles/limb
+C POWER3/PPC630          ?
+C POWER4/PPC970          1.25
+C POWER5                 ?
+C POWER6                 1.32
+C POWER7                 1.13
+
+C INPUT PARAMETERS
+define(`rp',	`r3')
+define(`up',	`r4')
+define(`n',	`r5')
+
+ASM_START()
+PROLOGUE(mpn_com)
+
+ifdef(`HAVE_ABI_mode32',
+`	rldicl	n, n, 0,32')
+
+	cmpdi	cr0, n, 4
+	blt	L(sml)
+
+	addi	r10, n, 4
+	srdi	r10, r10, 3
+	mtctr	r10
+
+	andi.	r0, n, 1
+	rlwinm	r11, n, 0,30,30
+	rlwinm	r12, n, 0,29,29
+	cmpdi	cr6, r11, 0
+	cmpdi	cr7, r12, 0
+
+	beq	cr0, L(xx0)
+L(xx1):	ld	r6, 0(up)
+	addi	up, up, 8
+	nor	r6, r6, r6
+	std	r6, 0(rp)
+	addi	rp, rp, 8
+
+L(xx0):	bne	cr6, L(x10)
+L(x00):	ld	r6, 0(r4)
+	ld	r7, 8(r4)
+	bne	cr7, L(100)
+L(000):	addi	rp, rp, -32
+	b	L(lo0)
+L(100):	addi	up, up, -32
+	b	L(lo4)
+L(x10):	ld	r8, 0(r4)
+	ld	r9, 8(r4)
+	bne	cr7, L(110)
+L(010):	addi	up, up, 16
+	addi	rp, rp, -16
+	b	L(lo2)
+L(110):	addi	up, up, -16
+	addi	rp, rp, -48
+	b	L(lo6)
+
+L(sml):	mtctr	n
+L(t):	ld	r6, 0(up)
+	addi	up, up, 8
+	nor	r6, r6, r6
+	std	r6, 0(rp)
+	addi	rp, rp, 8
+	bdnz	L(t)
+	blr
+
+	ALIGN(32)
+L(top):	nor	r6, r6, r6
+	nor	r7, r7, r7
+	std	r6, 0(rp)
+	std	r7, 8(rp)
+L(lo2):	ld	r6, 0(up)
+	ld	r7, 8(up)
+	nor	r8, r8, r8
+	nor	r9, r9, r9
+	std	r8, 16(rp)
+	std	r9, 24(rp)
+L(lo0):	ld	r8, 16(up)
+	ld	r9, 24(up)
+	nor	r6, r6, r6
+	nor	r7, r7, r7
+	std	r6, 32(rp)
+	std	r7, 40(rp)
+L(lo6):	ld	r6, 32(up)
+	ld	r7, 40(up)
+	nor	r8, r8, r8
+	nor	r9, r9, r9
+	std	r8, 48(rp)
+	std	r9, 56(rp)
+	addi	rp, rp, 64
+L(lo4):	ld	r8, 48(up)
+	ld	r9, 56(up)
+	addi	up, up, 64
+	bdnz	L(top)
+
+L(end):	nor	r6, r6, r6
+	nor	r7, r7, r7
+	std	r6, 0(rp)
+	std	r7, 8(rp)
+	nor	r8, r8, r8
+	nor	r9, r9, r9
+	std	r8, 16(rp)
+	std	r9, 24(rp)
+	blr
+EPILOGUE()

diff --git a/third_party/gmp/mpn/powerpc64/copyd.asm b/third_party/gmp/mpn/powerpc64/copyd.asm
new file mode 100644
index 0000000..c6ce930
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc64/copyd.asm

@@ -0,0 +1,84 @@
+dnl  PowerPC-64 mpn_copyd
+
+dnl  Copyright 2004, 2005 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C                  cycles/limb
+C POWER3/PPC630          1
+C POWER4/PPC970          1
+C POWER5                 ?
+C POWER6                 ?
+C POWER7                 1.4
+
+C INPUT PARAMETERS
+C rp	r3
+C up	r4
+C n	r5
+
+ASM_START()
+PROLOGUE(mpn_copyd)
+	rldic.	r0, r5, 3, 59	C r0 = (r5 & 3) << 3; cr0 = (n == 4t)?
+	cmpldi	cr6, r0, 16	C cr6 = (n cmp 4t + 2)?
+
+ifdef(`HAVE_ABI_mode32',
+`	rldic	r6, r5, 3, 32',	C byte count corresponding to n
+`	rldicr	r6, r5, 3, 60')	C byte count corresponding to n
+
+	addi	r5, r5, 4	C compute...
+ifdef(`HAVE_ABI_mode32',
+`	rldicl	r5, r5, 62,34',	C ...branch count
+`	rldicl	r5, r5, 62, 2')	C ...branch count
+	mtctr	r5
+
+	add	r4, r4, r6
+	add	r3, r3, r6
+	sub	r4, r4, r0	C offset up
+	sub	r3, r3, r0	C offset rp
+
+	beq	cr0, L(L00)
+	blt	cr6, L(L01)
+	beq	cr6, L(L10)
+	b	L(L11)
+
+	ALIGN(16)
+L(oop):	ld	r6, 24(r4)
+	std	r6, 24(r3)
+L(L11):	ld	r6, 16(r4)
+	std	r6, 16(r3)
+L(L10):	ld	r6, 8(r4)
+	std	r6, 8(r3)
+L(L01):	ld	r6, 0(r4)
+	std	r6, 0(r3)
+L(L00):	addi	r4, r4, -32
+	addi	r3, r3, -32
+	bdnz	L(oop)
+
+	blr
+EPILOGUE()

diff --git a/third_party/gmp/mpn/powerpc64/copyi.asm b/third_party/gmp/mpn/powerpc64/copyi.asm
new file mode 100644
index 0000000..9a86cb2
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc64/copyi.asm

@@ -0,0 +1,78 @@
+dnl  PowerPC-64 mpn_copyi.
+
+dnl  Copyright 2004, 2005 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C                  cycles/limb
+C POWER3/PPC630          1
+C POWER4/PPC970          1
+C POWER5                 ?
+C POWER6                 ?
+C POWER7                 1.4
+
+C INPUT PARAMETERS
+C rp	r3
+C up	r4
+C n	r5
+
+ASM_START()
+PROLOGUE(mpn_copyi)
+	rldic.	r0, r5, 3, 59	C r0 = (r5 & 3) << 3; cr0 = (n == 4t)?
+	cmpldi	cr6, r0, 16	C cr6 = (n cmp 4t + 2)?
+
+	addi	r5, r5, 4	C compute...
+ifdef(`HAVE_ABI_mode32',
+`	rldicl	r5, r5, 62,34',	C ...branch count
+`	rldicl	r5, r5, 62, 2')	C ...branch count
+	mtctr	r5
+
+	add	r4, r4, r0	C offset up
+	add	r3, r3, r0	C offset rp
+
+	beq	cr0, L(L00)
+	blt	cr6, L(L01)
+	beq	cr6, L(L10)
+	b	L(L11)
+
+	ALIGN(16)
+L(oop):	ld	r6, -32(r4)
+	std	r6, -32(r3)
+L(L11):	ld	r6, -24(r4)
+	std	r6, -24(r3)
+L(L10):	ld	r6, -16(r4)
+	std	r6, -16(r3)
+L(L01):	ld	r6, -8(r4)
+	std	r6, -8(r3)
+L(L00):	addi	r4, r4, 32
+	addi	r3, r3, 32
+	bdnz	L(oop)
+
+	blr
+EPILOGUE()

diff --git a/third_party/gmp/mpn/powerpc64/darwin.m4 b/third_party/gmp/mpn/powerpc64/darwin.m4
new file mode 100644
index 0000000..2c995e7
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc64/darwin.m4

@@ -0,0 +1,122 @@
+divert(-1)
+dnl  m4 macros for Mac OS 64-bit assembly.
+
+dnl  Copyright 2005, 2006 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+define(`DARWIN')
+
+define(`ASM_START',`')
+
+dnl  Called: PROLOGUE_cpu(GSYM_PREFIX`'foo[,toc])
+dnl          EPILOGUE_cpu(GSYM_PREFIX`'foo)
+dnl
+
+define(`PROLOGUE_cpu',
+m4_assert_numargs_range(1,2)
+`ifelse(`$2',toc,,
+`ifelse(`$2',,,`m4_error(`Unrecognised PROLOGUE parameter')')')dnl
+	.text
+	.globl	$1
+	.align	5
+$1:')
+
+define(`lea_list', `')
+
+dnl  LEAL -- Load Effective Address Local.  This is to be used for symbols
+dnl  defined in the same file.  It will not work for externally defined
+dnl  symbols.
+
+define(`LEAL',
+m4_assert_numargs(2)
+`ifdef(`PIC',
+`
+	mflr	r0			C save return address
+	bcl	20, 31, 1f
+1:	mflr	$1
+	addis	$1, $1, ha16($2-1b)
+	la	$1, lo16($2-1b)($1)
+	mtlr	r0			C restore return address
+',`
+	lis	$1, ha16($2)
+	la	$1, lo16($2)($1)
+')')
+
+dnl  LEA -- Load Effective Address.  This is to be used for symbols defined in
+dnl  another file.  It will not work for locally defined symbols.
+
+define(`LEA',
+m4_assert_numargs(2)
+`ifdef(`PIC',
+`define(`lea_list',
+`	.non_lazy_symbol_pointer
+`L'$2`'$non_lazy_ptr:
+	.indirect_symbol $2
+	.quad	0
+')
+	mflr	r0			C save return address
+	bcl	20, 31, 1f
+1:	mflr	$1
+	addis	$1, $1, ha16(`L'$2`'$non_lazy_ptr-1b)
+	ld	$1, lo16(`L'$2`'$non_lazy_ptr-1b)($1)
+	mtlr	r0			C restore return address
+',`
+	lis	$1, ha16($2)
+	la	$1, lo16($2)($1)
+')')
+
+define(`EXTERN',
+m4_assert_numargs(1)
+`dnl')
+
+define(`EXTERN_FUNC',
+m4_assert_numargs(1)
+`dnl')
+
+define(`DEF_OBJECT',
+m4_assert_numargs_range(1,2)
+`	.const
+	ALIGN(ifelse($#,1,2,$2))
+$1:
+')
+
+define(`END_OBJECT',
+m4_assert_numargs(1))
+
+define(`CALL',
+	`bl	GSYM_PREFIX`'$1')
+
+define(`EPILOGUE_cpu',
+`lea_list'
+`define(`lea_list', `')')
+
+define(`ASM_END', `dnl')
+
+define(`EXTRA_REGISTER', r2)
+
+divert

diff --git a/third_party/gmp/mpn/powerpc64/elf.m4 b/third_party/gmp/mpn/powerpc64/elf.m4
new file mode 100644
index 0000000..ddb5a8e
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc64/elf.m4

@@ -0,0 +1,123 @@
+divert(-1)
+dnl  m4 macros for powerpc64 GNU/Linux assembly.
+
+dnl  Copyright 2003, 2005, 2006 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+define(`ASM_START',
+`ifdef(`ELFv2_ABI',
+`
+	.abiversion 2
+')')
+
+dnl  Called: PROLOGUE_cpu(GSYM_PREFIX`'foo[,toc])
+dnl          EPILOGUE_cpu(GSYM_PREFIX`'foo)
+dnl
+
+define(`PROLOGUE_cpu',
+m4_assert_numargs_range(1,2)
+`ifelse(`$2',toc,,
+`ifelse(`$2',,,`m4_error(`Unrecognised PROLOGUE parameter')')')dnl
+ifdef(`ELFv2_ABI',
+`
+	.globl	$1
+	.type	$1, @function
+	.section	".text"
+	.align	5
+$1:
+ifelse(`$2',toc,`
+0:	addis	2, 12, (.TOC.-0b)@ha
+	addi	2, 2, (.TOC.-0b)@l
+	.localentry $1, .-$1
+',)
+',`
+	.globl	$1
+	.globl	.$1
+	.section	".opd","aw"
+	.align	3
+$1:
+	.llong	.$1, .TOC.@tocbase, 0
+	.size	$1, 24
+	.type	.$1, @function
+	.section	".text"
+	.align	5
+.$1:
+')')
+
+define(`EPILOGUE_cpu',
+m4_assert_numargs(1)
+`ifdef(`ELFv2_ABI',`
+	.size	$1, .-$1
+',`
+	.size	.$1, .-.$1
+')')
+
+define(`TOC_ENTRY', `')
+
+define(`LEA',
+m4_assert_numargs(2)
+`define(`TOC_ENTRY',
+`	.section	".toc", "aw"
+..$2:	.tc	$2[TC], $2')'
+	`ld	$1, ..$2@toc(2)')
+
+define(`LEAL',
+m4_assert_numargs(2)
+`LEA($1,$2)')
+
+
+define(`EXTERN',
+m4_assert_numargs(1)
+`dnl')
+
+define(`EXTERN_FUNC',
+m4_assert_numargs(1)
+`dnl')
+
+define(`DEF_OBJECT',
+m4_assert_numargs_range(1,2)
+`
+	.section	.rodata
+	ALIGN(ifelse($#,1,2,$2))
+	.type	$1, @object
+$1:
+')
+
+define(`END_OBJECT',
+m4_assert_numargs(1)
+`	.size	$1, .-$1')
+
+define(`CALL',
+	`bl	GSYM_PREFIX`'$1
+	nop')
+
+define(`ASM_END', `TOC_ENTRY')
+
+undefine(`EXTRA_REGISTER')
+
+divert

diff --git a/third_party/gmp/mpn/powerpc64/logops_n.asm b/third_party/gmp/mpn/powerpc64/logops_n.asm
new file mode 100644
index 0000000..2fa6985
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc64/logops_n.asm

@@ -0,0 +1,151 @@
+dnl  PowerPC-64 mpn_and_n, mpn_andn_n, mpn_nand_n, mpn_ior_n, mpn_iorn_n,
+dnl  mpn_nior_n, mpn_xor_n, mpn_xnor_n -- mpn bitwise logical operations.
+
+dnl  Copyright 2003-2005 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C                  cycles/limb
+C POWER3/PPC630          1.75
+C POWER4/PPC970          2.10
+C POWER5                 ?
+C POWER6                 ?
+C POWER7                 1.75
+
+C   n	   POWER3/PPC630   POWER4/PPC970
+C     1	       15.00	       15.33
+C     2		7.50		7.99
+C     3		5.33		6.00
+C     4		4.50		4.74
+C     5		4.20		4.39
+C     6		3.50		3.99
+C     7		3.14		3.64
+C     8		3.00		3.36
+C     9		3.00		3.36
+C    10		2.70		3.25
+C    11		2.63		3.11
+C    12		2.58		3.00
+C    13		2.61		3.02
+C    14		2.42		2.82
+C    15		2.40		2.79
+C    50		2.08		2.67
+C   100		1.85		2.31
+C   200		1.80		2.18
+C   400		1.77		2.14
+C  1000		1.76		2.10#
+C  2000		1.75#		2.13
+C  4000		2.30		2.57
+C  8000		2.62		2.58
+C 16000		2.52		4.25
+C 32000		2.49	       16.25
+C 64000		2.66	       18.76
+
+ifdef(`OPERATION_and_n',
+`	define(`func',`mpn_and_n')
+	define(`logop',		`and')')
+ifdef(`OPERATION_andn_n',
+`	define(`func',`mpn_andn_n')
+	define(`logop',		`andc')')
+ifdef(`OPERATION_nand_n',
+`	define(`func',`mpn_nand_n')
+	define(`logop',		`nand')')
+ifdef(`OPERATION_ior_n',
+`	define(`func',`mpn_ior_n')
+	define(`logop',		`or')')
+ifdef(`OPERATION_iorn_n',
+`	define(`func',`mpn_iorn_n')
+	define(`logop',		`orc')')
+ifdef(`OPERATION_nior_n',
+`	define(`func',`mpn_nior_n')
+	define(`logop',		`nor')')
+ifdef(`OPERATION_xor_n',
+`	define(`func',`mpn_xor_n')
+	define(`logop',		`xor')')
+ifdef(`OPERATION_xnor_n',
+`	define(`func',`mpn_xnor_n')
+	define(`logop',		`eqv')')
+
+C INPUT PARAMETERS
+C rp	r3
+C up	r4
+C vp	r5
+C n	r6
+
+MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n)
+
+ASM_START()
+PROLOGUE(func)
+	ld	r8, 0(r4)	C read lowest u limb
+	ld	r9, 0(r5)	C read lowest v limb
+	addi	r6, r6, 3	C compute branch count (1)
+	rldic.	r0, r6, 3, 59	C r0 = (n-1 & 3) << 3; cr0 = (n == 4(t+1))?
+	cmpldi	cr6, r0, 16	C cr6 = (n cmp 4t + 3)
+
+ifdef(`HAVE_ABI_mode32',
+`	rldicl	r6, r6, 62,34',	C ...branch count
+`	rldicl	r6, r6, 62, 2')	C ...branch count
+	mtctr	r6
+
+	ld	r6, 0(r4)	C read lowest u limb (again)
+	ld	r7, 0(r5)	C read lowest v limb (again)
+
+	add	r5, r5, r0	C offset vp
+	add	r4, r4, r0	C offset up
+	add	r3, r3, r0	C offset rp
+
+	beq	cr0, L(L01)
+	blt	cr6, L(L10)
+	beq	cr6, L(L11)
+	b	L(L00)
+
+L(oop):	ld	r8, -24(r4)
+	ld	r9, -24(r5)
+	logop	r10, r6, r7
+	std	r10, -32(r3)
+L(L00):	ld	r6, -16(r4)
+	ld	r7, -16(r5)
+	logop	r10, r8, r9
+	std	r10, -24(r3)
+L(L11):	ld	r8, -8(r4)
+	ld	r9, -8(r5)
+	logop	r10, r6, r7
+	std	r10, -16(r3)
+L(L10):	ld	r6, 0(r4)
+	ld	r7, 0(r5)
+	logop	r10, r8, r9
+	std	r10, -8(r3)
+L(L01):	addi	r5, r5, 32
+	addi	r4, r4, 32
+	addi	r3, r3, 32
+	bdnz	L(oop)
+
+	logop	r10, r6, r7
+	std	r10, -32(r3)
+	blr
+EPILOGUE()

diff --git a/third_party/gmp/mpn/powerpc64/lshift.asm b/third_party/gmp/mpn/powerpc64/lshift.asm
new file mode 100644
index 0000000..880944a
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc64/lshift.asm

@@ -0,0 +1,207 @@
+dnl  PowerPC-64 mpn_lshift -- rp[] = up[] << cnt
+
+dnl  Copyright 2003, 2005, 2010, 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C                   cycles/limb
+C POWER3/PPC630          ?
+C POWER4/PPC970          ?
+C POWER5                 2.25
+C POWER6                 9.75
+C POWER7                 2.15
+
+C TODO
+C  * Try to reduce the number of needed live registers
+C  * Micro-optimise header code
+C  * Keep in synch with rshift.asm and lshiftc.asm
+
+C INPUT PARAMETERS
+define(`rp',  `r3')
+define(`up',  `r4')
+define(`n',   `r5')
+define(`cnt', `r6')
+
+define(`tnc',`r0')
+define(`u0',`r30')
+define(`u1',`r31')
+define(`retval',`r5')
+
+ASM_START()
+PROLOGUE(mpn_lshift)
+	std	r31, -8(r1)
+	std	r30, -16(r1)
+	subfic	tnc, cnt, 64
+	sldi	r7, n, 3	C byte count corresponding to n
+	add	up, up, r7	C up = up + n
+	add	rp, rp, r7	C rp = rp + n
+	rldicl.	r30, n, 0,62	C r30 = n & 3, set cr0
+	cmpdi	cr6, r30, 2
+	addi	r31, n, 3	C compute count...
+	ld	r10, -8(up)	C load 1st limb for b00...b11
+	srd	retval, r10, tnc
+ifdef(`HAVE_ABI_mode32',
+`	rldicl	r31, r31, 62,34',	C ...branch count
+`	srdi	r31, r31, 2')	C ...for ctr
+	mtctr	r31		C copy count into ctr
+	beq	cr0, L(b00)
+	blt	cr6, L(b01)
+	ld	r11, -16(up)	C load 2nd limb for b10 and b11
+	beq	cr6, L(b10)
+
+	ALIGN(16)
+L(b11):	sld	r8, r10, cnt
+	srd	r9, r11, tnc
+	ld	u1, -24(up)
+	addi	up, up, -24
+	sld	r12, r11, cnt
+	srd	r7, u1, tnc
+	addi	rp, rp, 16
+	bdnz	L(gt3)
+
+	or	r11, r8, r9
+	sld	r8, u1, cnt
+	b	L(cj3)
+
+	ALIGN(16)
+L(gt3):	ld	u0, -8(up)
+	or	r11, r8, r9
+	sld	r8, u1, cnt
+	srd	r9, u0, tnc
+	ld	u1, -16(up)
+	or	r10, r12, r7
+	b	L(L11)
+
+	ALIGN(32)
+L(b10):	sld	r12, r10, cnt
+	addi	rp, rp, 24
+	srd	r7, r11, tnc
+	bdnz	L(gt2)
+
+	sld	r8, r11, cnt
+	or	r10, r12, r7
+	b	L(cj2)
+
+L(gt2):	ld	u0, -24(up)
+	sld	r8, r11, cnt
+	srd	r9, u0, tnc
+	ld	u1, -32(up)
+	or	r10, r12, r7
+	sld	r12, u0, cnt
+	srd	r7, u1, tnc
+	ld	u0, -40(up)
+	or	r11, r8, r9
+	addi	up, up, -16
+	b	L(L10)
+
+	ALIGN(16)
+L(b00):	ld	u1, -16(up)
+	sld	r12, r10, cnt
+	srd	r7, u1, tnc
+	ld	u0, -24(up)
+	sld	r8, u1, cnt
+	srd	r9, u0, tnc
+	ld	u1, -32(up)
+	or	r10, r12, r7
+	sld	r12, u0, cnt
+	srd	r7, u1, tnc
+	addi	rp, rp, 8
+	bdz	L(cj4)
+
+L(gt4):	addi	up, up, -32
+	ld	u0, -8(up)
+	or	r11, r8, r9
+	b	L(L00)
+
+	ALIGN(16)
+L(b01):	bdnz	L(gt1)
+	sld	r8, r10, cnt
+	std	r8, -8(rp)
+	b	L(ret)
+
+L(gt1):	ld	u0, -16(up)
+	sld	r8, r10, cnt
+	srd	r9, u0, tnc
+	ld	u1, -24(up)
+	sld	r12, u0, cnt
+	srd	r7, u1, tnc
+	ld	u0, -32(up)
+	or	r11, r8, r9
+	sld	r8, u1, cnt
+	srd	r9, u0, tnc
+	ld	u1, -40(up)
+	addi	up, up, -40
+	or	r10, r12, r7
+	bdz	L(end)
+
+	ALIGN(32)
+L(top):	sld	r12, u0, cnt
+	srd	r7, u1, tnc
+	ld	u0, -8(up)
+	std	r11, -8(rp)
+	or	r11, r8, r9
+L(L00):	sld	r8, u1, cnt
+	srd	r9, u0, tnc
+	ld	u1, -16(up)
+	std	r10, -16(rp)
+	or	r10, r12, r7
+L(L11):	sld	r12, u0, cnt
+	srd	r7, u1, tnc
+	ld	u0, -24(up)
+	std	r11, -24(rp)
+	or	r11, r8, r9
+L(L10):	sld	r8, u1, cnt
+	srd	r9, u0, tnc
+	ld	u1, -32(up)
+	addi	up, up, -32
+	std	r10, -32(rp)
+	addi	rp, rp, -32
+	or	r10, r12, r7
+	bdnz	L(top)
+
+	ALIGN(32)
+L(end):	sld	r12, u0, cnt
+	srd	r7, u1, tnc
+	std	r11, -8(rp)
+L(cj4):	or	r11, r8, r9
+	sld	r8, u1, cnt
+	std	r10, -16(rp)
+L(cj3):	or	r10, r12, r7
+	std	r11, -24(rp)
+L(cj2):	std	r10, -32(rp)
+	std	r8, -40(rp)
+
+L(ret):	ld	r31, -8(r1)
+	ld	r30, -16(r1)
+ifdef(`HAVE_ABI_mode32',
+`	srdi	r3, retval, 32
+	mr	r4, retval
+',`	mr	r3, retval')
+	blr
+EPILOGUE()

diff --git a/third_party/gmp/mpn/powerpc64/lshiftc.asm b/third_party/gmp/mpn/powerpc64/lshiftc.asm
new file mode 100644
index 0000000..7cf6a83
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc64/lshiftc.asm

@@ -0,0 +1,210 @@
+dnl  PowerPC-64 mpn_lshiftc -- rp[] = ~up[] << cnt
+
+dnl  Copyright 2003, 2005, 2010 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C                   cycles/limb
+C POWER3/PPC630          ?
+C POWER4/PPC970          ?
+C POWER5                 2.25
+C POWER6                 9.5
+C POWER7                 2.15
+
+C TODO
+C  * Try to reduce the number of needed live registers
+C  * Micro-optimise header code
+C  * Keep in synch with lshift.asm and rshift.asm
+C  * Could the long-scheduled std insns be less scheduled?
+
+C INPUT PARAMETERS
+define(`rp',  `r3')
+define(`up',  `r4')
+define(`n',   `r5')
+define(`cnt', `r6')
+
+define(`tnc',`r0')
+define(`u0',`r30')
+define(`u1',`r31')
+define(`retval',`r5')
+
+ASM_START()
+PROLOGUE(mpn_lshiftc)
+	std	r31, -8(r1)
+	std	r30, -16(r1)
+	subfic	tnc, cnt, 64
+	sldi	r7, n, 3	C byte count corresponding to n
+	add	up, up, r7	C up = up + n
+	add	rp, rp, r7	C rp = rp + n
+	rldicl.	r30, n, 0,62	C r30 = n & 3, set cr0
+	cmpdi	cr6, r30, 2
+	addi	r31, n, 3	C compute count...
+	ld	r10, -8(up)	C load 1st limb for b00...b11
+	srd	retval, r10, tnc
+	srdi	r31, r31, 2	C ...for ctr
+	mtctr	r31		C copy count into ctr
+	beq	cr0, L(b00)
+	blt	cr6, L(b01)
+	ld	r11, -16(up)	C load 2nd limb for b10 and b11
+	beq	cr6, L(b10)
+
+	ALIGN(16)
+L(b11):	sld	r8, r10, cnt
+	srd	r9, r11, tnc
+	ld	u1, -24(up)
+	addi	up, up, -24
+	sld	r12, r11, cnt
+	srd	r7, u1, tnc
+	addi	rp, rp, 16
+	bdnz	L(gt3)
+
+	nor	r11, r8, r9
+	sld	r8, u1, cnt
+	nor	r8, r8, r8
+	b	L(cj3)
+
+	ALIGN(16)
+L(gt3):	ld	u0, -8(up)
+	nor	r11, r8, r9
+	sld	r8, u1, cnt
+	srd	r9, u0, tnc
+	ld	u1, -16(up)
+	nor	r10, r12, r7
+	b	L(L11)
+
+	ALIGN(32)
+L(b10):	sld	r12, r10, cnt
+	addi	rp, rp, 24
+	srd	r7, r11, tnc
+	bdnz	L(gt2)
+
+	sld	r8, r11, cnt
+	nor	r10, r12, r7
+	nor	r8, r8, r8
+	b	L(cj2)
+
+L(gt2):	ld	u0, -24(up)
+	sld	r8, r11, cnt
+	srd	r9, u0, tnc
+	ld	u1, -32(up)
+	nor	r10, r12, r7
+	sld	r12, u0, cnt
+	srd	r7, u1, tnc
+	ld	u0, -40(up)
+	nor	r11, r8, r9
+	addi	up, up, -16
+	b	L(L10)
+
+	ALIGN(16)
+L(b00):	ld	u1, -16(up)
+	sld	r12, r10, cnt
+	srd	r7, u1, tnc
+	ld	u0, -24(up)
+	sld	r8, u1, cnt
+	srd	r9, u0, tnc
+	ld	u1, -32(up)
+	nor	r10, r12, r7
+	sld	r12, u0, cnt
+	srd	r7, u1, tnc
+	addi	rp, rp, 8
+	bdz	L(cj4)
+
+L(gt4):	addi	up, up, -32
+	ld	u0, -8(up)
+	nor	r11, r8, r9
+	b	L(L00)
+
+	ALIGN(16)
+L(b01):	bdnz	L(gt1)
+	sld	r8, r10, cnt
+	nor	r8, r8, r8
+	std	r8, -8(rp)
+	b	L(ret)
+
+L(gt1):	ld	u0, -16(up)
+	sld	r8, r10, cnt
+	srd	r9, u0, tnc
+	ld	u1, -24(up)
+	sld	r12, u0, cnt
+	srd	r7, u1, tnc
+	ld	u0, -32(up)
+	nor	r11, r8, r9
+	sld	r8, u1, cnt
+	srd	r9, u0, tnc
+	ld	u1, -40(up)
+	addi	up, up, -40
+	nor	r10, r12, r7
+	bdz	L(end)
+
+	ALIGN(32)
+L(top):	sld	r12, u0, cnt
+	srd	r7, u1, tnc
+	ld	u0, -8(up)
+	std	r11, -8(rp)
+	nor	r11, r8, r9
+L(L00):	sld	r8, u1, cnt
+	srd	r9, u0, tnc
+	ld	u1, -16(up)
+	std	r10, -16(rp)
+	nor	r10, r12, r7
+L(L11):	sld	r12, u0, cnt
+	srd	r7, u1, tnc
+	ld	u0, -24(up)
+	std	r11, -24(rp)
+	nor	r11, r8, r9
+L(L10):	sld	r8, u1, cnt
+	srd	r9, u0, tnc
+	ld	u1, -32(up)
+	addi	up, up, -32
+	std	r10, -32(rp)
+	addi	rp, rp, -32
+	nor	r10, r12, r7
+	bdnz	L(top)
+
+	ALIGN(32)
+L(end):	sld	r12, u0, cnt
+	srd	r7, u1, tnc
+	std	r11, -8(rp)
+L(cj4):	nor	r11, r8, r9
+	sld	r8, u1, cnt
+	std	r10, -16(rp)
+	nor	r8, r8, r8
+L(cj3):	nor	r10, r12, r7
+	std	r11, -24(rp)
+L(cj2):	std	r10, -32(rp)
+	std	r8, -40(rp)
+
+L(ret):	ld	r31, -8(r1)
+	ld	r30, -16(r1)
+ifdef(`HAVE_ABI_mode32',
+`	srdi	r3, retval, 32
+	mr	r4, retval
+',`	mr	r3, retval')
+	blr
+EPILOGUE()

diff --git a/third_party/gmp/mpn/powerpc64/mode32/add_n.asm b/third_party/gmp/mpn/powerpc64/mode32/add_n.asm
new file mode 100644
index 0000000..1da8087
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc64/mode32/add_n.asm

@@ -0,0 +1,86 @@
+dnl  PowerPC-64/mode32 mpn_add_n -- Add two limb vectors of the same length > 0
+dnl  and store sum in a third limb vector.
+
+dnl  Copyright 1999-2001, 2003, 2005 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C		cycles/limb
+C POWER3/PPC630:     ?
+C POWER4/PPC970:     4.25
+
+C INPUT PARAMETERS
+C rp	r3
+C up	r4
+C vp	r5
+C n	r6
+
+ASM_START()
+PROLOGUE(mpn_add_n)
+	mtctr	r6		C copy size into CTR
+	addic	r0, r0, 0	C clear cy
+	ld	r8, 0(r4)	C load least significant s1 limb
+	ld	r0, 0(r5)	C load least significant s2 limb
+	addi	r3, r3, -8	C offset res_ptr, it's updated before it's used
+	bdz	L(end)		C If done, skip loop
+
+L(oop):	ld	r9, 8(r4)	C load s1 limb
+	ld	r10, 8(r5)	C load s2 limb
+	adde	r7, r0, r8	C add limbs with cy, set cy
+	srdi	r6, r0, 32
+	srdi	r11, r8, 32
+	adde	r6, r6, r11	C add high limb parts, set cy
+	std	r7, 8(r3)	C store result limb
+	bdz	L(exit)		C decrement CTR and exit if done
+	ldu	r8, 16(r4)	C load s1 limb and update s1_ptr
+	ldu	r0, 16(r5)	C load s2 limb and update s2_ptr
+	adde	r7, r10, r9	C add limbs with cy, set cy
+	srdi	r6, r10, 32
+	srdi	r11, r9, 32
+	adde	r6, r6, r11	C add high limb parts, set cy
+	stdu	r7, 16(r3)	C store result limb and update res_ptr
+	bdnz	L(oop)		C decrement CTR and loop back
+
+L(end):	adde	r7, r0, r8
+	srdi	r6, r0, 32
+	srdi	r11, r8, 32
+	adde	r6, r6, r11	C add limbs with cy, set cy
+	std	r7, 8(r3)	C store ultimate result limb
+	li	r3, 0		C load cy into ...
+	addze	r4, r3		C ... return value register
+	blr
+L(exit):	adde	r7, r10, r9
+	srdi	r6, r10, 32
+	srdi	r11, r9, 32
+	adde	r6, r6, r11	C add limbs with cy, set cy
+	std	r7, 16(r3)
+	li	r3, 0		C load cy into ...
+	addze	r4, r3		C ... return value register
+	blr
+EPILOGUE()

diff --git a/third_party/gmp/mpn/powerpc64/mode32/addmul_1.asm b/third_party/gmp/mpn/powerpc64/mode32/addmul_1.asm
new file mode 100644
index 0000000..bdc3951
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc64/mode32/addmul_1.asm

@@ -0,0 +1,79 @@
+dnl  PowerPC-64 mpn_addmul_1 -- Multiply a limb vector with a limb and add
+dnl  the result to a second limb vector.
+
+dnl  Copyright 1999-2001, 2003, 2005 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C		cycles/limb
+C POWER3/PPC630:     ?
+C POWER4/PPC970:     12.5
+
+C INPUT PARAMETERS
+C rp	r3
+C up	r4
+C n	r5
+C v	r6,r7  or  r7,r8
+
+ASM_START()
+PROLOGUE(mpn_addmul_1)
+
+ifdef(`BROKEN_LONGLONG_PARAM',
+`	rldimi	r8, r7, 32,0	C assemble vlimb from separate 32-bit arguments
+	mr	r6, r8
+',`
+	rldimi	r7, r6, 32,0	C assemble vlimb from separate 32-bit arguments
+	mr	r6, r7
+')
+	li	r7, 0		C cy_limb = 0
+	mtctr	r5
+	addic	r0, r0, 0
+	addi	r3, r3, -8
+	addi	r4, r4, -8
+
+L(oop):	ldu	r0, 8(r4)
+	mulld	r9, r0, r6
+	adde	r12, r9, r7	C add old high limb and new low limb
+	srdi	r5, r9, 32
+	srdi	r11, r7, 32
+	adde	r5, r5, r11	C add high limb parts, set cy
+	mulhdu	r7, r0, r6
+	addze	r7, r7
+	ld	r10, 8(r3)
+	addc	r9, r12, r10
+	srdi	r5, r12, 32
+	srdi	r11, r10, 32
+	adde	r5, r5, r11	C add high limb parts, set cy
+	stdu	r9, 8(r3)
+	bdnz	L(oop)
+
+	addze	r4, r7
+	srdi	r3, r4, 32
+	blr
+EPILOGUE()

diff --git a/third_party/gmp/mpn/powerpc64/mode32/mul_1.asm b/third_party/gmp/mpn/powerpc64/mode32/mul_1.asm
new file mode 100644
index 0000000..3a17e98
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc64/mode32/mul_1.asm

@@ -0,0 +1,73 @@
+dnl  PowerPC-64 mpn_mul_1 -- Multiply a limb vector with a limb and add
+dnl  the result to a second limb vector.
+
+dnl  Copyright 1999-2001, 2003, 2005 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C		cycles/limb
+C POWER3/PPC630:     ?
+C POWER4/PPC970:     10
+
+C INPUT PARAMETERS
+C rp	r3
+C up	r4
+C n	r5
+C v	r6,r7  or  r7,r8
+
+ASM_START()
+PROLOGUE(mpn_mul_1)
+
+ifdef(`BROKEN_LONGLONG_PARAM',
+`	rldimi	r8, r7, 32,0	C assemble vlimb from separate 32-bit arguments
+	mr	r6, r8
+',`
+	rldimi	r7, r6, 32,0	C assemble vlimb from separate 32-bit arguments
+	mr	r6, r7
+')
+	li	r7, 0		C cy_limb = 0
+	mtctr	r5
+	addic	r0, r0, 0
+	addi	r3, r3, -8
+	addi	r4, r4, -8
+
+L(oop):	ldu	r0, 8(r4)
+	mulld	r9, r0, r6
+	adde	r12, r9, r7	C add old high limb and new low limb
+	srdi	r5, r9, 32
+	srdi	r11, r7, 32
+	adde	r5, r5, r11	C add high limb parts, set cy
+	mulhdu	r7, r0, r6
+	stdu	r12, 8(r3)
+	bdnz	L(oop)
+
+	addze	r4, r7
+	srdi	r3, r4, 32
+	blr
+EPILOGUE()

diff --git a/third_party/gmp/mpn/powerpc64/mode32/p4/gmp-mparam.h b/third_party/gmp/mpn/powerpc64/mode32/p4/gmp-mparam.h
new file mode 100644
index 0000000..4e805a0
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc64/mode32/p4/gmp-mparam.h

@@ -0,0 +1,182 @@
+/* PowerPC-64 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2008, 2009, 2011, 2017 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+/* 1800 MHz PPC970 */
+/* FFT tuning limit = 0.5 M */
+/* Generated by tuneup.c, 2017-01-01, gcc 4.0 */
+
+#define DIVREM_1_NORM_THRESHOLD              0  /* always */
+#define DIVREM_1_UNNORM_THRESHOLD            0  /* always */
+#define MOD_1_1P_METHOD                      1
+#define MOD_1_NORM_THRESHOLD                 0  /* always */
+#define MOD_1_UNNORM_THRESHOLD               0  /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          7
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          5
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD         6
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD        46
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     15
+#define USE_PREINV_DIVREM_1                  1
+#define DIV_QR_1N_PI1_METHOD                 1
+#define DIV_QR_1_NORM_THRESHOLD              2
+#define DIV_QR_1_UNNORM_THRESHOLD            2
+#define DIV_QR_2_PI2_THRESHOLD              15
+#define DIVEXACT_1_THRESHOLD                 0  /* always */
+#define BMOD_1_TO_MOD_1_THRESHOLD           88
+
+#define DIV_1_VS_MUL_1_PERCENT             269
+
+#define MUL_TOOM22_THRESHOLD                18
+#define MUL_TOOM33_THRESHOLD                60
+#define MUL_TOOM44_THRESHOLD                88
+#define MUL_TOOM6H_THRESHOLD               124
+#define MUL_TOOM8H_THRESHOLD               187
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      61
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD      91
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD      61
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD      60
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD      74
+
+#define SQR_BASECASE_THRESHOLD               4
+#define SQR_TOOM2_THRESHOLD                 28
+#define SQR_TOOM3_THRESHOLD                 90
+#define SQR_TOOM4_THRESHOLD                143
+#define SQR_TOOM6_THRESHOLD                181
+#define SQR_TOOM8_THRESHOLD                272
+
+#define MULMID_TOOM42_THRESHOLD             34
+
+#define MULMOD_BNM1_THRESHOLD               10
+#define SQRMOD_BNM1_THRESHOLD               15
+
+#define MUL_FFT_MODF_THRESHOLD             252  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    252, 5}, {     11, 6}, {      6, 5}, {     13, 6}, \
+    {      7, 5}, {     15, 6}, {     13, 5}, {     27, 6}, \
+    {     15, 7}, {      8, 6}, {     17, 7}, {      9, 6}, \
+    {     19, 7}, {     17, 8}, {      9, 7}, {     20, 8}, \
+    {     11, 7}, {     23, 8}, {     13, 9}, {      7, 8}, \
+    {     21, 9}, {     11, 8}, {     27,10}, {      7, 9}, \
+    {     15, 8}, {     33, 9}, {     19, 8}, {     39, 9}, \
+    {     23, 8}, {     47, 9}, {     27,10}, {     15, 9}, \
+    {     39,10}, {     23, 9}, {     47,11}, {     15,10}, \
+    {     31, 9}, {     67,10}, {     39, 9}, {     83,10}, \
+    {     47, 9}, {     95, 8}, {    191,10}, {     55,11}, \
+    {     31,10}, {     63, 9}, {    127, 8}, {    255,10}, \
+    {     71, 9}, {    143, 8}, {    287,10}, {     79, 9}, \
+    {    159, 8}, {    319,11}, {     47,10}, {     95, 9}, \
+    {    191, 8}, {    383,10}, {    103,12}, {     31,11}, \
+    {     63,10}, {    127, 9}, {    255, 8}, {    511,10}, \
+    {    143, 9}, {    287,11}, {     79,10}, {    159, 9}, \
+    {    319, 8}, {    639,10}, {    175, 9}, {    351, 8}, \
+    {    703,11}, {     95,10}, {    191, 9}, {    383, 8}, \
+    {    767,10}, {    207, 9}, {    415,10}, {    223, 9}, \
+    {    447,12}, {     63,11}, {    127,10}, {    255, 9}, \
+    {    511,11}, {    143,10}, {    287, 9}, {    575,11}, \
+    {    159,10}, {    319, 9}, {    639,11}, {    175,10}, \
+    {    351, 9}, {    703,12}, {     95,11}, {    191,10}, \
+    {    383, 9}, {    767,11}, {    207,10}, {    415,11}, \
+    {    223,10}, {    447,13}, {   8192,14}, {  16384,15}, \
+    {  32768,16} }
+#define MUL_FFT_TABLE3_SIZE 105
+#define MUL_FFT_THRESHOLD                 5248
+
+#define SQR_FFT_MODF_THRESHOLD             236  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    236, 5}, {     13, 6}, {     15, 7}, {      8, 6}, \
+    {     17, 7}, {      9, 6}, {     19, 7}, {     17, 8}, \
+    {      9, 7}, {     20, 8}, {     11, 7}, {     24, 8}, \
+    {     13, 9}, {      7, 8}, {     19, 9}, {     11, 8}, \
+    {     25,10}, {      7, 9}, {     15, 8}, {     33, 9}, \
+    {     19, 8}, {     39, 9}, {     23, 8}, {     47, 9}, \
+    {     27,10}, {     15, 9}, {     39,10}, {     23, 9}, \
+    {     47,11}, {     15,10}, {     31, 9}, {     67,10}, \
+    {     39, 9}, {     79, 8}, {    159,10}, {     47, 9}, \
+    {     95, 8}, {    191,11}, {     31,10}, {     63, 9}, \
+    {    127, 8}, {    255,10}, {     71, 9}, {    143, 8}, \
+    {    287,10}, {     79, 9}, {    159, 8}, {    319,11}, \
+    {     47,10}, {     95, 9}, {    191, 8}, {    383,12}, \
+    {     31,11}, {     63,10}, {    127, 9}, {    255, 8}, \
+    {    511,10}, {    143, 9}, {    287, 8}, {    575,11}, \
+    {     79,10}, {    159, 9}, {    319, 8}, {    639,10}, \
+    {    175, 9}, {    351, 8}, {    703,11}, {     95,10}, \
+    {    191, 9}, {    383, 8}, {    767,10}, {    207, 9}, \
+    {    415,10}, {    223,12}, {     63,11}, {    127,10}, \
+    {    255, 9}, {    511,11}, {    143,10}, {    287, 9}, \
+    {    575,11}, {    159,10}, {    319, 9}, {    639,11}, \
+    {    175,10}, {    351, 9}, {    703,11}, {    191,10}, \
+    {    383, 9}, {    767,11}, {    207,10}, {    415,11}, \
+    {    223,10}, {    447,13}, {   8192,14}, {  16384,15}, \
+    {  32768,16} }
+#define SQR_FFT_TABLE3_SIZE 97
+#define SQR_FFT_THRESHOLD                 3200
+
+#define MULLO_BASECASE_THRESHOLD             0  /* always */
+#define MULLO_DC_THRESHOLD                  56
+#define MULLO_MUL_N_THRESHOLD             8648
+#define SQRLO_BASECASE_THRESHOLD             2
+#define SQRLO_DC_THRESHOLD                 106
+#define SQRLO_SQR_THRESHOLD               6293
+
+#define DC_DIV_QR_THRESHOLD                 28
+#define DC_DIVAPPR_Q_THRESHOLD             102
+#define DC_BDIV_QR_THRESHOLD                51
+#define DC_BDIV_Q_THRESHOLD                124
+
+#define INV_MULMOD_BNM1_THRESHOLD           34
+#define INV_NEWTON_THRESHOLD               123
+#define INV_APPR_THRESHOLD                 109
+
+#define BINV_NEWTON_THRESHOLD              206
+#define REDC_1_TO_REDC_N_THRESHOLD          51
+
+#define MU_DIV_QR_THRESHOLD                807
+#define MU_DIVAPPR_Q_THRESHOLD             807
+#define MUPI_DIV_QR_THRESHOLD               53
+#define MU_BDIV_QR_THRESHOLD               748
+#define MU_BDIV_Q_THRESHOLD                872
+
+#define POWM_SEC_TABLE  2,23,66,440,1555
+
+#define GET_STR_DC_THRESHOLD                 7
+#define GET_STR_PRECOMPUTE_THRESHOLD        17
+#define SET_STR_DC_THRESHOLD              1035
+#define SET_STR_PRECOMPUTE_THRESHOLD      2170
+
+#define FAC_DSC_THRESHOLD                  542
+#define FAC_ODD_THRESHOLD                   24
+
+#define MATRIX22_STRASSEN_THRESHOLD         10
+#define HGCD_THRESHOLD                     108
+#define HGCD_APPR_THRESHOLD                116
+#define HGCD_REDUCE_THRESHOLD             1437
+#define GCD_DC_THRESHOLD                   268
+#define GCDEXT_DC_THRESHOLD                241
+#define JACOBI_BASE_METHOD                   4

diff --git a/third_party/gmp/mpn/powerpc64/mode32/sqr_diagonal.asm b/third_party/gmp/mpn/powerpc64/mode32/sqr_diagonal.asm
new file mode 100644
index 0000000..ff5f4b3
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc64/mode32/sqr_diagonal.asm

@@ -0,0 +1,117 @@
+dnl  PowerPC-64 mpn_sqr_diagonal.
+
+dnl  Copyright 2001-2003, 2005, 2006, 20010 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C		    cycles/limb
+C POWER3/PPC630		18
+C POWER4/PPC970		 ?
+C POWER5		 7.25
+C POWER6		 9.5
+
+C INPUT PARAMETERS
+define(`rp',  r3)
+define(`up',  r4)
+define(`n',   r5)
+
+ASM_START()
+PROLOGUE(mpn_sqr_diagonal)
+ifdef(`HAVE_ABI_mode32',
+`	rldicl	n, n, 0, 32')		C zero extend n
+
+	rldicl.	r0, n, 0,62		C r0 = n & 3, set cr0
+	addi	n, n, 3			C compute count...
+	cmpdi	cr6, r0, 2
+	srdi	n, n, 2			C ...for ctr
+	mtctr	n			C copy count into ctr
+	beq	cr0, L(b00)
+	blt	cr6, L(b01)
+	beq	cr6, L(b10)
+
+L(b11):	ld	r0, 0(up)
+	ld	r10, 8(up)
+	ld	r12, 16(up)
+	addi	rp, rp, -16
+	mulld	r7, r0, r0
+	mulhdu	r8, r0, r0
+	mulld	r9, r10, r10
+	mulhdu	r10, r10, r10
+	mulld	r11, r12, r12
+	mulhdu	r12, r12, r12
+	addi	up, up, 24
+	b	L(11)
+
+	ALIGN(16)
+L(b01):	ld	r0, 0(up)
+	addi	rp, rp, -48
+	addi	up, up, 8
+	mulld	r11, r0, r0
+	mulhdu	r12, r0, r0
+	b	L(01)
+
+	ALIGN(16)
+L(b10):	ld	r0, 0(up)
+	ld	r12, 8(up)
+	addi	rp, rp, -32
+	addi	up, up, 16
+	mulld	r9, r0, r0
+	mulhdu	r10, r0, r0
+	mulld	r11, r12, r12
+	mulhdu	r12, r12, r12
+	b	L(10)
+
+	ALIGN(32)
+L(b00):
+L(top):	ld	r0, 0(up)
+	ld	r8, 8(up)
+	ld	r10, 16(up)
+	ld	r12, 24(up)
+	mulld	r5, r0, r0
+	mulhdu	r6, r0, r0
+	mulld	r7, r8, r8
+	mulhdu	r8, r8, r8
+	mulld	r9, r10, r10
+	mulhdu	r10, r10, r10
+	mulld	r11, r12, r12
+	mulhdu	r12, r12, r12
+	addi	up, up, 32
+	std	r5, 0(rp)
+	std	r6, 8(rp)
+L(11):	std	r7, 16(rp)
+	std	r8, 24(rp)
+L(10):	std	r9, 32(rp)
+	std	r10, 40(rp)
+L(01):	std	r11, 48(rp)
+	std	r12, 56(rp)
+	addi	rp, rp, 64
+	bdnz	L(top)
+
+	blr
+EPILOGUE()

diff --git a/third_party/gmp/mpn/powerpc64/mode32/sub_n.asm b/third_party/gmp/mpn/powerpc64/mode32/sub_n.asm
new file mode 100644
index 0000000..6fdc1d4
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc64/mode32/sub_n.asm

@@ -0,0 +1,88 @@
+dnl  PowerPC-64/mode32 mpn_sub_n -- Subtract two limb vectors of the same
+dnl  length and store difference in a third limb vector.
+
+dnl  Copyright 1999-2001, 2003, 2005 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C		cycles/limb
+C POWER3/PPC630:     ?
+C POWER4/PPC970:     4.25
+
+C INPUT PARAMETERS
+C rp	r3
+C up	r4
+C vp	r5
+C n	r6
+
+ASM_START()
+PROLOGUE(mpn_sub_n)
+	mtctr	r6		C copy size into CTR
+	addic	r0, r6, -1	C set cy
+	ld	r8, 0(r4)	C load least significant s1 limb
+	ld	r0, 0(r5)	C load least significant s2 limb
+	addi	r3, r3, -8	C offset res_ptr, it's updated before it's used
+	bdz	L(end)		C If done, skip loop
+
+L(oop):	ld	r9, 8(r4)	C load s1 limb
+	ld	r10, 8(r5)	C load s2 limb
+	subfe	r7, r0, r8	C subtract limbs with cy, set cy
+	srdi	r6, r0, 32
+	srdi	r11, r8, 32
+	subfe	r6, r6, r11
+	std	r7, 8(r3)	C store result limb
+	bdz	L(exit)		C decrement CTR and exit if done
+	ldu	r8, 16(r4)	C load s1 limb and update s1_ptr
+	ldu	r0, 16(r5)	C load s2 limb and update s2_ptr
+	subfe	r7, r10, r9	C subtract limbs with cy, set cy
+	srdi	r6, r10, 32
+	srdi	r11, r9, 32
+	subfe	r6, r6, r11
+	stdu	r7, 16(r3)	C store result limb and update res_ptr
+	bdnz	L(oop)		C decrement CTR and loop back
+
+L(end):	subfe	r7, r0, r8
+	srdi	r6, r0, 32
+	srdi	r11, r8, 32
+	subfe	r6, r6, r11
+	std	r7, 8(r3)	C store ultimate result limb
+	subfe	r3, r0, r0	C load !cy into ...
+	subfic	r4, r3, 0	C ... return value register
+	li	r3, 0		C zero extend return value
+	blr
+L(exit):	subfe	r7, r10, r9
+	srdi	r6, r10, 32
+	srdi	r11, r9, 32
+	subfe	r6, r6, r11
+	std	r7, 16(r3)
+	subfe	r3, r0, r0	C load !cy into ...
+	subfic	r4, r3, 0	C ... return value register
+	li	r3, 0		C zero extend return value
+	blr
+EPILOGUE()

diff --git a/third_party/gmp/mpn/powerpc64/mode32/submul_1.asm b/third_party/gmp/mpn/powerpc64/mode32/submul_1.asm
new file mode 100644
index 0000000..996eda2
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc64/mode32/submul_1.asm

@@ -0,0 +1,81 @@
+dnl  PowerPC-64 mpn_submul_1 -- Multiply a limb vector with a limb and subtract
+dnl  the result from a second limb vector.
+
+dnl  Copyright 1999-2001, 2003, 2005 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C		cycles/limb
+C POWER3/PPC630:     ?
+C POWER4/PPC970:     16
+
+C INPUT PARAMETERS
+C rp	r3
+C up	r4
+C n	r5
+C v	r6,r7  or  r7,r8
+
+ASM_START()
+PROLOGUE(mpn_submul_1)
+
+ifdef(`BROKEN_LONGLONG_PARAM',
+`	rldimi	r8, r7, 32,0	C assemble vlimb from separate 32-bit arguments
+	mr	r6, r8
+',`
+	rldimi	r7, r6, 32,0	C assemble vlimb from separate 32-bit arguments
+	mr	r6, r7
+')
+	li	r7, 0		C cy_limb = 0
+	mtctr	r5
+	addic	r0, r0, 0
+	addi	r3, r3, -8
+	addi	r4, r4, -8
+
+L(oop):	ldu	r0, 8(r4)
+	mulld	r9, r0, r6
+	adde	r12, r9, r7	C add old high limb and new low limb
+	srdi	r5, r9, 32
+	srdi	r11, r7, 32
+	adde	r5, r5, r11	C add high limb parts, set cy
+	mulhdu	r7, r0, r6
+	addze	r7, r7
+	ld	r10, 8(r3)
+	subfc	r9, r12, r10
+	srdi	r5, r12, 32
+	srdi	r11, r10, 32
+	subfe	r5, r5, r11	C subtract high limb parts, set cy
+	stdu	r9, 8(r3)
+	subfe	r11, r11, r11	C invert ...
+	addic	r11, r11, 1	C ... carry
+	bdnz	L(oop)
+
+	addze	r4, r7
+	srdi	r3, r4, 32
+	blr
+EPILOGUE()

diff --git a/third_party/gmp/mpn/powerpc64/mode64/aors_n.asm b/third_party/gmp/mpn/powerpc64/mode64/aors_n.asm
new file mode 100644
index 0000000..0e8474f
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc64/mode64/aors_n.asm

@@ -0,0 +1,189 @@
+dnl  PowerPC-64 mpn_add_n/mpn_sub_n -- mpn addition and subtraction.
+
+dnl  Copyright 1999-2001, 2003-2005, 2007, 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C                   cycles/limb
+C POWER3/PPC630          1.5
+C POWER4/PPC970          2
+C POWER5                 2
+C POWER6                 2.63
+C POWER7               2.25-2.87
+
+C This code is a little bit slower for POWER3/PPC630 than the simple code used
+C previously, but it is much faster for POWER4/PPC970.  The reason for the
+C POWER3/PPC630 slowdown can be attributed to the saving and restoring of 4
+C registers.
+
+C INPUT PARAMETERS
+C rp	r3
+C up	r4
+C vp	r5
+C n	r6
+
+ifdef(`OPERATION_add_n',`
+  define(ADDSUBC,	adde)
+  define(ADDSUB,	addc)
+  define(func,		mpn_add_n)
+  define(func_nc,	mpn_add_nc)
+  define(GENRVAL,	`addi	r3, r3, 1')
+  define(SETCBR,	`addic	r0, $1, -1')
+  define(CLRCB,		`addic	r0, r0, 0')
+')
+ifdef(`OPERATION_sub_n',`
+  define(ADDSUBC,	subfe)
+  define(ADDSUB,	subfc)
+  define(func,		mpn_sub_n)
+  define(func_nc,	mpn_sub_nc)
+  define(GENRVAL,	`neg	r3, r3')
+  define(SETCBR,	`subfic	r0, $1, 0')
+  define(CLRCB,		`addic	r0, r1, -1')
+')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+
+ASM_START()
+PROLOGUE(func_nc)
+	SETCBR(r7)
+	b	L(ent)
+EPILOGUE()
+
+PROLOGUE(func)
+	CLRCB
+L(ent):	std	r31, -8(r1)
+	std	r30, -16(r1)
+	std	r29, -24(r1)
+	std	r28, -32(r1)
+
+	rldicl.	r0, r6, 0,62	C r0 = n & 3, set cr0
+	cmpdi	cr6, r0, 2
+	addi	r6, r6, 3	C compute count...
+	srdi	r6, r6, 2	C ...for ctr
+	mtctr	r6		C copy count into ctr
+	beq	cr0, L(b00)
+	blt	cr6, L(b01)
+	beq	cr6, L(b10)
+
+L(b11):	ld	r8, 0(r4)	C load s1 limb
+	ld	r9, 0(r5)	C load s2 limb
+	ld	r10, 8(r4)	C load s1 limb
+	ld	r11, 8(r5)	C load s2 limb
+	ld	r12, 16(r4)	C load s1 limb
+	addi	r4, r4, 24
+	ld	r0, 16(r5)	C load s2 limb
+	addi	r5, r5, 24
+	ADDSUBC	r29, r9, r8
+	ADDSUBC	r30, r11, r10
+	ADDSUBC	r31, r0, r12
+	std	r29, 0(r3)
+	std	r30, 8(r3)
+	std	r31, 16(r3)
+	addi	r3, r3, 24
+	bdnz	L(go)
+	b	L(ret)
+
+L(b01):	ld	r12, 0(r4)	C load s1 limb
+	addi	r4, r4, 8
+	ld	r0, 0(r5)	C load s2 limb
+	addi	r5, r5, 8
+	ADDSUBC	r31, r0, r12	C add
+	std	r31, 0(r3)
+	addi	r3, r3, 8
+	bdnz	L(go)
+	b	L(ret)
+
+L(b10):	ld	r10, 0(r4)	C load s1 limb
+	ld	r11, 0(r5)	C load s2 limb
+	ld	r12, 8(r4)	C load s1 limb
+	addi	r4, r4, 16
+	ld	r0, 8(r5)	C load s2 limb
+	addi	r5, r5, 16
+	ADDSUBC	r30, r11, r10	C add
+	ADDSUBC	r31, r0, r12	C add
+	std	r30, 0(r3)
+	std	r31, 8(r3)
+	addi	r3, r3, 16
+	bdnz	L(go)
+	b	L(ret)
+
+L(b00):	C INITCY		C clear/set cy
+L(go):	ld	r6, 0(r4)	C load s1 limb
+	ld	r7, 0(r5)	C load s2 limb
+	ld	r8, 8(r4)	C load s1 limb
+	ld	r9, 8(r5)	C load s2 limb
+	ld	r10, 16(r4)	C load s1 limb
+	ld	r11, 16(r5)	C load s2 limb
+	ld	r12, 24(r4)	C load s1 limb
+	ld	r0, 24(r5)	C load s2 limb
+	bdz	L(end)
+
+	addi	r4, r4, 32
+	addi	r5, r5, 32
+
+	ALIGN(16)
+L(top):	ADDSUBC	r28, r7, r6
+	ld	r6, 0(r4)	C load s1 limb
+	ld	r7, 0(r5)	C load s2 limb
+	ADDSUBC	r29, r9, r8
+	ld	r8, 8(r4)	C load s1 limb
+	ld	r9, 8(r5)	C load s2 limb
+	ADDSUBC	r30, r11, r10
+	ld	r10, 16(r4)	C load s1 limb
+	ld	r11, 16(r5)	C load s2 limb
+	ADDSUBC	r31, r0, r12
+	ld	r12, 24(r4)	C load s1 limb
+	ld	r0, 24(r5)	C load s2 limb
+	std	r28, 0(r3)
+	addi	r4, r4, 32
+	std	r29, 8(r3)
+	addi	r5, r5, 32
+	std	r30, 16(r3)
+	std	r31, 24(r3)
+	addi	r3, r3, 32
+	bdnz	L(top)		C decrement ctr and loop back
+
+L(end):	ADDSUBC	r28, r7, r6
+	ADDSUBC	r29, r9, r8
+	ADDSUBC	r30, r11, r10
+	ADDSUBC	r31, r0, r12
+	std	r28, 0(r3)
+	std	r29, 8(r3)
+	std	r30, 16(r3)
+	std	r31, 24(r3)
+
+L(ret):	ld	r31, -8(r1)
+	ld	r30, -16(r1)
+	ld	r29, -24(r1)
+	ld	r28, -32(r1)
+
+	subfe	r3, r0, r0	C -cy
+	GENRVAL
+	blr
+EPILOGUE()

diff --git a/third_party/gmp/mpn/powerpc64/mode64/aorsmul_1.asm b/third_party/gmp/mpn/powerpc64/mode64/aorsmul_1.asm
new file mode 100644
index 0000000..0c12f9b
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc64/mode64/aorsmul_1.asm

@@ -0,0 +1,225 @@
+dnl  PowerPC-64 mpn_addmul_1 and mpn_submul_1.
+
+dnl  Copyright 1999-2001, 2003-2006, 2010-2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C                   mpn_addmul_1    mpn_submul_1
+C                   cycles/limb     cycles/limb
+C POWER3/PPC630		6-18		6-18
+C POWER4/PPC970		 8		 8.3
+C POWER5		 8		 8.25
+C POWER6		16.25		16.75
+C POWER7		 3.77		 4.9
+
+C TODO
+C  * Try to reduce the number of needed live registers
+C  * Add support for _1c entry points
+
+C INPUT PARAMETERS
+define(`rp', `r3')
+define(`up', `r4')
+define(`n',  `r5')
+define(`vl', `r6')
+
+ifdef(`OPERATION_addmul_1',`
+  define(ADDSUBC,	adde)
+  define(ADDSUB,	addc)
+  define(func,		mpn_addmul_1)
+  define(func_nc,	mpn_addmul_1c)	C FIXME: not really supported
+  define(SM,		`')
+')
+ifdef(`OPERATION_submul_1',`
+  define(ADDSUBC,	subfe)
+  define(ADDSUB,	subfc)
+  define(func,		mpn_submul_1)
+  define(func_nc,	mpn_submul_1c)	C FIXME: not really supported
+  define(SM,		`$1')
+')
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
+
+ASM_START()
+PROLOGUE(func)
+	std	r31, -8(r1)
+	rldicl.	r0, n, 0,62	C r0 = n & 3, set cr0
+	std	r30, -16(r1)
+	cmpdi	cr6, r0, 2
+	std	r29, -24(r1)
+	addi	n, n, 3		C compute count...
+	std	r28, -32(r1)
+	srdi	n, n, 2		C ...for ctr
+	std	r27, -40(r1)
+	mtctr	n		C copy count into ctr
+	beq	cr0, L(b00)
+	blt	cr6, L(b01)
+	beq	cr6, L(b10)
+
+L(b11):	ld	r9, 0(up)
+	ld	r28, 0(rp)
+	mulld	r0, r9, r6
+	mulhdu	r12, r9, r6
+	ADDSUB	r0, r0, r28
+	std	r0, 0(rp)
+	addi	rp, rp, 8
+	ld	r9, 8(up)
+	ld	r27, 16(up)
+	addi	up, up, 24
+SM(`	subfe	r11, r11, r11 ')
+	b	L(bot)
+
+	ALIGN(16)
+L(b00):	ld	r9, 0(up)
+	ld	r27, 8(up)
+	ld	r28, 0(rp)
+	ld	r29, 8(rp)
+	mulld	r0, r9, r6
+	mulhdu	r5, r9, r6
+	mulld	r7, r27, r6
+	mulhdu	r8, r27, r6
+	addc	r7, r7, r5
+	addze	r12, r8
+	ADDSUB	r0, r0, r28
+	std	r0, 0(rp)
+	ADDSUBC	r7, r7, r29
+	std	r7, 8(rp)
+	addi	rp, rp, 16
+	ld	r9, 16(up)
+	ld	r27, 24(up)
+	addi	up, up, 32
+SM(`	subfe	r11, r11, r11 ')
+	b	L(bot)
+
+	ALIGN(16)
+L(b01):	bdnz	L(gt1)
+	ld	r9, 0(up)
+	ld	r11, 0(rp)
+	mulld	r0, r9, r6
+	mulhdu	r8, r9, r6
+	ADDSUB	r0, r0, r11
+	std	r0, 0(rp)
+SM(`	subfe	r11, r11, r11 ')
+SM(`	addic	r11, r11, 1 ')
+	addze	r3, r8
+	blr
+L(gt1):	ld	r9, 0(up)
+	ld	r27, 8(up)
+	mulld	r0, r9, r6
+	mulhdu	r5, r9, r6
+	mulld	r7, r27, r6
+	mulhdu	r8, r27, r6
+	ld	r9, 16(up)
+	ld	r28, 0(rp)
+	ld	r29, 8(rp)
+	ld	r30, 16(rp)
+	mulld	r11, r9, r6
+	mulhdu	r10, r9, r6
+	addc	r7, r7, r5
+	adde	r11, r11, r8
+	addze	r12, r10
+	ADDSUB	r0, r0, r28
+	std	r0, 0(rp)
+	ADDSUBC	r7, r7, r29
+	std	r7, 8(rp)
+	ADDSUBC	r11, r11, r30
+	std	r11, 16(rp)
+	addi	rp, rp, 24
+	ld	r9, 24(up)
+	ld	r27, 32(up)
+	addi	up, up, 40
+SM(`	subfe	r11, r11, r11 ')
+	b	L(bot)
+
+L(b10):	addic	r0, r0, 0
+	li	r12, 0		C cy_limb = 0
+	ld	r9, 0(up)
+	ld	r27, 8(up)
+	bdz	L(end)
+	addi	up, up, 16
+
+	ALIGN(16)
+L(top):	mulld	r0, r9, r6
+	mulhdu	r5, r9, r6	C 9
+	mulld	r7, r27, r6
+	mulhdu	r8, r27, r6	C 27
+	ld	r9, 0(up)
+	ld	r28, 0(rp)
+	ld	r27, 8(up)
+	ld	r29, 8(rp)
+	adde	r0, r0, r12	C 0 12
+	adde	r7, r7, r5	C 5 7
+	mulld	r5, r9, r6
+	mulhdu	r10, r9, r6	C 9
+	mulld	r11, r27, r6
+	mulhdu	r12, r27, r6	C 27
+	ld	r9, 16(up)
+	ld	r30, 16(rp)
+	ld	r27, 24(up)
+	ld	r31, 24(rp)
+	adde	r5, r5, r8	C 8 5
+	adde	r11, r11, r10	C 10 11
+	addze	r12, r12	C 12
+	ADDSUB	r0, r0, r28	C 0 28
+	std	r0, 0(rp)	C 0
+	ADDSUBC	r7, r7, r29	C 7 29
+	std	r7, 8(rp)	C 7
+	ADDSUBC	r5, r5, r30	C 5 30
+	std	r5, 16(rp)	C 5
+	ADDSUBC	r11, r11, r31	C 11 31
+	std	r11, 24(rp)	C 11
+	addi	up, up, 32
+SM(`	subfe	r11, r11, r11 ')
+	addi	rp, rp, 32
+L(bot):
+SM(`	addic	r11, r11, 1 ')
+	bdnz	L(top)
+
+L(end):	mulld	r0, r9, r6
+	mulhdu	r5, r9, r6
+	mulld	r7, r27, r6
+	mulhdu	r8, r27, r6
+	ld	r28, 0(rp)
+	ld	r29, 8(rp)
+	adde	r0, r0, r12
+	adde	r7, r7, r5
+	addze	r8, r8
+	ADDSUB	r0, r0, r28
+	std	r0, 0(rp)
+	ADDSUBC	r7, r7, r29
+	std	r7, 8(rp)
+SM(`	subfe	r11, r11, r11 ')
+SM(`	addic	r11, r11, 1 ')
+	addze	r3, r8
+	ld	r31, -8(r1)
+	ld	r30, -16(r1)
+	ld	r29, -24(r1)
+	ld	r28, -32(r1)
+	ld	r27, -40(r1)
+	blr
+EPILOGUE()

diff --git a/third_party/gmp/mpn/powerpc64/mode64/aorsorrlsh1_n.asm b/third_party/gmp/mpn/powerpc64/mode64/aorsorrlsh1_n.asm
new file mode 100644
index 0000000..2c5400a
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc64/mode64/aorsorrlsh1_n.asm

@@ -0,0 +1,43 @@
+dnl  PowerPC-64 mpn_addlsh1_n, mpn_sublsh1_n, mpn_rsblsh1_n.
+
+dnl  Copyright 2003, 2005, 2009, 2010, 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+define(LSH,		1)
+define(RSH,		63)
+
+ifdef(`OPERATION_addlsh1_n',`define(`DO_add')')
+ifdef(`OPERATION_sublsh1_n',`define(`DO_sub')')
+ifdef(`OPERATION_rsblsh1_n',`define(`DO_rsb')')
+
+MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_sublsh1_n mpn_rsblsh1_n)
+
+include_mpn(`powerpc64/mode64/aorsorrlshC_n.asm')

diff --git a/third_party/gmp/mpn/powerpc64/mode64/aorsorrlsh2_n.asm b/third_party/gmp/mpn/powerpc64/mode64/aorsorrlsh2_n.asm
new file mode 100644
index 0000000..447791a
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc64/mode64/aorsorrlsh2_n.asm

@@ -0,0 +1,43 @@
+dnl  PowerPC-64 mpn_addlsh2_n, mpn_sublsh2_n, mpn_rsblsh2_n.
+
+dnl  Copyright 2003, 2005, 2009, 2010, 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+define(LSH,		2)
+define(RSH,		62)
+
+ifdef(`OPERATION_addlsh2_n',`define(`DO_add')')
+ifdef(`OPERATION_sublsh2_n',`define(`DO_sub')')
+ifdef(`OPERATION_rsblsh2_n',`define(`DO_rsb')')
+
+MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_sublsh2_n mpn_rsblsh2_n)
+
+include_mpn(`powerpc64/mode64/aorsorrlshC_n.asm')

diff --git a/third_party/gmp/mpn/powerpc64/mode64/aorsorrlshC_n.asm b/third_party/gmp/mpn/powerpc64/mode64/aorsorrlshC_n.asm
new file mode 100644
index 0000000..6158f54
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc64/mode64/aorsorrlshC_n.asm

@@ -0,0 +1,187 @@
+dnl  PowerPC-64 mpn_addlshC_n, mpn_sublshC_n, mpn_rsblshC_n.
+
+dnl  Copyright 2003, 2005, 2009, 2010, 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+C                  cycles/limb
+C POWER3/PPC630          1.83   (1.5 c/l should be possible)
+C POWER4/PPC970          3      (2.0 c/l should be possible)
+C POWER5                 3
+C POWER6              3.5-47
+C POWER7                 3
+
+C STATUS
+C  * Try combining upx+up, and vpx+vp.
+C  * The worst case 47 c/l for POWER6 happens if the 3rd operand for ldx is
+C    greater than the 2nd operand.  Yes, this addition is non-commutative wrt
+C    performance.
+
+C INPUT PARAMETERS
+define(`rp', `r3')
+define(`up', `r4')
+define(`vp', `r5')
+define(`n',  `r6')
+
+ifdef(`DO_add', `
+  define(`ADDSUBC',	`addc	$1, $2, $3')
+  define(`ADDSUBE',	`adde	$1, $2, $3')
+  define(INITCY,	`addic	$1, r1, 0')
+  define(RETVAL,	`addze	r3, $1')
+  define(`func',	mpn_addlsh`'LSH`'_n)')
+ifdef(`DO_sub', `
+  define(`ADDSUBC',	`subfc	$1, $2, $3')
+  define(`ADDSUBE',	`subfe	$1, $2, $3')
+  define(INITCY,	`addic	$1, r1, -1')
+  define(RETVAL,	`subfze	r3, $1
+			neg	r3, r3')
+  define(`func',	mpn_sublsh`'LSH`'_n)')
+ifdef(`DO_rsb', `
+  define(`ADDSUBC',	`subfc	$1, $3, $2')
+  define(`ADDSUBE',	`subfe	$1, $3, $2')
+  define(INITCY,	`addic	$1, r1, -1')
+  define(RETVAL,	`addme	r3, $1')
+  define(`func',	mpn_rsblsh`'LSH`'_n)')
+
+define(`rpx', `r6')
+define(`upx', `r7')
+define(`vpx', `r12')
+
+define(`s0', `r0')  define(`s1', `r9')
+define(`u0', `r8')
+define(`v0', `r10') define(`v1', `r11')
+
+
+ASM_START()
+PROLOGUE(func)
+	cmpldi	cr0, n, 13
+	bgt	L(big)
+
+	mtctr	n		C copy n in ctr
+	INITCY(	r0)		C clear cy
+
+	ld	v0, 0(vp)	C load v limb
+	ld	u0, 0(up)	C load u limb
+	addi	up, up, -8	C update up
+	addi	rp, rp, -8	C update rp
+	sldi	s1, v0, LSH
+	bdz	L(ex1)		C If done, skip loop
+
+	ALIGN(16)
+L(lo0):	ld	v1, 8(vp)	C load v limb
+	ADDSUBE(s1, s1, u0)	C add limbs with cy, set cy
+	ldu	u0, 16(up)	C load u limb and update up
+	srdi	s0, v0, RSH	C shift down previous v limb
+	std	s1, 8(rp)	C store result limb
+	rldimi	s0, v1, LSH, 0	C left shift v limb and merge with prev v limb
+	bdz	L(ex0)		C decrement ctr and exit if done
+	ldu	v0, 16(vp)	C load v limb and update vp
+	ADDSUBE(s0, s0, u0)	C add limbs with cy, set cy
+	ld	u0, 8(up)	C load u limb
+	srdi	s1, v1, RSH	C shift down previous v limb
+	stdu	s0, 16(rp)	C store result limb and update rp
+	rldimi	s1, v0, LSH, 0	C left shift v limb and merge with prev v limb
+	bdnz	L(lo0)		C decrement ctr and loop back
+
+L(ex1):	ADDSUBE(r7, s1, u0)
+	std	r7, 8(rp)	C store last result limb
+	srdi	r0, v0, RSH
+	RETVAL(	r0)
+	blr
+L(ex0):	ADDSUBE(r7, s0, u0)
+	std	r7, 16(rp)	C store last result limb
+	srdi	r0, v1, RSH
+	RETVAL(	r0)
+	blr
+
+
+L(big):	rldicl.	r0, n, 0,63	C r0 = n & 1, set cr0
+	addi	r6, n, -1	C ...for ctr
+	srdi	r6, r6, 1	C ...for ctr
+	mtctr	r6		C copy count into ctr
+	beq	cr0, L(b0)
+
+L(b1):	ld	v1, 0(vp)
+	ld	u0, 0(up)
+	sldi	s1, v1, LSH
+	srdi	s0, v1, RSH
+	ld	v0, 8(vp)
+	ADDSUBC(s1, s1, u0)	C add limbs without cy, set cy
+	addi	rpx, rp, -16
+	addi	rp, rp, -8
+	sub	upx, up, rp
+	sub	vpx, vp, rp
+	sub	up, up, rpx
+	sub	vp, vp, rpx
+	addi	up, up, 8
+	addi	upx, upx, 16
+	addi	vp, vp, 16
+	addi	vpx, vpx, 24
+	b	L(mid)
+
+L(b0):	ld	v0, 0(vp)
+	ld	u0, 0(up)
+	sldi	s0, v0, LSH
+	srdi	s1, v0, RSH
+	ld	v1, 8(vp)
+	ADDSUBC(s0, s0, u0)	C add limbs without cy, set cy
+	addi	rpx, rp, -8
+	addi	rp, rp, -16
+	sub	upx, up, rpx
+	sub	vpx, vp, rpx
+	sub	up, up, rp
+	sub	vp, vp, rp
+	addi	up, up, 8
+	addi	upx, upx, 16
+	addi	vp, vp, 16
+	addi	vpx, vpx, 24
+
+	ALIGN(32)
+L(top):	ldx	u0, rp, up
+	ldx	v0, rp, vp
+	rldimi	s1, v1, LSH, 0
+	stdu	s0, 16(rp)
+	srdi	s0, v1, RSH
+	ADDSUBE(s1, s1, u0)	C add limbs with cy, set cy
+L(mid):	ldx	u0, rpx, upx
+	ldx	v1, rpx, vpx
+	rldimi	s0, v0, LSH, 0
+	stdu	s1, 16(rpx)
+	srdi	s1, v0, RSH
+	ADDSUBE(s0, s0, u0)	C add limbs with cy, set cy
+	bdnz	L(top)		C decrement CTR and loop back
+
+	ldx	u0, rp, up
+	rldimi	s1, v1, LSH, 0
+	std	s0, 16(rp)
+	srdi	s0, v1, RSH
+	ADDSUBE(s1, s1, u0)	C add limbs with cy, set cy
+	std	s1, 24(rp)
+
+	RETVAL(	s0)
+	blr
+EPILOGUE()

diff --git a/third_party/gmp/mpn/powerpc64/mode64/bdiv_dbm1c.asm b/third_party/gmp/mpn/powerpc64/mode64/bdiv_dbm1c.asm
new file mode 100644
index 0000000..45cded9
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc64/mode64/bdiv_dbm1c.asm

@@ -0,0 +1,132 @@
+dnl  PPC64 mpn_bdiv_dbm1c.
+
+dnl  Copyright 2008, 2010 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C                 cycles/limb
+C POWER3/PPC630       6-18
+C POWER4/PPC970       8.25
+C POWER5              8.5  fluctuating as function of n % 3
+C POWER6             15
+C POWER7              4.75
+
+C TODO
+C  * Nothing to do...
+
+C INPUT PARAMETERS
+define(`rp', `r3')
+define(`up', `r4')
+define(`n',  `r5')
+define(`bd', `r6')
+define(`cy', `r7')
+
+ASM_START()
+PROLOGUE(mpn_bdiv_dbm1c)
+	ld	r0, 0(r4)
+
+	rldicl.	r12, r5, 0,62
+	cmpldi	cr6, r12, 2
+	cmpldi	cr7, r5, 4
+	addi	r5, r5, 1
+	srwi	r5, r5, 2
+	mtctr	r5
+	beq	cr0, L(b00)
+	blt	cr6, L(b01)
+	beq	cr6, L(b10)
+
+	ALIGN(16)
+L(b11):	mulld	r5, r0, r6
+	mulhdu	r12, r0, r6
+	ld	r0, 8(r4)
+	addi	r4, r4, -24
+	addi	r3, r3, -24
+	b	L(3)
+
+	ALIGN(16)
+L(b00):	mulld	r9, r0, r6
+	mulhdu	r8, r0, r6
+	addi	r4, r4, -16
+	addi	r3, r3, -16
+	b	L(0)
+
+	ALIGN(16)
+L(b01):	mulld	r5, r0, r6
+	mulhdu	r12, r0, r6
+	addi	r3, r3, -8
+	ble	cr7, L(e1)
+	ld	r0, 8(r4)
+	addi	r4, r4, -8
+	b	L(1)
+
+	ALIGN(16)
+L(b10):	mulld	r9, r0, r6
+	mulhdu	r8, r0, r6
+	ble	cr7, L(e2)
+
+	ALIGN(16)
+L(top):	subfc	r11, r9, r7
+	ld	r10, 8(r4)
+	ld	r0, 16(r4)
+	subfe	r7, r8, r11
+	std	r11, 0(r3)
+	mulld	r5, r10, r6
+	mulhdu	r12, r10, r6
+L(1):	mulld	r9, r0, r6
+	mulhdu	r8, r0, r6
+	subfc	r11, r5, r7
+	subfe	r7, r12, r11
+	std	r11, 8(r3)
+L(0):	subfc	r11, r9, r7
+	ld	r10, 24(r4)
+	ld	r0, 32(r4)
+	subfe	r7, r8, r11
+	std	r11, 16(r3)
+	mulld	r5, r10, r6
+	mulhdu	r12, r10, r6
+L(3):	mulld	r9, r0, r6
+	mulhdu	r8, r0, r6
+	subfc	r11, r5, r7
+	subfe	r7, r12, r11
+	std	r11, 24(r3)
+	addi	r4, r4, 32
+	addi	r3, r3, 32
+	bdnz	L(top)
+
+L(e2):	ld	r10, 8(r4)
+	mulld	r5, r10, r6
+	mulhdu	r12, r10, r6
+	subfc	r11, r9, r7
+	subfe	r7, r8, r11
+	std	r11, 0(r3)
+L(e1):	subfc	r11, r5, r7
+	std	r11, 8(r3)
+	subfe	r3, r12, r11
+	blr
+EPILOGUE()

diff --git a/third_party/gmp/mpn/powerpc64/mode64/bdiv_q_1.asm b/third_party/gmp/mpn/powerpc64/mode64/bdiv_q_1.asm
new file mode 100644
index 0000000..307aafc
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc64/mode64/bdiv_q_1.asm

@@ -0,0 +1,146 @@
+dnl  PowerPC-64 mpn_bdiv_q_1, mpn_pi1_bdiv_q_1 -- Hensel division by 1-limb
+dnl  divisor.
+
+dnl  Copyright 2006, 2010, 2017 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C			cycles/limb
+C			norm	unorm
+C POWER3/PPC630	       13-19
+C POWER4/PPC970		16
+C POWER5		16	16
+C POWER6		37	46
+C POWER7		12	12
+C POWER8		12	12
+
+C INPUT PARAMETERS
+define(`rp', `r3')
+define(`up', `r4')
+define(`n',  `r5')
+define(`d',  `r6')
+define(`di', `r7')
+define(`cnt',`r8')
+
+define(`tnc',`r10')
+
+ASM_START()
+
+EXTERN(binvert_limb_table)
+
+PROLOGUE(mpn_bdiv_q_1,toc)
+	addi	r7, n, -1
+	cmpdi	cr1, n, 1
+	ld	r12, 0(up)
+	li	cnt, 0
+	neg	r0, d
+	and	r0, d, r0
+	cntlzd	r0, r0
+	subfic	cnt, r0, 63
+	srd	d, d, cnt
+L(7):
+	mtctr	r7
+	LEA(	r10, binvert_limb_table)
+	rldicl	r11, d, 63, 57
+	lbzx	r0, r10, r11
+	mulld	r9, r0, r0
+	sldi	r0, r0, 1
+	mulld	r9, d, r9
+	subf	r0, r9, r0
+	mulld	r10, r0, r0
+	sldi	r0, r0, 1
+	mulld	r10, d, r10
+	subf	r0, r10, r0
+	mulld	r9, r0, r0
+	sldi	r0, r0, 1
+	mulld	r9, d, r9
+	subf	di, r9, r0		C di = 1/d mod 2^64
+ifdef(`AIX',
+`	C For AIX it is not clear how to jump into another function.
+	b	.mpn_pi1_bdiv_q_1
+',`
+	C For non-AIX, dispatch into the pi1 variant.
+	bne	cr0, L(norm)
+	b	L(unorm)
+')
+EPILOGUE()
+
+PROLOGUE(mpn_pi1_bdiv_q_1)
+	cmpdi	cr0, cnt, 0
+	ld	r12, 0(up)
+	addic	r0, n, -1		C set carry as side effect
+	cmpdi	cr1, n, 1
+	mtctr	r0
+	beq	cr0, L(norm)
+
+L(unorm):
+	subfic	tnc, cnt, 64		C set carry as side effect
+	li	r5, 0
+	srd	r11, r12, cnt
+	beq	cr1, L(ed1)
+
+	ALIGN(16)
+L(tpu):	ld	r12, 8(up)
+	nop
+	addi	up, up, 8
+	sld	r0, r12, tnc
+	or	r11, r11, r0
+	subfe	r9, r5, r11
+	srd	r11, r12, cnt
+	mulld	r0, di, r9
+	mulhdu	r5, r0, d
+	std	r0, 0(rp)
+	addi	rp, rp, 8
+	bdnz	L(tpu)
+
+	subfe	r11, r5, r11
+L(ed1):	mulld	r0, di, r11
+	std	r0, 0(rp)
+	blr
+
+	ALIGN(16)
+L(norm):
+	mulld	r11, r12, di
+	mulhdu	r5, r11, d
+	std	r11, 0(rp)
+	beqlr	cr1
+
+	ALIGN(16)
+L(tpn):	ld	r9, 8(up)
+	addi	up, up, 8
+	subfe	r5, r5, r9
+	mulld	r11, di, r5
+	mulhdu	r5, r11, d	C result not used in last iteration
+	std	r11, 8(rp)
+	addi	rp, rp, 8
+	bdnz	L(tpn)
+
+	blr
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/powerpc64/mode64/cnd_aors_n.asm b/third_party/gmp/mpn/powerpc64/mode64/cnd_aors_n.asm
new file mode 100644
index 0000000..24968c1
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc64/mode64/cnd_aors_n.asm

@@ -0,0 +1,196 @@
+dnl  PowerPC-64 mpn_cnd_add_n/mpn_cnd_sub_n.
+
+dnl  Copyright 1999-2001, 2003-2005, 2007, 2011, 2012 Free Software Foundation,
+dnl  Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C                   cycles/limb
+C POWER3/PPC630          ?
+C POWER4/PPC970          2.25
+C POWER5                 ?
+C POWER6                 3
+C POWER7                 2
+
+C INPUT PARAMETERS
+define(`cnd',  `r3')
+define(`rp',   `r4')
+define(`up',   `r5')
+define(`vp',   `r6')
+define(`n',    `r7')
+
+ifdef(`OPERATION_cnd_add_n',`
+  define(ADDSUBC,	adde)
+  define(ADDSUB,	addc)
+  define(func,		mpn_cnd_add_n)
+  define(GENRVAL,	`addi	r3, r3, 1')
+  define(SETCBR,	`addic	r0, $1, -1')
+  define(CLRCB,		`addic	r0, r0, 0')
+')
+ifdef(`OPERATION_cnd_sub_n',`
+  define(ADDSUBC,	subfe)
+  define(ADDSUB,	subfc)
+  define(func,		mpn_cnd_sub_n)
+  define(GENRVAL,	`neg	r3, r3')
+  define(SETCBR,	`subfic	r0, $1, 0')
+  define(CLRCB,		`addic	r0, r1, -1')
+')
+
+MULFUNC_PROLOGUE(mpn_cnd_add_n mpn_cnd_sub_n)
+
+ASM_START()
+PROLOGUE(func)
+	std	r31, -8(r1)
+	std	r30, -16(r1)
+	std	r29, -24(r1)
+	std	r28, -32(r1)
+	std	r27, -40(r1)
+
+	subfic	cnd, cnd, 0
+	subfe	cnd, cnd, cnd
+
+	rldicl.	r0, n, 0,62	C r0 = n & 3, set cr0
+	cmpdi	cr6, r0, 2
+	addi	n, n, 3	C compute count...
+	srdi	n, n, 2	C ...for ctr
+	mtctr	n		C copy count into ctr
+	beq	cr0, L(b00)
+	blt	cr6, L(b01)
+	beq	cr6, L(b10)
+
+L(b11):	ld	r8, 0(up)	C load s1 limb
+	ld	r9, 0(vp)	C load s2 limb
+	ld	r10, 8(up)	C load s1 limb
+	ld	r11, 8(vp)	C load s2 limb
+	ld	r12, 16(up)	C load s1 limb
+	addi	up, up, 24
+	ld	r0, 16(vp)	C load s2 limb
+	addi	vp, vp, 24
+	and	r9, r9, cnd
+	and	r11, r11, cnd
+	and	r0, r0, cnd
+	ADDSUB	r29, r9, r8
+	ADDSUBC	r30, r11, r10
+	ADDSUBC	r31, r0, r12
+	std	r29, 0(rp)
+	std	r30, 8(rp)
+	std	r31, 16(rp)
+	addi	rp, rp, 24
+	bdnz	L(go)
+	b	L(ret)
+
+L(b01):	ld	r12, 0(up)	C load s1 limb
+	addi	up, up, 8
+	ld	r0, 0(vp)	C load s2 limb
+	addi	vp, vp, 8
+	and	r0, r0, cnd
+	ADDSUB	r31, r0, r12	C add
+	std	r31, 0(rp)
+	addi	rp, rp, 8
+	bdnz	L(go)
+	b	L(ret)
+
+L(b10):	ld	r10, 0(up)	C load s1 limb
+	ld	r11, 0(vp)	C load s2 limb
+	ld	r12, 8(up)	C load s1 limb
+	addi	up, up, 16
+	ld	r0, 8(vp)	C load s2 limb
+	addi	vp, vp, 16
+	and	r11, r11, cnd
+	and	r0, r0, cnd
+	ADDSUB	r30, r11, r10	C add
+	ADDSUBC	r31, r0, r12	C add
+	std	r30, 0(rp)
+	std	r31, 8(rp)
+	addi	rp, rp, 16
+	bdnz	L(go)
+	b	L(ret)
+
+L(b00):	CLRCB			C clear/set cy
+L(go):	ld	r7, 0(up)	C load s1 limb
+	ld	r27, 0(vp)	C load s2 limb
+	ld	r8, 8(up)	C load s1 limb
+	ld	r9, 8(vp)	C load s2 limb
+	ld	r10, 16(up)	C load s1 limb
+	ld	r11, 16(vp)	C load s2 limb
+	ld	r12, 24(up)	C load s1 limb
+	ld	r0, 24(vp)	C load s2 limb
+	and	r27, r27, cnd
+	and	r9, r9, cnd
+	and	r11, r11, cnd
+	and	r0, r0, cnd
+	bdz	L(end)
+
+	addi	up, up, 32
+	addi	vp, vp, 32
+
+L(top):	ADDSUBC	r28, r27, r7
+	ld	r7, 0(up)	C load s1 limb
+	ld	r27, 0(vp)	C load s2 limb
+	ADDSUBC	r29, r9, r8
+	ld	r8, 8(up)	C load s1 limb
+	ld	r9, 8(vp)	C load s2 limb
+	ADDSUBC	r30, r11, r10
+	ld	r10, 16(up)	C load s1 limb
+	ld	r11, 16(vp)	C load s2 limb
+	ADDSUBC	r31, r0, r12
+	ld	r12, 24(up)	C load s1 limb
+	ld	r0, 24(vp)	C load s2 limb
+	std	r28, 0(rp)
+	addi	up, up, 32
+	std	r29, 8(rp)
+	addi	vp, vp, 32
+	std	r30, 16(rp)
+	std	r31, 24(rp)
+	addi	rp, rp, 32
+	and	r27, r27, cnd
+	and	r9, r9, cnd
+	and	r11, r11, cnd
+	and	r0, r0, cnd
+	bdnz	L(top)		C decrement ctr and loop back
+
+L(end):	ADDSUBC	r28, r27, r7
+	ADDSUBC	r29, r9, r8
+	ADDSUBC	r30, r11, r10
+	ADDSUBC	r31, r0, r12
+	std	r28, 0(rp)
+	std	r29, 8(rp)
+	std	r30, 16(rp)
+	std	r31, 24(rp)
+
+L(ret):	ld	r31, -8(r1)
+	ld	r30, -16(r1)
+	ld	r29, -24(r1)
+	ld	r28, -32(r1)
+	ld	r27, -40(r1)
+
+	subfe	r3, r0, r0	C -cy
+	GENRVAL
+	blr
+EPILOGUE()

diff --git a/third_party/gmp/mpn/powerpc64/mode64/dive_1.asm b/third_party/gmp/mpn/powerpc64/mode64/dive_1.asm
new file mode 100644
index 0000000..c2d10bd
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc64/mode64/dive_1.asm

@@ -0,0 +1,135 @@
+dnl  PowerPC-64 mpn_divexact_1 -- mpn by limb exact division.
+
+dnl  Copyright 2006, 2010 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C			cycles/limb
+C			norm	unorm
+C POWER3/PPC630	       13-19
+C POWER4/PPC970		16
+C POWER5		16	16
+C POWER6		37	46
+C POWER7		12	12
+C POWER8		12	12
+
+C TODO
+C  * Check if n=1 code is really an improvement.  It probably isn't.
+C  * Make more similar to mode1o.asm.
+
+C INPUT PARAMETERS
+define(`rp', `r3')
+define(`up', `r4')
+define(`n',  `r5')
+define(`d',  `r6')
+
+
+ASM_START()
+
+EXTERN(binvert_limb_table)
+
+PROLOGUE(mpn_divexact_1,toc)
+	addic.	n, n, -1
+	ld	r12, 0(up)
+	bne	cr0, L(2)
+	divdu	r0, r12, d
+	std	r0, 0(rp)
+	blr
+L(2):
+	rldicl.	r0, d, 0, 63
+	li	r10, 0
+	bne	cr0, L(7)
+	neg	r0, d
+	and	r0, d, r0
+	cntlzd	r0, r0
+	subfic	r0, r0, 63
+	rldicl	r10, r0, 0, 32
+	srd	d, d, r0
+L(7):
+	mtctr	n
+	LEA(	r5, binvert_limb_table)
+	rldicl	r11, d, 63, 57
+	lbzx	r0, r5, r11
+	mulld	r9, r0, r0
+	sldi	r0, r0, 1
+	mulld	r9, d, r9
+	subf	r0, r9, r0
+	mulld	r5, r0, r0
+	sldi	r0, r0, 1
+	mulld	r5, d, r5
+	subf	r0, r5, r0
+	mulld	r9, r0, r0
+	sldi	r0, r0, 1
+	mulld	r9, d, r9
+	subf	r7, r9, r0		C r7 = 1/d mod 2^64
+
+	bne	cr0, L(norm)
+	subfic	r8, r10, 64		C set carry as side effect
+	li	r5, 0
+	srd	r11, r12, r10
+
+	ALIGN(16)
+L(loop0):
+	ld	r12, 8(up)
+	nop
+	addi	up, up, 8
+	sld	r0, r12, r8
+	or	r11, r11, r0
+	subfe	r9, r5, r11
+	srd	r11, r12, r10
+	mulld	r0, r7, r9
+	mulhdu	r5, r0, d
+	std	r0, 0(rp)
+	addi	rp, rp, 8
+	bdnz	L(loop0)
+
+	subfe	r0, r5, r11
+	mulld	r0, r7, r0
+	std	r0, 0(rp)
+	blr
+
+	ALIGN(16)
+L(norm):
+	mulld	r11, r12, r7
+	mulhdu	r5, r11, d
+	std	r11, 0(rp)
+	ALIGN(16)
+L(loop1):
+	ld	r9, 8(up)
+	addi	up, up, 8
+	subfe	r5, r5, r9
+	mulld	r11, r7, r5
+	mulhdu	r5, r11, d	C result not used in last iteration
+	std	r11, 8(rp)
+	addi	rp, rp, 8
+	bdnz	L(loop1)
+
+	blr
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/powerpc64/mode64/divrem_1.asm b/third_party/gmp/mpn/powerpc64/mode64/divrem_1.asm
new file mode 100644
index 0000000..b283877
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc64/mode64/divrem_1.asm

@@ -0,0 +1,274 @@
+dnl  PowerPC-64 mpn_divrem_1 -- Divide an mpn number by an unnormalized limb.
+
+dnl  Copyright 2003-2005, 2007, 2008, 2010, 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C                           cycles/limb
+C                       norm    unorm   frac
+C POWER3/PPC630         16-34   16-34   ~11   outdated figures
+C POWER4/PPC970          28      28      19
+C POWER5                 29      29     ~19
+C POWER6                 49      59     ~42
+C POWER7                 24.5    23     ~14
+
+C INPUT PARAMETERS
+C qp  = r3
+C fn  = r4
+C up  = r5
+C un  = r6
+C d   = r7
+
+C We use a not very predictable branch in the frac code, therefore the cycle
+C count wobbles somewhat.  With the alternative branch-free code, things run
+C considerably slower on POWER4/PPC970 and POWER5.
+
+C Add preinv entry point.
+
+
+ASM_START()
+
+EXTERN_FUNC(mpn_invert_limb)
+
+PROLOGUE(mpn_divrem_1,toc)
+
+	mfcr	r12
+	add.	r10, r6, r4
+	std	r25, -56(r1)
+	mr	r25, r4
+	mflr	r0
+	std	r26, -48(r1)
+	mr	r26, r5
+	std	r28, -32(r1)
+	mr	r28, r6
+	std	r29, -24(r1)
+	mr	r29, r3
+	li	r3, 0
+	std	r30, -16(r1)
+	mr	r30, r7
+	std	r31, -8(r1)
+	li	r31, 0
+	std	r27, -40(r1)
+	std	r0, 16(r1)
+	stw	r12, 8(r1)
+	stdu	r1, -176(r1)
+	beq-	cr0, L(1)
+	cmpdi	cr7, r7, 0
+	sldi	r0, r10, 3
+	add	r11, r0, r29
+	addi	r29, r11, -8
+	blt-	cr7, L(162)
+	cmpdi	cr4, r6, 0
+	beq+	cr4, L(71)
+L(163):
+	sldi	r9, r6, 3
+	add	r9, r9, r5
+	ld	r7, -8(r9)
+	cmpld	cr7, r7, r30
+	bge-	cr7, L(71)
+	cmpdi	cr7, r10, 1
+	li	r0, 0
+	mr	r31, r7
+	std	r0, -8(r11)
+	addi	r29, r29, -8
+	mr	r3, r7
+	beq-	cr7, L(1)
+	addi	r28, r6, -1
+	cmpdi	cr4, r28, 0
+L(71):
+	cntlzd	r27, r30
+	sld	r30, r30, r27
+	sld	r31, r31, r27
+	mr	r3, r30
+	CALL(	mpn_invert_limb)
+	beq-	cr4, L(110)
+	sldi	r9, r28, 3
+	addic.	r6, r28, -2
+	add	r9, r9, r26
+	subfic	r5, r27, 64
+	ld	r8, -8(r9)
+	srd	r0, r8, r5
+	or	r31, r31, r0
+	sld	r7, r8, r27
+	blt-	cr0, L(154)
+	addi	r28, r28, -1
+	mtctr	r28
+	sldi	r6, r6, 3
+	ALIGN(16)
+L(uloop):
+	ldx	r8, r26, r6
+	nop
+	mulld	r0, r31, r3
+	mulhdu	r10, r31, r3
+	addi	r11, r31, 1
+	srd	r9, r8, r5
+	addi	r6, r6, -8
+	or	r9, r7, r9
+	addc	r0, r0, r9
+	adde	r10, r10, r11
+	mulld	r31, r10, r30
+	subf	r31, r31, r9
+	subfc	r0, r31, r0	C r <= ql
+	subfe	r0, r0, r0	C r0 = -(r <= ql)
+	and	r9, r30, r0
+	add	r31, r31, r9
+	add	r10, r0, r10	C qh -= (r >= ql)
+	cmpld	cr7, r31, r30
+	bge-	cr7, L(164)
+L(123):
+	std	r10, 0(r29)
+	addi	r29, r29, -8
+	sld	r7, r8, r27
+	bdnz	L(uloop)
+L(154):
+	addi	r11, r31, 1
+	nop
+	mulld	r0, r31, r3
+	mulhdu	r8, r31, r3
+	addc	r0, r0, r7
+	adde	r8, r8, r11
+	mulld	r31, r8, r30
+	subf	r31, r31, r7
+	subfc	r0, r0, r31	C r >= ql
+	subfe	r0, r0, r0	C r0 = -(r >= ql)
+	not	r7, r0
+	add	r8, r7, r8	C qh -= (r >= ql)
+	andc	r0, r30, r0
+	add	r31, r31, r0
+	cmpld	cr7, r31, r30
+	bge-	cr7, L(165)
+L(134):
+	std	r8, 0(r29)
+	addi	r29, r29, -8
+L(110):
+	addic.	r0, r25, -1
+	blt-	cr0, L(156)
+	mtctr	r25
+	neg	r9, r30
+	ALIGN(16)
+L(ufloop):
+	addi	r11, r31, 1
+	nop
+	mulld	r0, r3, r31
+	mulhdu	r10, r3, r31
+	add	r10, r10, r11
+	mulld	r31, r9, r10
+ifelse(0,1,`
+	subfc	r0, r0, r31
+	subfe	r0, r0, r0	C r0 = -(r >= ql)
+	not	r7, r0
+	add	r10, r7, r10	C qh -= (r >= ql)
+	andc	r0, r30, r0
+	add	r31, r31, r0
+',`
+	cmpld	cr7, r31, r0
+	blt	cr7, L(29)
+	add	r31, r30, r31
+	addi	r10, r10, -1
+L(29):
+')
+	std	r10, 0(r29)
+	addi	r29, r29, -8
+	bdnz	L(ufloop)
+L(156):
+	srd	r3, r31, r27
+L(1):
+	addi	r1, r1, 176
+	ld	r0, 16(r1)
+	lwz	r12, 8(r1)
+	mtlr	r0
+	ld	r25, -56(r1)
+	ld	r26, -48(r1)
+	mtcrf	8, r12
+	ld	r27, -40(r1)
+	ld	r28, -32(r1)
+	ld	r29, -24(r1)
+	ld	r30, -16(r1)
+	ld	r31, -8(r1)
+	blr
+L(162):
+	cmpdi	cr7, r6, 0
+	beq-	cr7, L(8)
+	sldi	r9, r6, 3
+	addi	r29, r29, -8
+	add	r9, r9, r5
+	addi	r28, r6, -1
+	ld	r31, -8(r9)
+	subfc	r9, r7, r31
+	li	r9, 0
+	adde	r9, r9, r9
+	neg	r0, r9
+	std	r9, -8(r11)
+	and	r0, r0, r7
+	subf	r31, r0, r31
+L(8):
+	mr	r3, r30
+	CALL(	mpn_invert_limb)
+	li	r27, 0
+	addic.	r6, r28, -1
+	blt-	cr0, L(110)
+	mtctr	r28
+	sldi	r6, r6, 3
+	ALIGN(16)
+L(nloop):
+	addi	r11, r31, 1
+	ldx	r8, r26, r6
+	mulld	r0, r31, r3
+	mulhdu	r10, r31, r3
+	addi	r6, r6, -8
+	addc	r0, r0, r8
+	adde	r10, r10, r11
+	mulld	r31, r10, r30
+	subf	r31, r31, r8	C r = nl - qh * d
+	subfc	r0, r31, r0	C r <= ql
+	subfe	r0, r0, r0	C r0 = -(r <= ql)
+	and	r9, r30, r0
+	add	r31, r31, r9
+	add	r10, r0, r10	C qh -= (r >= ql)
+	cmpld	cr7, r31, r30
+	bge-	cr7, L(167)
+L(51):
+	std	r10, 0(r29)
+	addi	r29, r29, -8
+	bdnz	L(nloop)
+	b	L(110)
+
+L(164):
+	subf	r31, r30, r31
+	addi	r10, r10, 1
+	b	L(123)
+L(167):
+	subf	r31, r30, r31
+	addi	r10, r10, 1
+	b	L(51)
+L(165):
+	subf	r31, r30, r31
+	addi	r8, r8, 1
+	b	L(134)
+EPILOGUE()

diff --git a/third_party/gmp/mpn/powerpc64/mode64/divrem_2.asm b/third_party/gmp/mpn/powerpc64/mode64/divrem_2.asm
new file mode 100644
index 0000000..752c3d6
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc64/mode64/divrem_2.asm

@@ -0,0 +1,187 @@
+dnl  PPC-64 mpn_divrem_2 -- Divide an mpn number by a normalized 2-limb number.
+
+dnl  Copyright 2007, 2008 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C                       cycles/limb
+C                       norm    frac
+C POWER3/PPC630
+C POWER4/PPC970         ?       ?
+C POWER5                37      ?
+C POWER6                62      ?
+C POWER6                30.5    ?
+
+C INPUT PARAMETERS
+C qp  = r3
+C fn  = r4
+C up  = r5
+C un  = r6
+C dp  = r7
+
+
+ifdef(`DARWIN',,`
+define(`r2',`r31')')		C FIXME!
+
+ASM_START()
+
+EXTERN_FUNC(mpn_invert_limb)
+
+PROLOGUE(mpn_divrem_2,toc)
+	mflr	r0
+	std	r23, -72(r1)
+	std	r24, -64(r1)
+	std	r25, -56(r1)
+	std	r26, -48(r1)
+	std	r27, -40(r1)
+	std	r28, -32(r1)
+	std	r29, -24(r1)
+	std	r30, -16(r1)
+	std	r31, -8(r1)
+	std	r0, 16(r1)
+	stdu	r1, -192(r1)
+	mr	r24, r3
+	mr	r25, r4
+	sldi	r0, r6, 3
+	add	r26, r5, r0
+	addi	r26, r26, -24
+	ld	r30, 8(r7)
+	ld	r28, 0(r7)
+	ld	r29, 16(r26)
+	ld	r31, 8(r26)
+
+ifelse(0,1,`
+	li	r23, 0
+	cmpld	cr7, r29, r30
+	blt	cr7, L(8)
+	bgt	cr7, L(9)
+	cmpld	cr0, r31, r28
+	blt	cr0, L(8)
+L(9):	subfc	r31, r28, r31
+	subfe	r29, r30, r29
+	li	r23, 1
+',`
+	li	r23, 0
+	cmpld	cr7, r29, r30
+	blt	cr7, L(8)
+	mfcr	r0
+	rlwinm	r0, r0, 30, 31, 31
+	subfc	r9, r28, r31
+	addze.	r0, r0
+	nop
+	beq	cr0, L(8)
+	subfc	r31, r28, r31
+	subfe	r29, r30, r29
+	li	r23, 1
+')
+
+L(8):
+	add	r27, r25, r6
+	addic.	r27, r27, -3
+	blt	cr0, L(18)
+	mr	r3, r30
+	CALL(	mpn_invert_limb)
+	mulld	r10, r3, r30
+	mulhdu	r0, r3, r28
+	addc	r8, r10, r28
+	subfe	r11, r1, r1
+	addc	r10, r8, r0
+	addze.	r11, r11
+	blt	cr0, L(91)
+L(40):
+	subfc	r10, r30, r10
+	addme.	r11, r11
+	addi	r3, r3, -1
+	bge	cr0, L(40)
+L(91):
+	addi	r5, r27,  1
+	mtctr	r5
+	sldi	r0, r27, 3
+	add	r24, r24, r0
+	ALIGN(16)
+L(loop):
+	mulhdu	r8, r29, r3
+	mulld	r6, r29, r3
+	addc	r6, r6, r31
+	adde	r8, r8, r29
+	cmpd	cr7, r27, r25
+	mulld	r0, r30, r8
+	mulhdu	r11, r28, r8
+	mulld	r10, r28, r8
+	subf	r31, r0, r31
+	li	r7, 0
+	blt	cr7, L(60)
+	ld	r7, 0(r26)
+	addi	r26, r26, -8
+	nop
+L(60):	subfc	r7, r28, r7
+	subfe	r31, r30, r31
+	subfc	r7, r10, r7
+	subfe	r4, r11, r31
+	subfc	r9, r6, r4
+	subfe	r9, r1, r1
+	andc	r6, r28, r9
+	andc	r0, r30, r9
+	addc	r31, r7, r6
+	adde	r29, r4, r0
+	subf	r8, r9, r8
+	cmpld	cr7, r29, r30
+	bge-	cr7, L(fix)
+L(bck):	std	r8, 0(r24)
+	addi	r24, r24, -8
+	addi	r27, r27, -1
+	bdnz	L(loop)
+L(18):
+	std	r31, 8(r26)
+	std	r29, 16(r26)
+	mr	r3, r23
+	addi	r1, r1, 192
+	ld	r0, 16(r1)
+	mtlr	r0
+	ld	r23, -72(r1)
+	ld	r24, -64(r1)
+	ld	r25, -56(r1)
+	ld	r26, -48(r1)
+	ld	r27, -40(r1)
+	ld	r28, -32(r1)
+	ld	r29, -24(r1)
+	ld	r30, -16(r1)
+	ld	r31, -8(r1)
+	blr
+L(fix):
+	mfcr	r0
+	rlwinm	r0, r0, 30, 31, 31
+	subfc	r9, r28, r31
+	addze.	r0, r0
+	beq	cr0, L(bck)
+	subfc	r31, r28, r31
+	subfe	r29, r30, r29
+	addi	r8, r8, 1
+	b	L(bck)
+EPILOGUE()

diff --git a/third_party/gmp/mpn/powerpc64/mode64/gcd_11.asm b/third_party/gmp/mpn/powerpc64/mode64/gcd_11.asm
new file mode 100644
index 0000000..f9792e5
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc64/mode64/gcd_11.asm

@@ -0,0 +1,77 @@
+dnl  PowerPC-64 mpn_gcd_11.
+
+dnl  Copyright 2000-2002, 2005, 2009, 2011-2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C		    cycles/bit (approx)
+C POWER3/PPC630		 ?
+C POWER4/PPC970		 8.5	obsolete
+C POWER5		 ?
+C POWER6		 ?
+C POWER7		 9.4	obsolete
+C POWER8		 ?
+C POWER9		 ?
+C Numbers measured with: speed -CD -s16-64 -t48 mpn_gcd_1
+
+define(`u0',    `r3')
+define(`v0',    `r4')
+
+define(`mask', `r0')dnl
+define(`a1',   `r4')dnl
+define(`a2',   `r5')dnl
+define(`d1',   `r6')dnl
+define(`d2',   `r7')dnl
+define(`cnt',  `r9')dnl
+
+ASM_START()
+PROLOGUE(mpn_gcd_11)
+	li	r12, 63
+	mr	r8, v0
+	subf.	r10, u0, v0		C r10 = d - a
+	beq	L(end)
+
+	ALIGN(16)
+L(top):	subfc	r11, r8, r3		C r11 = a - d
+	and	d2, r11, r10
+	subfe	mask, mask, mask
+	cntlzd	cnt, d2
+	and	a1, r10, mask		C d - a
+	andc	a2, r11,  mask		C a - d
+	and	d1, r3, mask		C a
+	andc	d2, r8, mask		C d
+	or	r3, a1, a2		C new a
+	subf	cnt, cnt, r12
+	or	r8, d1, d2		C new d
+	srd	r3, r3, cnt
+	subf.	r10, r3, r8		C r10 = d - a
+	bne	L(top)
+
+L(end):	blr
+EPILOGUE()

diff --git a/third_party/gmp/mpn/powerpc64/mode64/gmp-mparam.h b/third_party/gmp/mpn/powerpc64/mode64/gmp-mparam.h
new file mode 100644
index 0000000..f8305f4
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc64/mode64/gmp-mparam.h

@@ -0,0 +1,82 @@
+/* PowerPC-64 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2008, 2009 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+/* 1600MHz PPC970 */
+
+/* Generated by tuneup.c, 2009-01-14, gcc 4.0 */
+
+#define MUL_TOOM22_THRESHOLD             14
+#define MUL_TOOM33_THRESHOLD             93
+#define MUL_TOOM44_THRESHOLD            135
+
+#define SQR_BASECASE_THRESHOLD            6
+#define SQR_TOOM2_THRESHOLD              32
+#define SQR_TOOM3_THRESHOLD              74
+#define SQR_TOOM4_THRESHOLD             136
+
+#define MULLO_BASECASE_THRESHOLD          0  /* always */
+#define MULLO_DC_THRESHOLD               44
+#define MULLO_MUL_N_THRESHOLD           234
+
+#define DIV_SB_PREINV_THRESHOLD           0  /* always */
+#define DIV_DC_THRESHOLD                 33
+#define POWM_THRESHOLD                   89
+
+#define MATRIX22_STRASSEN_THRESHOLD      15
+#define HGCD_THRESHOLD                   93
+#define GCD_DC_THRESHOLD                237
+#define GCDEXT_DC_THRESHOLD             273
+#define JACOBI_BASE_METHOD                1
+
+#define MOD_1_NORM_THRESHOLD              0  /* always */
+#define MOD_1_UNNORM_THRESHOLD            0  /* always */
+#define MOD_1_1_THRESHOLD                 6
+#define MOD_1_2_THRESHOLD                 9
+#define MOD_1_4_THRESHOLD                23
+#define USE_PREINV_DIVREM_1               0
+#define USE_PREINV_MOD_1                  0
+#define DIVEXACT_1_THRESHOLD              0  /* always (native) */
+#define MODEXACT_1_ODD_THRESHOLD          0  /* always (native) */
+
+#define GET_STR_DC_THRESHOLD             12
+#define GET_STR_PRECOMPUTE_THRESHOLD     24
+#define SET_STR_DC_THRESHOLD            650
+#define SET_STR_PRECOMPUTE_THRESHOLD   1713
+
+#define MUL_FFT_TABLE  { 336, 672, 1856, 2816, 7168, 20480, 81920, 327680, 0 }
+#define MUL_FFT_MODF_THRESHOLD          304
+#define MUL_FFT_THRESHOLD              4224
+
+#define SQR_FFT_TABLE  { 272, 672, 1600, 2816, 7168, 20480, 81920, 327680, 786432, 0 }
+#define SQR_FFT_MODF_THRESHOLD          272
+#define SQR_FFT_THRESHOLD              2688

diff --git a/third_party/gmp/mpn/powerpc64/mode64/invert_limb.asm b/third_party/gmp/mpn/powerpc64/mode64/invert_limb.asm
new file mode 100644
index 0000000..dfdba64
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc64/mode64/invert_limb.asm

@@ -0,0 +1,88 @@
+dnl  PowerPC-64 mpn_invert_limb -- Invert a normalized limb.
+
+dnl  Copyright 2004-2006, 2008, 2010, 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C                  cycles/limb (approximate)
+C POWER3/PPC630         80
+C POWER4/PPC970         86
+C POWER5                86
+C POWER6               170
+C POWER7                66
+
+ASM_START()
+PROLOGUE(mpn_invert_limb,toc)
+	LEAL(	r12, approx_tab)
+	srdi	r9, r3, 32
+	rlwinm	r9, r9, 10, 23, 30	C (d >> 55) & 0x1fe
+	srdi	r10, r3, 24		C d >> 24
+	lis	r11, 0x1000
+	rldicl	r8, r3, 0, 63		C d mod 2
+	addi	r10, r10, 1		C d40
+	sldi	r11, r11, 32		C 2^60
+	srdi	r7, r3, 1		C d/2
+	add	r7, r7, r8		C d63 = ceil(d/2)
+	neg	r8, r8			C mask = -(d mod 2)
+	lhzx	r0, r9, r12
+	mullw	r9, r0, r0		C v0*v0
+	sldi	r6, r0, 11		C v0 << 11
+	addi	r0, r6, -1		C (v0 << 11) - 1
+	mulld	r9, r9, r10		C v0*v0*d40
+	srdi	r9, r9, 40		C v0*v0*d40 >> 40
+	subf	r9, r9, r0		C v1 = (v0 << 11) - (v0*v0*d40 >> 40) - 1
+	mulld	r0, r9, r10		C v1*d40
+	sldi	r6, r9, 13		C v1 << 13
+	subf	r0, r0, r11		C 2^60 - v1*d40
+	mulld	r0, r0, r9		C v1 * (2^60 - v1*d40)
+	srdi	r0, r0, 47		C v1 * (2^60 - v1*d40) >> 47
+	add	r0, r0, r6		C v2 = (v1 << 13) + (v1 * (2^60 - v1*d40) >> 47)
+	mulld	r11, r0, r7		C v2 * d63
+	srdi	r10, r0, 1		C v2 >> 1
+	sldi	r9, r0, 31		C v2 << 31
+	and	r8, r10, r8		C (v2 >> 1) & mask
+	subf	r8, r11, r8		C ((v2 >> 1) & mask) - v2 * d63
+	mulhdu	r0, r8, r0		C p1 = v2 * (((v2 >> 1) & mask) - v2 * d63)
+	srdi	r0, r0, 1		C p1 >> 1
+	add	r0, r0, r9		C v3 = (v2 << 31) + (p1 >> 1)
+	nop
+	mulld	r11, r0, r3
+	mulhdu	r9, r0, r3
+	addc	r10, r11, r3
+	adde	r3, r9, r3
+	subf	r3, r3, r0
+	blr
+EPILOGUE()
+
+DEF_OBJECT(approx_tab)
+forloop(i,256,512-1,dnl
+`	.short	eval(0x7fd00/i)
+')dnl
+END_OBJECT(approx_tab)
+ASM_END()

diff --git a/third_party/gmp/mpn/powerpc64/mode64/mod_1_1.asm b/third_party/gmp/mpn/powerpc64/mode64/mod_1_1.asm
new file mode 100644
index 0000000..8733730
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc64/mode64/mod_1_1.asm

@@ -0,0 +1,164 @@
+dnl  PowerPC-64 mpn_mod_1_1p
+
+dnl  Copyright 2010, 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C                   cycles/limb
+C POWER3/PPC630          ?
+C POWER4/PPC970         17
+C POWER5                16
+C POWER6                30
+C POWER7                10.2
+
+C TODO
+C  * Optimise, in particular the cps function.  This was compiler-generated and
+C    then hand optimised.
+
+C INPUT PARAMETERS
+define(`ap',  `r3')
+define(`n',   `r4')
+define(`d',   `r5')
+define(`cps', `r6')
+
+ASM_START()
+
+EXTERN_FUNC(mpn_invert_limb)
+
+PROLOGUE(mpn_mod_1_1p)
+	sldi	r10, r4, 3
+	addi	r4, r4, -1
+	add	r3, r3, r10
+	ld	r0, 16(r6)		C B1modb
+	ld	r12, 24(r6)		C B2modb
+	ld	r9, -8(r3)
+	ld	r10, -16(r3)
+	mtctr	r4
+	mulhdu	r8, r9, r0
+	mulld	r7, r9, r0
+	addc	r11, r7, r10
+	addze	r9, r8
+	bdz	L(end)
+
+	ALIGN(16)
+L(top):	ld	r4, -24(r3)
+	addi	r3, r3, -8
+	nop
+	mulld	r10, r11, r0
+	mulld	r8, r9, r12
+	mulhdu	r11, r11, r0
+	mulhdu	r9, r9, r12
+	addc	r7, r10, r4
+	addze	r10, r11
+	addc	r11, r8, r7
+	adde	r9, r9, r10
+	bdnz	L(top)
+
+L(end):
+ifdef(`HAVE_LIMB_LITTLE_ENDIAN',
+`	lwz	r0, 8(r6)',
+`	lwz	r0, 12(r6)')
+	ld	r3, 0(r6)
+	cmpdi	cr7, r0, 0
+	beq-	cr7, L(4)
+	subfic	r10, r0, 64
+	sld	r9, r9, r0
+	srd	r10, r11, r10
+	or	r9, r10, r9
+L(4):	subfc	r10, r5, r9
+	subfe	r10, r10, r10
+	nand	r10, r10, r10
+	sld	r11, r11, r0
+	and	r10, r10, r5
+	subf	r9, r10, r9
+	mulhdu	r10, r9, r3
+	mulld	r3, r9, r3
+	addi	r9, r9, 1
+	addc	r8, r3, r11
+	adde	r3, r10, r9
+	mulld	r3, r3, r5
+	subf	r3, r3, r11
+	cmpld	cr7, r8, r3
+	bge	cr7, L(5)		C FIXME: Make branch-less
+	add	r3, r3, r5
+L(5):	cmpld	cr7, r3, r5
+	bge-	cr7, L(10)
+	srd	r3, r3, r0
+	blr
+
+L(10):	subf	r3, r5, r3
+	srd	r3, r3, r0
+	blr
+EPILOGUE()
+
+PROLOGUE(mpn_mod_1_1p_cps,toc)
+	mflr	r0
+	std	r29, -24(r1)
+	std	r30, -16(r1)
+	std	r31, -8(r1)
+	cntlzd	r31, r4
+	std	r0, 16(r1)
+	extsw	r31, r31
+	mr	r29, r3
+	stdu	r1, -144(r1)
+	sld	r30, r4, r31
+	mr	r3, r30
+	CALL(	mpn_invert_limb)
+	cmpdi	cr7, r31, 0
+	neg	r0, r30
+	beq-	cr7, L(13)
+	subfic	r11, r31, 64
+	li	r0, 1
+	neg	r9, r30
+	srd	r11, r3, r11
+	sld	r0, r0, r31
+	or	r0, r11, r0
+	mulld	r0, r0, r9
+L(13):	mulhdu	r9, r0, r3
+	mulld	r11, r0, r3
+	add	r9, r0, r9
+	nor	r9, r9, r9
+	mulld	r9, r9, r30
+	cmpld	cr7, r11, r9
+	bge	cr7, L(14)
+	add	r9, r9, r30
+L(14):	addi	r1, r1, 144
+	srd	r0, r0, r31
+	std	r31, 8(r29)
+	std	r3, 0(r29)
+	std	r0, 16(r29)
+	ld	r0, 16(r1)
+	srd	r9, r9, r31
+	ld	r30, -16(r1)
+	ld	r31, -8(r1)
+	std	r9, 24(r29)
+	ld	r29, -24(r1)
+	mtlr	r0
+	blr
+EPILOGUE()

diff --git a/third_party/gmp/mpn/powerpc64/mode64/mod_1_4.asm b/third_party/gmp/mpn/powerpc64/mode64/mod_1_4.asm
new file mode 100644
index 0000000..0b7d6bf
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc64/mode64/mod_1_4.asm

@@ -0,0 +1,270 @@
+dnl  PowerPC-64 mpn_mod_1s_4p
+
+dnl  Copyright 2010, 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C                   cycles/limb
+C POWER3/PPC630          ?
+C POWER4/PPC970          9
+C POWER5                 9
+C POWER6                13
+C POWER7                3.5
+
+C TODO
+C  * Optimise, in particular the cps function.  This was compiler-generated and
+C    then hand optimised.
+
+C INPUT PARAMETERS
+define(`ap',  `r3')
+define(`n',   `r4')
+define(`d',   `r5')
+define(`cps', `r6')
+
+ASM_START()
+
+EXTERN_FUNC(mpn_invert_limb)
+
+PROLOGUE(mpn_mod_1s_4p)
+	std	r23, -72(r1)
+	ld	r23, 48(cps)
+	std	r24, -64(r1)
+	std	r25, -56(r1)
+	ld	r24, 32(cps)
+	ld	r25, 24(cps)
+	std	r26, -48(r1)
+	std	r27, -40(r1)
+	ld	r26, 16(cps)
+	std	r28, -32(r1)
+	std	r29, -24(r1)
+	std	r30, -16(r1)
+	std	r31, -8(r1)
+	ld	r30, 40(cps)
+
+	rldicl.	r0, n, 0,62
+	sldi	r31, n, 3
+	add	ap, ap, r31		C make ap point at end of operand
+
+	cmpdi	cr7, r0, 2
+	beq	cr0, L(b00)
+	blt	cr7, L(b01)
+	beq	cr7, L(b10)
+
+L(b11):	ld	r11, -16(ap)
+	ld	r9, -8(ap)
+	ld	r0, -24(ap)
+	mulhdu	r27, r11, r26
+	mulld	r8, r11, r26
+	mulhdu	r11, r9, r25
+	mulld	r9, r9, r25
+	addc	r31, r8, r0
+	addze	r10, r27
+	addc	r0, r9, r31
+	adde	r9, r11, r10
+	addi	ap, ap, -40
+	b	L(6)
+
+	ALIGN(16)
+L(b00):	ld	r11, -24(ap)
+	ld	r10, -16(ap)
+	ld	r9, -8(ap)
+	ld	r0, -32(ap)
+	mulld	r8, r11, r26
+	mulhdu	r7, r10, r25
+	mulhdu	r27, r11, r26
+	mulhdu	r11, r9, r24
+	mulld	r10, r10, r25
+	mulld	r9, r9, r24
+	addc	r31, r8, r0
+	addze	r0, r27
+	addc	r8, r31, r10
+	adde	r10, r0, r7
+	addc	r0, r9, r8
+	adde	r9, r11, r10
+	addi	ap, ap, -48
+	b	L(6)
+
+	ALIGN(16)
+L(b01):	li	r9, 0
+	ld	r0, -8(ap)
+	addi	ap, ap, -24
+	b	L(6)
+
+	ALIGN(16)
+L(b10):	ld	r9, -8(ap)
+	ld	r0, -16(ap)
+	addi	ap, ap, -32
+
+	ALIGN(16)
+L(6):	addi	r10, n, 3
+	srdi	r7, r10, 2
+	mtctr	r7
+	bdz	L(end)
+
+	ALIGN(16)
+L(top):	ld	r31, -16(ap)
+	ld	r10, -8(ap)
+	ld	r11, 8(ap)
+	ld	r12, 0(ap)
+	mulld	r29, r0, r30		C rl * B4modb
+	mulhdu	r0,  r0, r30		C rl * B4modb
+	mulhdu	r27, r10, r26
+	mulld	r10, r10, r26
+	mulhdu	r7, r9, r23		C rh * B5modb
+	mulld	r9, r9, r23		C rh * B5modb
+	mulhdu	r28, r11, r24
+	mulld	r11, r11, r24
+	mulhdu	r4, r12, r25
+	mulld	r12, r12, r25
+	addc	r8, r10, r31
+	addze	r10, r27
+	addi	ap, ap, -32
+	addc	r27, r8, r12
+	adde	r12, r10, r4
+	addc	r11, r27, r11
+	adde	r31, r12, r28
+	addc	r12, r11, r29
+	adde	r4, r31, r0
+	addc	r0, r9, r12
+	adde	r9, r7, r4
+	bdnz	L(top)
+
+L(end):
+ifdef(`HAVE_LIMB_LITTLE_ENDIAN',
+`	lwz	r3, 8(cps)',
+`	lwz	r3, 12(cps)')
+	mulld	r10, r9, r26
+	mulhdu	r9, r9, r26
+	addc	r11, r0, r10
+	addze	r9, r9
+	ld	r10, 0(cps)
+	subfic	r8, r3, 64
+	sld	r9, r9, r3
+	srd	r8, r11, r8
+	sld	r11, r11, r3
+	or	r9, r8, r9
+	mulld	r0, r9, r10
+	mulhdu	r10, r9, r10
+	addi	r9, r9, 1
+	addc	r8, r0, r11
+	adde	r0, r10, r9
+	mulld	r0, r0, d
+	subf	r0, r0, r11
+	cmpld	cr7, r8, r0
+	bge	cr7, L(9)
+	add	r0, r0, d
+L(9):	cmpld	cr7, r0, d
+	bge-	cr7, L(16)
+L(10):	srd	r3, r0, r3
+	ld	r23, -72(r1)
+	ld	r24, -64(r1)
+	ld	r25, -56(r1)
+	ld	r26, -48(r1)
+	ld	r27, -40(r1)
+	ld	r28, -32(r1)
+	ld	r29, -24(r1)
+	ld	r30, -16(r1)
+	ld	r31, -8(r1)
+	blr
+
+L(16):	subf	r0, d, r0
+	b	L(10)
+EPILOGUE()
+
+PROLOGUE(mpn_mod_1s_4p_cps,toc)
+	mflr	r0
+	std	r29, -24(r1)
+	std	r30, -16(r1)
+	mr	r29, r3
+	std	r0, 16(r1)
+	std	r31, -8(r1)
+	stdu	r1, -144(r1)
+	cntlzd	r31, r4
+	sld	r30, r4, r31
+	mr	r3, r30
+	CALL(	mpn_invert_limb)
+	subfic	r9, r31, 64
+	li	r10, 1
+	sld	r10, r10, r31
+	srd	r9, r3, r9
+	neg	r0, r30
+	or	r10, r10, r9
+	mulld	r10, r10, r0
+	mulhdu	r11, r10, r3
+	nor	r11, r11, r11
+	subf	r11, r10, r11
+	mulld	r11, r11, r30
+	mulld	r0, r10, r3
+	cmpld	cr7, r0, r11
+	bge	cr7, L(18)
+	add	r11, r11, r30
+L(18):	mulhdu	r9, r11, r3
+	add	r9, r11, r9
+	nor	r9, r9, r9
+	mulld	r9, r9, r30
+	mulld	r0, r11, r3
+	cmpld	cr7, r0, r9
+	bge	cr7, L(19)
+	add	r9, r9, r30
+L(19):	mulhdu	r0, r9, r3
+	add	r0, r9, r0
+	nor	r0, r0, r0
+	mulld	r0, r0, r30
+	mulld	r8, r9, r3
+	cmpld	cr7, r8, r0
+	bge	cr7, L(20)
+	add	r0, r0, r30
+L(20):	mulhdu	r8, r0, r3
+	add	r8, r0, r8
+	nor	r8, r8, r8
+	mulld	r8, r8, r30
+	mulld	r7, r0, r3
+	cmpld	cr7, r7, r8
+	bge	cr7, L(21)
+	add	r8, r8, r30
+L(21):	srd	r0, r0, r31
+	addi	r1, r1, 144
+	srd	r8, r8, r31
+	srd	r10, r10, r31
+	srd	r11, r11, r31
+	std	r0, 40(r29)
+	std	r31, 8(r29)
+	srd	r9, r9, r31
+	ld	r0, 16(r1)
+	ld	r30, -16(r1)
+	std	r8, 48(r29)
+	std	r3, 0(r29)
+	mtlr	r0
+	ld	r31, -8(r1)
+	std	r10, 16(r29)
+	std	r11, 24(r29)
+	std	r9, 32(r29)
+	ld	r29, -24(r1)
+	blr
+EPILOGUE()

diff --git a/third_party/gmp/mpn/powerpc64/mode64/mod_34lsub1.asm b/third_party/gmp/mpn/powerpc64/mode64/mod_34lsub1.asm
new file mode 100644
index 0000000..c35e0e3
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc64/mode64/mod_34lsub1.asm

@@ -0,0 +1,132 @@
+dnl  PowerPC-64 mpn_mod_34lsub1 -- modulo 2^48-1.
+
+dnl  Copyright 2005 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C                   cycles/limb
+C POWER3/PPC630          1.33
+C POWER4/PPC970          1.5
+C POWER5                 1.32
+C POWER6                 2.35
+C POWER7                 1
+
+C INPUT PARAMETERS
+define(`up',`r3')
+define(`n',`r4')
+
+ASM_START()
+PROLOGUE(mpn_mod_34lsub1)
+	li	r8, 0
+	li	r9, 0
+	li	r10, 0
+	li	r11, 0
+
+	cmpdi	cr6, n, 3
+	blt	cr6, L(lt3)
+
+	li	r0, -0x5556		C 0xFFFFFFFFFFFFAAAA
+	rldimi	r0, r0, 16, 32		C 0xFFFFFFFFAAAAAAAA
+	rldimi	r0, r0, 32, 63		C 0xAAAAAAAAAAAAAAAB
+	mulhdu	r0, r0, n
+	srdi	r0, r0, 1		C r0 = [n / 3]
+	mtctr	r0
+
+	ld	r5, 0(up)
+	ld	r6, 8(up)
+	ld	r7, 16(up)
+	addi	up, up, 24
+	bdz	L(end)
+
+	ALIGN(16)
+L(top):	addc	r8, r8, r5
+	nop
+	ld	r5, 0(up)
+	adde	r9, r9, r6
+	ld	r6, 8(up)
+	adde	r10, r10, r7
+	ld	r7, 16(up)
+	addi	up, up, 48
+	addze	r11, r11
+	bdz	L(endx)
+	addc	r8, r8, r5
+	nop
+	ld	r5, -24(up)
+	adde	r9, r9, r6
+	ld	r6, -16(up)
+	adde	r10, r10, r7
+	ld	r7, -8(up)
+	addze	r11, r11
+	bdnz	L(top)
+
+	addi	up, up, 24
+L(endx):
+	addi	up, up, -24
+
+L(end):	addc	r8, r8, r5
+	adde	r9, r9, r6
+	adde	r10, r10, r7
+	addze	r11, r11
+
+	sldi	r5, r0, 1
+	add	r5, r5, r0		C r11 = n / 3 * 3
+	sub	n, n, r5		C n = n mod 3
+L(lt3):	cmpdi	cr6, n, 1
+	blt	cr6, L(2)
+
+	ld	r5, 0(up)
+	addc	r8, r8, r5
+	li	r6, 0
+	beq	cr6, L(1)
+
+	ld	r6, 8(up)
+L(1):	adde	r9, r9, r6
+	addze	r10, r10
+	addze	r11, r11
+
+L(2):	rldicl	r0, r8, 0, 16		C r0 = r8 mod 2^48
+	srdi	r3, r8, 48		C r3 = r8 div 2^48
+	rldic	r4, r9, 16, 16		C r4 = (r9 mod 2^32) << 16
+	srdi	r5, r9, 32		C r5 = r9 div 2^32
+	rldic	r6, r10, 32, 16		C r6 = (r10 mod 2^16) << 32
+	srdi	r7, r10, 16		C r7 = r10 div 2^16
+
+	add	r0, r0, r3
+	add	r4, r4, r5
+	add	r6, r6, r7
+
+	add	r0, r0, r4
+	add	r6, r6, r11
+
+	add	r3, r0, r6
+	blr
+EPILOGUE()
+
+C |__r10__|__r9___|__r8___|
+C |-----|-----|-----|-----|

diff --git a/third_party/gmp/mpn/powerpc64/mode64/mode1o.asm b/third_party/gmp/mpn/powerpc64/mode64/mode1o.asm
new file mode 100644
index 0000000..726339a
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc64/mode64/mode1o.asm

@@ -0,0 +1,117 @@
+dnl  PowerPC-64 mpn_modexact_1_odd -- mpn by limb exact remainder.
+
+dnl  Copyright 2006 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C                  cycles/limb
+C POWER3/PPC630        13-19
+C POWER4/PPC970         16
+C POWER5                16
+C POWER6                 ?
+C POWER7                12
+
+C TODO
+C  * Check if n=1 code is really an improvement.  It probably isn't.
+C  * Make more similar to dive_1.asm.
+
+C INPUT PARAMETERS
+define(`up', `r3')
+define(`n',  `r4')
+define(`d',  `r5')
+define(`cy', `r6')
+
+
+ASM_START()
+
+EXTERN(binvert_limb_table)
+
+PROLOGUE(mpn_modexact_1c_odd,toc)
+	addic.	n, n, -1		C set carry as side effect
+	ld	r8, 0(up)
+	bne	cr0, L(2)
+	cmpld	cr7, r6, r8
+	bge	cr7, L(4)
+	subf	r8, r6, r8
+	divdu	r3, r8, d
+	mulld	r3, r3, d
+	subf.	r3, r3, r8
+	beqlr	cr0
+	subf	r3, r3, d
+	blr
+
+L(4):	subf	r3, r8, r6
+	divdu	r8, r3, d
+	mulld	r8, r8, d
+	subf	r3, r8, r3
+	blr
+
+L(2):	LEA(	r7, binvert_limb_table)
+	rldicl	r9, d, 63, 57
+	mtctr	n
+	lbzx	r0, r7, r9
+	mulld	r7, r0, r0
+	sldi	r0, r0, 1
+	mulld	r7, d, r7
+	subf	r0, r7, r0
+	mulld	r9, r0, r0
+	sldi	r0, r0, 1
+	mulld	r9, d, r9
+	subf	r0, r9, r0
+	mulld	r7, r0, r0
+	sldi	r0, r0, 1
+	mulld	r7, d, r7
+	subf	r9, r7, r0
+
+	ALIGN(16)
+L(loop):
+	subfe	r0, r6, r8
+	ld	r8, 8(up)
+	addi	up, up, 8
+	mulld	r0, r9, r0
+	mulhdu	r6, r0, d
+	bdnz	L(loop)
+
+	cmpld	cr7, d, r8
+	blt	cr7, L(10)
+
+	subfe	r0, r0, r0
+	subf	r6, r0, r6
+	cmpld	cr7, r6, r8
+	subf	r3, r8, r6
+	bgelr	cr7
+	add	r3, d, r3
+	blr
+
+L(10):	subfe	r0, r6, r8
+	mulld	r0, r9, r0
+	mulhdu	r3, r0, d
+	blr
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/powerpc64/mode64/mul_1.asm b/third_party/gmp/mpn/powerpc64/mode64/mul_1.asm
new file mode 100644
index 0000000..27a8f8f
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc64/mode64/mul_1.asm

@@ -0,0 +1,168 @@
+dnl  PowerPC-64 mpn_mul_1 -- Multiply a limb vector with a limb and store
+dnl  the result in a second limb vector.
+
+dnl  Copyright 1999-2001, 2003-2006, 2010 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C               cycles/limb
+C POWER3/PPC630     6-18
+C POWER4/PPC970     7.25?  not updated for last file revision
+C POWER5            7.25
+C POWER6           14
+C POWER7            2.9
+
+C TODO
+C  * Try to reduce the number of needed live registers (at least r5 and r10
+C    could be combined)
+C  * Optimize feed-in code, for speed and size.
+C  * Clean up r12/r7 usage in feed-in code.
+
+C INPUT PARAMETERS
+define(`rp', `r3')
+define(`up', `r4')
+define(`n', `r5')
+define(`vl', `r6')
+
+ASM_START()
+PROLOGUE(mpn_mul_1c)
+	std	r27, -40(r1)
+	std	r26, -48(r1)
+	mr	r12, r7
+	b	L(ent)
+EPILOGUE()
+PROLOGUE(mpn_mul_1)
+	std	r27, -40(r1)
+	std	r26, -48(r1)
+	li	r12, 0		C cy_limb = 0
+L(ent):	ld	r26, 0(up)
+
+	rldicl.	r0, n, 0,62	C r0 = n & 3, set cr0
+	cmpdi	cr6, r0, 2
+	addic	n, n, 3		C compute count...
+	srdi	n, n, 2		C ...for ctr
+	mtctr	n		C copy count into ctr
+	beq	cr0, L(b00)
+	blt	cr6, L(b01)
+	beq	cr6, L(b10)
+
+L(b11):	mr	r7, r12
+	mulld	r0, r26, r6
+	mulhdu	r12, r26, r6
+	addi	up, up, 8
+	addc	r0, r0, r7
+	std	r0, 0(rp)
+	addi	rp, rp, 8
+	b	L(fic)
+
+L(b00):	ld	r27, 8(up)
+	addi	up, up, 16
+	mulld	r0, r26, r6
+	mulhdu	r5, r26, r6
+	mulld	r7, r27, r6
+	mulhdu	r8, r27, r6
+	addc	r0, r0, r12
+	adde	r7, r7, r5
+	addze	r12, r8
+	std	r0, 0(rp)
+	std	r7, 8(rp)
+	addi	rp, rp, 16
+	b	L(fic)
+
+	nop			C alignment
+L(b01):	bdnz	L(gt1)
+	mulld	r0, r26, r6
+	mulhdu	r8, r26, r6
+	addc	r0, r0, r12
+	std	r0, 0(rp)
+	b	L(ret)
+L(gt1):	ld	r27, 8(up)
+	nop
+	mulld	r0, r26, r6
+	mulhdu	r5, r26, r6
+	ld	r26, 16(up)
+	mulld	r7, r27, r6
+	mulhdu	r8, r27, r6
+	mulld	r9, r26, r6
+	mulhdu	r10, r26, r6
+	addc	r0, r0, r12
+	adde	r7, r7, r5
+	adde	r9, r9, r8
+	addze	r12, r10
+	std	r0, 0(rp)
+	std	r7, 8(rp)
+	std	r9, 16(rp)
+	addi	up, up, 24
+	addi	rp, rp, 24
+	b	L(fic)
+
+	nop
+L(fic):	ld	r26, 0(up)
+L(b10):	ld	r27, 8(up)
+	addi	up, up, 16
+	bdz	L(end)
+
+L(top):	mulld	r0, r26, r6
+	mulhdu	r5, r26, r6
+	mulld	r7, r27, r6
+	mulhdu	r8, r27, r6
+	ld	r26, 0(up)
+	ld	r27, 8(up)
+	adde	r0, r0, r12
+	adde	r7, r7, r5
+	mulld	r9, r26, r6
+	mulhdu	r10, r26, r6
+	mulld	r11, r27, r6
+	mulhdu	r12, r27, r6
+	ld	r26, 16(up)
+	ld	r27, 24(up)
+	std	r0, 0(rp)
+	adde	r9, r9, r8
+	std	r7, 8(rp)
+	adde	r11, r11, r10
+	std	r9, 16(rp)
+	addi	up, up, 32
+	std	r11, 24(rp)
+
+	addi	rp, rp, 32
+	bdnz	L(top)
+
+L(end):	mulld	r0, r26, r6
+	mulhdu	r5, r26, r6
+	mulld	r7, r27, r6
+	mulhdu	r8, r27, r6
+	adde	r0, r0, r12
+	adde	r7, r7, r5
+	std	r0, 0(rp)
+	std	r7, 8(rp)
+L(ret):	addze	r3, r8
+	ld	r27, -40(r1)
+	ld	r26, -48(r1)
+	blr
+EPILOGUE()

diff --git a/third_party/gmp/mpn/powerpc64/mode64/mul_basecase.asm b/third_party/gmp/mpn/powerpc64/mode64/mul_basecase.asm
new file mode 100644
index 0000000..1873187
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc64/mode64/mul_basecase.asm

@@ -0,0 +1,708 @@
+dnl  PowerPC-64 mpn_mul_basecase.
+
+dnl  Copyright 1999-2001, 2003-2006, 2008 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C                  cycles/limb
+C POWER3/PPC630         6-18
+C POWER4/PPC970          8
+C POWER5                 8
+C POWER6                24
+
+C INPUT PARAMETERS
+define(`rp', `r3')
+define(`up', `r4')
+define(`un', `r5')
+define(`vp', `r6')
+define(`vn', `r7')
+
+define(`v0',	   `r25')
+define(`outer_rp', `r22')
+define(`outer_up', `r23')
+
+ASM_START()
+PROLOGUE(mpn_mul_basecase)
+
+C Special code for un <= 2, for efficiency of these important cases,
+C and since it simplifies the default code.
+	cmpdi	cr0, un, 2
+	bgt	cr0, L(un_gt2)
+	cmpdi	cr6, vn, 1
+	ld	r7, 0(vp)
+	ld	r5, 0(up)
+	mulld	r8, r5, r7	C weight 0
+	mulhdu	r9, r5, r7	C weight 1
+	std	r8, 0(rp)
+	beq	cr0, L(2x)
+	std	r9, 8(rp)
+	blr
+	ALIGN(16)
+L(2x):	ld	r0, 8(up)
+	mulld	r8, r0, r7	C weight 1
+	mulhdu	r10, r0, r7	C weight 2
+	addc	r9, r9, r8
+	addze	r10, r10
+	bne	cr6, L(2x2)
+	std	r9, 8(rp)
+	std	r10, 16(rp)
+	blr
+	ALIGN(16)
+L(2x2):	ld	r6, 8(vp)
+	nop
+	mulld	r8, r5, r6	C weight 1
+	mulhdu	r11, r5, r6	C weight 2
+	addc	r9, r9, r8
+	std	r9, 8(rp)
+	adde	r11, r11, r10
+	mulld	r12, r0, r6	C weight 2
+	mulhdu	r0, r0, r6	C weight 3
+	addze	r0, r0
+	addc	r11, r11, r12
+	addze	r0, r0
+	std	r11, 16(rp)
+	std	r0, 24(rp)
+	blr
+
+L(un_gt2):
+	std	r31, -8(r1)
+	std	r30, -16(r1)
+	std	r29, -24(r1)
+	std	r28, -32(r1)
+	std	r27, -40(r1)
+	std	r26, -48(r1)
+	std	r25, -56(r1)
+	std	r24, -64(r1)
+	std	r23, -72(r1)
+	std	r22, -80(r1)
+
+	mr	outer_rp, rp
+	mr	outer_up, up
+
+	ld	v0, 0(vp)	C new v limb
+	addi	vp, vp, 8
+	ld	r26, 0(up)
+
+	rldicl.	r0, un, 0,62	C r0 = n & 3, set cr0
+	cmpdi	cr6, r0, 2
+	addi	un, un, 1	C compute count...
+	srdi	un, un, 2	C ...for ctr
+	mtctr	un		C copy inner loop count into ctr
+	beq	cr0, L(b0)
+	blt	cr6, L(b1)
+	beq	cr6, L(b2)
+
+
+	ALIGN(16)
+L(b3):	mulld	r0, r26, v0
+	mulhdu	r12, r26, v0
+	addic	r0, r0, 0
+	std	r0, 0(rp)
+	ld	r26, 8(up)
+	ld	r27, 16(up)
+	bdz	L(end_m_3)
+
+	ALIGN(16)
+L(lo_m_3):
+	mulld	r0, r26, v0
+	mulhdu	r31, r26, v0
+	ld	r26, 24(up)
+	nop
+	mulld	r24, r27, v0
+	mulhdu	r8, r27, v0
+	ld	r27, 32(up)
+	nop
+	adde	r0, r0, r12
+	adde	r24, r24, r31
+	mulld	r9, r26, v0
+	mulhdu	r10, r26, v0
+	ld	r26, 40(up)
+	nop
+	mulld	r11, r27, v0
+	mulhdu	r12, r27, v0
+	ld	r27, 48(up)
+	std	r0, 8(rp)
+	adde	r9, r9, r8
+	std	r24, 16(rp)
+	adde	r11, r11, r10
+	std	r9, 24(rp)
+	addi	up, up, 32
+	std	r11, 32(rp)
+	addi	rp, rp, 32
+	bdnz	L(lo_m_3)
+
+	ALIGN(16)
+L(end_m_3):
+	mulld	r0, r26, v0
+	mulhdu	r31, r26, v0
+
+	mulld	r24, r27, v0
+	mulhdu	r8, r27, v0
+
+	adde	r0, r0, r12
+	adde	r24, r24, r31
+
+	std	r0, 8(rp)
+	std	r24, 16(rp)
+	addze	r8, r8
+	std	r8, 24(rp)
+	addic.	vn, vn, -1
+	beq	L(ret)
+
+	ALIGN(16)
+L(outer_lo_3):
+	mtctr	un		C copy inner loop count into ctr
+	addi	rp, outer_rp, 8
+	mr	up, outer_up
+	addi	outer_rp, outer_rp, 8
+	ld	v0, 0(vp)	C new v limb
+	addi	vp, vp, 8
+	ld	r26, 0(up)
+	ld	r28, 0(rp)
+	mulld	r0, r26, v0
+	mulhdu	r12, r26, v0
+	addc	r0, r0, r28
+	std	r0, 0(rp)
+	ld	r26, 8(up)
+	ld	r27, 16(up)
+	bdz	L(end_3)
+
+	ALIGN(16)		C registers dying
+L(lo_3):
+	mulld	r0, r26, v0	C
+	mulhdu	r10, r26, v0	C 26
+	ld	r26, 24(up)	C
+	ld	r28, 8(rp)	C
+	mulld	r24, r27, v0	C
+	mulhdu	r8, r27, v0	C 27
+	ld	r27, 32(up)	C
+	ld	r29, 16(rp)	C
+	adde	r0, r0, r12	C 0 12
+	adde	r24, r24, r10	C 24 10
+	mulld	r9, r26, v0	C
+	mulhdu	r10, r26, v0	C 26
+	ld	r26, 40(up)	C
+	ld	r30, 24(rp)	C
+	mulld	r11, r27, v0	C
+	mulhdu	r12, r27, v0	C 27
+	ld	r27, 48(up)	C
+	ld	r31, 32(rp)	C
+	adde	r9, r9, r8	C 8 9
+	adde	r11, r11, r10	C 10 11
+	addze	r12, r12	C 12
+	addc	r0, r0, r28	C 0 28
+	std	r0, 8(rp)	C 0
+	adde	r24, r24, r29	C 7 29
+	std	r24, 16(rp)	C 7
+	adde	r9, r9, r30	C 9 30
+	std	r9, 24(rp)	C 9
+	adde	r11, r11, r31	C 11 31
+	std	r11, 32(rp)	C 11
+	addi	up, up, 32	C
+	addi	rp, rp, 32	C
+	bdnz	L(lo_3)	C
+
+	ALIGN(16)
+L(end_3):
+	mulld	r0, r26, v0
+	mulhdu	r10, r26, v0
+	ld	r28, 8(rp)
+	nop
+	mulld	r24, r27, v0
+	mulhdu	r8, r27, v0
+	ld	r29, 16(rp)
+	nop
+	adde	r0, r0, r12
+	adde	r24, r24, r10
+	addze	r8, r8
+	addc	r0, r0, r28
+	std	r0, 8(rp)
+	adde	r24, r24, r29
+	std	r24, 16(rp)
+	addze	r8, r8
+	std	r8, 24(rp)
+
+	addic.	vn, vn, -1
+	bne	L(outer_lo_3)
+	b	L(ret)
+
+
+	ALIGN(16)
+L(b0):	ld	r27, 8(up)
+	addi	up, up, 8
+	mulld	r0, r26, v0
+	mulhdu	r10, r26, v0
+	mulld	r24, r27, v0
+	mulhdu	r8, r27, v0
+	addc	r24, r24, r10
+	addze	r12, r8
+	std	r0, 0(rp)
+	std	r24, 8(rp)
+	addi	rp, rp, 8
+	ld	r26, 8(up)
+	ld	r27, 16(up)
+	bdz	L(end_m_0)
+
+	ALIGN(16)
+L(lo_m_0):
+	mulld	r0, r26, v0
+	mulhdu	r31, r26, v0
+	ld	r26, 24(up)
+	nop
+	mulld	r24, r27, v0
+	mulhdu	r8, r27, v0
+	ld	r27, 32(up)
+	nop
+	adde	r0, r0, r12
+	adde	r24, r24, r31
+	mulld	r9, r26, v0
+	mulhdu	r10, r26, v0
+	ld	r26, 40(up)
+	nop
+	mulld	r11, r27, v0
+	mulhdu	r12, r27, v0
+	ld	r27, 48(up)
+	std	r0, 8(rp)
+	adde	r9, r9, r8
+	std	r24, 16(rp)
+	adde	r11, r11, r10
+	std	r9, 24(rp)
+	addi	up, up, 32
+	std	r11, 32(rp)
+	addi	rp, rp, 32
+	bdnz	L(lo_m_0)
+
+	ALIGN(16)
+L(end_m_0):
+	mulld	r0, r26, v0
+	mulhdu	r31, r26, v0
+
+	mulld	r24, r27, v0
+	mulhdu	r8, r27, v0
+
+	adde	r0, r0, r12
+	adde	r24, r24, r31
+
+	std	r0, 8(rp)
+	addze	r8, r8
+	std	r24, 16(rp)
+	addic.	vn, vn, -1
+	std	r8, 24(rp)
+	nop
+	beq	L(ret)
+
+	ALIGN(16)
+L(outer_lo_0):
+	mtctr	un		C copy inner loop count into ctr
+	addi	rp, outer_rp, 16
+	addi	up, outer_up, 8
+	addi	outer_rp, outer_rp, 8
+	ld	v0, 0(vp)	C new v limb
+	addi	vp, vp, 8
+	ld	r26, -8(up)
+	ld	r27, 0(up)
+	ld	r28, -8(rp)
+	ld	r29, 0(rp)
+	nop
+	nop
+	mulld	r0, r26, v0
+	mulhdu	r10, r26, v0
+	mulld	r24, r27, v0
+	mulhdu	r8, r27, v0
+	addc	r24, r24, r10
+	addze	r12, r8
+	addc	r0, r0, r28
+	std	r0, -8(rp)
+	adde	r24, r24, r29
+	std	r24, 0(rp)
+	ld	r26, 8(up)
+	ld	r27, 16(up)
+	bdz	L(end_0)
+
+	ALIGN(16)		C registers dying
+L(lo_0):
+	mulld	r0, r26, v0	C
+	mulhdu	r10, r26, v0	C 26
+	ld	r26, 24(up)	C
+	ld	r28, 8(rp)	C
+	mulld	r24, r27, v0	C
+	mulhdu	r8, r27, v0	C 27
+	ld	r27, 32(up)	C
+	ld	r29, 16(rp)	C
+	adde	r0, r0, r12	C 0 12
+	adde	r24, r24, r10	C 24 10
+	mulld	r9, r26, v0	C
+	mulhdu	r10, r26, v0	C 26
+	ld	r26, 40(up)	C
+	ld	r30, 24(rp)	C
+	mulld	r11, r27, v0	C
+	mulhdu	r12, r27, v0	C 27
+	ld	r27, 48(up)	C
+	ld	r31, 32(rp)	C
+	adde	r9, r9, r8	C 8 9
+	adde	r11, r11, r10	C 10 11
+	addze	r12, r12	C 12
+	addc	r0, r0, r28	C 0 28
+	std	r0, 8(rp)	C 0
+	adde	r24, r24, r29	C 7 29
+	std	r24, 16(rp)	C 7
+	adde	r9, r9, r30	C 9 30
+	std	r9, 24(rp)	C 9
+	adde	r11, r11, r31	C 11 31
+	std	r11, 32(rp)	C 11
+	addi	up, up, 32	C
+	addi	rp, rp, 32	C
+	bdnz	L(lo_0)	C
+
+	ALIGN(16)
+L(end_0):
+	mulld	r0, r26, v0
+	mulhdu	r10, r26, v0
+	ld	r28, 8(rp)
+	nop
+	mulld	r24, r27, v0
+	mulhdu	r8, r27, v0
+	ld	r29, 16(rp)
+	nop
+	adde	r0, r0, r12
+	adde	r24, r24, r10
+	addze	r8, r8
+	addic.	vn, vn, -1
+	addc	r0, r0, r28
+	std	r0, 8(rp)
+	adde	r24, r24, r29
+	std	r24, 16(rp)
+	addze	r8, r8
+	std	r8, 24(rp)
+	bne	L(outer_lo_0)
+	b	L(ret)
+
+
+	ALIGN(16)
+L(b1):	ld	r27, 8(up)
+	nop
+	mulld	r0, r26, v0
+	mulhdu	r31, r26, v0
+	ld	r26, 16(up)
+	mulld	r24, r27, v0
+	mulhdu	r8, r27, v0
+	mulld	r9, r26, v0
+	mulhdu	r10, r26, v0
+	addc	r24, r24, r31
+	adde	r9, r9, r8
+	addze	r12, r10
+	std	r0, 0(rp)
+	std	r24, 8(rp)
+	std	r9, 16(rp)
+	addi	up, up, 16
+	addi	rp, rp, 16
+	ld	r26, 8(up)
+	ld	r27, 16(up)
+	bdz	L(end_m_1)
+
+	ALIGN(16)
+L(lo_m_1):
+	mulld	r0, r26, v0
+	mulhdu	r31, r26, v0
+	ld	r26, 24(up)
+	nop
+	mulld	r24, r27, v0
+	mulhdu	r8, r27, v0
+	ld	r27, 32(up)
+	nop
+	adde	r0, r0, r12
+	adde	r24, r24, r31
+	mulld	r9, r26, v0
+	mulhdu	r10, r26, v0
+	ld	r26, 40(up)
+	nop
+	mulld	r11, r27, v0
+	mulhdu	r12, r27, v0
+	ld	r27, 48(up)
+	std	r0, 8(rp)
+	adde	r9, r9, r8
+	std	r24, 16(rp)
+	adde	r11, r11, r10
+	std	r9, 24(rp)
+	addi	up, up, 32
+	std	r11, 32(rp)
+	addi	rp, rp, 32
+	bdnz	L(lo_m_1)
+
+	ALIGN(16)
+L(end_m_1):
+	mulld	r0, r26, v0
+	mulhdu	r31, r26, v0
+
+	mulld	r24, r27, v0
+	mulhdu	r8, r27, v0
+
+	adde	r0, r0, r12
+	adde	r24, r24, r31
+
+	std	r0, 8(rp)
+	addze	r8, r8
+	std	r24, 16(rp)
+	addic.	vn, vn, -1
+	std	r8, 24(rp)
+	nop
+	beq	L(ret)
+
+	ALIGN(16)
+L(outer_lo_1):
+	mtctr	un		C copy inner loop count into ctr
+	addi	rp, outer_rp, 24
+	addi	up, outer_up, 16
+	addi	outer_rp, outer_rp, 8
+	ld	v0, 0(vp)	C new v limb
+	addi	vp, vp, 8
+	ld	r26, -16(up)
+	ld	r27, -8(up)
+	mulld	r0, r26, v0
+	mulhdu	r31, r26, v0
+	ld	r26, 0(up)
+	ld	r28, -16(rp)
+	mulld	r24, r27, v0
+	mulhdu	r8, r27, v0
+	ld	r29, -8(rp)
+	ld	r30, 0(rp)
+	mulld	r9, r26, v0
+	mulhdu	r10, r26, v0
+	addc	r24, r24, r31
+	adde	r9, r9, r8
+	addze	r12, r10
+	addc	r0, r0, r28
+	std	r0, -16(rp)
+	adde	r24, r24, r29
+	std	r24, -8(rp)
+	adde	r9, r9, r30
+	std	r9, 0(rp)
+	ld	r26, 8(up)
+	ld	r27, 16(up)
+	bdz	L(end_1)
+
+	ALIGN(16)		C registers dying
+L(lo_1):
+	mulld	r0, r26, v0	C
+	mulhdu	r10, r26, v0	C 26
+	ld	r26, 24(up)	C
+	ld	r28, 8(rp)	C
+	mulld	r24, r27, v0	C
+	mulhdu	r8, r27, v0	C 27
+	ld	r27, 32(up)	C
+	ld	r29, 16(rp)	C
+	adde	r0, r0, r12	C 0 12
+	adde	r24, r24, r10	C 24 10
+	mulld	r9, r26, v0	C
+	mulhdu	r10, r26, v0	C 26
+	ld	r26, 40(up)	C
+	ld	r30, 24(rp)	C
+	mulld	r11, r27, v0	C
+	mulhdu	r12, r27, v0	C 27
+	ld	r27, 48(up)	C
+	ld	r31, 32(rp)	C
+	adde	r9, r9, r8	C 8 9
+	adde	r11, r11, r10	C 10 11
+	addze	r12, r12	C 12
+	addc	r0, r0, r28	C 0 28
+	std	r0, 8(rp)	C 0
+	adde	r24, r24, r29	C 7 29
+	std	r24, 16(rp)	C 7
+	adde	r9, r9, r30	C 9 30
+	std	r9, 24(rp)	C 9
+	adde	r11, r11, r31	C 11 31
+	std	r11, 32(rp)	C 11
+	addi	up, up, 32	C
+	addi	rp, rp, 32	C
+	bdnz	L(lo_1)	C
+
+	ALIGN(16)
+L(end_1):
+	mulld	r0, r26, v0
+	mulhdu	r10, r26, v0
+	ld	r28, 8(rp)
+	nop
+	mulld	r24, r27, v0
+	mulhdu	r8, r27, v0
+	ld	r29, 16(rp)
+	nop
+	adde	r0, r0, r12
+	adde	r24, r24, r10
+	addze	r8, r8
+	addic.	vn, vn, -1
+	addc	r0, r0, r28
+	std	r0, 8(rp)
+	adde	r24, r24, r29
+	std	r24, 16(rp)
+	addze	r8, r8
+	std	r8, 24(rp)
+	bne	L(outer_lo_1)
+	b	L(ret)
+
+
+	ALIGN(16)
+L(b2):	ld	r27, 8(up)
+	addi	up, up, -8
+	addi	rp, rp, -8
+	li	r12, 0
+	addic	r12, r12, 0
+
+	ALIGN(16)
+L(lo_m_2):
+	mulld	r0, r26, v0
+	mulhdu	r31, r26, v0
+	ld	r26, 24(up)
+	nop
+	mulld	r24, r27, v0
+	mulhdu	r8, r27, v0
+	ld	r27, 32(up)
+	nop
+	adde	r0, r0, r12
+	adde	r24, r24, r31
+	mulld	r9, r26, v0
+	mulhdu	r10, r26, v0
+	ld	r26, 40(up)
+	nop
+	mulld	r11, r27, v0
+	mulhdu	r12, r27, v0
+	ld	r27, 48(up)
+	std	r0, 8(rp)
+	adde	r9, r9, r8
+	std	r24, 16(rp)
+	adde	r11, r11, r10
+	std	r9, 24(rp)
+	addi	up, up, 32
+	std	r11, 32(rp)
+
+	addi	rp, rp, 32
+	bdnz	L(lo_m_2)
+
+	ALIGN(16)
+L(end_m_2):
+	mulld	r0, r26, v0
+	mulhdu	r31, r26, v0
+
+	mulld	r24, r27, v0
+	mulhdu	r8, r27, v0
+
+	adde	r0, r0, r12
+	adde	r24, r24, r31
+
+	std	r0, 8(rp)
+	addze	r8, r8
+	std	r24, 16(rp)
+	addic.	vn, vn, -1
+	std	r8, 24(rp)
+	nop
+	beq	L(ret)
+
+	ALIGN(16)
+L(outer_lo_2):
+	mtctr	un		C copy inner loop count into ctr
+	addi	rp, outer_rp, 0
+	addi	up, outer_up, -8
+	addi	outer_rp, outer_rp, 8
+	ld	v0, 0(vp)	C new v limb
+	addi	vp, vp, 8
+	ld	r26, 8(up)
+	ld	r27, 16(up)
+	li	r12, 0
+	addic	r12, r12, 0
+
+	ALIGN(16)		C registers dying
+L(lo_2):
+	mulld	r0, r26, v0	C
+	mulhdu	r10, r26, v0	C 26
+	ld	r26, 24(up)	C
+	ld	r28, 8(rp)	C
+	mulld	r24, r27, v0	C
+	mulhdu	r8, r27, v0	C 27
+	ld	r27, 32(up)	C
+	ld	r29, 16(rp)	C
+	adde	r0, r0, r12	C 0 12
+	adde	r24, r24, r10	C 24 10
+	mulld	r9, r26, v0	C
+	mulhdu	r10, r26, v0	C 26
+	ld	r26, 40(up)	C
+	ld	r30, 24(rp)	C
+	mulld	r11, r27, v0	C
+	mulhdu	r12, r27, v0	C 27
+	ld	r27, 48(up)	C
+	ld	r31, 32(rp)	C
+	adde	r9, r9, r8	C 8 9
+	adde	r11, r11, r10	C 10 11
+	addze	r12, r12	C 12
+	addc	r0, r0, r28	C 0 28
+	std	r0, 8(rp)	C 0
+	adde	r24, r24, r29	C 7 29
+	std	r24, 16(rp)	C 7
+	adde	r9, r9, r30	C 9 30
+	std	r9, 24(rp)	C 9
+	adde	r11, r11, r31	C 11 31
+	std	r11, 32(rp)	C 11
+	addi	up, up, 32	C
+	addi	rp, rp, 32	C
+	bdnz	L(lo_2)	C
+
+	ALIGN(16)
+L(end_2):
+	mulld	r0, r26, v0
+	mulhdu	r10, r26, v0
+	ld	r28, 8(rp)
+	nop
+	mulld	r24, r27, v0
+	mulhdu	r8, r27, v0
+	ld	r29, 16(rp)
+	nop
+	adde	r0, r0, r12
+	adde	r24, r24, r10
+	addze	r8, r8
+	addic.	vn, vn, -1
+	addc	r0, r0, r28
+	std	r0, 8(rp)
+	adde	r24, r24, r29
+	std	r24, 16(rp)
+	addze	r8, r8
+	std	r8, 24(rp)
+	bne	L(outer_lo_2)
+	b	L(ret)
+
+
+L(ret):	ld	r31, -8(r1)
+	ld	r30, -16(r1)
+	ld	r29, -24(r1)
+	ld	r28, -32(r1)
+	ld	r27, -40(r1)
+	ld	r26, -48(r1)
+	ld	r25, -56(r1)
+	ld	r24, -64(r1)
+	ld	r23, -72(r1)
+	ld	r22, -80(r1)
+	blr
+EPILOGUE()

diff --git a/third_party/gmp/mpn/powerpc64/mode64/p3/gmp-mparam.h b/third_party/gmp/mpn/powerpc64/mode64/p3/gmp-mparam.h
new file mode 100644
index 0000000..61a437b
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc64/mode64/p3/gmp-mparam.h

@@ -0,0 +1,179 @@
+/* POWER3/PowerPC630 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2008-2010 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+
+#define MOD_1_NORM_THRESHOLD                 0  /* always */
+#define MOD_1_UNNORM_THRESHOLD               0  /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          7
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          3
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD        18
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD         0  /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     17
+#define USE_PREINV_DIVREM_1                  0
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD        MP_SIZE_T_MAX  /* never */
+
+#define MUL_TOOM22_THRESHOLD                10
+#define MUL_TOOM33_THRESHOLD                33
+#define MUL_TOOM44_THRESHOLD                46
+#define MUL_TOOM6H_THRESHOLD                77
+#define MUL_TOOM8H_THRESHOLD               139
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      49
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD      47
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD      49
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD      49
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD      34
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 14
+#define SQR_TOOM3_THRESHOLD                 45
+#define SQR_TOOM4_THRESHOLD                 64
+#define SQR_TOOM6_THRESHOLD                 85
+#define SQR_TOOM8_THRESHOLD                139
+
+#define MULMID_TOOM42_THRESHOLD             22
+
+#define MULMOD_BNM1_THRESHOLD                8
+#define SQRMOD_BNM1_THRESHOLD               10
+
+#define MUL_FFT_MODF_THRESHOLD             220  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    220, 5}, {      9, 6}, {      5, 5}, {     11, 6}, \
+    {     13, 7}, {      7, 6}, {     15, 7}, {     13, 8}, \
+    {      7, 7}, {     15, 8}, {     13, 9}, {      7, 8}, \
+    {     19, 9}, {     11, 8}, {     23,10}, {      7, 9}, \
+    {     15, 8}, {     33, 9}, {     23,10}, {     15, 9}, \
+    {     35, 8}, {     71,10}, {     23, 9}, {     47,11}, \
+    {     15,10}, {     31, 9}, {     71,10}, {     39, 9}, \
+    {     79,10}, {     55,11}, {     31,10}, {     63, 9}, \
+    {    127,10}, {     71, 9}, {    143, 8}, {    287,10}, \
+    {     79,11}, {     47,10}, {     95, 9}, {    191,12}, \
+    {     31,11}, {     63,10}, {    127, 9}, {    255, 8}, \
+    {    511,10}, {    143, 9}, {    287,11}, {     79,10}, \
+    {    159, 9}, {    319, 8}, {    639,10}, {    175, 9}, \
+    {    351,11}, {     95,10}, {    191, 9}, {    383,11}, \
+    {    111,10}, {    223,12}, {     63,11}, {    127,10}, \
+    {    255, 9}, {    511,11}, {    143,10}, {    287, 9}, \
+    {    575,11}, {    159,10}, {    319, 9}, {    639,11}, \
+    {    175,10}, {    351,12}, {     95,11}, {    191,10}, \
+    {    383, 9}, {    767,11}, {    223,13}, {     63,12}, \
+    {    127,11}, {    255,10}, {    511,11}, {    287,10}, \
+    {    575, 9}, {   1151,12}, {    159,11}, {    319,10}, \
+    {    639,11}, {    351,12}, {    191,11}, {    383,10}, \
+    {    767,12}, {    223,11}, {    447,10}, {    895,13}, \
+    {    127,12}, {    255,11}, {    511,12}, {    287,11}, \
+    {    575,10}, {   1151,12}, {    319,11}, {    639,12}, \
+    {    351,11}, {    703,13}, {    191,12}, {    383,11}, \
+    {    767,12}, {    415,11}, {    831,10}, {   1663,12}, \
+    {    447,11}, {    895,14}, {  16384,15}, {  32768,16}, \
+    {  65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \
+    {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 120
+#define MUL_FFT_THRESHOLD                 2688
+
+#define SQR_FFT_MODF_THRESHOLD             188  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    188, 5}, {      9, 6}, {      5, 5}, {     11, 6}, \
+    {     13, 7}, {     13, 8}, {      7, 7}, {     16, 8}, \
+    {      9, 7}, {     19, 8}, {     13, 9}, {      7, 8}, \
+    {     19, 9}, {     11, 8}, {     23,10}, {      7, 9}, \
+    {     15, 8}, {     31, 9}, {     19, 8}, {     39, 9}, \
+    {     23,10}, {     15, 9}, {     39,10}, {     23,11}, \
+    {     15,10}, {     31, 9}, {     67,10}, {     39, 9}, \
+    {     79, 8}, {    159,10}, {     47, 9}, {     95, 8}, \
+    {    191,11}, {     31,10}, {     63, 9}, {    127, 8}, \
+    {    255,10}, {     71, 9}, {    143, 8}, {    287,10}, \
+    {     79, 9}, {    159,11}, {     47,10}, {     95, 9}, \
+    {    191,12}, {     31,11}, {     63,10}, {    127, 9}, \
+    {    255, 8}, {    511,10}, {    143, 9}, {    287,11}, \
+    {     79,10}, {    159, 9}, {    319, 8}, {    639,10}, \
+    {    175,11}, {     95,10}, {    191, 9}, {    383,11}, \
+    {    111,10}, {    223,12}, {     63,11}, {    127,10}, \
+    {    255, 9}, {    511,11}, {    143,10}, {    287, 9}, \
+    {    575,11}, {    159,10}, {    319, 9}, {    639,11}, \
+    {    175,12}, {     95,11}, {    191,10}, {    383, 9}, \
+    {    767,11}, {    223,13}, {     63,12}, {    127,11}, \
+    {    255,10}, {    511,11}, {    287,10}, {    575,12}, \
+    {    159,11}, {    319,10}, {    639,11}, {    351,12}, \
+    {    191,11}, {    383,10}, {    767,12}, {    223,11}, \
+    {    447,10}, {    895,13}, {    127,12}, {    255,11}, \
+    {    511,12}, {    287,11}, {    575,10}, {   1151,12}, \
+    {    319,11}, {    639,12}, {    351,13}, {    191,12}, \
+    {    383,11}, {    767,12}, {    447,11}, {    895,14}, \
+    {  16384,15}, {  32768,16}, {  65536,17}, { 131072,18}, \
+    { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \
+    {4194304,23}, {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 118
+#define SQR_FFT_THRESHOLD                 1728
+
+#define MULLO_BASECASE_THRESHOLD             2
+#define MULLO_DC_THRESHOLD                  27
+#define MULLO_MUL_N_THRESHOLD             2511
+
+#define DC_DIV_QR_THRESHOLD                 23
+#define DC_DIVAPPR_Q_THRESHOLD              87
+#define DC_BDIV_QR_THRESHOLD                27
+#define DC_BDIV_Q_THRESHOLD                 60
+
+#define INV_MULMOD_BNM1_THRESHOLD           27
+#define INV_NEWTON_THRESHOLD                91
+#define INV_APPR_THRESHOLD                  91
+
+#define BINV_NEWTON_THRESHOLD              115
+#define REDC_1_TO_REDC_N_THRESHOLD          31
+
+#define MU_DIV_QR_THRESHOLD                551
+#define MU_DIVAPPR_Q_THRESHOLD             551
+#define MUPI_DIV_QR_THRESHOLD               42
+#define MU_BDIV_QR_THRESHOLD               483
+#define MU_BDIV_Q_THRESHOLD                492
+
+#define POWM_SEC_TABLE  2,23,140,556,713,746
+
+#define MATRIX22_STRASSEN_THRESHOLD          8
+#define HGCD_THRESHOLD                      56
+#define HGCD_APPR_THRESHOLD                 51
+#define HGCD_REDUCE_THRESHOLD              688
+#define GCD_DC_THRESHOLD                   333
+#define GCDEXT_DC_THRESHOLD                126
+#define JACOBI_BASE_METHOD                   1
+
+#define GET_STR_DC_THRESHOLD                17
+#define GET_STR_PRECOMPUTE_THRESHOLD        28
+#define SET_STR_DC_THRESHOLD               375
+#define SET_STR_PRECOMPUTE_THRESHOLD       812
+
+#define FAC_DSC_THRESHOLD                  351
+#define FAC_ODD_THRESHOLD                    0  /* always */

diff --git a/third_party/gmp/mpn/powerpc64/mode64/p4/gmp-mparam.h b/third_party/gmp/mpn/powerpc64/mode64/p4/gmp-mparam.h
new file mode 100644
index 0000000..3c40fb9
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc64/mode64/p4/gmp-mparam.h

@@ -0,0 +1,214 @@
+/* POWER4/PowerPC970 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2008-2010, 2014, 2015 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+/* 1800 MHz PPC970 */
+/* FFT tuning limit = 15 M */
+/* Generated by tuneup.c, 2015-10-09, gcc 4.0 */
+
+#define MOD_1_NORM_THRESHOLD                 0  /* always */
+#define MOD_1_UNNORM_THRESHOLD               0  /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          7
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          5
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD         9
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD        22
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     16
+#define USE_PREINV_DIVREM_1                  0
+#define DIV_QR_1N_PI1_METHOD                 1
+#define DIV_QR_1_NORM_THRESHOLD              1
+#define DIV_QR_1_UNNORM_THRESHOLD            1
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           35
+
+#define DIV_1_VS_MUL_1_PERCENT             218
+
+#define MUL_TOOM22_THRESHOLD                14
+#define MUL_TOOM33_THRESHOLD                53
+#define MUL_TOOM44_THRESHOLD               136
+#define MUL_TOOM6H_THRESHOLD               197
+#define MUL_TOOM8H_THRESHOLD               272
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      89
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD      91
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD      89
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD      90
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD      76
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 22
+#define SQR_TOOM3_THRESHOLD                 73
+#define SQR_TOOM4_THRESHOLD                202
+#define SQR_TOOM6_THRESHOLD                  0  /* always */
+#define SQR_TOOM8_THRESHOLD                430
+
+#define MULMID_TOOM42_THRESHOLD             34
+
+#define MULMOD_BNM1_THRESHOLD               11
+#define SQRMOD_BNM1_THRESHOLD               13
+
+#define MUL_FFT_MODF_THRESHOLD             444  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    444, 5}, {     17, 6}, {      9, 5}, {     19, 6}, \
+    {     10, 5}, {     21, 6}, {     13, 5}, {     28, 6}, \
+    {     19, 7}, {     10, 6}, {     21, 7}, {     11, 6}, \
+    {     23, 7}, {     12, 6}, {     25, 7}, {     21, 8}, \
+    {     11, 7}, {     25, 8}, {     13, 7}, {     28, 8}, \
+    {     15, 7}, {     31, 8}, {     21, 9}, {     11, 8}, \
+    {     27, 9}, {     15, 8}, {     35, 9}, {     19, 8}, \
+    {     39, 9}, {     23, 8}, {     49, 9}, {     27,10}, \
+    {     15, 9}, {     39,10}, {     23, 9}, {     55,11}, \
+    {     15,10}, {     31, 9}, {     71,10}, {     39, 9}, \
+    {     83,10}, {     47, 9}, {     95,10}, {     55,11}, \
+    {     31,10}, {     63, 9}, {    127,10}, {     87,11}, \
+    {     47,10}, {    103,12}, {     31,11}, {     63,10}, \
+    {    135, 9}, {    271,11}, {     79,10}, {    159, 9}, \
+    {    319,10}, {    167,11}, {     95, 9}, {    383, 8}, \
+    {    767,10}, {    199,11}, {    111,12}, {     63,11}, \
+    {    127,10}, {    255, 9}, {    511,10}, {    271, 9}, \
+    {    543,11}, {    143,10}, {    287, 9}, {    575,10}, \
+    {    303, 9}, {    607,11}, {    159,10}, {    319, 9}, \
+    {    639,10}, {    335, 9}, {    671,10}, {    351,12}, \
+    {     95,10}, {    383, 9}, {    767,10}, {    415, 9}, \
+    {    831,13}, {     63,12}, {    127,11}, {    255,10}, \
+    {    511,11}, {    271,10}, {    543,11}, {    287,10}, \
+    {    575,11}, {    303,10}, {    607,12}, {    159,11}, \
+    {    319,10}, {    639,11}, {    335,10}, {    671,11}, \
+    {    351,10}, {    703,11}, {    383,10}, {    767,11}, \
+    {    415,10}, {    831,12}, {    223,10}, {    895,13}, \
+    {    127,12}, {    255,11}, {    543,12}, {    287,11}, \
+    {    607,12}, {    319,11}, {    671,12}, {    351,11}, \
+    {    703,12}, {    383,11}, {    767,12}, {    415,11}, \
+    {    895,14}, {    127,13}, {    255,12}, {    607,13}, \
+    {    319,12}, {    703,13}, {    383,12}, {    895,14}, \
+    {    255,13}, {    511,12}, {   1023,13}, {    575,12}, \
+    {   1151,13}, {    703,14}, {    383,13}, {    895,15}, \
+    {    255,14}, {    511,13}, {   1023,12}, {   2047,13}, \
+    {   1087,12}, {   2175,13}, {   1151,14}, {    639,13}, \
+    {   1343,12}, {   2687,13}, {   1407,14}, {    767,13}, \
+    {   1663,14}, {    895,15}, {  32768,16}, {  65536,17}, \
+    { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
+    {2097152,22}, {4194304,23}, {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 159
+#define MUL_FFT_THRESHOLD                 9088
+
+#define SQR_FFT_MODF_THRESHOLD             344  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    344, 5}, {     15, 6}, {      8, 5}, {     17, 6}, \
+    {      9, 5}, {     19, 6}, {     10, 5}, {     21, 6}, \
+    {     13, 5}, {     28, 6}, {     21, 7}, {     11, 6}, \
+    {     23, 7}, {     12, 6}, {     25, 7}, {     14, 6}, \
+    {     29, 7}, {     21, 8}, {     11, 7}, {     25, 8}, \
+    {     13, 7}, {     27, 8}, {     15, 7}, {     31, 8}, \
+    {     21, 9}, {     11, 8}, {     27, 9}, {     15, 8}, \
+    {     35, 9}, {     19, 8}, {     39, 9}, {     23, 8}, \
+    {     47, 9}, {     27,10}, {     15, 9}, {     39,10}, \
+    {     23, 9}, {     51,11}, {     15,10}, {     31, 9}, \
+    {     67,10}, {     39, 9}, {     79,10}, {     47, 9}, \
+    {     95,10}, {     55,11}, {     31,10}, {     79,11}, \
+    {     47,10}, {     95, 9}, {    191,10}, {    103,12}, \
+    {     31,11}, {     63,10}, {    127, 9}, {    255, 8}, \
+    {    511,10}, {    135, 9}, {    271, 8}, {    543,11}, \
+    {     79, 9}, {    319, 8}, {    639,11}, {     95,10}, \
+    {    191, 9}, {    383, 8}, {    767,10}, {    207, 9}, \
+    {    415,12}, {     63,11}, {    127,10}, {    255, 9}, \
+    {    511,10}, {    271, 9}, {    543,10}, {    287, 9}, \
+    {    575,10}, {    303, 9}, {    607,10}, {    319, 9}, \
+    {    639,10}, {    335,11}, {    175,10}, {    351, 9}, \
+    {    703,11}, {    191,10}, {    383, 9}, {    767,11}, \
+    {    207,10}, {    415, 9}, {    831,13}, {     63,12}, \
+    {    127,11}, {    255,10}, {    511,11}, {    271,10}, \
+    {    543,11}, {    287,10}, {    575,11}, {    303,10}, \
+    {    607,11}, {    319,10}, {    639,11}, {    335,10}, \
+    {    671,11}, {    351,10}, {    703,12}, {    191,11}, \
+    {    383,10}, {    767,11}, {    415,10}, {    831,12}, \
+    {    223,10}, {    895,11}, {    479,13}, {    127,12}, \
+    {    255,11}, {    543,12}, {    287,11}, {    607,12}, \
+    {    319,11}, {    671,12}, {    351,11}, {    703,13}, \
+    {    191,12}, {    383,11}, {    767,12}, {    415,11}, \
+    {    831,10}, {   1663,11}, {    895,12}, {    479,14}, \
+    {    127,13}, {    255,12}, {    607,13}, {    319,12}, \
+    {    703,13}, {    383,12}, {    831,11}, {   1663,12}, \
+    {    927,14}, {    255,13}, {    511,12}, {   1023,13}, \
+    {    575,12}, {   1151,13}, {    639,12}, {   1279,13}, \
+    {    703,14}, {    383,13}, {    895,15}, {    255,14}, \
+    {    511,13}, {   1023,12}, {   2175,13}, {   1151,12}, \
+    {   2303,13}, {   1215,14}, {    639,13}, {   1343,12}, \
+    {   2687,13}, {   1407,14}, {    767,13}, {   1663,14}, \
+    {    895,15}, {  32768,16}, {  65536,17}, { 131072,18}, \
+    { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \
+    {4194304,23}, {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 174
+#define SQR_FFT_THRESHOLD                 6272
+
+#define MULLO_BASECASE_THRESHOLD             0  /* always */
+#define MULLO_DC_THRESHOLD                  43
+#define MULLO_MUL_N_THRESHOLD            18087
+#define SQRLO_BASECASE_THRESHOLD             2
+#define SQRLO_DC_THRESHOLD                  79
+#define SQRLO_SQR_THRESHOLD              12322
+
+#define DC_DIV_QR_THRESHOLD                 42
+#define DC_DIVAPPR_Q_THRESHOLD             159
+#define DC_BDIV_QR_THRESHOLD                46
+#define DC_BDIV_Q_THRESHOLD                110
+
+#define INV_MULMOD_BNM1_THRESHOLD           26
+#define INV_NEWTON_THRESHOLD               177
+#define INV_APPR_THRESHOLD                 165
+
+#define BINV_NEWTON_THRESHOLD              198
+#define REDC_1_TO_REDC_N_THRESHOLD          56
+
+#define MU_DIV_QR_THRESHOLD               1017
+#define MU_DIVAPPR_Q_THRESHOLD            1142
+#define MUPI_DIV_QR_THRESHOLD               90
+#define MU_BDIV_QR_THRESHOLD               924
+#define MU_BDIV_Q_THRESHOLD               1017
+
+#define POWM_SEC_TABLE  7,17,86,579,1925
+
+#define GET_STR_DC_THRESHOLD                14
+#define GET_STR_PRECOMPUTE_THRESHOLD        23
+#define SET_STR_DC_THRESHOLD               788
+#define SET_STR_PRECOMPUTE_THRESHOLD      1713
+
+#define FAC_DSC_THRESHOLD                  512
+#define FAC_ODD_THRESHOLD                   25
+
+#define MATRIX22_STRASSEN_THRESHOLD         10
+#define HGCD_THRESHOLD                     113
+#define HGCD_APPR_THRESHOLD                115
+#define HGCD_REDUCE_THRESHOLD             4633
+#define GCD_DC_THRESHOLD                   330
+#define GCDEXT_DC_THRESHOLD                242
+#define JACOBI_BASE_METHOD                   4

diff --git a/third_party/gmp/mpn/powerpc64/mode64/p5/gmp-mparam.h b/third_party/gmp/mpn/powerpc64/mode64/p5/gmp-mparam.h
new file mode 100644
index 0000000..15b009c
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc64/mode64/p5/gmp-mparam.h

@@ -0,0 +1,219 @@
+/* POWER5 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 1999-2003, 2009, 2010 Free Software Foundation,
+Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+/* POWER5 (friggms.hpc.ntnu.no) */
+
+#define MOD_1_NORM_THRESHOLD                 0  /* always */
+#define MOD_1_UNNORM_THRESHOLD               0  /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          6
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          5
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD        15
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD         0  /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     11
+#define USE_PREINV_DIVREM_1                  0
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           40
+
+#define MUL_TOOM22_THRESHOLD                21
+#define MUL_TOOM33_THRESHOLD                24
+#define MUL_TOOM44_THRESHOLD                70
+#define MUL_TOOM6H_THRESHOLD               262
+#define MUL_TOOM8H_THRESHOLD               393
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      49
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     126
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD      85
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD      94
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD      70
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 24
+#define SQR_TOOM3_THRESHOLD                 81
+#define SQR_TOOM4_THRESHOLD                142
+#define SQR_TOOM6_THRESHOLD                189
+#define SQR_TOOM8_THRESHOLD                284
+
+#define MULMID_TOOM42_THRESHOLD             36
+
+#define MULMOD_BNM1_THRESHOLD               12
+#define SQRMOD_BNM1_THRESHOLD               15
+
+#define MUL_FFT_MODF_THRESHOLD             304  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    348, 5}, {     17, 6}, {      9, 5}, {     19, 6}, \
+    {     10, 5}, {     21, 6}, {     21, 7}, {     11, 6}, \
+    {     23, 7}, {     12, 6}, {     25, 7}, {     21, 8}, \
+    {     11, 7}, {     25, 8}, {     13, 7}, {     27, 8}, \
+    {     15, 7}, {     31, 8}, {     21, 9}, {     11, 8}, \
+    {     27, 9}, {     15, 8}, {     33, 9}, {     19, 8}, \
+    {     39, 9}, {     23, 8}, {     47, 9}, {     27,10}, \
+    {     15, 9}, {     39,10}, {     23, 9}, {     51,11}, \
+    {     15,10}, {     31, 9}, {     67,10}, {     39, 9}, \
+    {     79,10}, {     47, 9}, {     95,10}, {     55,11}, \
+    {     31,10}, {     79,11}, {     47,10}, {     95,12}, \
+    {     31,11}, {     63,10}, {    127, 9}, {    255,10}, \
+    {    135,11}, {     79,10}, {    159, 9}, {    319,11}, \
+    {     95,10}, {    191, 9}, {    383,12}, {     63,11}, \
+    {    127,10}, {    255, 9}, {    511,10}, {    271,11}, \
+    {    143,10}, {    287, 9}, {    575,10}, {    319,12}, \
+    {     95,11}, {    191,10}, {    383,13}, {     63,12}, \
+    {    127,11}, {    255,10}, {    511,11}, {    271,10}, \
+    {    543,11}, {    287,10}, {    575, 9}, {   1151,11}, \
+    {    319,10}, {    639,11}, {    351,10}, {    703,12}, \
+    {    191,11}, {    383,10}, {    767,11}, {    415,12}, \
+    {    223,11}, {    447,13}, {    127,12}, {    255,11}, \
+    {    511,10}, {   1023,11}, {    543,10}, {   1087,12}, \
+    {    287,11}, {    575,10}, {   1151,12}, {    319,11}, \
+    {    639,12}, {    351,11}, {    703,13}, {    191,12}, \
+    {    383,11}, {    767,12}, {    415,11}, {    831,12}, \
+    {    447,11}, {    895,14}, {    127,13}, {    255,12}, \
+    {    511,11}, {   1023,12}, {    543,11}, {   1087,10}, \
+    {   2175,12}, {    575,11}, {   1151,12}, {    607,13}, \
+    {    319,12}, {    639,11}, {   1279,12}, {    671,11}, \
+    {   1343,12}, {    703,11}, {   1407,13}, {    383,12}, \
+    {    767,11}, {   1535,12}, {    831,13}, {    447,12}, \
+    {    959,11}, {   1919,14}, {    255,13}, {    511,12}, \
+    {   1087,11}, {   2175,13}, {    575,12}, {   1215,11}, \
+    {   2431,10}, {   4863,13}, {    639,12}, {   1343,13}, \
+    {    703,12}, {   1407,14}, {    383,13}, {    767,12}, \
+    {   1535,13}, {    831,12}, {   1663,13}, {    959,12}, \
+    {   1919,11}, {   3839,15}, {    255,14}, {    511,13}, \
+    {   1087,12}, {   2175,13}, {   1215,12}, {   2431,11}, \
+    {   4863,14}, {    639,13}, {   1343,12}, {   2687,13}, \
+    {   1407,12}, {   2815,13}, {   1471,12}, {   2943,14}, \
+    {    767,13}, {   1599,12}, {   3199,13}, {   1663,14}, \
+    {    895,13}, {   1919,12}, {   3839,15}, {    511,14}, \
+    {   1023,13}, {   2175,14}, {   1151,13}, {   2431,12}, \
+    {   4863,14}, {   1279,13}, {   2687,14}, {   1407,13}, \
+    {   2943,15}, {    767,14}, {   1535,13}, {   3199,14}, \
+    {   1663,13}, {   3327,14}, {   1919,13}, {   3839,16}, \
+    {    511,15}, {   1023,14}, {   2431,13}, {   4863,15}, \
+    {   1279,14}, {   2943,12}, {  11775,15}, {   1535,14}, \
+    {   3327,15}, {   1791,14}, {  16384,15}, {  32768,16}, \
+    {  65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \
+    {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 208
+#define MUL_FFT_THRESHOLD                 4224
+
+#define SQR_FFT_MODF_THRESHOLD             284  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    272, 5}, {     15, 6}, {      8, 5}, {     17, 6}, \
+    {     19, 7}, {     17, 8}, {      9, 7}, {     21, 8}, \
+    {     11, 7}, {     24, 8}, {     13, 7}, {     27, 8}, \
+    {     15, 7}, {     31, 8}, {     21, 9}, {     11, 8}, \
+    {     27, 9}, {     15, 8}, {     33, 9}, {     19, 8}, \
+    {     39, 9}, {     23, 8}, {     47, 9}, {     27,10}, \
+    {     15, 9}, {     39,10}, {     23, 9}, {     47,11}, \
+    {     15,10}, {     31, 9}, {     63,10}, {     47,11}, \
+    {     31,10}, {     71, 9}, {    143,10}, {     79,11}, \
+    {     47,10}, {     95,12}, {     31,11}, {     63,10}, \
+    {    127, 9}, {    255, 8}, {    511, 9}, {    271,10}, \
+    {    143,11}, {     79,10}, {    159, 9}, {    319,10}, \
+    {    175, 9}, {    351,11}, {     95,10}, {    191, 9}, \
+    {    383,10}, {    207, 9}, {    415,12}, {     63,11}, \
+    {    127,10}, {    255, 9}, {    511,10}, {    271,11}, \
+    {    143,10}, {    287, 9}, {    575,11}, {    159,10}, \
+    {    319,11}, {    175,10}, {    351,12}, {     95,11}, \
+    {    191,10}, {    383,11}, {    207,10}, {    415,13}, \
+    {     63,12}, {    127,11}, {    255,10}, {    511,11}, \
+    {    271,10}, {    543,11}, {    287,10}, {    575,12}, \
+    {    159,11}, {    319,10}, {    639,11}, {    351,10}, \
+    {    703,12}, {    191,11}, {    383,10}, {    767,11}, \
+    {    415,12}, {    223,11}, {    447,10}, {    895,11}, \
+    {    479,10}, {    959,12}, {    255,11}, {    511,10}, \
+    {   1023,11}, {    543,12}, {    287,11}, {    575,12}, \
+    {    319,11}, {    639,12}, {    351,11}, {    703,13}, \
+    {    191,12}, {    383,11}, {    767,12}, {    415,11}, \
+    {    831,12}, {    447,11}, {    895,12}, {    479,11}, \
+    {    959,13}, {    255,12}, {    511,11}, {   1023,12}, \
+    {    543,11}, {   1087,12}, {    575,13}, {    319,12}, \
+    {    639,11}, {   1279,12}, {    703,11}, {   1407,13}, \
+    {    383,12}, {    831,13}, {    447,12}, {    959,14}, \
+    {    255,13}, {    511,12}, {   1087,13}, {    575,12}, \
+    {   1215,13}, {    639,12}, {   1279,13}, {    703,12}, \
+    {   1407,14}, {    383,13}, {    831,12}, {   1663,13}, \
+    {    959,12}, {   1919,15}, {    255,14}, {    511,13}, \
+    {   1023,12}, {   2047,13}, {   1087,12}, {   2175,13}, \
+    {   1215,14}, {    639,13}, {   1407,12}, {   2815,14}, \
+    {    767,13}, {   1663,14}, {    895,13}, {   1919,15}, \
+    {    511,14}, {   1023,13}, {   2175,14}, {   1151,13}, \
+    {   2431,12}, {   4863,14}, {   1407,13}, {   2815,15}, \
+    {    767,14}, {   1663,13}, {   3327,14}, {   1919,13}, \
+    {   3839,16}, {    511,15}, {   1023,14}, {   2431,13}, \
+    {   4863,15}, {   1279,14}, {   2943,13}, {   5887,12}, \
+    {  11775,15}, {   1535,14}, {   3327,15}, {   1791,14}, \
+    {  16384,15}, {  32768,16}, {  65536,17}, { 131072,18}, \
+    { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \
+    {4194304,23}, {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 190
+#define SQR_FFT_THRESHOLD                 3264
+
+#define MULLO_BASECASE_THRESHOLD             6
+#define MULLO_DC_THRESHOLD                  60
+#define MULLO_MUL_N_THRESHOLD             7463
+
+#define DC_DIV_QR_THRESHOLD                 58
+#define DC_DIVAPPR_Q_THRESHOLD             232
+#define DC_BDIV_QR_THRESHOLD                78
+#define DC_BDIV_Q_THRESHOLD                238
+
+#define INV_MULMOD_BNM1_THRESHOLD           92
+#define INV_NEWTON_THRESHOLD               155
+#define INV_APPR_THRESHOLD                 157
+
+#define BINV_NEWTON_THRESHOLD              155
+#define REDC_1_TO_REDC_N_THRESHOLD          61
+
+#define MU_DIV_QR_THRESHOLD                998
+#define MU_DIVAPPR_Q_THRESHOLD             979
+#define MUPI_DIV_QR_THRESHOLD               79
+#define MU_BDIV_QR_THRESHOLD               823
+#define MU_BDIV_Q_THRESHOLD                942
+
+#define MATRIX22_STRASSEN_THRESHOLD         14
+#define HGCD_THRESHOLD                      74
+#define HGCD_APPR_THRESHOLD                155
+#define HGCD_REDUCE_THRESHOLD             2479
+#define GCD_DC_THRESHOLD                   351
+#define GCDEXT_DC_THRESHOLD                288
+#define JACOBI_BASE_METHOD                   4
+
+#define GET_STR_DC_THRESHOLD                12
+#define GET_STR_PRECOMPUTE_THRESHOLD        21
+#define SET_STR_DC_THRESHOLD               650
+#define SET_STR_PRECOMPUTE_THRESHOLD      1585
+
+#define FAC_DSC_THRESHOLD                  662
+#define FAC_ODD_THRESHOLD                   28

diff --git a/third_party/gmp/mpn/powerpc64/mode64/p6/aorsmul_1.asm b/third_party/gmp/mpn/powerpc64/mode64/p6/aorsmul_1.asm
new file mode 100644
index 0000000..c572b91
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc64/mode64/p6/aorsmul_1.asm

@@ -0,0 +1,185 @@
+dnl  PowerPC-64 mpn_addmul_1 and mpn_submul_1 optimised for power6.
+
+dnl  Copyright 1999-2001, 2003-2006, 2008, 2010, 2011 Free Software Foundation,
+dnl  Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C               mpn_addmul_1    mpn_submul_1
+C               cycles/limb     cycles/limb
+C POWER3/PPC630     ?               ?
+C POWER4/PPC970     ?               ?
+C POWER5            ?               ?
+C POWER6           12.25           12.8
+C POWER7            ?               ?
+
+C TODO
+C  * Reduce register usage.
+C  * Schedule function entry code.
+C  * Unroll more.  8-way unrolling would bring us to 10 c/l, 16-way unrolling
+C    would bring us to 9 c/l.
+C  * Handle n = 1 and perhaps n = 2 separately, without saving any registers.
+
+C INPUT PARAMETERS
+define(`rp',  `r3')
+define(`up',  `r4')
+define(`n',   `r5')
+define(`v0',  `r6')
+
+ifdef(`OPERATION_addmul_1',`
+  define(ADDSUBC,	adde)
+  define(ADDSUB,	addc)
+  define(func,		mpn_addmul_1)
+  define(func_nc,	mpn_addmul_1c)	C FIXME: not really supported
+  define(AM,		`$1')
+  define(SM,		`')
+  define(CLRRSC,	`addic	$1, r0, 0')
+')
+ifdef(`OPERATION_submul_1',`
+  define(ADDSUBC,	subfe)
+  define(ADDSUB,	subfc)
+  define(func,		mpn_submul_1)
+  define(func_nc,	mpn_submul_1c)	C FIXME: not really supported
+  define(AM,		`')
+  define(SM,		`$1')
+  define(CLRRSC,	`subfc	$1, r0, r0')
+')
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
+
+ASM_START()
+PROLOGUE(func)
+	std	r31, -8(r1)
+	std	r30, -16(r1)
+	std	r29, -24(r1)
+	std	r28, -32(r1)
+	std	r27, -40(r1)
+
+	rldicl.	r0, n, 0,62	C r0 = n & 3, set cr0
+	cmpdi	cr6, r0, 2
+	addi	n, n, 3		C compute count...
+	srdi	n, n, 2		C ...for ctr
+	mtctr	n		C copy loop count into ctr
+	beq	cr0, L(b0)
+	blt	cr6, L(b1)
+	beq	cr6, L(b2)
+
+L(b3):	ld	r8, 0(up)
+	ld	r7, 8(up)
+	ld	r27, 16(up)
+	addi	up, up, 16
+	addi	rp, rp, 16
+	mulld	r5,  r8, v0
+	mulhdu	r8,  r8, v0
+	mulld	r9,  r7, v0
+	mulhdu	r7,  r7, v0
+	mulld	r11, r27, v0
+	mulhdu	r27, r27, v0
+	ld	r29, -16(rp)
+	ld	r30, -8(rp)
+	ld	r31, 0(rp)
+	addc	r9, r9, r8
+	adde	r11, r11, r7
+	addze	r12, r27
+	ADDSUB	r5, r5, r29
+	b	L(l3)
+
+L(b2):	ld	r7, 0(up)
+	ld	r27, 8(up)
+	addi	up, up, 8
+	addi	rp, rp, 8
+	mulld	r9,  r7, v0
+	mulhdu	r7,  r7, v0
+	mulld	r11, r27, v0
+	mulhdu	r27, r27, v0
+	ld	r30, -8(rp)
+	ld	r31, 0(rp)
+	addc	r11, r11, r7
+	addze	r12, r27
+	ADDSUB	r9, r9, r30
+	b	L(l2)
+
+L(b1):	ld	r27, 0(up)
+	ld	r31, 0(rp)
+	mulld	r11, r27, v0
+	mulhdu	r12, r27, v0
+	ADDSUB	r11, r11, r31
+	b	L(l1)
+
+L(b0):	addi	up, up, -8
+	addi	rp, rp, -8
+	CLRRSC(	r12)		C clear r12 and clr/set cy
+
+	ALIGN(32)
+L(top):
+SM(`	subfe	r11, r0, r0')	C complement...
+SM(`	addic	r11, r11, 1')	C ...carry flag
+	ld	r10, 8(up)
+	ld	r8, 16(up)
+	ld	r7, 24(up)
+	ld	r27, 32(up)
+	addi	up, up, 32
+	addi	rp, rp, 32
+	mulld	r0,  r10, v0
+	mulhdu	r10, r10, v0
+	mulld	r5,  r8, v0
+	mulhdu	r8,  r8, v0
+	mulld	r9,  r7, v0
+	mulhdu	r7,  r7, v0
+	mulld	r11, r27, v0
+	mulhdu	r27, r27, v0
+	ld	r28, -24(rp)
+	adde	r0, r0, r12
+	ld	r29, -16(rp)
+	adde	r5, r5, r10
+	ld	r30, -8(rp)
+	ld	r31, 0(rp)
+	adde	r9, r9, r8
+	adde	r11, r11, r7
+	addze	r12, r27
+	ADDSUB	r0, r0, r28
+	std	r0, -24(rp)
+	ADDSUBC	r5, r5, r29
+L(l3):	std	r5, -16(rp)
+	ADDSUBC	r9, r9, r30
+L(l2):	std	r9, -8(rp)
+	ADDSUBC	r11, r11, r31
+L(l1):	std	r11, 0(rp)
+	bdnz	L(top)
+
+AM(`	addze	r3, r12')
+SM(`	subfe	r11, r0, r0')		C complement...
+	ld	r31, -8(r1)
+SM(`	subf	r3, r11, r12')
+	ld	r30, -16(r1)
+	ld	r29, -24(r1)
+	ld	r28, -32(r1)
+	ld	r27, -40(r1)
+	blr
+EPILOGUE()

diff --git a/third_party/gmp/mpn/powerpc64/mode64/p6/gmp-mparam.h b/third_party/gmp/mpn/powerpc64/mode64/p6/gmp-mparam.h
new file mode 100644
index 0000000..c7e2f89
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc64/mode64/p6/gmp-mparam.h

@@ -0,0 +1,160 @@
+/* POWER6 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 1999-2003, 2009-2011 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+/* 3500 MHz POWER6 (kolga.bibsys.no) */
+
+#define MOD_1_NORM_THRESHOLD                 0  /* always */
+#define MOD_1_UNNORM_THRESHOLD               0  /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          3
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          3
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD        12
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD         0  /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD      6
+#define USE_PREINV_DIVREM_1                  0
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           21
+
+#define MUL_TOOM22_THRESHOLD                20
+#define MUL_TOOM33_THRESHOLD                50
+#define MUL_TOOM44_THRESHOLD               106
+#define MUL_TOOM6H_THRESHOLD               274
+#define MUL_TOOM8H_THRESHOLD               339
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      62
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD      76
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD      73
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD      66
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD      88
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 24
+#define SQR_TOOM3_THRESHOLD                 49
+#define SQR_TOOM4_THRESHOLD                130
+#define SQR_TOOM6_THRESHOLD                226
+#define SQR_TOOM8_THRESHOLD                272
+
+#define MULMID_TOOM42_THRESHOLD             36
+
+#define MULMOD_BNM1_THRESHOLD               14
+#define SQRMOD_BNM1_THRESHOLD               14
+
+#define MUL_FFT_MODF_THRESHOLD             380  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    340, 5}, {     19, 6}, {     10, 5}, {     21, 6}, \
+    {     11, 5}, {     23, 6}, {     21, 7}, {     11, 6}, \
+    {     23, 7}, {     12, 6}, {     25, 7}, {     21, 8}, \
+    {     11, 7}, {     24, 8}, {     13, 7}, {     27, 8}, \
+    {     21, 9}, {     11, 8}, {     25, 9}, {     15, 8}, \
+    {     33, 9}, {     23, 8}, {     47, 9}, {     27,10}, \
+    {     15, 9}, {     39,10}, {     23, 9}, {     47,11}, \
+    {     15,10}, {     31, 9}, {     63,10}, {     47,11}, \
+    {     31,10}, {     71,11}, {     47,12}, {     31,11}, \
+    {     63,10}, {    127, 9}, {    255, 8}, {    511,10}, \
+    {    135, 9}, {    271,11}, {     79, 9}, {    319, 8}, \
+    {    639,10}, {    175,11}, {     95,10}, {    191, 9}, \
+    {    383,10}, {    207,12}, {     63,10}, {    255, 9}, \
+    {    511,10}, {    271, 9}, {    543,11}, {    143,10}, \
+    {    287, 9}, {    575,10}, {    303, 9}, {    607,10}, \
+    {    319, 9}, {    639,11}, {    175,12}, {     95,11}, \
+    {    191,10}, {    383,11}, {    207,10}, {    415,13}, \
+    {   8192,14}, {  16384,15}, {  32768,16}, {  65536,17}, \
+    { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
+    {2097152,22}, {4194304,23}, {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 79
+#define MUL_FFT_THRESHOLD                 3520
+
+#define SQR_FFT_MODF_THRESHOLD             308  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    280, 5}, {     17, 6}, {      9, 5}, {     19, 6}, \
+    {     21, 7}, {     11, 6}, {     23, 7}, {     21, 8}, \
+    {     11, 7}, {     24, 8}, {     13, 7}, {     27, 8}, \
+    {     21, 9}, {     11, 8}, {     25, 9}, {     15, 8}, \
+    {     33, 9}, {     19, 8}, {     39, 9}, {     23, 8}, \
+    {     47, 9}, {     27,10}, {     15, 9}, {     39,10}, \
+    {     23, 9}, {     47,11}, {     15,10}, {     31, 9}, \
+    {     63,10}, {     47,11}, {     31,10}, {     71, 9}, \
+    {    143,11}, {     47,12}, {     31,11}, {     63, 9}, \
+    {    255, 8}, {    511, 9}, {    271,10}, {    143,11}, \
+    {     79,10}, {    159, 9}, {    319,10}, {    175, 9}, \
+    {    351,11}, {     95,10}, {    191, 9}, {    383,10}, \
+    {    207,12}, {     63,11}, {    127,10}, {    255, 9}, \
+    {    511, 8}, {   1023,10}, {    271, 9}, {    543,11}, \
+    {    143,10}, {    287, 9}, {    575,11}, {    159,10}, \
+    {    319, 9}, {    639,11}, {    175,10}, {    351,12}, \
+    {     95,11}, {    191,10}, {    383,11}, {    207,10}, \
+    {    415,13}, {   8192,14}, {  16384,15}, {  32768,16}, \
+    {  65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \
+    {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 80
+#define SQR_FFT_THRESHOLD                 2752
+
+#define MULLO_BASECASE_THRESHOLD             5
+#define MULLO_DC_THRESHOLD                  62
+#define MULLO_MUL_N_THRESHOLD             2995
+
+#define DC_DIV_QR_THRESHOLD                 59
+#define DC_DIVAPPR_Q_THRESHOLD             200
+#define DC_BDIV_QR_THRESHOLD                70
+#define DC_BDIV_Q_THRESHOLD                168
+
+#define INV_MULMOD_BNM1_THRESHOLD           53
+#define INV_NEWTON_THRESHOLD               170
+#define INV_APPR_THRESHOLD                 166
+
+#define BINV_NEWTON_THRESHOLD              220
+#define REDC_1_TO_REDC_N_THRESHOLD          67
+
+#define MU_DIV_QR_THRESHOLD                998
+#define MU_DIVAPPR_Q_THRESHOLD             942
+#define MUPI_DIV_QR_THRESHOLD               57
+#define MU_BDIV_QR_THRESHOLD               889
+#define MU_BDIV_Q_THRESHOLD               1078
+
+#define POWM_SEC_TABLE  4,26,216,804,1731
+
+#define MATRIX22_STRASSEN_THRESHOLD         13
+#define HGCD_THRESHOLD                     106
+#define HGCD_APPR_THRESHOLD                109
+#define HGCD_REDUCE_THRESHOLD             2205
+#define GCD_DC_THRESHOLD                   492
+#define GCDEXT_DC_THRESHOLD                327
+#define JACOBI_BASE_METHOD                   4
+
+#define GET_STR_DC_THRESHOLD                16
+#define GET_STR_PRECOMPUTE_THRESHOLD        28
+#define SET_STR_DC_THRESHOLD               537
+#define SET_STR_PRECOMPUTE_THRESHOLD      1576
+
+#define FAC_DSC_THRESHOLD                  426
+#define FAC_ODD_THRESHOLD                    0  /* always */

diff --git a/third_party/gmp/mpn/powerpc64/mode64/p6/mul_basecase.asm b/third_party/gmp/mpn/powerpc64/mode64/p6/mul_basecase.asm
new file mode 100644
index 0000000..3d32b46
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc64/mode64/p6/mul_basecase.asm

@@ -0,0 +1,589 @@
+dnl  PowerPC-64 mpn_mul_basecase.
+
+dnl  Copyright 1999-2001, 2003-2006, 2008, 2010 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C		    cycles/limb
+C POWER3/PPC630		 ?
+C POWER4/PPC970		 ?
+C POWER5		 ?
+C POWER6		12.25
+
+C TODO
+C  * Reduce register usage.  At least 4 register less can be used.
+C  * Unroll more.  8-way unrolling would bring us to 10 c/l, 16-way unrolling
+C    would bring us to 9 c/l.
+C  * The bdz insns for b1 and b2 will never branch,
+C  * Align things better, perhaps by moving things like pointer updates from
+C    before to after loops.
+
+C INPUT PARAMETERS
+define(`rp', `r3')
+define(`up', `r4')
+define(`un', `r5')
+define(`vp', `r6')
+define(`vn', `r7')
+
+define(`v0',	   `r25')
+define(`outer_rp', `r22')
+define(`outer_up', `r23')
+
+ASM_START()
+PROLOGUE(mpn_mul_basecase)
+
+C Special code for un <= 2, for efficiency of these important cases,
+C and since it simplifies the default code.
+	cmpdi	cr0, un, 2
+	bgt	cr0, L(un_gt2)
+	cmpdi	cr6, vn, 1
+	ld	r7, 0(vp)
+	ld	r5, 0(up)
+	mulld	r8, r5, r7	C weight 0
+	mulhdu	r9, r5, r7	C weight 1
+	std	r8, 0(rp)
+	beq	cr0, L(2x)
+	std	r9, 8(rp)
+	blr
+	ALIGN(16)
+L(2x):	ld	r0, 8(up)
+	mulld	r8, r0, r7	C weight 1
+	mulhdu	r10, r0, r7	C weight 2
+	addc	r9, r9, r8
+	addze	r10, r10
+	bne	cr6, L(2x2)
+	std	r9, 8(rp)
+	std	r10, 16(rp)
+	blr
+	ALIGN(16)
+L(2x2):	ld	r6, 8(vp)
+	nop
+	mulld	r8, r5, r6	C weight 1
+	mulhdu	r11, r5, r6	C weight 2
+	mulld	r12, r0, r6	C weight 2
+	mulhdu	r0, r0, r6	C weight 3
+	addc	r9, r9, r8
+	std	r9, 8(rp)
+	adde	r11, r11, r10
+	addze	r0, r0
+	addc	r11, r11, r12
+	addze	r0, r0
+	std	r11, 16(rp)
+	std	r0, 24(rp)
+	blr
+
+L(un_gt2):
+	std	r31, -8(r1)
+	std	r30, -16(r1)
+	std	r29, -24(r1)
+	std	r28, -32(r1)
+	std	r27, -40(r1)
+	std	r26, -48(r1)
+	std	r25, -56(r1)
+	std	r24, -64(r1)
+	std	r23, -72(r1)
+	std	r22, -80(r1)
+	std	r21, -88(r1)
+	std	r20, -96(r1)
+
+	mr	outer_rp, rp
+	mr	outer_up, up
+
+	ld	v0, 0(vp)	C new v limb
+	addi	vp, vp, 8
+	ld	r26, 0(up)
+
+	rldicl.	r0, un, 0,62	C r0 = n & 3, set cr0
+	cmpdi	cr6, r0, 2
+	addi	un, un, 4	C compute count...
+	srdi	un, un, 2	C ...for ctr
+	mtctr	un		C copy inner loop count into ctr
+	beq	cr0, L(b0)
+	blt	cr6, L(b1)
+	beq	cr6, L(b2)
+
+
+	ALIGN(16)
+L(b3):
+	ld	r27, 8(up)
+	ld	r20, 16(up)
+	mulld	r0, r26, v0
+	mulhdu	r31, r26, v0
+	mulld	r24, r27, v0
+	mulhdu	r8, r27, v0
+	mulld	r9, r20, v0
+	mulhdu	r10, r20, v0
+	addc	r24, r24, r31
+	adde	r9, r9, r8
+	addze	r12, r10
+	std	r0, 0(rp)
+	std	r24, 8(rp)
+	std	r9, 16(rp)
+	addi	up, up, 16
+	addi	rp, rp, 16
+	bdz	L(end_m_3)
+
+	ALIGN(32)
+L(lo_m_3):
+	ld	r26, 8(up)
+	ld	r27, 16(up)
+	ld	r20, 24(up)
+	ld	r21, 32(up)
+	mulld	r0, r26, v0
+	mulhdu	r31, r26, v0
+	mulld	r24, r27, v0
+	mulhdu	r8, r27, v0
+	mulld	r9, r20, v0
+	mulhdu	r27, r20, v0
+	mulld	r11, r21, v0
+	mulhdu	r26, r21, v0
+	adde	r0, r0, r12
+	adde	r24, r24, r31
+	std	r0, 8(rp)
+	adde	r9, r9, r8
+	std	r24, 16(rp)
+	adde	r11, r11, r27
+	std	r9, 24(rp)
+	addi	up, up, 32
+	std	r11, 32(rp)
+	addi	rp, rp, 32
+	mr	r12, r26
+	bdnz	L(lo_m_3)
+
+	ALIGN(16)
+L(end_m_3):
+	addze	r12, r12
+	addic.	vn, vn, -1
+	std	r12, 8(rp)
+	beq	L(ret)
+
+	ALIGN(16)
+L(outer_lo_3):
+	mtctr	un		C copy inner loop count into ctr
+	addi	rp, outer_rp, 24
+	addi	up, outer_up, 16
+	addi	outer_rp, outer_rp, 8
+	ld	v0, 0(vp)	C new v limb
+	addi	vp, vp, 8
+	ld	r26, -16(up)
+	ld	r27, -8(up)
+	ld	r20, 0(up)
+	mulld	r0, r26, v0
+	mulhdu	r31, r26, v0
+	mulld	r24, r27, v0
+	mulhdu	r8, r27, v0
+	mulld	r9, r20, v0
+	mulhdu	r10, r20, v0
+	ld	r28, -16(rp)
+	ld	r29, -8(rp)
+	ld	r30, 0(rp)
+	addc	r24, r24, r31
+	adde	r9, r9, r8
+	addze	r12, r10
+	addc	r0, r0, r28
+	std	r0, -16(rp)
+	adde	r24, r24, r29
+	std	r24, -8(rp)
+	adde	r9, r9, r30
+	std	r9, 0(rp)
+	bdz	L(end_3)
+
+	ALIGN(32)		C registers dying
+L(lo_3):
+	ld	r26, 8(up)
+	ld	r27, 16(up)
+	ld	r20, 24(up)	C
+	ld	r21, 32(up)	C
+	addi	up, up, 32	C
+	addi	rp, rp, 32	C
+	mulld	r0, r26, v0	C
+	mulhdu	r10, r26, v0	C 26
+	mulld	r24, r27, v0	C
+	mulhdu	r8, r27, v0	C 27
+	mulld	r9, r20, v0	C
+	mulhdu	r27, r20, v0	C 26
+	mulld	r11, r21, v0	C
+	mulhdu	r26, r21, v0	C 27
+	ld	r28, -24(rp)	C
+	adde	r0, r0, r12	C 0 12
+	ld	r29, -16(rp)	C
+	adde	r24, r24, r10	C 24 10
+	ld	r30, -8(rp)	C
+	ld	r31, 0(rp)	C
+	adde	r9, r9, r8	C 8 9
+	adde	r11, r11, r27	C 27 11
+	addze	r12, r26	C 26
+	addc	r0, r0, r28	C 0 28
+	std	r0, -24(rp)	C 0
+	adde	r24, r24, r29	C 7 29
+	std	r24, -16(rp)	C 7
+	adde	r9, r9, r30	C 9 30
+	std	r9, -8(rp)	C 9
+	adde	r11, r11, r31	C 11 31
+	std	r11, 0(rp)	C 11
+	bdnz	L(lo_3)		C
+
+	ALIGN(16)
+L(end_3):
+	addze	r12, r12
+	addic.	vn, vn, -1
+	std	r12, 8(rp)
+	bne	L(outer_lo_3)
+	b	L(ret)
+
+
+	ALIGN(16)
+L(b1):
+	mulld	r0, r26, v0
+	mulhdu	r12, r26, v0
+	addic	r0, r0, 0
+	std	r0, 0(rp)
+	bdz	L(end_m_1)
+
+	ALIGN(16)
+L(lo_m_1):
+	ld	r26, 8(up)
+	ld	r27, 16(up)
+	ld	r20, 24(up)
+	ld	r21, 32(up)
+	mulld	r0, r26, v0
+	mulhdu	r31, r26, v0
+	mulld	r24, r27, v0
+	mulhdu	r8, r27, v0
+	mulld	r9, r20, v0
+	mulhdu	r27, r20, v0
+	mulld	r11, r21, v0
+	mulhdu	r26, r21, v0
+	adde	r0, r0, r12
+	adde	r24, r24, r31
+	std	r0, 8(rp)
+	adde	r9, r9, r8
+	std	r24, 16(rp)
+	adde	r11, r11, r27
+	std	r9, 24(rp)
+	addi	up, up, 32
+	std	r11, 32(rp)
+	addi	rp, rp, 32
+	mr	r12, r26
+	bdnz	L(lo_m_1)
+
+	ALIGN(16)
+L(end_m_1):
+	addze	r12, r12
+	addic.	vn, vn, -1
+	std	r12, 8(rp)
+	beq	L(ret)
+
+	ALIGN(16)
+L(outer_lo_1):
+	mtctr	un		C copy inner loop count into ctr
+	addi	rp, outer_rp, 8
+	mr	up, outer_up
+	addi	outer_rp, outer_rp, 8
+	ld	v0, 0(vp)	C new v limb
+	addi	vp, vp, 8
+	ld	r26, 0(up)
+	ld	r28, 0(rp)
+	mulld	r0, r26, v0
+	mulhdu	r12, r26, v0
+	addc	r0, r0, r28
+	std	r0, 0(rp)
+	bdz	L(end_1)
+
+	ALIGN(32)		C registers dying
+L(lo_1):
+	ld	r26, 8(up)
+	ld	r27, 16(up)
+	ld	r20, 24(up)	C
+	ld	r21, 32(up)	C
+	addi	up, up, 32	C
+	addi	rp, rp, 32	C
+	mulld	r0, r26, v0	C
+	mulhdu	r10, r26, v0	C 26
+	mulld	r24, r27, v0	C
+	mulhdu	r8, r27, v0	C 27
+	mulld	r9, r20, v0	C
+	mulhdu	r27, r20, v0	C 26
+	mulld	r11, r21, v0	C
+	mulhdu	r26, r21, v0	C 27
+	ld	r28, -24(rp)	C
+	adde	r0, r0, r12	C 0 12
+	ld	r29, -16(rp)	C
+	adde	r24, r24, r10	C 24 10
+	ld	r30, -8(rp)	C
+	ld	r31, 0(rp)	C
+	adde	r9, r9, r8	C 8 9
+	adde	r11, r11, r27	C 27 11
+	addze	r12, r26	C 26
+	addc	r0, r0, r28	C 0 28
+	std	r0, -24(rp)	C 0
+	adde	r24, r24, r29	C 7 29
+	std	r24, -16(rp)	C 7
+	adde	r9, r9, r30	C 9 30
+	std	r9, -8(rp)	C 9
+	adde	r11, r11, r31	C 11 31
+	std	r11, 0(rp)	C 11
+	bdnz	L(lo_1)		C
+
+	ALIGN(16)
+L(end_1):
+	addze	r12, r12
+	addic.	vn, vn, -1
+	std	r12, 8(rp)
+	bne	L(outer_lo_1)
+	b	L(ret)
+
+
+	ALIGN(16)
+L(b0):
+	addi	up, up, -8
+	addi	rp, rp, -8
+	li	r12, 0
+	addic	r12, r12, 0
+	bdz	L(end_m_0)
+
+	ALIGN(16)
+L(lo_m_0):
+	ld	r26, 8(up)
+	ld	r27, 16(up)
+	ld	r20, 24(up)
+	ld	r21, 32(up)
+	mulld	r0, r26, v0
+	mulhdu	r31, r26, v0
+	mulld	r24, r27, v0
+	mulhdu	r8, r27, v0
+	mulld	r9, r20, v0
+	mulhdu	r27, r20, v0
+	mulld	r11, r21, v0
+	mulhdu	r26, r21, v0
+	adde	r0, r0, r12
+	adde	r24, r24, r31
+	std	r0, 8(rp)
+	adde	r9, r9, r8
+	std	r24, 16(rp)
+	adde	r11, r11, r27
+	std	r9, 24(rp)
+	addi	up, up, 32
+	std	r11, 32(rp)
+	addi	rp, rp, 32
+	mr	r12, r26
+	bdnz	L(lo_m_0)
+
+	ALIGN(16)
+L(end_m_0):
+	addze	r12, r12
+	addic.	vn, vn, -1
+	std	r12, 8(rp)
+	beq	L(ret)
+
+	ALIGN(16)
+L(outer_lo_0):
+	mtctr	un		C copy inner loop count into ctr
+	addi	rp, outer_rp, 0
+	addi	up, outer_up, -8
+	addi	outer_rp, outer_rp, 8
+	ld	v0, 0(vp)	C new v limb
+	addi	vp, vp, 8
+	li	r12, 0
+	addic	r12, r12, 0
+	bdz	L(end_0)
+
+	ALIGN(32)		C registers dying
+L(lo_0):
+	ld	r26, 8(up)
+	ld	r27, 16(up)
+	ld	r20, 24(up)	C
+	ld	r21, 32(up)	C
+	addi	up, up, 32	C
+	addi	rp, rp, 32	C
+	mulld	r0, r26, v0	C
+	mulhdu	r10, r26, v0	C 26
+	mulld	r24, r27, v0	C
+	mulhdu	r8, r27, v0	C 27
+	mulld	r9, r20, v0	C
+	mulhdu	r27, r20, v0	C 26
+	mulld	r11, r21, v0	C
+	mulhdu	r26, r21, v0	C 27
+	ld	r28, -24(rp)	C
+	adde	r0, r0, r12	C 0 12
+	ld	r29, -16(rp)	C
+	adde	r24, r24, r10	C 24 10
+	ld	r30, -8(rp)	C
+	ld	r31, 0(rp)	C
+	adde	r9, r9, r8	C 8 9
+	adde	r11, r11, r27	C 27 11
+	addze	r12, r26	C 26
+	addc	r0, r0, r28	C 0 28
+	std	r0, -24(rp)	C 0
+	adde	r24, r24, r29	C 7 29
+	std	r24, -16(rp)	C 7
+	adde	r9, r9, r30	C 9 30
+	std	r9, -8(rp)	C 9
+	adde	r11, r11, r31	C 11 31
+	std	r11, 0(rp)	C 11
+	bdnz	L(lo_0)		C
+
+	ALIGN(16)
+L(end_0):
+	addze	r12, r12
+	addic.	vn, vn, -1
+	std	r12, 8(rp)
+	bne	L(outer_lo_0)
+	b	L(ret)
+
+
+	ALIGN(16)
+L(b2):	ld	r27, 8(up)
+	addi	up, up, 8
+	mulld	r0, r26, v0
+	mulhdu	r10, r26, v0
+	mulld	r24, r27, v0
+	mulhdu	r8, r27, v0
+	addc	r24, r24, r10
+	addze	r12, r8
+	std	r0, 0(rp)
+	std	r24, 8(rp)
+	addi	rp, rp, 8
+	bdz	L(end_m_2)
+
+	ALIGN(16)
+L(lo_m_2):
+	ld	r26, 8(up)
+	ld	r27, 16(up)
+	ld	r20, 24(up)
+	ld	r21, 32(up)
+	mulld	r0, r26, v0
+	mulhdu	r31, r26, v0
+	mulld	r24, r27, v0
+	mulhdu	r8, r27, v0
+	mulld	r9, r20, v0
+	mulhdu	r27, r20, v0
+	mulld	r11, r21, v0
+	mulhdu	r26, r21, v0
+	adde	r0, r0, r12
+	adde	r24, r24, r31
+	std	r0, 8(rp)
+	adde	r9, r9, r8
+	std	r24, 16(rp)
+	adde	r11, r11, r27
+	std	r9, 24(rp)
+	addi	up, up, 32
+	std	r11, 32(rp)
+	addi	rp, rp, 32
+	mr	r12, r26
+	bdnz	L(lo_m_2)
+
+	ALIGN(16)
+L(end_m_2):
+	addze	r12, r12
+	addic.	vn, vn, -1
+	std	r12, 8(rp)
+	beq	L(ret)
+
+	ALIGN(16)
+L(outer_lo_2):
+	mtctr	un		C copy inner loop count into ctr
+	addi	rp, outer_rp, 16
+	addi	up, outer_up, 8
+	addi	outer_rp, outer_rp, 8
+	ld	v0, 0(vp)	C new v limb
+	addi	vp, vp, 8
+	ld	r26, -8(up)
+	ld	r27, 0(up)
+	ld	r28, -8(rp)
+	ld	r29, 0(rp)
+	mulld	r0, r26, v0
+	mulhdu	r10, r26, v0
+	mulld	r24, r27, v0
+	mulhdu	r8, r27, v0
+	addc	r24, r24, r10
+	addze	r12, r8
+	addc	r0, r0, r28
+	std	r0, -8(rp)
+	adde	r24, r24, r29
+	std	r24, 0(rp)
+	bdz	L(end_2)
+
+	ALIGN(16)		C registers dying
+L(lo_2):
+	ld	r26, 8(up)
+	ld	r27, 16(up)
+	ld	r20, 24(up)	C
+	ld	r21, 32(up)	C
+	addi	up, up, 32	C
+	addi	rp, rp, 32	C
+	mulld	r0, r26, v0	C
+	mulhdu	r10, r26, v0	C 26
+	mulld	r24, r27, v0	C
+	mulhdu	r8, r27, v0	C 27
+	mulld	r9, r20, v0	C
+	mulhdu	r27, r20, v0	C 26
+	mulld	r11, r21, v0	C
+	mulhdu	r26, r21, v0	C 27
+	ld	r28, -24(rp)	C
+	adde	r0, r0, r12	C 0 12
+	ld	r29, -16(rp)	C
+	adde	r24, r24, r10	C 24 10
+	ld	r30, -8(rp)	C
+	ld	r31, 0(rp)	C
+	adde	r9, r9, r8	C 8 9
+	adde	r11, r11, r27	C 27 11
+	addze	r12, r26	C 26
+	addc	r0, r0, r28	C 0 28
+	std	r0, -24(rp)	C 0
+	adde	r24, r24, r29	C 7 29
+	std	r24, -16(rp)	C 7
+	adde	r9, r9, r30	C 9 30
+	std	r9, -8(rp)	C 9
+	adde	r11, r11, r31	C 11 31
+	std	r11, 0(rp)	C 11
+	bdnz	L(lo_2)		C
+
+	ALIGN(16)
+L(end_2):
+	addze	r12, r12
+	addic.	vn, vn, -1
+	std	r12, 8(rp)
+	bne	L(outer_lo_2)
+C	b	L(ret)
+
+L(ret):	ld	r31, -8(r1)
+	ld	r30, -16(r1)
+	ld	r29, -24(r1)
+	ld	r28, -32(r1)
+	ld	r27, -40(r1)
+	ld	r26, -48(r1)
+	ld	r25, -56(r1)
+	ld	r24, -64(r1)
+	ld	r23, -72(r1)
+	ld	r22, -80(r1)
+	ld	r21, -88(r1)
+	ld	r20, -96(r1)
+	blr
+EPILOGUE()

diff --git a/third_party/gmp/mpn/powerpc64/mode64/p7/aormul_2.asm b/third_party/gmp/mpn/powerpc64/mode64/p7/aormul_2.asm
new file mode 100644
index 0000000..8731e01
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc64/mode64/p7/aormul_2.asm

@@ -0,0 +1,135 @@
+dnl  PowerPC-64 mpn_mul_2 and mpn_addmul_2.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C                    cycles/limb    cycles/limb
+C			mul_2         addmul_2
+C POWER3/PPC630		 ?		 ?
+C POWER4/PPC970		 ?		 ?
+C POWER5		 ?		 ?
+C POWER6		 ?		 ?
+C POWER7-SMT4		 3		 3
+C POWER7-SMT2		 ?		 ?
+C POWER7-SMT1		 ?		 ?
+
+C INPUT PARAMETERS
+define(`rp', `r3')
+define(`up', `r4')
+define(`n',  `r5')
+define(`vp', `r6')
+
+define(`cy0', `r10')
+ifdef(`EXTRA_REGISTER',
+` define(`cy1', EXTRA_REGISTER)',
+` define(`cy1', `r31')')
+
+ifdef(`OPERATION_mul_2',`
+  define(`AM',		`')
+  define(`ADDX',	`addc')
+  define(`func',	`mpn_mul_2')
+')
+ifdef(`OPERATION_addmul_2',`
+  define(`AM',		`$1')
+  define(`ADDX',	`adde')
+  define(`func',	`mpn_addmul_2')
+')
+
+MULFUNC_PROLOGUE(mpn_mul_2 mpn_addmul_2)
+
+ASM_START()
+PROLOGUE(func)
+
+ifdef(`EXTRA_REGISTER',,`
+	std	r31, -8(r1)
+')
+	andi.	r12, n, 1
+	addi	r0, n, 1
+	srdi	r0, r0, 1
+	mtctr	r0
+	ld	r11, 0(vp)		C v0
+	li	cy0, 0
+	ld	r12, 8(vp)		C v1
+	li	cy1, 0
+	ld	r5, 0(up)
+	beq	L(lo0)
+	addi	up, up, -8
+	addi	rp, rp, -8
+	b	L(lo1)
+
+	ALIGN(32)
+L(top):
+AM(`	ld	r0, -8(rp)')
+	ld	r5, 0(up)
+AM(`	addc	r6, r6, r0')
+	ADDX	r7, r7, r8
+	addze	r9, r9
+	addc	r6, r6, cy0
+	adde	cy0, r7, cy1
+	std	r6, -8(rp)
+	addze	cy1, r9
+L(lo0):	mulld	r6, r11, r5		C v0 * u[i]  weight 0
+	mulhdu	r7, r11, r5		C v0 * u[i]  weight 1
+	mulld	r8, r12, r5		C v1 * u[i]  weight 1
+	mulhdu	r9, r12, r5		C v1 * u[i]  weight 2
+AM(`	ld	r0, 0(rp)')
+	ld	r5, 8(up)
+AM(`	addc	r6, r6, r0')
+	ADDX	r7, r7, r8
+	addze	r9, r9
+	addc	r6, r6, cy0
+	adde	cy0, r7, cy1
+	std	r6, 0(rp)
+	addze	cy1, r9
+L(lo1):	mulld	r6, r11, r5		C v0 * u[i]  weight 0
+	mulhdu	r7, r11, r5		C v0 * u[i]  weight 1
+	addi	up, up, 16
+	addi	rp, rp, 16
+	mulld	r8, r12, r5		C v1 * u[i]  weight 1
+	mulhdu	r9, r12, r5		C v1 * u[i]  weight 2
+	bdnz	L(top)
+
+L(end):
+AM(`	ld	r0, -8(rp)')
+AM(`	addc	r6, r6, r0')
+	ADDX	r7, r7, r8
+	addze	r9, r9
+	addc	r6, r6, cy0
+	std	r6, -8(rp)
+	adde	cy0, r7, cy1
+	addze	cy1, r9
+	std	cy0, 0(rp)
+	mr	r3, cy1
+
+ifdef(`EXTRA_REGISTER',,`
+	ld	r31, -8(r1)
+')
+	blr
+EPILOGUE()

diff --git a/third_party/gmp/mpn/powerpc64/mode64/p7/aors_n.asm b/third_party/gmp/mpn/powerpc64/mode64/p7/aors_n.asm
new file mode 100644
index 0000000..857c701
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc64/mode64/p7/aors_n.asm

@@ -0,0 +1,128 @@
+dnl  PowerPC-64 mpn_add_n, mpn_sub_n optimised for POWER7.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C		   cycles/limb
+C POWER3/PPC630		 ?
+C POWER4/PPC970		 ?
+C POWER5		 ?
+C POWER6		 ?
+C POWER7		 2.18
+
+C This is a tad bit slower than the cnd_aors_n.asm code, which is of course an
+C anomaly.
+
+ifdef(`OPERATION_add_n',`
+  define(ADDSUBC,	adde)
+  define(ADDSUB,	addc)
+  define(func,		mpn_add_n)
+  define(func_nc,	mpn_add_nc)
+  define(GENRVAL,	`addi	r3, r3, 1')
+  define(SETCBR,	`addic	r0, $1, -1')
+  define(CLRCB,		`addic	r0, r0, 0')
+')
+ifdef(`OPERATION_sub_n',`
+  define(ADDSUBC,	subfe)
+  define(ADDSUB,	subfc)
+  define(func,		mpn_sub_n)
+  define(func_nc,	mpn_sub_nc)
+  define(GENRVAL,	`neg	r3, r3')
+  define(SETCBR,	`subfic	r0, $1, 0')
+  define(CLRCB,		`addic	r0, r1, -1')
+')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+
+C INPUT PARAMETERS
+define(`rp',	`r3')
+define(`up',	`r4')
+define(`vp',	`r5')
+define(`n',	`r6')
+
+ASM_START()
+PROLOGUE(func_nc)
+	SETCBR(r7)
+	b	L(ent)
+EPILOGUE()
+
+PROLOGUE(func)
+	CLRCB
+L(ent):
+	andi.	r7, n, 1
+	beq	L(bx0)
+
+L(bx1):	ld	r7, 0(up)
+	ld	r9, 0(vp)
+	ADDSUBC	r11, r9, r7
+	std	r11, 0(rp)
+	cmpldi	cr6, n, 1
+	beq	cr6, L(end)
+	addi	up, up, 8
+	addi	vp, vp, 8
+	addi	rp, rp, 8
+
+L(bx0):	addi	r0, n, 2	C compute branch...
+	srdi	r0, r0, 2	C ...count
+	mtctr	r0
+
+	andi.	r7, n, 2
+	bne	L(mid)
+
+	addi	up, up, 16
+	addi	vp, vp, 16
+	addi	rp, rp, 16
+
+	ALIGN(32)
+L(top):	ld	r6, -16(up)
+	ld	r7, -8(up)
+	ld	r8, -16(vp)
+	ld	r9, -8(vp)
+	ADDSUBC	r10, r8, r6
+	ADDSUBC	r11, r9, r7
+	std	r10, -16(rp)
+	std	r11, -8(rp)
+L(mid):	ld	r6, 0(up)
+	ld	r7, 8(up)
+	ld	r8, 0(vp)
+	ld	r9, 8(vp)
+	ADDSUBC	r10, r8, r6
+	ADDSUBC	r11, r9, r7
+	std	r10, 0(rp)
+	std	r11, 8(rp)
+	addi	up, up, 32
+	addi	vp, vp, 32
+	addi	rp, rp, 32
+	bdnz	L(top)
+
+L(end):	subfe	r3, r0, r0	C -cy
+	GENRVAL
+	blr
+EPILOGUE()

diff --git a/third_party/gmp/mpn/powerpc64/mode64/p7/aorsorrlsh1_n.asm b/third_party/gmp/mpn/powerpc64/mode64/p7/aorsorrlsh1_n.asm
new file mode 100644
index 0000000..ddf5fd8
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc64/mode64/p7/aorsorrlsh1_n.asm

@@ -0,0 +1,43 @@
+dnl  PowerPC-64 mpn_addlsh1_n, mpn_sublsh1_n, mpn_rsblsh1_n.
+
+dnl  Copyright 2003, 2005, 2009, 2010, 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+define(LSH,		1)
+define(RSH,		63)
+
+ifdef(`OPERATION_addlsh1_n',`define(`DO_add')')
+ifdef(`OPERATION_sublsh1_n',`define(`DO_sub')')
+ifdef(`OPERATION_rsblsh1_n',`define(`DO_rsb')')
+
+MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_sublsh1_n mpn_rsblsh1_n)
+
+include_mpn(`powerpc64/mode64/p7/aorsorrlshC_n.asm')

diff --git a/third_party/gmp/mpn/powerpc64/mode64/p7/aorsorrlsh2_n.asm b/third_party/gmp/mpn/powerpc64/mode64/p7/aorsorrlsh2_n.asm
new file mode 100644
index 0000000..3f9d88d
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc64/mode64/p7/aorsorrlsh2_n.asm

@@ -0,0 +1,43 @@
+dnl  PowerPC-64 mpn_addlsh2_n, mpn_sublsh2_n, mpn_rsblsh2_n.
+
+dnl  Copyright 2003, 2005, 2009, 2010, 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+define(LSH,		2)
+define(RSH,		62)
+
+ifdef(`OPERATION_addlsh2_n',`define(`DO_add')')
+ifdef(`OPERATION_sublsh2_n',`define(`DO_sub')')
+ifdef(`OPERATION_rsblsh2_n',`define(`DO_rsb')')
+
+MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_sublsh2_n mpn_rsblsh2_n)
+
+include_mpn(`powerpc64/mode64/p7/aorsorrlshC_n.asm')

diff --git a/third_party/gmp/mpn/powerpc64/mode64/p7/aorsorrlshC_n.asm b/third_party/gmp/mpn/powerpc64/mode64/p7/aorsorrlshC_n.asm
new file mode 100644
index 0000000..5251202
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc64/mode64/p7/aorsorrlshC_n.asm

@@ -0,0 +1,129 @@
+dnl  PowerPC-64 mpn_addlshC_n, mpn_sublshC_n, mpn_rsblshC_n.
+
+dnl  Copyright 2003, 2005, 2009, 2010, 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+C                  cycles/limb
+C POWER3/PPC630          ?
+C POWER4/PPC970          ?
+C POWER5                 ?
+C POWER6                 ?
+C POWER7                 2.5
+
+C INPUT PARAMETERS
+define(`rp', `r3')
+define(`up', `r4')
+define(`vp', `r5')
+define(`n',  `r6')
+
+ifdef(`DO_add', `
+  define(`ADDSUBC',	`addc	$1, $2, $3')
+  define(`ADDSUBE',	`adde	$1, $2, $3')
+  define(INITCY,	`addic	$1, r1, 0')
+  define(RETVAL,	`addze	r3, $1')
+  define(`func',	mpn_addlsh`'LSH`'_n)')
+ifdef(`DO_sub', `
+  define(`ADDSUBC',	`subfc	$1, $2, $3')
+  define(`ADDSUBE',	`subfe	$1, $2, $3')
+  define(INITCY,	`addic	$1, r1, -1')
+  define(RETVAL,	`subfze	r3, $1
+			neg	r3, r3')
+  define(`func',	mpn_sublsh`'LSH`'_n)')
+ifdef(`DO_rsb', `
+  define(`ADDSUBC',	`subfc	$1, $3, $2')
+  define(`ADDSUBE',	`subfe	$1, $3, $2')
+  define(INITCY,	`addic	$1, r1, -1')
+  define(RETVAL,	`addme	r3, $1')
+  define(`func',	mpn_rsblsh`'LSH`'_n)')
+
+define(`s0', `r0')  define(`s1', `r9')
+define(`u0', `r6')  define(`u1', `r7')
+define(`v0', `r10') define(`v1', `r11')
+
+
+ASM_START()
+PROLOGUE(func)
+	rldic	r7, n, 3, 59
+	add	up, up, r7
+	add	vp, vp, r7
+	add	rp, rp, r7
+
+ifdef(`DO_add', `
+	addic	r0, n, 3	C set cy flag as side effect
+',`
+	subfc	r0, r0, r0	C set cy flag
+	addi	r0, n, 3
+')
+	srdi	r0, r0, 2
+	mtctr	r0
+
+	andi.	r0, n, 1
+	beq	L(bx0)
+
+L(bx1):	andi.	r0, n, 2
+	li	s0, 0
+	bne	L(lo3)
+	b	L(lo1)
+
+L(bx0):	andi.	r0, n, 2
+	li	s1, 0
+	bne	L(lo2)
+
+	ALIGN(32)
+L(top):	addi	rp, rp, 32
+	ld	v0, 0(vp)
+	addi	vp, vp, 32
+	rldimi	s1, v0, LSH, 0
+	ld	u0, 0(up)
+	addi	up, up, 32
+	srdi	s0, v0, RSH
+	ADDSUBE(s1, s1, u0)
+	std	s1, -32(rp)
+L(lo3):	ld	v1, -24(vp)
+	rldimi	s0, v1, LSH, 0
+	ld	u1, -24(up)
+	srdi	s1, v1, RSH
+	ADDSUBE(s0, s0, u1)
+	std	s0, -24(rp)
+L(lo2):	ld	v0, -16(vp)
+	rldimi	s1, v0, LSH, 0
+	ld	u0, -16(up)
+	srdi	s0, v0, RSH
+	ADDSUBE(s1, s1, u0)
+	std	s1, -16(rp)
+L(lo1):	ld	v1, -8(vp)
+	rldimi	s0, v1, LSH, 0
+	ld	u1, -8(up)
+	srdi	s1, v1, RSH
+	ADDSUBE(s0, s0, u1)
+	std	s0, -8(rp)
+	bdnz	L(top)		C decrement CTR and loop back
+
+	RETVAL(	s1)
+	blr
+EPILOGUE()

diff --git a/third_party/gmp/mpn/powerpc64/mode64/p7/gcd_11.asm b/third_party/gmp/mpn/powerpc64/mode64/p7/gcd_11.asm
new file mode 100644
index 0000000..f04e896
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc64/mode64/p7/gcd_11.asm

@@ -0,0 +1,67 @@
+dnl  PowerPC-64 mpn_gcd_11.
+
+dnl  Copyright 2000-2002, 2005, 2009, 2011-2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C		    cycles/bit (approx)
+C POWER3/PPC630		 -
+C POWER4/PPC970		 -
+C POWER5		 -
+C POWER6		 -
+C POWER7		 7.6    obsolete
+C POWER8		 ?
+C POWER9		 ?
+C Numbers measured with: speed -CD -s16-64 -t48 mpn_gcd_1
+
+C INPUT PARAMETERS
+define(`u0',    `r3')
+define(`v0',    `r4')
+
+define(`cnt',  `r9')dnl
+
+ASM_START()
+PROLOGUE(mpn_gcd_11)
+	li	r12, 63
+	b	L(odd)
+
+	ALIGN(16)
+L(top):	and	r8, r11, r10		C isolate lsb
+	cntlzd	cnt, r8
+	isel	v0, u0, v0, 29		C v = min(u,v)
+	isel	u0, r10, r11, 29	C u = |u - v|
+	subf	cnt, cnt, r12		C cnt = 63-cnt
+	srd	u0, u0, cnt
+L(odd):	cmpld	cr7, v0, u0
+	subf	r10, u0, v0		C r10 = v - u
+	subf	r11, v0, u0		C r11 = u - v
+	bne	cr7, L(top)
+
+L(end):	blr
+EPILOGUE()

diff --git a/third_party/gmp/mpn/powerpc64/mode64/p7/gcd_22.asm b/third_party/gmp/mpn/powerpc64/mode64/p7/gcd_22.asm
new file mode 100644
index 0000000..ade30e4
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc64/mode64/p7/gcd_22.asm

@@ -0,0 +1,146 @@
+dnl  PowerPC-64 mpn_gcd_22 optimised for POWER7 and POWER8.
+
+dnl  Copyright 2000-2002, 2005, 2009, 2011-2013, 2019 Free Software Foundation,
+dnl  Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C		    cycles/bit (approx)
+C POWER3/PPC630		 -
+C POWER4/PPC970		 -
+C POWER5		 -
+C POWER6		 -
+C POWER7		12.3
+C POWER8		13.4
+C POWER9		10.6
+
+C We define SLOW if this target uses a slow struct return mechanism, with
+C r3 as an implicit parameter for the struct pointer.
+undefine(`SLOW')dnl
+ifdef(`AIX',`define(`SLOW',`due to AIX')',`
+  ifdef(`DARWIN',,`
+    ifdef(`ELFv2_ABI',,`define(`SLOW',`due to ELFv1')')dnl
+  ')
+')
+
+ifdef(`SLOW',`
+define(`IFSLOW', `$1')
+define(`u1',    `r4')
+define(`u0',    `r5')
+define(`v1',    `r6')
+define(`v0',    `r7')
+',`
+define(`IFSLOW', `')
+define(`u1',    `r3')
+define(`u0',    `r4')
+define(`v1',    `r5')
+define(`v0',    `r6')
+')
+
+define(`tmp',   `r0')
+define(`t0',    `r8')
+define(`t1',    `r9')
+define(`s0',    `r10')
+define(`s1',    `r11')
+define(`cnt',   `r12')
+
+ASM_START()
+PROLOGUE(mpn_gcd_22)
+L(top):	subfc.	t0, v0, u0		C 0 12
+	beq	cr0, L(lowz)
+	subfe	t1, v1, u1		C 2 14
+	subfe.	tmp, tmp, tmp		C 4	set cr0 from the carry bit
+	subfc	s0, u0, v0		C 0
+	subfe	s1, u1, v1		C 2
+
+L(bck):	and	tmp, s0, t0		C 2
+	cntlzd	cnt, tmp		C 4
+	addi	tmp, cnt, 1		C 6
+	subfic	cnt, cnt, 63		C 6
+
+	isel	v0, v0, u0, 2		C 6	use condition set by subfe
+	isel	v1, v1, u1, 2		C 6
+	isel	u0, t0, s0, 2		C 6
+	isel	u1, t1, s1, 2		C 6
+
+	srd	u0, u0, cnt		C 8
+	sld	tmp, u1, tmp		C 8
+	srd	u1, u1, cnt		C 8
+	or	u0, u0, tmp		C 10
+
+	or.	r0, u1, v1		C 10
+	bne	L(top)
+
+
+	li	r0, 63
+	b	L(odd)
+	ALIGN(16)
+L(top1):isel	v0, u0, v0, 29		C v = min(u,v)
+	isel	u0, r10, r11, 29	C u = |u - v|
+	subf	cnt, cnt, r0		C cnt = 63-cnt
+	srd	u0, u0, cnt
+L(odd):	subf	r10, u0, v0		C r10 = v - u
+	subf	r11, v0, u0		C r11 = u - v
+	cmpld	cr7, v0, u0
+	and	r8, r11, r10		C isolate lsb
+	cntlzd	cnt, r8
+	bne	cr7, L(top1)
+
+ifdef(`SLOW',`
+	std	v0, 0(r3)
+	std	r10, 8(r3)		C zero
+',`
+	mr	r3, v0
+	li	r4, 0
+')
+	blr
+
+
+L(lowz):C We come here when v0 - u0 = 0
+	C 1. If v1 - u1 = 0, then gcd is u = v.
+	C 2. Else compute gcd_21({v1,v0}, |u1-v1|)
+	subfc.	t0, v1, u1		C 2 8
+	beq	L(end)
+	li	t1, 0
+	subfe.	tmp, tmp, tmp		C 4	set cr0 from the carry bit
+	subf	s0, u1, v1		C 2
+	li	s1, 0
+	b	L(bck)
+
+L(end):
+ifdef(`SLOW',`
+	std	v0, 0(r3)
+	std	v1, 8(r3)
+	blr
+',`
+	mr	r3, v0
+	mr	r4, v1
+	blr
+')
+EPILOGUE()

diff --git a/third_party/gmp/mpn/powerpc64/mode64/p7/gmp-mparam.h b/third_party/gmp/mpn/powerpc64/mode64/p7/gmp-mparam.h
new file mode 100644
index 0000000..5fa62cf
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc64/mode64/p7/gmp-mparam.h

@@ -0,0 +1,174 @@
+/* POWER7 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+/* 3720 MHz POWER7/SMT4 */
+/* FFT tuning limit = 0.5 M */
+/* Generated by tuneup.c, 2019-10-02, gcc 4.8 */
+
+#define MOD_1_NORM_THRESHOLD                 0  /* always */
+#define MOD_1_UNNORM_THRESHOLD               0  /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          6
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          4
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD         8
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD        16
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     13
+#define USE_PREINV_DIVREM_1                  0
+#define DIV_QR_1N_PI1_METHOD                 1  /* 3.47% faster than 2 */
+#define DIV_QR_1_NORM_THRESHOLD              1
+#define DIV_QR_1_UNNORM_THRESHOLD            1
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           27
+
+#define DIV_1_VS_MUL_1_PERCENT             341
+
+#define MUL_TOOM22_THRESHOLD                22
+#define MUL_TOOM33_THRESHOLD                71
+#define MUL_TOOM44_THRESHOLD               196
+#define MUL_TOOM6H_THRESHOLD               298
+#define MUL_TOOM8H_THRESHOLD               406
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      81
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     140
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD     132
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD     139
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD     120
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 32
+#define SQR_TOOM3_THRESHOLD                105
+#define SQR_TOOM4_THRESHOLD                190
+#define SQR_TOOM6_THRESHOLD                318
+#define SQR_TOOM8_THRESHOLD                547
+
+#define MULMID_TOOM42_THRESHOLD             56
+
+#define MULMOD_BNM1_THRESHOLD               18
+#define SQRMOD_BNM1_THRESHOLD               20
+
+#define MUL_FFT_MODF_THRESHOLD             436  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    436, 5}, {     21, 6}, {     21, 7}, {     11, 6}, \
+    {     23, 7}, {     12, 6}, {     25, 7}, {     21, 8}, \
+    {     11, 7}, {     25, 8}, {     13, 7}, {     28, 8}, \
+    {     15, 7}, {     33, 8}, {     17, 7}, {     35, 8}, \
+    {     19, 7}, {     39, 8}, {     21, 9}, {     11, 8}, \
+    {     29, 9}, {     15, 8}, {     35, 9}, {     19, 8}, \
+    {     41, 9}, {     23, 8}, {     49, 9}, {     27,10}, \
+    {     15, 9}, {     31, 8}, {     63, 9}, {     43,10}, \
+    {     23, 9}, {     55,11}, {     15,10}, {     31, 9}, \
+    {     67,10}, {     39, 9}, {     79,10}, {     47, 9}, \
+    {     95,10}, {     55,11}, {     31,10}, {     63, 9}, \
+    {    127,10}, {     79,11}, {     47,10}, {    103,12}, \
+    {     31,11}, {     63,10}, {    135,11}, {     79,10}, \
+    {    159,11}, {     95,10}, {    191, 9}, {    383,11}, \
+    {    111,12}, {     63,11}, {    127,10}, {    255, 9}, \
+    {    511,11}, {    143,10}, {    287, 9}, {    575,11}, \
+    {    159,10}, {    319,12}, {     95,11}, {    191,10}, \
+    {    383, 9}, {    767,11}, {    207,10}, {    415,13}, \
+    {   8192,14}, {  16384,15}, {  32768,16}, {  65536,17}, \
+    { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
+    {2097152,22}, {4194304,23}, {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 83
+#define MUL_FFT_THRESHOLD                 4736
+
+#define SQR_FFT_MODF_THRESHOLD             368  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    368, 5}, {     19, 6}, {     10, 5}, {     21, 6}, \
+    {     21, 7}, {     11, 6}, {     23, 7}, {     12, 6}, \
+    {     25, 7}, {     13, 6}, {     27, 7}, {     25, 8}, \
+    {     13, 7}, {     28, 8}, {     15, 7}, {     32, 8}, \
+    {     17, 7}, {     35, 8}, {     19, 7}, {     39, 8}, \
+    {     21, 9}, {     11, 8}, {     29, 9}, {     15, 8}, \
+    {     35, 9}, {     19, 8}, {     41, 9}, {     23, 8}, \
+    {     47, 9}, {     27,10}, {     15, 9}, {     31, 8}, \
+    {     63, 9}, {     39,10}, {     23, 9}, {     51,11}, \
+    {     15,10}, {     31, 9}, {     67,10}, {     39, 9}, \
+    {     79,10}, {     47, 9}, {     95,10}, {     55,11}, \
+    {     31,10}, {     79,11}, {     47,10}, {     95,12}, \
+    {     31,11}, {     63,10}, {    135,11}, {     79,10}, \
+    {    159, 9}, {    319,11}, {     95,10}, {    191, 9}, \
+    {    383,11}, {    111,12}, {     63,11}, {    127,10}, \
+    {    255, 9}, {    511,11}, {    143,10}, {    287, 9}, \
+    {    575,10}, {    303,11}, {    159,10}, {    319, 9}, \
+    {    639,12}, {     95,11}, {    191,10}, {    383, 9}, \
+    {    767,13}, {   8192,14}, {  16384,15}, {  32768,16}, \
+    {  65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \
+    {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 84
+#define SQR_FFT_THRESHOLD                 3264
+
+#define MULLO_BASECASE_THRESHOLD             3
+#define MULLO_DC_THRESHOLD                  35
+#define MULLO_MUL_N_THRESHOLD             9449
+#define SQRLO_BASECASE_THRESHOLD             3
+#define SQRLO_DC_THRESHOLD                 119
+#define SQRLO_SQR_THRESHOLD               6440
+
+#define DC_DIV_QR_THRESHOLD                 33
+#define DC_DIVAPPR_Q_THRESHOLD             124
+#define DC_BDIV_QR_THRESHOLD                62
+#define DC_BDIV_Q_THRESHOLD                144
+
+#define INV_MULMOD_BNM1_THRESHOLD           67
+#define INV_NEWTON_THRESHOLD               123
+#define INV_APPR_THRESHOLD                 123
+
+#define BINV_NEWTON_THRESHOLD              284
+#define REDC_1_TO_REDC_2_THRESHOLD          18
+#define REDC_2_TO_REDC_N_THRESHOLD         109
+
+#define MU_DIV_QR_THRESHOLD               1387
+#define MU_DIVAPPR_Q_THRESHOLD            1334
+#define MUPI_DIV_QR_THRESHOLD               50
+#define MU_BDIV_QR_THRESHOLD              1308
+#define MU_BDIV_Q_THRESHOLD               1499
+
+#define POWM_SEC_TABLE  1,23,121,579,642
+
+#define GET_STR_DC_THRESHOLD                11
+#define GET_STR_PRECOMPUTE_THRESHOLD        18
+#define SET_STR_DC_THRESHOLD              1562
+#define SET_STR_PRECOMPUTE_THRESHOLD      3100
+
+#define FAC_DSC_THRESHOLD                  774
+#define FAC_ODD_THRESHOLD                   25
+
+#define MATRIX22_STRASSEN_THRESHOLD         18
+#define HGCD2_DIV1_METHOD                    5  /* 3.27% faster than 3 */
+#define HGCD_THRESHOLD                     118
+#define HGCD_APPR_THRESHOLD                150
+#define HGCD_REDUCE_THRESHOLD             3014
+#define GCD_DC_THRESHOLD                   386
+#define GCDEXT_DC_THRESHOLD                365
+#define JACOBI_BASE_METHOD                   4  /* 27.64% faster than 1 */

diff --git a/third_party/gmp/mpn/powerpc64/mode64/p8/gmp-mparam.h b/third_party/gmp/mpn/powerpc64/mode64/p8/gmp-mparam.h
new file mode 100644
index 0000000..ed4db28
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc64/mode64/p8/gmp-mparam.h

@@ -0,0 +1,170 @@
+/* POWER8 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2017 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+/* 4150 MHz POWER8/SMT4 */
+/* FFT tuning limit = 0.5 M */
+/* Generated by tuneup.c, 2019-09-24, gcc 7.2 */
+
+#define MOD_1_NORM_THRESHOLD                 0  /* always */
+#define MOD_1_UNNORM_THRESHOLD               0  /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          7
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          6
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD        11
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD        22
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     10
+#define USE_PREINV_DIVREM_1                  0
+#define DIV_QR_1N_PI1_METHOD                 2  /* 16.97% faster than 1 */
+#define DIV_QR_1_NORM_THRESHOLD              2
+#define DIV_QR_1_UNNORM_THRESHOLD            1
+#define DIV_QR_2_PI2_THRESHOLD               9
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           34
+
+#define DIV_1_VS_MUL_1_PERCENT             276
+
+#define MUL_TOOM22_THRESHOLD                18
+#define MUL_TOOM33_THRESHOLD                73
+#define MUL_TOOM44_THRESHOLD               195
+#define MUL_TOOM6H_THRESHOLD               278
+#define MUL_TOOM8H_THRESHOLD               406
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      73
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     131
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD     121
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD     138
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD     106
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 32
+#define SQR_TOOM3_THRESHOLD                 97
+#define SQR_TOOM4_THRESHOLD                178
+#define SQR_TOOM6_THRESHOLD                303
+#define SQR_TOOM8_THRESHOLD                454
+
+#define MULMID_TOOM42_THRESHOLD             42
+
+#define MULMOD_BNM1_THRESHOLD               15
+#define SQRMOD_BNM1_THRESHOLD               19
+
+#define MUL_FFT_MODF_THRESHOLD             404  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    404, 5}, {     19, 6}, {     10, 5}, {     21, 6}, \
+    {     21, 7}, {     11, 6}, {     23, 7}, {     12, 6}, \
+    {     25, 7}, {     21, 8}, {     11, 7}, {     25, 8}, \
+    {     13, 7}, {     27, 8}, {     15, 7}, {     31, 8}, \
+    {     17, 7}, {     35, 8}, {     19, 7}, {     39, 8}, \
+    {     21, 9}, {     11, 8}, {     27, 9}, {     15, 8}, \
+    {     35, 9}, {     19, 8}, {     41, 9}, {     23, 8}, \
+    {     47, 9}, {     27,10}, {     15, 9}, {     39,10}, \
+    {     23, 9}, {     51,11}, {     15,10}, {     31, 9}, \
+    {     63,10}, {     39, 9}, {     79,10}, {     47, 9}, \
+    {     95,11}, {     31,10}, {     63, 9}, {    131,10}, \
+    {     79,11}, {     47,10}, {     95,12}, {     31,11}, \
+    {     63,10}, {    127, 9}, {    255,10}, {    135,11}, \
+    {     79,10}, {    159,11}, {     95, 8}, {    767, 7}, \
+    {   1599,11}, {    111,12}, {     63,11}, {    127,10}, \
+    {    255, 9}, {    511,11}, {    143,10}, {    287, 9}, \
+    {    575,11}, {    159,12}, {     95,11}, {    191,10}, \
+    {    383,13}, {   8192,14}, {  16384,15}, {  32768,16}, \
+    {  65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \
+    {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 80
+#define MUL_FFT_THRESHOLD                 4736
+
+#define SQR_FFT_MODF_THRESHOLD             340  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    340, 5}, {     19, 6}, {     10, 5}, {     21, 6}, \
+    {     21, 7}, {     11, 6}, {     23, 7}, {     21, 8}, \
+    {     11, 7}, {     25, 8}, {     13, 7}, {     27, 8}, \
+    {     15, 7}, {     31, 8}, {     21, 9}, {     11, 8}, \
+    {     27, 9}, {     15, 8}, {     33, 9}, {     19, 8}, \
+    {     41, 9}, {     23, 8}, {     47, 9}, {     27,10}, \
+    {     15, 9}, {     39,10}, {     23, 9}, {     47,11}, \
+    {     15,10}, {     31, 9}, {     67,10}, {     39, 9}, \
+    {     79,10}, {     47,11}, {     31,10}, {     79,11}, \
+    {     47,10}, {     95,12}, {     31,11}, {     63,10}, \
+    {    127, 9}, {    255,11}, {     79, 9}, {    319,11}, \
+    {     95,10}, {    191,12}, {     63,11}, {    127,10}, \
+    {    255, 9}, {    511,10}, {    271,11}, {    143,10}, \
+    {    287, 9}, {    575,10}, {    303, 9}, {    607,10}, \
+    {    319,12}, {     95,11}, {    191,10}, {    383,13}, \
+    {   8192,14}, {  16384,15}, {  32768,16}, {  65536,17}, \
+    { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
+    {2097152,22}, {4194304,23}, {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 71
+#define SQR_FFT_THRESHOLD                 3264
+
+#define MULLO_BASECASE_THRESHOLD             3
+#define MULLO_DC_THRESHOLD                  33
+#define MULLO_MUL_N_THRESHOLD             9174
+#define SQRLO_BASECASE_THRESHOLD             0  /* always */
+#define SQRLO_DC_THRESHOLD                 114
+#define SQRLO_SQR_THRESHOLD               6461
+
+#define DC_DIV_QR_THRESHOLD                 38
+#define DC_DIVAPPR_Q_THRESHOLD             158
+#define DC_BDIV_QR_THRESHOLD                48
+#define DC_BDIV_Q_THRESHOLD                112
+
+#define INV_MULMOD_BNM1_THRESHOLD           74
+#define INV_NEWTON_THRESHOLD               132
+#define INV_APPR_THRESHOLD                 131
+
+#define BINV_NEWTON_THRESHOLD              278
+#define REDC_1_TO_REDC_2_THRESHOLD          56
+#define REDC_2_TO_REDC_N_THRESHOLD           0  /* always */
+
+#define MU_DIV_QR_THRESHOLD               1142
+#define MU_DIVAPPR_Q_THRESHOLD            1142
+#define MUPI_DIV_QR_THRESHOLD               46
+#define MU_BDIV_QR_THRESHOLD              1142
+#define MU_BDIV_Q_THRESHOLD               1470
+
+#define POWM_SEC_TABLE  3,19,117,672,1867
+
+#define GET_STR_DC_THRESHOLD                11
+#define GET_STR_PRECOMPUTE_THRESHOLD        18
+#define SET_STR_DC_THRESHOLD               608
+#define SET_STR_PRECOMPUTE_THRESHOLD      2405
+
+#define FAC_DSC_THRESHOLD                  164
+#define FAC_ODD_THRESHOLD                    0  /* always */
+
+#define MATRIX22_STRASSEN_THRESHOLD         14
+#define HGCD2_DIV1_METHOD                    1  /* 6.88% faster than 3 */
+#define HGCD_THRESHOLD                     114
+#define HGCD_APPR_THRESHOLD                118
+#define HGCD_REDUCE_THRESHOLD             2205
+#define GCD_DC_THRESHOLD                   440
+#define GCDEXT_DC_THRESHOLD                345
+#define JACOBI_BASE_METHOD                   1  /* 0.74% faster than 4 */

diff --git a/third_party/gmp/mpn/powerpc64/mode64/p8/invert_limb.asm b/third_party/gmp/mpn/powerpc64/mode64/p8/invert_limb.asm
new file mode 100644
index 0000000..53ea0e0
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc64/mode64/p8/invert_limb.asm

@@ -0,0 +1,53 @@
+dnl  PowerPC-64 mpn_invert_limb -- Invert a normalized limb.
+
+dnl  Copyright 2015, 2017 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C                  cycles/limb (approximate)
+C POWER3/PPC630         -
+C POWER4/PPC970         -
+C POWER5                -
+C POWER6                -
+C POWER7                ?
+C POWER8               32
+
+C This runs on POWER7 and later, but is faster only on later CPUs.
+C We might want to inline this, considering its small footprint.
+
+ASM_START()
+PROLOGUE(mpn_invert_limb)
+	sldi.	r4, r3, 1
+	neg	r5, r3
+	divdeu(	r3, r5, r3)
+	beq-	L(1)
+	blr
+L(1):	li	r3, -1
+	blr
+EPILOGUE()

diff --git a/third_party/gmp/mpn/powerpc64/mode64/p9/add_n_sub_n.asm b/third_party/gmp/mpn/powerpc64/mode64/p9/add_n_sub_n.asm
new file mode 100644
index 0000000..2426a00
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc64/mode64/p9/add_n_sub_n.asm

@@ -0,0 +1,112 @@
+dnl  PowerPC-64 mpn_add_n_sub_n optimised for POWER9.
+
+dnl  Copyright 2018 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C		   cycles/limb
+C POWER3/PPC630		 -
+C POWER4/PPC970		 -
+C POWER5		 -
+C POWER6		 -
+C POWER7		 -
+C POWER8		 -
+C POWER9		 2.25
+
+
+C INPUT PARAMETERS
+define(`arp',	`r3')
+define(`srp',	`r4')
+define(`up',	`r5')
+define(`vp',	`r6')
+define(`n',	`r7')
+
+ASM_START()
+PROLOGUE(mpn_add_n_sub_n)
+	cmpdi	cr7, n, 2
+	subfo	r0, r0, r0		C clear OV
+	rldicl.	r9, n, 0, 63		C n & 1
+	beq	cr0, L(bx0)
+
+L(bx1):	ld	r10, 0(up)
+	ld	r11, 0(vp)
+	ble	cr7, L(1)
+	srdi	r7, r7, 1
+	mtctr	r7
+	ld	r8, 8(up)
+	ld	r9, 8(vp)
+	addex(	r0, r10, r11, 0)
+	subfc	r12, r11, r10
+	addi	up, up, -8
+	addi	vp, vp, -8
+	b	L(lo1)
+
+L(bx0):	ld	r8, 0(up)
+	ld	r9, 0(vp)
+	ld	r10, 8(up)
+	ld	r11, 8(vp)
+	addex(	r0, r8, r9, 0)
+	subfc	r12, r9, r8
+	addi	arp, arp, 8
+	addi	srp, srp, 8
+	ble	cr7, L(end)
+	addi	r7, r7, -1
+	srdi	r7, r7, 1
+	mtctr	r7
+
+L(top):	ld	r8, 16(up)
+	ld	r9, 16(vp)
+	std	r0, -8(arp)
+	std	r12, -8(srp)
+	addex(	r0, r10, r11, 0)
+	subfe	r12, r11, r10
+L(lo1):	ld	r10, 24(up)
+	ld	r11, 24(vp)
+	std	r0, 0(arp)
+	std	r12, 0(srp)
+	addex(	r0, r8, r9, 0)
+	subfe	r12, r9, r8
+	addi	up, up, 16
+	addi	vp, vp, 16
+	addi	arp, arp, 16
+	addi	srp, srp, 16
+	bdnz	L(top)
+
+L(end):	std	r0, -8(arp)
+	std	r12, -8(srp)
+L(1):	addex(	r0, r10, r11, 0)
+	subfe	r12, r11, r10
+	std	r0, 0(arp)
+	std	r12, 0(srp)
+	subfe	r3, r3, r3
+	addex(	r3, r3, r3, 0)
+	rldicl	r3, r3, 1, 62
+	blr
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/powerpc64/mode64/p9/addmul_1.asm b/third_party/gmp/mpn/powerpc64/mode64/p9/addmul_1.asm
new file mode 100644
index 0000000..8f49606
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc64/mode64/p9/addmul_1.asm

@@ -0,0 +1,130 @@
+dnl  Power9 mpn_addmul_1.
+
+dnl  Copyright 2017, 2018 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C                   cycles/limb
+C POWER3/PPC630		 -
+C POWER4/PPC970		 -
+C POWER5		 -
+C POWER6		 -
+C POWER7		 -
+C POWER8		 -
+C POWER9		 2.5
+
+C TODO
+C  * Schedule for Power9 pipeline.
+C  * Unroll 4x if that proves beneficial.
+C  * This is marginally faster (but much smaller) than ../aorsmul_1.asm.
+
+C INPUT PARAMETERS
+define(`rp', `r3')
+define(`up', `r4')
+define(`n',  `r5')
+define(`v0', `r6')
+
+ASM_START()
+PROLOGUE(mpn_addmul_1)
+	cmpdi	cr6, n, 2
+	addi	r0, n, -1	C FIXME: postpone
+	srdi	r0, r0, 1	C FIXME: postpone
+	mtctr	r0		C FIXME: postpone
+	rldicl.	r0, n, 0,63	C r0 = n & 3, set cr0
+	bne	cr0, L(b1)
+
+L(b0):	ld	r10, 0(rp)
+	ld	r12, 0(up)
+	ld	r11, 8(rp)
+	ld	r0, 8(up)
+	maddld(	r9, r12, v0, r10)
+	maddhdu(r7, r12, v0, r10)
+	ble	cr6, L(2)
+	ld	r10, 16(rp)
+	ld	r12, 16(up)
+	maddld(	r8, r0, v0, r11)
+	maddhdu(r5, r0, v0, r11)
+	addic	up, up, 16
+	addi	rp, rp, -8
+	b	L(mid)
+
+L(b1):	ld	r11, 0(rp)
+	ld	r0, 0(up)
+	ble	cr6, L(1)
+	ld	r10, 8(rp)
+	ld	r12, 8(up)
+	maddld(	r8, r0, v0, r11)
+	maddhdu(r5, r0, v0, r11)
+	ld	r11, 16(rp)
+	ld	r0, 16(up)
+	maddld(	r9, r12, v0, r10)
+	maddhdu(r7, r12, v0, r10)
+	addic	up, up, 24
+	bdz	L(end)
+
+	ALIGN(16)
+L(top):	ld	r10, 24(rp)
+	ld	r12, 0(up)
+	std	r8, 0(rp)
+	adde	r9, r5, r9
+	maddld(	r8, r0, v0, r11)	C W:0,2,4
+	maddhdu(r5, r0, v0, r11)	C W:1,3,5
+L(mid):	ld	r11, 32(rp)
+	ld	r0, 8(up)
+	std	r9, 8(rp)
+	adde	r8, r7, r8
+	maddld(	r9, r12, v0, r10)	C W:1,3,5
+	maddhdu(r7, r12, v0, r10)	C W:2,4,6
+	addi	rp, rp, 16
+	addi	up, up, 16
+	bdnz	L(top)
+
+L(end):	std	r8, 0(rp)
+	maddld(	r8, r0, v0, r11)
+	adde	r9, r5, r9
+	maddhdu(r5, r0, v0, r11)
+	std	r9, 8(rp)
+	adde	r8, r7, r8
+	std	r8, 16(rp)
+	addze	r3, r5
+	blr
+
+L(2):	maddld(	r8, r0, v0, r11)
+	maddhdu(r5, r0, v0, r11)
+	std	r9, 0(rp)
+	addc	r8, r7, r8
+	std	r8, 8(rp)
+	addze	r3, r5
+	blr
+
+L(1):	maddld(	r8,  r0, v0, r11)
+	std	r8, 0(rp)
+	maddhdu(r3, r0, v0, r11)
+	blr
+EPILOGUE()

diff --git a/third_party/gmp/mpn/powerpc64/mode64/p9/addmul_2.asm b/third_party/gmp/mpn/powerpc64/mode64/p9/addmul_2.asm
new file mode 100644
index 0000000..1dd59ea
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc64/mode64/p9/addmul_2.asm

@@ -0,0 +1,182 @@
+dnl  Power9 mpn_addmul_2.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2018 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 3 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C         cycles/limb
+C power9:    1.62
+
+C STATUS
+C  * Not written with any power9 pipeline understanding.
+C  * The 4x unrolling was not motivated by any timing tests.
+C  * No local scheduling for performance tweaking has been done.
+C  * Decrease load scheduling!
+
+define(`rp', `r3')
+define(`up', `r4')
+define(`n',  `r5')		C Note: Reused as scratch
+define(`vp', `r6')		C Note: Reused for v1
+
+define(`v0', `r7')
+define(`v1', `r6')
+
+
+ASM_START()
+PROLOGUE(mpn_addmul_2)
+	std	r26, -48(r1)
+	std	r27, -40(r1)
+	std	r28, -32(r1)
+	std	r29, -24(r1)
+	std	r30, -16(r1)
+	std	r31, -8(r1)
+
+	subfic	r0, r1, 0	C clear CA
+	subfo	r0, r0, r0	C clear OV and r0
+
+	cmpdi	cr7, n, 4
+
+	ld	v0, 0(vp)
+	ld	v1, 8(vp)
+
+	srdi	r10, n, 2
+	mtctr	r10
+
+	rldicl.	r9, n, 0, 63
+	bne	cr0, L(bx1)
+
+L(bx0):	rldicl. r9, n, 63, 63
+
+	ld	r28, 0(rp)
+	ld	r8, 0(up)
+	ld	r11, 8(rp)
+	ld	r9, 8(up)
+	maddld(	r26, r8, v0, r28)
+	maddhdu(r31, r8, v0, r28)
+	blt	cr7, L(2)
+	ld	r28, 16(rp)
+	mulld	r5, r8, v1
+	mulhdu	r10, r8, v1
+	bne	cr0, L(b10)
+
+L(b00):	addi	up, up, -8
+	addi	rp, rp, -24
+	b	L(lo0)
+
+L(b10):	addi	up, up, 8
+	addi	rp, rp, -8
+	b	L(lo2)
+
+L(2):	addi	rp, rp, -8
+	mulld	r5, r8, v1
+	mulhdu	r10, r8, v1
+	b	L(cj2)
+
+L(bx1):	rldicl. r9, n, 63, 63
+
+	ld	r29, 0(rp)
+	ld	r9, 0(up)
+	ld	r10, 8(rp)
+	ld	r8, 8(up)
+	maddld(	r27, r9, v0, r29)
+	maddhdu(r30, r9, v0, r29)
+	ld	r29, 16(rp)
+	mulld	r12, r9, v1
+	mulhdu	r11, r9, v1
+	bne	cr0, L(b11)
+
+L(b01):	addi	rp, rp, -16
+	b	L(lo1)
+L(b11):	addi	up, up, 16
+	blt	cr7, L(end)
+
+L(top):	ld	r9, 0(up)
+	maddld(	r26, r8, v0, r10)	C 0  4   -> adde
+	maddhdu(r31, r8, v0, r10)	C 1  5
+	adde	r0, r27, r0		C    7 11
+	ld	r28, 24(rp)
+	std	r0, 0(rp)
+	maddld(	r5, r8, v1, r29)	C 1  5   -> addex
+	maddhdu(r10, r8, v1, r29)	C 2  6
+	addex(	r0, r12, r30, 0)	C    8 12
+L(lo2):	ld	r8, 8(up)
+	maddld(	r27, r9, v0, r11)	C 1  5   -> adde
+	maddhdu(r30, r9, v0, r11)	C 2  6
+	adde	r0, r26, r0		C    8 12
+	ld	r29, 32(rp)
+	std	r0, 8(rp)
+	maddld(	r12, r9, v1, r28)	C 2  6   -> addex
+	maddhdu(r11, r9, v1, r28)	C 3  7
+	addex(	r0, r5, r31, 0)		C 5  9 13
+L(lo1):	ld	r9, 16(up)
+	maddld(	r26, r8, v0, r10)	C 2  6   -> adde
+	maddhdu(r31, r8, v0, r10)	C 3  7
+	adde	r0, r27, r0		C    5  9 13
+	ld	r28, 40(rp)
+	std	r0, 16(rp)
+	maddld(	r5, r8, v1, r29)	C 3  7   -> addex
+	maddhdu(r10, r8, v1, r29)	C 4  8
+	addex(	r0, r12, r30, 0)	C    6 10
+L(lo0):	ld	r8, 24(up)
+	maddld(	r27, r9, v0, r11)	C 3  7   -> adde
+	maddhdu(r30, r9, v0, r11)	C 4  8
+	adde	r0, r26, r0		C    6 10
+	ld	r29, 48(rp)
+	std	r0, 24(rp)
+	maddld(	r12, r9, v1, r28)	C 4  8   -> addex
+	maddhdu(r11, r9, v1, r28)	C 5  9
+	addex(	r0, r5, r31, 0)		C    7 11
+	addi	up, up, 32
+	addi	rp, rp, 32
+	bdnz	L(top)
+
+L(end):	ld	r9, 0(up)
+	maddld(	r26, r8, v0, r10)	C 0  4
+	maddhdu(r31, r8, v0, r10)	C 1  5
+	adde	r0, r27, r0		C    7 11
+	std	r0, 0(rp)		C		-4
+	maddld(	r5, r8, v1, r29)	C 1  5
+	maddhdu(r10, r8, v1, r29)	C 2  6
+	addex(	r0, r12, r30, 0)	C    8 12
+L(cj2):	maddld(	r27, r9, v0, r11)	C 1  5		-2
+	maddhdu(r30, r9, v0, r11)	C 2  6		-1
+	adde	r0, r26, r0		C    8 12	-3
+	std	r0, 8(rp)		C		-3
+	mulld	r12, r9, v1		C 2  6		-1
+	mulhdu	r11, r9, v1		C 3  7		0 = return limb
+	addex(	r0, r5, r31, 0)		C 5  9 13
+	adde	r0, r27, r0		C    5  9 13	-2
+	std	r0, 16(rp)		C		-2
+	addex(	r0, r12, r30, 0)	C    6 10	-1
+	adde	r0, r0, r10		C		-1
+	std	r0, 24(rp)		C		-1
+	li	r4, 0
+	addze	r3, r11
+	addex(	r3, r3, r4, 0)
+
+L(ret):	ld	r26, -48(r1)
+	ld	r27, -40(r1)
+	ld	r28, -32(r1)
+	ld	r29, -24(r1)
+	ld	r30, -16(r1)
+	ld	r31, -8(r1)
+	blr
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/powerpc64/mode64/p9/aorsmul_1.asm b/third_party/gmp/mpn/powerpc64/mode64/p9/aorsmul_1.asm
new file mode 100644
index 0000000..e4ca3a8
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc64/mode64/p9/aorsmul_1.asm

@@ -0,0 +1,179 @@
+dnl  POWER9 mpn_addmul_1 and mpn_submul_1.
+
+dnl  Copyright 2018 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C                   mpn_addmul_1    mpn_submul_1
+C                   cycles/limb     cycles/limb
+C POWER3/PPC630		 -		 -
+C POWER4/PPC970		 -		 -
+C POWER5		 -		 -
+C POWER6		 -		 -
+C POWER7		 -		 -
+C POWER8		 -		 -
+C POWER9		 2.63		 2.63
+
+C INPUT PARAMETERS
+define(`rp', `r3')
+define(`up', `r4')
+define(`n',  `r5')
+define(`v0', `r6')
+
+
+ifdef(`OPERATION_addmul_1',`
+  define(`ADDSUBC',	adde)
+  define(`ADDSUB',	addc)
+  define(`func',	mpn_addmul_1)
+  define(`AM',		`$1')
+  define(`SM',		`')
+')
+ifdef(`OPERATION_submul_1',`
+  define(`ADDSUBC',	subfe)
+  define(`ADDSUB',	subfc)
+  define(`func',	mpn_submul_1)
+  define(`AM',		`')
+  define(`SM',		`$1')
+')
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
+
+ASM_START()
+PROLOGUE(func)
+	cmpdi	cr7, n, 3
+	srdi	r10, n, 2
+	mtctr	r10
+	rldicl.	r9, n, 0, 63
+	ld	r11, 0(up)
+	bne	cr0, L(bx1)
+
+L(bx0):	rldicl. r9, n, 63, 63
+AM(`	subfzeo	r12, n		')	C ov = 0, ca = 0
+AM(`	li	r12, 0		')
+SM(`	subfco	r12, r12, r12	')	C r12 = 0, ov = 0, ca = 1
+	ld	r9, 8(up)
+	mulld	r0, r11, v0
+	mulhdu	r5, r11, v0
+	blt	cr7, L(2)
+	ld	r8, 16(up)
+	bne	cr0, L(b10)
+
+L(b00):	addi	rp, rp, -24
+	b	L(lo0)
+L(b10):	addi	rp, rp, -8
+	addi	up, up, 16
+	b	L(lo2)
+
+L(2):	addi	rp, rp, -8
+	b	L(cj2)
+
+L(bx1):	rldicl. r9, n, 63, 63
+AM(`	subfzeo	r5, n		')	C ov = 0, ca = 0
+AM(`	li	r5, 0		')
+SM(`	subfco	r5, r5, r5	')	C r5 = 0, ov = 0, ca = 1
+	blt	cr7, L(1)
+	ld	r8, 8(up)
+	mulld	r7, r11, v0
+	mulhdu	r12, r11, v0
+	ld	r9, 16(up)
+	bne	cr0, L(b11)
+
+L(b01):	addi	rp, rp, -16
+	addi	up, up, 8
+	b	L(lo1)
+
+L(1):	mulld	r7, r11, v0
+	mulhdu	r12, r11, v0
+	ld	r11, 0(rp)
+	ADDSUB	r10, r7, r11
+	std	r10, 0(rp)
+AM(`	addze	r3, r12		')
+SM(`	subfe	r0, r0, r0	')
+SM(`	sub	r3, r12, r0	')
+	blr
+
+L(b11):	addi	up, up, 24
+	ble	cr7, L(end)
+
+	ALIGN(16)
+L(top):	ld	r11, 0(rp)
+	mulld	r0, r8, v0
+	addex(	r7, r7, r5, 0)
+	mulhdu	r5, r8, v0
+	ld	r8, 0(up)
+	ADDSUBC	r10, r7, r11
+	std	r10, 0(rp)
+L(lo2):	ld	r11, 8(rp)
+	mulld	r7, r9, v0
+	addex(	r0, r0, r12, 0)
+	mulhdu	r12, r9, v0
+	ld	r9, 8(up)
+	ADDSUBC	r10, r0, r11
+	std	r10, 8(rp)
+L(lo1):	ld	r11, 16(rp)
+	mulld	r0, r8, v0
+	addex(	r7, r7, r5, 0)
+	mulhdu	r5, r8, v0
+	ld	r8, 16(up)
+	ADDSUBC	r10, r7, r11
+	std	r10, 16(rp)
+L(lo0):	ld	r11, 24(rp)
+	mulld	r7, r9, v0
+	addex(	r0, r0, r12, 0)
+	mulhdu	r12, r9, v0
+	ld	r9, 24(up)
+	ADDSUBC	r10, r0, r11
+	std	r10, 24(rp)
+	addi	up, up, 32
+	addi	rp, rp, 32
+	bdnz	L(top)
+
+L(end):	ld	r11, 0(rp)
+	mulld	r0, r8, v0
+	addex(	r7, r7, r5, 0)
+	mulhdu	r5, r8, v0
+	ADDSUBC	r10, r7, r11
+	std	r10, 0(rp)
+L(cj2):	ld	r11, 8(rp)
+	mulld	r7, r9, v0
+	addex(	r0, r0, r12, 0)
+	mulhdu	r12, r9, v0
+	ADDSUBC	r10, r0, r11
+	std	r10, 8(rp)
+	ld	r11, 16(rp)
+	addex(	r7, r7, r5, 0)
+	ADDSUBC	r10, r7, r11
+	std	r10, 16(rp)
+	li	r0, 0
+	addex(	r3, r12, r0, 0)
+AM(`	addze	r3, r3		')
+SM(`	subfe	r0, r0, r0	')
+SM(`	sub	r3, r3, r0	')
+	blr
+EPILOGUE()

diff --git a/third_party/gmp/mpn/powerpc64/mode64/p9/gcd_11.asm b/third_party/gmp/mpn/powerpc64/mode64/p9/gcd_11.asm
new file mode 100644
index 0000000..2dc982d
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc64/mode64/p9/gcd_11.asm

@@ -0,0 +1,64 @@
+dnl  PowerPC-64 mpn_gcd_11.
+
+dnl  Copyright 2000-2002, 2005, 2009, 2011-2013, 2019 Free Software Foundation,
+dnl  Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C		    cycles/bit (approx)
+C POWER3/PPC630		 -
+C POWER4/PPC970		 -
+C POWER5		 -
+C POWER6		 -
+C POWER7		 -
+C POWER8		 -
+C POWER9		 5.75
+C Numbers measured with: speed -CD -s16-64 -t48 mpn_gcd_1
+
+define(`u0',    `r3')
+define(`v0',    `r4')
+
+define(`cnt',  `r9')dnl
+
+ASM_START()
+PROLOGUE(mpn_gcd_11)
+	b	L(odd)
+
+	ALIGN(16)
+L(top):	isel	v0, u0, v0, 29		C v = min(u,v)
+	isel	u0, r10, r11, 29	C u = |v - u|
+	srd	u0, u0, cnt
+L(odd):	subf	r10, u0, v0		C r10 = v - u
+	subf	r11, v0, u0		C r11 = u - v
+	cmpld	cr7, v0, u0
+	cnttzd	cnt, r10
+	bne	cr7, L(top)
+
+L(end):	blr
+EPILOGUE()

diff --git a/third_party/gmp/mpn/powerpc64/mode64/p9/gcd_22.asm b/third_party/gmp/mpn/powerpc64/mode64/p9/gcd_22.asm
new file mode 100644
index 0000000..12d11b0
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc64/mode64/p9/gcd_22.asm

@@ -0,0 +1,143 @@
+dnl  PowerPC-64 mpn_gcd_22 optimised for POWER9.
+
+dnl  Copyright 2000-2002, 2005, 2009, 2011-2013, 2019 Free Software Foundation,
+dnl  Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C		    cycles/bit (approx)
+C POWER3/PPC630		 -
+C POWER4/PPC970		 -
+C POWER5		 -
+C POWER6		 -
+C POWER7		 -
+C POWER8		 -
+C POWER9		 9.58
+
+C We define SLOW if this target uses a slow struct return mechanism, with
+C r3 as an implicit parameter for the struct pointer.
+undefine(`SLOW')dnl
+ifdef(`AIX',`define(`SLOW',`due to AIX')',`
+  ifdef(`DARWIN',,`
+    ifdef(`ELFv2_ABI',,`define(`SLOW',`due to ELFv1')')dnl
+  ')
+')
+
+ifdef(`SLOW',`
+define(`IFSLOW', `$1')
+define(`u1',    `r4')
+define(`u0',    `r5')
+define(`v1',    `r6')
+define(`v0',    `r7')
+',`
+define(`IFSLOW', `')
+define(`u1',    `r3')
+define(`u0',    `r4')
+define(`v1',    `r5')
+define(`v0',    `r6')
+')
+
+define(`tmp',   `r0')
+define(`t0',    `r8')
+define(`t1',    `r9')
+define(`s0',    `r10')
+define(`s1',    `r11')
+define(`cnt',   `r12')
+
+ASM_START()
+PROLOGUE(mpn_gcd_22)
+	cmpld	cr7, v0, u0
+L(top):	subfc	t0, v0, u0		C 0 12
+	beq	cr7, L(lowz)
+	subfe	t1, v1, u1		C 2 14
+	subfe.	tmp, tmp, tmp		C 4	set cr0 from the carry bit
+	subfc	s0, u0, v0		C 0
+	subfe	s1, u1, v1		C 2
+
+L(bck):	cnttzd	cnt, t0			C 2
+	subfic	tmp, cnt, 64		C 4
+
+	isel	v0, v0, u0, 2		C 6	use condition set by subfe
+	isel	u0, t0, s0, 2		C 6
+	isel	v1, v1, u1, 2		C 6
+	isel	u1, t1, s1, 2		C 6
+
+	srd	u0, u0, cnt		C 8
+	sld	tmp, u1, tmp		C 8
+	srd	u1, u1, cnt		C 8
+	or	u0, u0, tmp		C 10
+
+	or.	r0, u1, v1		C 10
+	cmpld	cr7, v0, u0
+	bne	L(top)
+
+
+	b	L(odd)
+	ALIGN(16)
+L(top1):isel	v0, u0, v0, 29		C v = min(u,v)
+	isel	u0, r10, r11, 29	C u = |u - v|
+	srd	u0, u0, cnt
+L(odd):	subf	r10, u0, v0		C r10 = v - u
+	subf	r11, v0, u0		C r11 = u - v
+	cmpld	cr7, v0, u0
+	cnttzd	cnt, r10
+	bne	cr7, L(top1)
+
+ifdef(`SLOW',`
+	std	v0, 0(r3)
+	std	r10, 8(r3)
+',`
+	mr	r3, v0
+	li	r4, 0
+')
+	blr
+
+
+L(lowz):C We come here when v0 - u0 = 0
+	C 1. If v1 - u1 = 0, then gcd is u = v.
+	C 2. Else compute gcd_21({v1,v0}, |u1-v1|)
+	subfc.	t0, v1, u1		C 2 8
+	beq	L(end)
+	li	t1, 0
+	subfe.	tmp, tmp, tmp		C 4	set cr0 from the carry bit
+	subf	s0, u1, v1		C 2
+	li	s1, 0
+	b	L(bck)
+
+L(end):
+ifdef(`SLOW',`
+	std	v0, 0(r3)
+	std	v1, 8(r3)
+	blr
+',`
+	mr	r3, v0
+	mr	r4, v1
+	blr
+')
+EPILOGUE()

diff --git a/third_party/gmp/mpn/powerpc64/mode64/p9/gmp-mparam.h b/third_party/gmp/mpn/powerpc64/mode64/p9/gmp-mparam.h
new file mode 100644
index 0000000..5650def
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc64/mode64/p9/gmp-mparam.h

@@ -0,0 +1,253 @@
+/* POWER9 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+/* 2200MHz POWER9 */
+/* FFT tuning limit = 221,245,838 */
+/* Generated by tuneup.c, 2019-10-29, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD                 0  /* always */
+#define MOD_1_UNNORM_THRESHOLD               0  /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          8
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          5
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD         7
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD        44
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     11
+#define USE_PREINV_DIVREM_1                  0
+#define DIV_QR_1N_PI1_METHOD                 2  /* 19.28% faster than 1 */
+#define DIV_QR_1_NORM_THRESHOLD              3
+#define DIV_QR_1_UNNORM_THRESHOLD            2
+#define DIV_QR_2_PI2_THRESHOLD               7
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           33
+
+#define DIV_1_VS_MUL_1_PERCENT             365
+
+#define MUL_TOOM22_THRESHOLD                34
+#define MUL_TOOM33_THRESHOLD               109
+#define MUL_TOOM44_THRESHOLD               458
+#define MUL_TOOM6H_THRESHOLD               517
+#define MUL_TOOM8H_THRESHOLD               608
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD     113
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     292
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD     204
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD     211
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD     178
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 46
+#define SQR_TOOM3_THRESHOLD                158
+#define SQR_TOOM4_THRESHOLD                674
+#define SQR_TOOM6_THRESHOLD                  0  /* always */
+#define SQR_TOOM8_THRESHOLD                898
+
+#define MULMID_TOOM42_THRESHOLD             70
+
+#define MULMOD_BNM1_THRESHOLD               17
+#define SQRMOD_BNM1_THRESHOLD               25
+
+#define MUL_FFT_MODF_THRESHOLD             404  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    404, 5}, {     23, 6}, {     12, 5}, {     25, 6}, \
+    {     13, 5}, {     27, 6}, {     27, 7}, {     14, 6}, \
+    {     29, 7}, {     15, 6}, {     31, 7}, {     25, 8}, \
+    {     13, 7}, {     28, 8}, {     15, 7}, {     32, 8}, \
+    {     17, 7}, {     35, 8}, {     27, 9}, {     15, 8}, \
+    {     35, 9}, {     19, 8}, {     39, 9}, {     23, 8}, \
+    {     47, 9}, {     27,10}, {     15, 9}, {     31, 8}, \
+    {     63, 9}, {     35, 8}, {     71, 9}, {     39,10}, \
+    {     23, 9}, {     51,11}, {     15,10}, {     31, 9}, \
+    {     71,10}, {     39, 9}, {     83,10}, {     47, 9}, \
+    {     95,10}, {     55,11}, {     31,10}, {     79,11}, \
+    {     47,10}, {     95,12}, {     31,11}, {     63,10}, \
+    {    135,11}, {     79,10}, {    159,11}, {     95,12}, \
+    {     63,11}, {    127,10}, {    255, 9}, {    511,11}, \
+    {    143,10}, {    287, 9}, {    575,10}, {    303,11}, \
+    {    159,12}, {     95,11}, {    191,13}, {     63,12}, \
+    {    127,11}, {    255,10}, {    511,11}, {    271,10}, \
+    {    543,11}, {    287,10}, {    575,11}, {    303,12}, \
+    {    159,11}, {    319,10}, {    639,11}, {    335,10}, \
+    {    671,11}, {    351,10}, {    703,11}, {    367,10}, \
+    {    735,12}, {    191,11}, {    383,10}, {    767,11}, \
+    {    415,10}, {    831,12}, {    223,11}, {    447,10}, \
+    {    895,11}, {    479,13}, {    127,12}, {    255,11}, \
+    {    511,10}, {   1023,11}, {    543,12}, {    287,11}, \
+    {    575,10}, {   1151,11}, {    607,12}, {    319,11}, \
+    {    639,10}, {   1279,11}, {    671,12}, {    351,11}, \
+    {    703,10}, {   1407,11}, {    735,13}, {    191,12}, \
+    {    383,11}, {    767,10}, {   1535,11}, {    799,12}, \
+    {    415,11}, {    831,10}, {   1663,11}, {    863,12}, \
+    {    447,11}, {    895,12}, {    479,14}, {    127,13}, \
+    {    255,12}, {    511,11}, {   1023,12}, {    543,11}, \
+    {   1087,12}, {    575,11}, {   1151,12}, {    607,13}, \
+    {    319,12}, {    639,11}, {   1279,12}, {    671,11}, \
+    {   1343,12}, {    703,11}, {   1407,12}, {    735,11}, \
+    {   1471,13}, {    383,12}, {    767,11}, {   1535,12}, \
+    {    799,11}, {   1599,12}, {    831,11}, {   1663,13}, \
+    {    447,12}, {    895,11}, {   1791,12}, {    959,14}, \
+    {    255,13}, {    511,12}, {   1087,11}, {   2175,13}, \
+    {    575,12}, {   1215,13}, {    639,12}, {   1343,13}, \
+    {    703,12}, {   1471,14}, {    383,13}, {    767,12}, \
+    {   1599,13}, {    831,12}, {   1727,13}, {    895,11}, \
+    {   3583,12}, {   1919,15}, {    255,14}, {    511,13}, \
+    {   1087,12}, {   2175,13}, {   1215,14}, {    639,13}, \
+    {   1343,12}, {   2687,13}, {   1471,14}, {    767,13}, \
+    {   1599,12}, {   3199,13}, {   1727,14}, {    895,13}, \
+    {   1919,15}, {    511,14}, {   1023,13}, {   2175,14}, \
+    {   1151,13}, {   2431,12}, {   4863,14}, {   1279,13}, \
+    {   2687,14}, {   1407,13}, {   2943,15}, {    767,14}, \
+    {   1535,13}, {   3199,14}, {   1663,13}, {   3455,12}, \
+    {   6911,14}, {   1919,16}, {    511,15}, {   1023,14}, \
+    {   2175,13}, {   4479,14}, {   2431,13}, {   4863,15}, \
+    {   1279,14}, {   2943,13}, {   5887,15}, {   1535,14}, \
+    {   3455,13}, {   6911,15}, {   1791,14}, {   3839,13}, \
+    {   7679,16}, {   1023,15}, {   2047,14}, {   4351,15}, \
+    {   2303,14}, {   4863,15}, {   2815,14}, {   5887,16}, \
+    {   1535,15}, {   3327,14}, {   6911,15}, {   3839,14}, \
+    {   7679,17}, {   1023,16}, {   2047,15}, {   4351,14}, \
+    {   8959,15}, {   4863,16}, {   2559,15}, {   5887,14}, \
+    {  11775,16}, {   3071,15}, {  32768,16}, {  65536,17}, \
+    { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
+    {2097152,22}, {4194304,23}, {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 243
+#define MUL_FFT_THRESHOLD                 3712
+
+#define SQR_FFT_MODF_THRESHOLD             404  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    404, 5}, {     25, 6}, {     13, 5}, {     27, 6}, \
+    {     29, 7}, {     15, 6}, {     31, 7}, {     25, 8}, \
+    {     13, 7}, {     28, 8}, {     15, 7}, {     32, 8}, \
+    {     17, 7}, {     35, 8}, {     29, 9}, {     15, 8}, \
+    {     35, 9}, {     19, 8}, {     41, 9}, {     23, 8}, \
+    {     47, 9}, {     27,10}, {     15, 9}, {     39,10}, \
+    {     23, 9}, {     51,11}, {     15,10}, {     31, 9}, \
+    {     71,10}, {     39, 9}, {     83,10}, {     47, 9}, \
+    {     95,10}, {     55,11}, {     31,10}, {     79,11}, \
+    {     47,10}, {     95,12}, {     31,11}, {     63,10}, \
+    {    127, 9}, {    255,10}, {    135,11}, {     79,10}, \
+    {    159,11}, {     95,12}, {     63,11}, {    127,10}, \
+    {    255, 9}, {    511,10}, {    271,11}, {    143,10}, \
+    {    287, 9}, {    575,10}, {    303,11}, {    159,12}, \
+    {     95,13}, {     63,12}, {    127,11}, {    255,10}, \
+    {    511,11}, {    271,10}, {    543,11}, {    287,10}, \
+    {    575,11}, {    303,12}, {    159,11}, {    319,10}, \
+    {    639,11}, {    335,10}, {    671,11}, {    351,10}, \
+    {    703,11}, {    367,10}, {    735,12}, {    191,11}, \
+    {    383,10}, {    767,11}, {    415,12}, {    223,11}, \
+    {    447,10}, {    895,13}, {    127,12}, {    255,11}, \
+    {    511,10}, {   1023,11}, {    543,12}, {    287,11}, \
+    {    575,10}, {   1151,11}, {    607,12}, {    319,11}, \
+    {    671,12}, {    351,11}, {    703,10}, {   1407,11}, \
+    {    735,13}, {    191,12}, {    383,11}, {    767,10}, \
+    {   1535,12}, {    415,11}, {    831,12}, {    447,11}, \
+    {    895,12}, {    479,14}, {    127,13}, {    255,12}, \
+    {    511,11}, {   1023,12}, {    543,11}, {   1087,12}, \
+    {    575,11}, {   1151,12}, {    607,13}, {    319,12}, \
+    {    639,11}, {   1279,12}, {    671,11}, {   1343,12}, \
+    {    703,11}, {   1407,12}, {    735,13}, {    383,12}, \
+    {    767,11}, {   1535,12}, {    799,11}, {   1599,12}, \
+    {    831,13}, {    447,12}, {    895,11}, {   1791,12}, \
+    {    959,14}, {    255,13}, {    511,12}, {   1023,11}, \
+    {   2047,12}, {   1087,13}, {    575,12}, {   1215,13}, \
+    {    639,12}, {   1343,13}, {    703,12}, {   1407,14}, \
+    {    383,13}, {    767,12}, {   1599,13}, {    831,12}, \
+    {   1727,13}, {    895,12}, {   1791,13}, {    959,15}, \
+    {    255,14}, {    511,13}, {   1023,12}, {   2047,13}, \
+    {   1087,12}, {   2175,13}, {   1215,14}, {    639,13}, \
+    {   1343,12}, {   2687,13}, {   1471,14}, {    767,13}, \
+    {   1599,12}, {   3199,13}, {   1727,14}, {    895,13}, \
+    {   1919,15}, {    511,14}, {   1023,13}, {   2175,14}, \
+    {   1151,13}, {   2431,12}, {   4863,14}, {   1279,13}, \
+    {   2687,14}, {   1407,13}, {   2815,15}, {    767,14}, \
+    {   1535,13}, {   3199,14}, {   1663,13}, {   3455,14}, \
+    {   1919,16}, {    511,15}, {   1023,14}, {   2175,13}, \
+    {   4479,14}, {   2431,13}, {   4863,15}, {   1279,14}, \
+    {   2943,13}, {   5887,15}, {   1535,14}, {   3455,13}, \
+    {   6911,15}, {   1791,14}, {   3839,16}, {   1023,15}, \
+    {   2047,14}, {   4479,15}, {   2303,14}, {   4863,15}, \
+    {   2559,14}, {   5119,15}, {   2815,14}, {   5887,16}, \
+    {   1535,15}, {   3327,14}, {   6911,15}, {   3839,17}, \
+    {   1023,16}, {   2047,15}, {   4351,14}, {   8959,15}, \
+    {   4863,16}, {   2559,15}, {   5887,14}, {  11775,16}, \
+    {   3071,15}, {  32768,16}, {  65536,17}, { 131072,18}, \
+    { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \
+    {4194304,23}, {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 230
+#define SQR_FFT_THRESHOLD                 3264
+
+#define MULLO_BASECASE_THRESHOLD             3
+#define MULLO_DC_THRESHOLD                  39
+#define MULLO_MUL_N_THRESHOLD             7246
+#define SQRLO_BASECASE_THRESHOLD             6
+#define SQRLO_DC_THRESHOLD                  40
+#define SQRLO_SQR_THRESHOLD               6440
+
+#define DC_DIV_QR_THRESHOLD                 30
+#define DC_DIVAPPR_Q_THRESHOLD              88
+#define DC_BDIV_QR_THRESHOLD                35
+#define DC_BDIV_Q_THRESHOLD                 62
+
+#define INV_MULMOD_BNM1_THRESHOLD           79
+#define INV_NEWTON_THRESHOLD                11
+#define INV_APPR_THRESHOLD                  11
+
+#define BINV_NEWTON_THRESHOLD              264
+#define REDC_1_TO_REDC_2_THRESHOLD           8
+#define REDC_2_TO_REDC_N_THRESHOLD          79
+
+#define MU_DIV_QR_THRESHOLD               1442
+#define MU_DIVAPPR_Q_THRESHOLD            1470
+#define MUPI_DIV_QR_THRESHOLD                0  /* always */
+#define MU_BDIV_QR_THRESHOLD              1470
+#define MU_BDIV_Q_THRESHOLD               1652
+
+#define POWM_SEC_TABLE  1,16,151,839
+
+#define GET_STR_DC_THRESHOLD                 7
+#define GET_STR_PRECOMPUTE_THRESHOLD        15
+#define SET_STR_DC_THRESHOLD               406
+#define SET_STR_PRECOMPUTE_THRESHOLD       885
+
+#define FAC_DSC_THRESHOLD                  179
+#define FAC_ODD_THRESHOLD                   53
+
+#define MATRIX22_STRASSEN_THRESHOLD         19
+#define HGCD2_DIV1_METHOD                    1  /* 9.10% faster than 3 */
+#define HGCD_THRESHOLD                      45
+#define HGCD_APPR_THRESHOLD                 50
+#define HGCD_REDUCE_THRESHOLD             2479
+#define GCD_DC_THRESHOLD                   321
+#define GCDEXT_DC_THRESHOLD                258
+#define JACOBI_BASE_METHOD                   4  /* 15.45% faster than 1 */
+
+/* Tuneup completed successfully, took 179422 seconds */

diff --git a/third_party/gmp/mpn/powerpc64/mode64/p9/mul_1.asm b/third_party/gmp/mpn/powerpc64/mode64/p9/mul_1.asm
new file mode 100644
index 0000000..363f095
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc64/mode64/p9/mul_1.asm

@@ -0,0 +1,126 @@
+dnl  Power9 mpn_mul_1.
+
+dnl  Copyright 2017, 2018 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C                   cycles/limb
+C POWER3/PPC630		 ?
+C POWER4/PPC970		 ?
+C POWER5		 ?
+C POWER6		 ?
+C POWER7		 ?
+C POWER8		 ?
+C POWER9		 2.47
+
+C TODO
+C  * Schedule for Power9 pipeline.
+C  * Unroll 4x if that proves beneficial.
+C  * This is marginally faster (but much smaller) than ../mul_1.asm.
+
+C INPUT PARAMETERS
+define(`rp', `r3')
+define(`up', `r4')
+define(`n',  `r5')
+define(`v0', `r6')
+
+ASM_START()
+PROLOGUE(mpn_mul_1c)
+	b	L(ent)
+EPILOGUE()
+PROLOGUE(mpn_mul_1)
+	li	r7, 0
+L(ent):	ld	r11, 0(up)
+	cmpdi	cr6, n, 2
+	addi	r0, n, -1	C FIXME: postpone
+	srdi	r0, r0, 1	C FIXME: postpone
+	mtctr	r0		C FIXME: postpone
+	rldicl.	r12, n, 0,63	C r0 = n & 3, set cr0
+	bne	cr0, L(b1)
+
+L(b0):	ld	r0, 8(up)
+	maddld(	r9, r11, v0, r7)
+	maddhdu(r7, r11, v0, r7)
+	ble	cr6, L(2)
+	ld	r12, 16(up)
+	mulld	r8, r0, v0
+	mulhdu	r5, r0, v0
+	addic	up, up, 16
+	addi	rp, rp, -8
+	b	L(mid)
+
+L(b1):	ld	r0, 0(up)
+	ble	cr6, L(1)
+	ld	r12, 8(up)
+	maddld(	r8, r11, v0, r7)
+	maddhdu(r5, r11, v0, r7)
+	ld	r0, 16(up)
+	mulld	r9, r12, v0
+	mulhdu	r7, r12, v0
+	addic	up, up, 24
+	bdz	L(end)
+
+	ALIGN(16)
+L(top):	ld	r12, 0(up)
+	std	r8, 0(rp)
+	adde	r9, r5, r9
+	mulld	r8, r0, v0
+	mulhdu	r5, r0, v0
+L(mid):	ld	r0, 8(up)
+	std	r9, 8(rp)
+	adde	r8, r7, r8
+	mulld	r9, r12, v0
+	mulhdu	r7, r12, v0
+	addi	rp, rp, 16
+	addi	up, up, 16
+	bdnz	L(top)
+
+L(end):	std	r8, 0(rp)
+	mulld	r8, r0, v0
+	adde	r9, r5, r9
+	mulhdu	r5, r0, v0
+	std	r9, 8(rp)
+	adde	r8, r7, r8
+	std	r8, 16(rp)
+	addze	r3, r5
+	blr
+
+L(2):	mulld	r8, r0, v0
+	mulhdu	r5, r0, v0
+	std	r9, 0(rp)
+	addc	r8, r7, r8
+	std	r8, 8(rp)
+	addze	r3, r5
+	blr
+
+L(1):	maddld(	r8,  r0, v0, r7)
+	std	r8, 0(rp)
+	maddhdu(r3, r0, v0, r7)
+	blr
+EPILOGUE()

diff --git a/third_party/gmp/mpn/powerpc64/mode64/p9/mul_2.asm b/third_party/gmp/mpn/powerpc64/mode64/p9/mul_2.asm
new file mode 100644
index 0000000..632b6cb
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc64/mode64/p9/mul_2.asm

@@ -0,0 +1,170 @@
+dnl  Power9 mpn_mul_2.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2018 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 3 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C         cycles/limb
+C power9:    1.58
+
+C STATUS
+C  * Not written with any power9 pipeline understanding.
+C  * The 4x unrolling was not motivated by any timing tests.
+C  * No local scheduling for performance tweaking has been done.
+C  * Decrease load scheduling!
+
+define(`rp', `r3')
+define(`up', `r4')
+define(`n',  `r5')		C Note: Reused as scratch
+define(`vp', `r6')		C Note: Reused for v1
+
+define(`v0', `r7')
+define(`v1', `r6')
+
+
+ASM_START()
+PROLOGUE(mpn_mul_2)
+	std	r28, -32(r1)
+	std	r29, -24(r1)
+	std	r30, -16(r1)
+	std	r31, -8(r1)
+
+	subfic	r0, n, 0	C clear CA
+	subfo	r0, r0, r0	C clear OV and r0
+
+	cmpdi	cr7, n, 4
+
+	ld	v0, 0(vp)
+	ld	v1, 8(vp)
+
+	srdi	r10, n, 2
+	mtctr	r10
+
+	rldicl.	r9, n, 0, 63
+	bne	cr0, L(bx1)
+
+L(bx0):	rldicl. r9, n, 63, 63
+
+	ld	r8, 0(up)
+	ld	r9, 8(up)
+	li	r11, 0
+	mulld	r28, r8, v0
+	mulhdu	r31, r8, v0
+	blt	cr7, L(2)
+	mulld	r5, r8, v1
+	mulhdu	r10, r8, v1
+	bne	cr0, L(b10)
+
+L(b00):	addi	up, up, -8
+	addi	rp, rp, -24
+	b	L(lo0)
+
+L(b10):	addi	up, up, 8
+	addi	rp, rp, -8
+	b	L(lo2)
+
+L(2):	addi	rp, rp, -8
+	mulld	r5, r8, v1
+	mulhdu	r10, r8, v1
+	b	L(cj2)
+
+L(bx1):	rldicl. r9, n, 63, 63
+
+	ld	r9, 0(up)
+	ld	r8, 8(up)
+	li	r10, 0
+	mulld	r29, r9, v0
+	mulhdu	r30, r9, v0
+	mulld	r12, r9, v1
+	mulhdu	r11, r9, v1
+	bne	cr0, L(b11)
+
+L(b01):	addi	rp, rp, -16
+	b	L(lo1)
+L(b11):	addi	up, up, 16
+	blt	cr7, L(end)
+
+L(top):	ld	r9, 0(up)
+	maddld(	r28, r8, v0, r10)	C 0  4   -> adde
+	maddhdu(r31, r8, v0, r10)	C 1  5
+	adde	r0, r29, r0		C    7 11
+	std	r0, 0(rp)
+	mulld	r5, r8, v1		C 1  5   -> addex
+	mulhdu	r10, r8, v1		C 2  6
+	addex(	r0, r12, r30, 0)	C    8 12
+L(lo2):	ld	r8, 8(up)
+	maddld(	r29, r9, v0, r11)	C 1  5   -> adde
+	maddhdu(r30, r9, v0, r11)	C 2  6
+	adde	r0, r28, r0		C    8 12
+	std	r0, 8(rp)
+	mulld	r12, r9, v1		C 2  6   -> addex
+	mulhdu	r11, r9, v1		C 3  7
+	addex(	r0, r5, r31, 0)		C 5  9 13
+L(lo1):	ld	r9, 16(up)
+	maddld(	r28, r8, v0, r10)	C 2  6   -> adde
+	maddhdu(r31, r8, v0, r10)	C 3  7
+	adde	r0, r29, r0		C    5  9 13
+	std	r0, 16(rp)
+	mulld	r5, r8, v1		C 3  7   -> addex
+	mulhdu	r10, r8, v1		C 4  8
+	addex(	r0, r12, r30, 0)	C    6 10
+L(lo0):	ld	r8, 24(up)
+	maddld(	r29, r9, v0, r11)	C 3  7   -> adde
+	maddhdu(r30, r9, v0, r11)	C 4  8
+	adde	r0, r28, r0		C    6 10
+	std	r0, 24(rp)
+	mulld	r12, r9, v1		C 4  8   -> addex
+	mulhdu	r11, r9, v1		C 5  9
+	addex(	r0, r5, r31, 0)		C    7 11
+	addi	up, up, 32
+	addi	rp, rp, 32
+	bdnz	L(top)
+
+L(end):	ld	r9, 0(up)
+	maddld(	r28, r8, v0, r10)	C 0  4
+	maddhdu(r31, r8, v0, r10)	C 1  5
+	adde	r0, r29, r0		C    7 11
+	std	r0, 0(rp)		C		-4
+	mulld	r5, r8, v1		C 1  5
+	mulhdu	r10, r8, v1		C 2  6
+	addex(	r0, r12, r30, 0)	C    8 12
+L(cj2):	maddld(	r29, r9, v0, r11)	C 1  5		-2
+	maddhdu(r30, r9, v0, r11)	C 2  6		-1
+	adde	r0, r28, r0		C    8 12	-3
+	std	r0, 8(rp)		C		-3
+	mulld	r12, r9, v1		C 2  6		-1
+	mulhdu	r11, r9, v1		C 3  7		0 = return limb
+	addex(	r0, r5, r31, 0)		C 5  9 13
+	adde	r0, r29, r0		C    5  9 13	-2
+	std	r0, 16(rp)		C		-2
+	addex(	r0, r12, r30, 0)	C    6 10	-1
+	adde	r0, r0, r10		C		-1
+	std	r0, 24(rp)		C		-1
+	li	r4, 0
+	addze	r3, r11
+	addex(	r3, r3, r4, 0)
+
+L(ret):	ld	r28, -32(r1)
+	ld	r29, -24(r1)
+	ld	r30, -16(r1)
+	ld	r31, -8(r1)
+	blr
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/powerpc64/mode64/p9/mul_basecase.asm b/third_party/gmp/mpn/powerpc64/mode64/p9/mul_basecase.asm
new file mode 100644
index 0000000..8f3d322
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc64/mode64/p9/mul_basecase.asm

@@ -0,0 +1,415 @@
+dnl  Power9 mpn_mul_basecase.
+
+dnl  Copyright 1999-2001, 2003-2006, 2008, 2017-2018 Free Software Foundation,
+dnl  Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C                  cycles/limb
+C POWER3/PPC630          -
+C POWER4/PPC970          -
+C POWER5                 -
+C POWER6                 -
+C POWER7                 -
+C POWER8                 -
+C POWER9                 1.62
+
+C TODO
+C  * Check if (inner) loop alignment affects performance.
+C  * Could we schedule loads less in addmul_2/mul_2? That would save some regs
+C    and make the tail code more manageable.
+C  * Postpone some register saves to main loop.
+C  * Perhaps write more small operands (3x1, 3x2, 3x3) code.
+C  * Consider restoring rp,up after loop using arithmetic, eliminating rp2, up2.
+C    On the other hand, the current rp,up restore register are useful for OSP.
+C  * Do OSP. This should save a lot with the current deep addmul_2 pipeline.
+
+C INPUT PARAMETERS
+define(`rp', `r3')
+define(`up', `r4')
+define(`un', `r5')
+define(`vp', `r6')
+define(`vn', `r7')
+
+define(`v0', `r0')
+define(`v1', `r7')
+define(`rp2', `r24')
+define(`up2', `r25')
+
+ASM_START()
+PROLOGUE(mpn_mul_basecase)
+	cmpdi	cr0, un, 2
+	bgt	cr0, L(un_gt2)
+	cmpdi	cr6, vn, 1
+	ld	r7, 0(vp)
+	ld	r5, 0(up)
+	mulld	r8, r5, r7	C weight 0
+	mulhdu	r9, r5, r7	C weight 1
+	std	r8, 0(rp)
+	beq	cr0, L(2x)
+	std	r9, 8(rp)
+	blr
+	ALIGN(16)
+L(2x):	ld	r0, 8(up)
+	mulld	r8, r0, r7	C weight 1
+	mulhdu	r10, r0, r7	C weight 2
+	addc	r9, r9, r8
+	addze	r10, r10
+	bne	cr6, L(2x2)
+	std	r9, 8(rp)
+	std	r10, 16(rp)
+	blr
+	ALIGN(16)
+L(2x2):	ld	r6, 8(vp)
+	mulld	r8, r5, r6	C weight 1
+	mulhdu	r11, r5, r6	C weight 2
+	addc	r9, r9, r8
+	std	r9, 8(rp)
+	adde	r11, r11, r10
+	mulld	r12, r0, r6	C weight 2
+	mulhdu	r0, r0, r6	C weight 3
+	addze	r0, r0
+	addc	r11, r11, r12
+	addze	r0, r0
+	std	r11, 16(rp)
+	std	r0, 24(rp)
+	blr
+
+L(un_gt2):
+	std	r22, -80(r1)
+	std	r23, -72(r1)
+	std	r24, -64(r1)
+	std	r25, -56(r1)
+	std	r26, -48(r1)
+	std	r27, -40(r1)
+	std	r28, -32(r1)
+	std	r29, -24(r1)
+	std	r30, -16(r1)
+	std	r31, -8(r1)
+	mr	rp2, r3			C rp
+	mr	up2, r4			C up
+	srdi	r22, r5, 2		C un
+	subfic	r23, r7, 0		C -vn, clear CA
+	subfo	r0, r0, r0		C clear OV (and r0)
+
+	cmpdi	cr6, un, 3
+	rldicl	r0, un, 0, 63		C r0 = un & 1
+	cmpdi	cr7, r0, 0
+	rldicl	r0, un, 63, 63		C FIXME: unused for vn = 1
+	cmpdi	cr5, r0, 0		C FIXME: unused for vn = 1
+
+	ld	v0, 0(vp)
+	rldicl.	r9, vn, 0, 63
+	beq	cr0, L(vn_evn)
+
+L(vn_odd):
+	addi	r10, un, -2
+	ld	r5, 0(up)
+	srdi	r10, r10, 1
+	mtctr	r10
+	bne	cr7, L(m1_b1)
+
+L(m1_b0):
+	ld	r10, 8(up)
+	mulld	r9, r5, v0
+	mulhdu	r11, r5, v0
+	ld	r12, 16(up)
+	mulld	r8, r10, v0
+	mulhdu	r5, r10, v0
+	addi	rp, rp, -8
+	b	L(m1_mid)
+
+L(m1_b1):
+	ld	r12, 8(up)
+	mulld	r8, r5, v0
+	mulhdu	r5, r5, v0
+	ld	r10, 16(up)
+	mulld	r9, r12, v0
+	mulhdu	r11, r12, v0
+	addi	up, up, 8
+	beq	cr6, L(m1_end)		C jump taken means un = 3, vn = {1,3}
+
+	ALIGN(16)
+L(m1_top):
+	ld	r12, 16(up)
+	std	r8, 0(rp)
+	adde	r9, r5, r9
+	mulld	r8, r10, v0
+	mulhdu	r5, r10, v0
+L(m1_mid):
+	ld	r10, 24(up)
+	std	r9, 8(rp)
+	adde	r8, r11, r8
+	mulld	r9, r12, v0
+	mulhdu	r11, r12, v0
+	addi	rp, rp, 16
+	addi	up, up, 16
+	bdnz	L(m1_top)
+
+L(m1_end):
+	std	r8, 0(rp)
+	mulld	r8, r10, v0
+	adde	r9, r5, r9
+	mulhdu	r5, r10, v0
+	std	r9, 8(rp)
+	adde	r8, r11, r8
+	std	r8, 16(rp)
+	addze	r10, r5
+	std	r10, 24(rp)
+
+	addi	rp2, rp2, 8
+	addi	vp, vp, 8
+	addic.	r23, r23, 1
+	b	L(do_outer)
+
+L(vn_evn):
+	ld	v1, 8(vp)
+	addi	r23, r23, 2
+	mtctr	r22
+	bne	cr7, L(m2_bx1)
+
+L(m2_bx0):
+	ld	r8, 0(up)
+	ld	r9, 8(up)
+	li	r11, 0
+	mulld	r28, r8, v0
+	mulhdu	r31, r8, v0
+	mulld	r5, r8, v1
+	mulhdu	r10, r8, v1
+	li	r12, 0
+	bne	cr5, L(m2_b10)
+
+L(m2_b00):
+	addi	up, up, -8
+	addi	rp, rp, -24
+	b	L(m2_lo0)
+
+L(m2_b10):
+	addi	up, up, 8
+	addi	rp, rp, -8
+	b	L(m2_lo2)
+
+L(m2_bx1):
+	ld	r9, 0(up)
+	ld	r8, 8(up)
+	li	r10, 0
+	mulld	r29, r9, v0
+	mulhdu	r30, r9, v0
+	mulld	r12, r9, v1
+	mulhdu	r11, r9, v1
+	li	r5, 0
+	bne	cr5, L(m2_b11)
+
+L(m2_b01):
+	addi	rp, rp, -16
+	b	L(m2_lo1)
+L(m2_b11):
+	addi	up, up, 16
+	beq	cr6, L(m2_end)		C taken means un = 3, vn = 2. We're done.
+
+L(m2_top):
+	ld	r9, 0(up)
+	maddld(	r28, r8, v0, r10)
+	maddhdu(r31, r8, v0, r10)
+	adde	r5, r29, r5
+	std	r5, 0(rp)
+	mulld	r5, r8, v1
+	mulhdu	r10, r8, v1
+	addex(	r12, r12, r30, 0)
+L(m2_lo2):
+	ld	r8, 8(up)
+	maddld(	r29, r9, v0, r11)
+	maddhdu(r30, r9, v0, r11)
+	adde	r12, r28, r12
+	std	r12, 8(rp)
+	mulld	r12, r9, v1
+	mulhdu	r11, r9, v1
+	addex(	r5, r5, r31, 0)
+L(m2_lo1):
+	ld	r9, 16(up)
+	maddld(	r28, r8, v0, r10)
+	maddhdu(r31, r8, v0, r10)
+	adde	r5, r29, r5
+	std	r5, 16(rp)
+	mulld	r5, r8, v1
+	mulhdu	r10, r8, v1
+	addex(	r12, r12, r30, 0)
+L(m2_lo0):
+	ld	r8, 24(up)
+	maddld(	r29, r9, v0, r11)
+	maddhdu(r30, r9, v0, r11)
+	adde	r12, r28, r12
+	std	r12, 24(rp)
+	mulld	r12, r9, v1
+	mulhdu	r11, r9, v1
+	addex(	r5, r5, r31, 0)
+	addi	up, up, 32
+	addi	rp, rp, 32
+	bdnz	L(m2_top)
+
+L(m2_end):
+	ld	r9, 0(up)
+	maddld(	r28, r8, v0, r10)
+	maddhdu(r31, r8, v0, r10)
+	adde	r5, r29, r5
+	std	r5, 0(rp)
+	mulld	r5, r8, v1
+	mulhdu	r10, r8, v1
+	b	L(cj)
+
+L(outer):
+	ld	v0, 0(vp)
+	ld	v1, 8(vp)
+	addi	r23, r23, 2
+	mtctr	r22
+	bne	cr7, L(bx1)
+
+L(bx0):	ld	r26, 0(rp2)
+	ld	r8, 0(up2)
+	ld	r11, 8(rp2)
+	ld	r9, 8(up2)
+	maddld(	r28, r8, v0, r26)
+	maddhdu(r31, r8, v0, r26)
+	ld	r26, 16(rp2)
+	mulld	r5, r8, v1
+	mulhdu	r10, r8, v1
+	li	r12, 0
+	bne	cr5, L(b10)
+
+L(b00):	addi	up, up2, -8
+	addi	rp, rp2, -24
+	b	L(lo0)
+
+L(b10):	addi	up, up2, 8
+	addi	rp, rp2, -8
+	b	L(lo2)
+
+L(bx1):	ld	r27, 0(rp2)
+	ld	r9, 0(up2)
+	ld	r10, 8(rp2)
+	ld	r8, 8(up2)
+	maddld(	r29, r9, v0, r27)
+	maddhdu(r30, r9, v0, r27)
+	ld	r27, 16(rp2)
+	mulld	r12, r9, v1
+	mulhdu	r11, r9, v1
+	li	r5, 0
+	bne	cr5, L(b11)
+
+L(b01):	addi	up, up2, 0
+	addi	rp, rp2, -16
+	b	L(lo1)
+L(b11):	addi	up, up2, 16
+	addi	rp, rp2, 0
+	beq	cr6, L(end)		C taken means un = 3, vn = 3. We're done.
+
+L(top):	ld	r9, 0(up)
+	maddld(	r28, r8, v0, r10)
+	maddhdu(r31, r8, v0, r10)
+	adde	r5, r29, r5
+	ld	r26, 24(rp)
+	std	r5, 0(rp)
+	maddld(	r5, r8, v1, r27)
+	maddhdu(r10, r8, v1, r27)
+	addex(	r12, r12, r30, 0)
+L(lo2):	ld	r8, 8(up)
+	maddld(	r29, r9, v0, r11)
+	maddhdu(r30, r9, v0, r11)
+	adde	r12, r28, r12
+	ld	r27, 32(rp)
+	std	r12, 8(rp)
+	maddld(	r12, r9, v1, r26)
+	maddhdu(r11, r9, v1, r26)
+	addex(	r5, r5, r31, 0)
+L(lo1):	ld	r9, 16(up)
+	maddld(	r28, r8, v0, r10)
+	maddhdu(r31, r8, v0, r10)
+	adde	r5, r29, r5
+	ld	r26, 40(rp)
+	std	r5, 16(rp)
+	maddld(	r5, r8, v1, r27)
+	maddhdu(r10, r8, v1, r27)
+	addex(	r12, r12, r30, 0)
+L(lo0):	ld	r8, 24(up)
+	maddld(	r29, r9, v0, r11)
+	maddhdu(r30, r9, v0, r11)
+	adde	r12, r28, r12
+	ld	r27, 48(rp)
+	std	r12, 24(rp)
+	maddld(	r12, r9, v1, r26)
+	maddhdu(r11, r9, v1, r26)
+	addex(	r5, r5, r31, 0)
+	addi	up, up, 32
+	addi	rp, rp, 32
+	bdnz	L(top)
+
+L(end):	ld	r9, 0(up)
+	maddld(	r28, r8, v0, r10)
+	maddhdu(r31, r8, v0, r10)
+	adde	r5, r29, r5
+	std	r5, 0(rp)
+	maddld(	r5, r8, v1, r27)
+	maddhdu(r10, r8, v1, r27)
+L(cj):	addex(	r12, r12, r30, 0)
+	maddld(	r29, r9, v0, r11)
+	maddhdu(r30, r9, v0, r11)
+	adde	r12, r28, r12
+	std	r12, 8(rp)
+	mulld	r12, r9, v1
+	mulhdu	r11, r9, v1
+	addex(	r5, r5, r31, 0)
+	adde	r5, r29, r5
+	std	r5, 16(rp)
+	addex(	r12, r12, r30, 0)
+	adde	r12, r12, r10
+	std	r12, 24(rp)
+	li	r4, 0
+	addze	r5, r11
+	addex(	r5, r5, r4, 0)
+	std	r5, 32(rp)
+
+	cmpdi	cr0, r23, 0
+	addi	rp2, rp2, 16
+	addi	vp, vp, 16
+L(do_outer):
+	bne	cr0, L(outer)
+L(ret):
+	ld	r22, -80(r1)
+	ld	r23, -72(r1)
+	ld	r24, -64(r1)
+	ld	r25, -56(r1)
+	ld	r26, -48(r1)
+	ld	r27, -40(r1)
+	ld	r28, -32(r1)
+	ld	r29, -24(r1)
+	ld	r30, -16(r1)
+	ld	r31, -8(r1)
+	blr
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/powerpc64/mode64/p9/sqr_basecase.asm b/third_party/gmp/mpn/powerpc64/mode64/p9/sqr_basecase.asm
new file mode 100644
index 0000000..2d4fa63
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc64/mode64/p9/sqr_basecase.asm

@@ -0,0 +1,555 @@
+dnl  Power9 mpn_sqr_basecase.
+
+dnl  Copyright 1999-2001, 2003-2006, 2008, 2017-2018 Free Software Foundation,
+dnl  Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C                  cycles/limb
+C POWER3/PPC630          -
+C POWER4/PPC970          -
+C POWER5                 -
+C POWER6                 -
+C POWER7                 -
+C POWER8                 -
+C POWER9                 1.62
+
+C TODO
+C  * Completely separate evn and odd code into two outer loops. Also consider
+C    unrolling these two outer loops and thereby eliminate all branches.
+C  * Avoid the reloading of u1 before every loop start.
+C  * Reduce register usage.
+C  * Consider getting rid of cy and instead load 3 u limbs, use addc+adde+adde.
+C  * Consider skewing conditional adjustments to allow mask creation with subfe
+C    like in the un=3 code. It might streamline the adjustments (or not).
+
+C INPUT PARAMETERS
+define(`rp', `r3')
+define(`up', `r4')
+define(`un', `r5')
+
+define(`u0', `r0')
+define(`u1', `r7')
+define(`rp2', `r24')
+define(`up2', `r25')
+define(`cy',  `r6')
+
+define(`LSHU1U0',`
+	addc	u0, u0, u0
+	adde	u1, u1, u1
+	li	cy, 0
+	addze	cy, cy
+')
+define(`LSHU1U',`
+	addc	u0, u0, u0
+	add	u0, u0, cy
+	adde	u1, u1, u1
+	li	cy, 0
+	addze	cy, cy
+')
+define(`LSHU1UF',`
+	addc	u0, u0, u0
+	add	u0, u0, cy
+	adde	u1, u1, u1
+')
+define(`LSHU1UHF',`
+	add	u0, u0, u0
+	add	u0, u0, cy
+')
+C These are cleverer replacements, but they tend to leave CA set, disturbing
+C the main accumulation code! Breaking that false dependency might have a
+C positive performance impact. Note that the subfe here results in a mask for
+C our adjustments.
+define(`xLSHU1U0',`
+	addc	u0, u0, u0
+	adde	u1, u1, u1
+	subfe	cy, cy, cy
+')
+define(`xLSHU1U',`
+	subfic	cy, cy, 0
+	adde	u0, u0, u0
+	adde	u1, u1, u1
+	subfe	cy, cy, cy
+')
+define(`xLSHU1U',`
+	subfic	cy, cy, 0
+	adde	u0, u0, u0
+')
+
+ASM_START()
+PROLOGUE(mpn_sqr_basecase)
+	ld	r0, 0(up)	C n = 1
+	mulld	r8, r0, r0	C weight 0
+	mulhdu	r9, r0, r0	C weight 1
+	std	r8, 0(rp)
+	cmpdi	cr0, un, 2
+	bge	cr0, L(ge2)
+	std	r9, 8(rp)
+	blr
+
+L(ge2):	bgt	cr0, L(gt2)
+	ld	r6, 8(up)
+	mulld	r10, r6, r6	C u1 * u1
+	mulhdu	r11, r6, r6	C u1 * u1
+	mulld	r4, r6, r0	C u1 * u0
+	mulhdu	r5, r6, r0	C u1 * u0
+	addc	r4, r4, r4
+	adde	r5, r5, r5
+	addze	r11, r11
+	addc	r9, r9, r4
+	adde	r10, r10, r5
+	addze	r11, r11
+	std	r9, 8(rp)
+	std	r10, 16(rp)
+	std	r11, 24(rp)
+	blr
+
+L(gt2):	cmpdi	cr0, un, 3
+	bgt	cr0, L(gt3)
+	std	r30, -16(r1)
+	std	r31, -8(r1)
+	subfo	r12, r12, r12		C clear OV (and result register)
+	ld	r8, 8(r4)
+	mulld	r5, r8, r8		C W2
+	mulhdu	r10, r8, r8		C W3
+	sradi	r11, u0, 63		C CAUTION: clobbers CA
+	and	r11, r11, r8		C W3
+	addc	u0, u0, u0
+	adde	u1, r8, r8
+	subfe	r6, r6, r6		C	mask
+	ld	r4, 16(r4)		C W2
+	mulld	r12, r8, u0		C W1	u1 x u0
+	mulhdu	r8, r8, u0		C W2	u1 x u0
+	maddld(	r31, r4, u0, r11)	C W2
+	maddhdu(r30, r4, u0, r11)	C W3
+	andc	r6, r4, r6		C W4
+	addc	r9, r12, r9		C W1
+	std	r9, 8(rp)		C W1
+	mulld	r9, r4, u1		C W3
+	mulhdu	r11, r4, u1		C W4
+	addex(	r5, r5, r8, 0)		C W2
+	adde	r5, r31, r5		C W2
+	std	r5, 16(rp)		C W2
+	maddld(	r5, r4, r4, r6)		C W4	u2^2
+	maddhdu(r6, r4, r4, r6)		C W5	u2^2
+	addex(	r9, r9, r30, 0)		C W3
+	adde	r9, r9, r10		C W3
+	std	r9, 24(rp)		C W3
+	adde	r5, r5, r11		C W4
+	addze	r6, r6			C W5
+	li	r8, 0
+	addex(	r5, r5, r8, 0)		C W4
+	std	r5, 32(rp)		C W4
+	addex(	r6, r6, r8, 0)		C W5
+	std	r6, 40(rp)		C W5
+	ld	r30, -16(r1)
+	ld	r31, -8(r1)
+	blr
+
+L(gt3):	std	r22, -80(r1)
+	std	r23, -72(r1)
+	std	r24, -64(r1)
+	std	r25, -56(r1)
+	std	r26, -48(r1)
+	std	r27, -40(r1)
+	std	r28, -32(r1)
+	std	r29, -24(r1)
+	std	r30, -16(r1)
+	std	r31, -8(r1)
+
+	mr	rp2, rp
+	mr	up2, up
+	addi	r22, un, -1		C count for loop FIXME: Adjust
+	subfo	r0, r0, r0		C clear OV (and r0)
+	rldicl	r0, un, 0, 63		C r0 = un & 1
+	cmpdi	cr7, r0, 0
+
+	ld	u0, 0(up2)
+	ld	u1, 8(up2)
+
+	cmpdi	cr5, r22, 4
+	srdi	r31, r22, 2
+	addi	r22, r22, -2
+	mtctr	r31
+
+	beq	cr7, L(m2_evn)
+L(m2_odd):
+	rldicl.	r31, r22, 63, 63	C r22 & 2
+	mulld	r23, u0, u0
+	mulhdu	r12, u0, u0
+	mulld	r5, u1, u1
+	mulhdu	r10, u1, u1
+
+	sradi	r11, u0, 63
+	and	r11, r11, u1
+
+	LSHU1U0
+
+	ld	r8, 8(up2)
+	ld	r9, 16(up2)
+	mulld	r28, r8, u0		C W	u1 x u0
+	mulhdu	r31, r8, u0		C W	u1 x u0
+	std	r23, 0(rp2)
+
+	bne	cr0, L(m2_11)
+L(m2_01):
+	addi	up, up2, 16
+	addi	rp, rp2, 0
+	b	L(m2_lo2)
+L(m2_11):
+	addi	up, up2, 0
+	addi	rp, rp2, -16
+	b	L(m2_lo0)
+
+L(m2_evn):
+	rldicl.	r31, r22, 63, 63	C r22 & 2
+	mulld	r23, u0, u0
+	mulhdu	r5, u0, u0
+	mulld	r12, u1, u1
+	mulhdu	r11, u1, u1
+
+	sradi	r10, u0, 63
+	and	r10, r10, u1
+
+	LSHU1U0
+
+	ld	r9, 8(up2)
+	ld	r8, 16(up2)
+	mulld	r29, r9, u0		C W	u1 x u0
+	mulhdu	r30, r9, u0		C W	u1 x u0
+	std	r23, 0(rp2)
+
+	beq	cr0, L(m2_10)
+L(m2_00):
+	addi	up, up2, 8
+	addi	rp, rp2, -8
+	b	L(m2_lo1)
+L(m2_10):
+	addi	up, up2, 24
+	addi	rp, rp2, 8
+	ble	cr5, L(m2_end)
+
+L(m2_top):
+	ld	r9, 0(up)
+	maddld(	r28, r8, u0, r10)
+	maddhdu(r31, r8, u0, r10)
+	adde	r5, r29, r5
+	std	r5, 0(rp)
+	mulld	r5, r8, u1
+	mulhdu	r10, r8, u1
+	addex(	r12, r12, r30, 0)
+L(m2_lo2):
+	ld	r8, 8(up)
+	maddld(	r29, r9, u0, r11)
+	maddhdu(r30, r9, u0, r11)
+	adde	r12, r28, r12
+	std	r12, 8(rp)
+	mulld	r12, r9, u1
+	mulhdu	r11, r9, u1
+	addex(	r5, r5, r31, 0)
+L(m2_lo1):
+	ld	r9, 16(up)
+	maddld(	r28, r8, u0, r10)
+	maddhdu(r31, r8, u0, r10)
+	adde	r5, r29, r5
+	std	r5, 16(rp)
+	mulld	r5, r8, u1
+	mulhdu	r10, r8, u1
+	addex(	r12, r12, r30, 0)
+L(m2_lo0):
+	ld	r8, 24(up)
+	maddld(	r29, r9, u0, r11)
+	maddhdu(r30, r9, u0, r11)
+	adde	r12, r28, r12
+	std	r12, 24(rp)
+	mulld	r12, r9, u1
+	mulhdu	r11, r9, u1
+	addex(	r5, r5, r31, 0)
+	addi	up, up, 32
+	addi	rp, rp, 32
+	bdnz	L(m2_top)
+
+L(m2_end):
+	ld	r9, 0(up)
+	maddld(	r28, r8, u0, r10)
+	maddhdu(r31, r8, u0, r10)
+	adde	r5, r29, r5
+	std	r5, 0(rp)
+	mulld	r5, r8, u1
+	mulhdu	r10, r8, u1
+	b	L(cj)			C jump to addmul_2 tail
+
+L(outer):
+	addi	up2, up2, 16
+	addi	rp2, rp2, 32
+
+	ld	u0, 0(up2)
+	ld	u1, 8(up2)
+
+	cmpdi	cr5, r22, 4
+	srdi	r31, r22, 2
+	addi	r22, r22, -2
+	mtctr	r31
+
+	ld	r26, 0(rp2)
+	ld	r27, 16(rp2)
+
+	rldicl.	r31, r22, 63, 63	C r22 & 2
+	beq	cr7, L(evn)
+
+L(odd):	maddld(	r23, u0, u0, r26)	C W	u2^2
+	maddhdu(r12, u0, u0, r26)	C W	u2^2
+	maddld(	r5, u1, u1, r27)	C W	u3^2
+	maddhdu(r10, u1, u1, r27)	C W	u3^2
+	ld	r26, 8(rp2)
+
+	ld	r8, -8(up2)
+	sradi	r8, r8, 63		C CAUTION: clobbers CA
+	and	r8, r8, u0
+	sradi	r11, u0, 63		C CAUTION: clobbers CA
+	and	r11, r11, u1
+
+	LSHU1U
+
+	addc	r23, r23, r8
+
+	ld	r8, 8(up2)
+	ld	r9, 16(up2)
+	maddld(	r28, r8, u0, r26)	C W	u3 x u2
+	maddhdu(r31, r8, u0, r26)	C W	u3 x u2
+	ld	r26, 24(rp2)
+	std	r23, 0(rp2)		C W0
+
+	bne	cr0, L(11)
+L(01):
+	addi	up, up2, 16
+	addi	rp, rp2, 0
+	b	L(lo2)
+L(11):
+	addi	up, up2, 0
+	addi	rp, rp2, -16
+	b	L(lo0)
+
+L(evn):	maddld(	r23, u0, u0, r26)	C W	u2^2
+	maddhdu(r5, u0, u0, r26)	C W	u2^2
+	maddld(	r12, u1, u1, r27)	C W	u3^2
+	maddhdu(r11, u1, u1, r27)	C W	u3^2
+	ld	r27, 8(rp2)
+
+	ld	r9, -8(up2)
+	sradi	r9, r9, 63		C CAUTION: clobbers CA
+	and	r9, r9, u0
+	sradi	r10, u0, 63		C CAUTION: clobbers CA
+	and	r10, r10, u1
+
+	LSHU1U
+
+	addc	r23, r23, r9
+
+	ld	r9, 8(up2)
+	ld	r8, 16(up2)
+	maddld(	r29, r9, u0, r27)	C W	u3 x u2
+	maddhdu(r30, r9, u0, r27)	C W	u3 x u2
+	ld	r27, 24(rp2)
+	std	r23, 0(rp2)		C W0
+
+	beq	cr0, L(10)
+L(00):
+	addi	up, up2, 8
+	addi	rp, rp2, -8
+	b	L(lo1)
+L(10):
+	addi	up, up2, 24
+	addi	rp, rp2, 8
+	ble	cr5, L(end)
+
+L(top):	ld	r9, 0(up)
+	maddld(	r28, r8, u0, r10)
+	maddhdu(r31, r8, u0, r10)
+	adde	r5, r29, r5
+	ld	r26, 24(rp)
+	std	r5, 0(rp)
+	maddld(	r5, r8, u1, r27)
+	maddhdu(r10, r8, u1, r27)
+	addex(	r12, r12, r30, 0)
+L(lo2):	ld	r8, 8(up)
+	maddld(	r29, r9, u0, r11)
+	maddhdu(r30, r9, u0, r11)
+	adde	r12, r28, r12
+	ld	r27, 32(rp)
+	std	r12, 8(rp)
+	maddld(	r12, r9, u1, r26)
+	maddhdu(r11, r9, u1, r26)
+	addex(	r5, r5, r31, 0)
+L(lo1):	ld	r9, 16(up)
+	maddld(	r28, r8, u0, r10)
+	maddhdu(r31, r8, u0, r10)
+	adde	r5, r29, r5
+	ld	r26, 40(rp)
+	std	r5, 16(rp)
+	maddld(	r5, r8, u1, r27)
+	maddhdu(r10, r8, u1, r27)
+	addex(	r12, r12, r30, 0)
+L(lo0):	ld	r8, 24(up)
+	maddld(	r29, r9, u0, r11)
+	maddhdu(r30, r9, u0, r11)
+	adde	r12, r28, r12
+	ld	r27, 48(rp)
+	std	r12, 24(rp)
+	maddld(	r12, r9, u1, r26)
+	maddhdu(r11, r9, u1, r26)
+	addex(	r5, r5, r31, 0)
+	addi	up, up, 32
+	addi	rp, rp, 32
+	bdnz	L(top)
+
+L(end):	ld	r9, 0(up)
+	maddld(	r28, r8, u0, r10)
+	maddhdu(r31, r8, u0, r10)
+	adde	r5, r29, r5
+	std	r5, 0(rp)
+	maddld(	r5, r8, u1, r27)
+	maddhdu(r10, r8, u1, r27)
+L(cj):	addex(	r12, r12, r30, 0)
+	maddld(	r29, r9, u0, r11)
+	maddhdu(r30, r9, u0, r11)
+	adde	r12, r28, r12
+	std	r12, 8(rp)
+	mulld	r12, r9, u1
+	mulhdu	r11, r9, u1
+	addex(	r5, r5, r31, 0)
+	adde	r5, r29, r5
+	std	r5, 16(rp)
+	addex(	r12, r12, r30, 0)
+	adde	r12, r12, r10
+	std	r12, 24(rp)
+	li	r4, 0
+	addze	r5, r11
+	addex(	r5, r5, r4, 0)
+	std	r5, 32(rp)
+	bgt	cr5, L(outer)
+
+L(corner):
+	ld	u0, 16(up2)
+	ld	u1, 24(up2)
+	ld	r26, 32(rp2)
+	bne	cr7, L(corner_odd)
+
+L(corner_evn):
+	ld	r27, 40(rp2)
+	maddld(	r23, u0, u0, r26)	C W	u2^2
+	maddhdu(r5, u0, u0, r26)	C W	u2^2
+	mulld	r12, u1, u1		C W	u3^2
+	mulhdu	r11, u1, u1		C W	u3^2
+
+	ld	r9, 8(up2)
+	sradi	r9, r9, 63		C CAUTION: clobbers CA
+	and	r9, r9, u0
+	sradi	r10, u0, 63		C CAUTION: clobbers CA
+	and	r10, r10, u1
+
+	LSHU1UHF
+
+	addc	r23, r23, r9
+
+	ld	r9, 24(up2)
+	maddld(	r29, r9, u0, r27)	C W	u3 x u2
+	maddhdu(r30, r9, u0, r27)	C W	u3 x u2
+	std	r23, 32(rp2)
+	adde	r5, r29, r5
+	std	r5, 40(rp2)
+	addex(	r12, r12, r30, 0)
+	adde	r12, r12, r10		C W	FIXME can this co?
+	std	r12, 48(rp2)
+	li	r4, 0
+	addex(	r5, r11, r4, 0)
+	addze	r5, r5
+	std	r5, 56(rp2)
+	b	L(ret)
+
+L(corner_odd):
+	ld	r27, 48(rp2)
+	maddld(	r23, u0, u0, r26)	C W	u2^2
+	maddhdu(r12, u0, u0, r26)	C W	u2^2
+	maddld(	r5, u1, u1, r27)	C W	u3^2
+	maddhdu(r10, u1, u1, r27)	C W	u3^2
+	ld	r26, 40(rp2)
+
+	ld	r8, 8(up2)
+	sradi	r8, r8, 63		C CAUTION: clobbers CA
+	and	r8, r8, u0
+	sradi	r11, u0, 63		C CAUTION: clobbers CA
+	and	r11, r11, u1
+
+	LSHU1UF
+
+	addc	r23, r23, r8
+
+	ld	r8, 24(up2)
+	ld	r9, 32(up2)
+	maddld(	r28, r8, u0, r26)	C W	u3 x u2
+	maddhdu(r31, r8, u0, r26)	C W	u3 x u2
+	std	r23, 32(rp2)
+	maddld(	r29, r9, u0, r11)
+	maddhdu(r30, r9, u0, r11)
+	adde	r12, r28, r12
+	std	r12, 40(rp2)
+	mulld	r12, r9, u1
+	mulhdu	r11, r9, u1
+	addex(	r5, r5, r31, 0)
+	adde	r5, r29, r5
+	std	r5, 48(rp2)
+	addex(	r12, r12, r30, 0)
+	adde	r12, r12, r10
+	std	r12, 56(rp2)
+	mulld	r23, r9, r9		C W	u2^2
+	mulhdu	r12, r9, r9		C W	u2^2
+	adde	r23, r23, r11
+	addze	r12, r12
+	sradi	r4, r8, 63		C CAUTION: clobbers CA
+	and	r4, r4, r9
+	addex(	r23, r23, r4, 0)
+	std	r23, 64(rp2)
+	li	r4, 0
+	addex(	r12, r12, r4, 0)
+	std	r12, 72(rp2)
+
+L(ret):	ld	r22, -80(r1)
+	ld	r23, -72(r1)
+	ld	r24, -64(r1)
+	ld	r25, -56(r1)
+	ld	r26, -48(r1)
+	ld	r27, -40(r1)
+	ld	r28, -32(r1)
+	ld	r29, -24(r1)
+	ld	r30, -16(r1)
+	ld	r31, -8(r1)
+	blr
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/powerpc64/mode64/rsh1aors_n.asm b/third_party/gmp/mpn/powerpc64/mode64/rsh1aors_n.asm
new file mode 100644
index 0000000..1f57bdf
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc64/mode64/rsh1aors_n.asm

@@ -0,0 +1,173 @@
+dnl  PowerPC-64 mpn_rsh1add_n, mpn_rsh1sub_n
+
+dnl  Copyright 2003, 2005, 2010, 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C		   cycles/limb
+C POWER3/PPC630		 ?
+C POWER4/PPC970		 2.9
+C POWER5		 ?
+C POWER6		 3.5
+C POWER7		 2.25
+
+define(`rp', `r3')
+define(`up', `r4')
+define(`vp', `r5')
+define(`n',  `r6')
+
+ifdef(`OPERATION_rsh1add_n', `
+  define(`ADDSUBC',	`addc')
+  define(`ADDSUBE',	`adde')
+  define(INITCY,	`addic	$1, r1, 0')
+  define(`func',	mpn_rsh1add_n)')
+ifdef(`OPERATION_rsh1sub_n', `
+  define(`ADDSUBC',	`subfc')
+  define(`ADDSUBE',	`subfe')
+  define(INITCY,	`addic	$1, r1, -1')
+  define(`func',	mpn_rsh1sub_n)')
+
+define(`s0', `r9')
+define(`s1', `r7')
+define(`x0', `r0')
+define(`x1', `r12')
+define(`u0', `r8')
+define(`v0', `r10')
+
+MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1sub_n)
+
+ASM_START()
+PROLOGUE(func)
+	ld	u0, 0(up)
+	ld	v0, 0(vp)
+
+	cmpdi	cr6, n, 2
+
+	addi	r0, n, 1
+	srdi	r0, r0, 2
+	mtctr	r0			C copy size to count register
+
+	andi.	r0, n, 1
+	bne	cr0, L(bx1)
+
+L(bx0):	ADDSUBC	x1, v0, u0
+	ld	u0, 8(up)
+	ld	v0, 8(vp)
+	ADDSUBE	x0, v0, u0
+	ble	cr6, L(n2)
+	ld	u0, 16(up)
+	ld	v0, 16(vp)
+	srdi	s0, x1, 1
+	rldicl	r11, x1, 0, 63		C return value
+	ADDSUBE	x1, v0, u0
+	andi.	n, n, 2
+	bne	cr0, L(b10)
+L(b00):	addi	rp, rp, -24
+	b	L(lo0)
+L(b10):	addi	up, up, 16
+	addi	vp, vp, 16
+	addi	rp, rp, -8
+	b	L(lo2)
+
+	ALIGN(16)
+L(bx1):	ADDSUBC	x0, v0, u0
+	ble	cr6, L(n1)
+	ld	u0, 8(up)
+	ld	v0, 8(vp)
+	ADDSUBE	x1, v0, u0
+	ld	u0, 16(up)
+	ld	v0, 16(vp)
+	srdi	s1, x0, 1
+	rldicl	r11, x0, 0, 63		C return value
+	ADDSUBE	x0, v0, u0
+	andi.	n, n, 2
+	bne	cr0, L(b11)
+L(b01):	addi	up, up, 8
+	addi	vp, vp, 8
+	addi	rp, rp, -16
+	b	L(lo1)
+L(b11):	addi	up, up, 24
+	addi	vp, vp, 24
+	bdz	L(end)
+
+	ALIGN(32)
+L(top):	ld	u0, 0(up)
+	ld	v0, 0(vp)
+	srdi	s0, x1, 1
+	rldimi	s1, x1, 63, 0
+	std	s1, 0(rp)
+	ADDSUBE	x1, v0, u0
+L(lo2):	ld	u0, 8(up)
+	ld	v0, 8(vp)
+	srdi	s1, x0, 1
+	rldimi	s0, x0, 63, 0
+	std	s0, 8(rp)
+	ADDSUBE	x0, v0, u0
+L(lo1):	ld	u0, 16(up)
+	ld	v0, 16(vp)
+	srdi	s0, x1, 1
+	rldimi	s1, x1, 63, 0
+	std	s1, 16(rp)
+	ADDSUBE	x1, v0, u0
+L(lo0):	ld	u0, 24(up)
+	ld	v0, 24(vp)
+	srdi	s1, x0, 1
+	rldimi	s0, x0, 63, 0
+	std	s0, 24(rp)
+	ADDSUBE	x0, v0, u0
+	addi	up, up, 32
+	addi	vp, vp, 32
+	addi	rp, rp, 32
+	bdnz	L(top)
+
+L(end):	srdi	s0, x1, 1
+	rldimi	s1, x1, 63, 0
+	std	s1, 0(rp)
+L(cj2):	srdi	s1, x0, 1
+	rldimi	s0, x0, 63, 0
+	std	s0, 8(rp)
+L(cj1):	ADDSUBE	x1, x1, x1		C pseudo-depends on x1
+	rldimi	s1, x1, 63, 0
+	std	s1, 16(rp)
+	mr	r3, r11
+	blr
+
+L(n1):	srdi	s1, x0, 1
+	rldicl	r11, x0, 0, 63		C return value
+	ADDSUBE	x1, x1, x1		C pseudo-depends on x1
+	rldimi	s1, x1, 63, 0
+	std	s1, 0(rp)
+	mr	r3, r11
+	blr
+
+L(n2):	addi	rp, rp, -8
+	srdi	s0, x1, 1
+	rldicl	r11, x1, 0, 63		C return value
+	b	L(cj2)
+EPILOGUE()

diff --git a/third_party/gmp/mpn/powerpc64/mode64/sqr_basecase.asm b/third_party/gmp/mpn/powerpc64/mode64/sqr_basecase.asm
new file mode 100644
index 0000000..e76bb88
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc64/mode64/sqr_basecase.asm

@@ -0,0 +1,863 @@
+dnl  PowerPC-64 mpn_sqr_basecase.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  Copyright 1999-2001, 2003-2006, 2008, 2010, 2011 Free Software Foundation,
+dnl  Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C                  cycles/limb
+C POWER3/PPC630         6-18
+C POWER4/PPC970          8
+C POWER5                 8
+C POWER6                16.25
+C POWER7                 3.77
+
+C NOTES
+C  * This is very crude, cleanup!
+C  * Try to reduce the number of needed live registers.
+C  * Rewrite for POWER6 to use 8 consecutive muls, not 2 groups of 4.  The
+C    cost will be more live registers.
+C  * Rewrite for POWER7 to use addmul_2 building blocks; this will reduce code
+C    size a lot and speed things up perhaps 25%.
+C  * Use computed goto in order to compress the code.
+C  * Implement a larger final corner.
+C  * Schedule callee-saves register saves into other insns.  This could save
+C    about 5 cycles/call.  (We cannot analogously optimise the restores, since
+C    the sqr_diag_addlsh1 loop has no wind-down code as currently written.)
+C  * Should the alternating std/adde sequences be split?  Some pipelines handle
+C    adde poorly, and might sequentialise all these instructions.
+C  * The sqr_diag_addlsh1 loop was written for POWER6 and its preferences for
+C    adjacent integer multiply insns.  Except for the multiply insns, the code
+C    was not carefully optimised for POWER6 or any other CPU.
+C  * Perform cross-jumping in sqr_diag_addlsh1's feed-in code, into the loop.
+
+C INPUT PARAMETERS
+define(`rp', `r3')
+define(`up', `r4')
+define(`n',  `r5')
+
+define(`rp_outer', `r25')
+define(`up_outer', `r21')
+define(`rp_saved', `r22')
+define(`up_saved', `r23')
+define(`n_saved',  `r24')
+
+ASM_START()
+PROLOGUE(mpn_sqr_basecase)
+	cmpdi	cr0, n, 2
+	bge	cr0, L(ge2)
+	ld	r5, 0(up)	C n = 1
+	nop
+	mulld	r8, r5, r5	C weight 0
+	mulhdu	r9, r5, r5	C weight 1
+	std	r8, 0(rp)
+	std	r9, 8(rp)
+	blr
+	ALIGN(16)
+L(ge2):	bgt	cr0, L(gt2)
+	ld	r0, 0(up)	C n = 2
+	nop
+	mulld	r8, r0, r0	C u0 * u0
+	mulhdu	r9, r0, r0	C u0 * u0
+	ld	r6, 8(up)
+	mulld	r10, r6, r6	C u1 * u1
+	mulhdu	r11, r6, r6	C u1 * u1
+	mulld	r4, r6, r0	C u1 * u0
+	mulhdu	r5, r6, r0	C u1 * u0
+	addc	r4, r4, r4
+	adde	r5, r5, r5
+	addze	r11, r11
+	addc	r9, r9, r4
+	adde	r10, r10, r5
+	addze	r11, r11
+	std	r8, 0(rp)
+	std	r9, 8(rp)
+	std	r10, 16(rp)
+	std	r11, 24(rp)
+	blr
+
+	ALIGN(16)
+L(gt2):	std	r31,  -8(r1)
+	std	r30, -16(r1)
+	std	r29, -24(r1)
+	std	r28, -32(r1)
+	std	r27, -40(r1)
+	std	r26, -48(r1)
+	std	r25, -56(r1)
+	std	r24, -64(r1)
+	std	r23, -72(r1)
+	std	r22, -80(r1)
+	std	r21, -88(r1)
+
+	mr	rp_saved, rp
+	mr	up_saved, up
+	mr	n_saved, n
+	mr	rp_outer, rp
+	mr	up_outer, up
+
+	rldicl.	r0, n, 0,62	C r0 = n & 3, set cr0
+	cmpdi	cr6, r0, 2
+	addic	r7, n, 2	C compute count...
+	srdi	r7, r7, 2	C ...for ctr
+	mtctr	r7		C copy count into ctr
+	beq-	cr0, L(b0)
+	blt-	cr6, L(b1)
+	beq-	cr6, L(b2)
+
+L(b3):	ld	r6, 0(up)
+	ld	r9, 8(up)
+	ld	r27, 16(up)
+	addi	up, up, 24
+	li	r12, 0		C carry limb
+	bdz	L(em3)
+
+	ALIGN(16)
+L(tm3):	mulld	r0, r9, r6
+	mulhdu	r26, r9, r6
+	mulld	r7, r27, r6
+	mulhdu	r8, r27, r6
+	ld	r9, 0(up)
+	ld	r27, 8(up)
+	adde	r0, r0, r12
+	adde	r7, r7, r26
+	mulld	r26, r9, r6
+	mulhdu	r10, r9, r6
+	mulld	r11, r27, r6
+	mulhdu	r12, r27, r6
+	ld	r9, 16(up)
+	ld	r27, 24(up)
+	std	r0, 8(rp)
+	adde	r26, r26, r8
+	std	r7, 16(rp)
+	adde	r11, r11, r10
+	std	r26, 24(rp)
+	addi	up, up, 32
+	std	r11, 32(rp)
+	addi	rp, rp, 32
+	bdnz	L(tm3)
+
+L(em3):	mulld	r0, r9, r6
+	mulhdu	r26, r9, r6
+	mulld	r7, r27, r6
+	mulhdu	r8, r27, r6
+	adde	r0, r0, r12
+	adde	r7, r7, r26
+	std	r0, 8(rp)
+	std	r7, 16(rp)
+	addze	r8, r8
+	std	r8, 24(rp)
+	addi	n, n, 2
+	b	L(outer_loop)
+
+L(b0):	ld	r6, 0(up)
+	ld	r27, 8(up)
+	mulld	r7, r27, r6
+	mulhdu	r12, r27, r6
+	std	r7, 8(rp)
+	addi	rp, rp, 8
+	ld	r9, 16(up)
+	ld	r27, 24(up)
+	addi	up, up, 32
+	bdz	L(em0)
+
+	ALIGN(16)
+L(tm0):	mulld	r0, r9, r6
+	mulhdu	r26, r9, r6
+	mulld	r7, r27, r6
+	mulhdu	r8, r27, r6
+	ld	r9, 0(up)
+	ld	r27, 8(up)
+	adde	r0, r0, r12
+	adde	r7, r7, r26
+	mulld	r26, r9, r6
+	mulhdu	r10, r9, r6
+	mulld	r11, r27, r6
+	mulhdu	r12, r27, r6
+	ld	r9, 16(up)
+	ld	r27, 24(up)
+	std	r0, 8(rp)
+	adde	r26, r26, r8
+	std	r7, 16(rp)
+	adde	r11, r11, r10
+	std	r26, 24(rp)
+	addi	up, up, 32
+	std	r11, 32(rp)
+	addi	rp, rp, 32
+	bdnz	L(tm0)
+
+L(em0):	mulld	r0, r9, r6
+	mulhdu	r26, r9, r6
+	mulld	r7, r27, r6
+	mulhdu	r8, r27, r6
+	adde	r0, r0, r12
+	adde	r7, r7, r26
+	std	r0, 8(rp)
+	std	r7, 16(rp)
+	addze	r8, r8
+	std	r8, 24(rp)
+	addi	n, n, 2
+	b	L(outer_loop_ent_2)
+
+L(b1):	ld	r6, 0(up)
+	ld	r9, 8(up)
+	ld	r27, 16(up)
+	mulld	r0, r9, r6
+	mulhdu	r26, r9, r6
+	mulld	r7, r27, r6
+	mulhdu	r12, r27, r6
+	addc	r7, r7, r26
+	std	r0, 8(rp)
+	std	r7, 16(rp)
+	addi	rp, rp, 16
+	ld	r9, 24(up)
+	ld	r27, 32(up)
+	addi	up, up, 40
+	bdz	L(em1)
+
+	ALIGN(16)
+L(tm1):	mulld	r0, r9, r6
+	mulhdu	r26, r9, r6
+	mulld	r7, r27, r6
+	mulhdu	r8, r27, r6
+	ld	r9, 0(up)
+	ld	r27, 8(up)
+	adde	r0, r0, r12
+	adde	r7, r7, r26
+	mulld	r26, r9, r6
+	mulhdu	r10, r9, r6
+	mulld	r11, r27, r6
+	mulhdu	r12, r27, r6
+	ld	r9, 16(up)
+	ld	r27, 24(up)
+	std	r0, 8(rp)
+	adde	r26, r26, r8
+	std	r7, 16(rp)
+	adde	r11, r11, r10
+	std	r26, 24(rp)
+	addi	up, up, 32
+	std	r11, 32(rp)
+	addi	rp, rp, 32
+	bdnz	L(tm1)
+
+L(em1):	mulld	r0, r9, r6
+	mulhdu	r26, r9, r6
+	mulld	r7, r27, r6
+	mulhdu	r8, r27, r6
+	adde	r0, r0, r12
+	adde	r7, r7, r26
+	std	r0, 8(rp)
+	std	r7, 16(rp)
+	addze	r8, r8
+	std	r8, 24(rp)
+	addi	n, n, 2
+	b	L(outer_loop_ent_3)
+
+L(b2):	addi	r7, r7, -1	C FIXME
+	mtctr	r7		C FIXME
+	ld	r6, 0(up)
+	ld	r9, 8(up)
+	ld	r27, 16(up)
+	mulld	r0, r9, r6
+	mulhdu	r26, r9, r6
+	mulld	r7, r27, r6
+	mulhdu	r8, r27, r6
+	ld	r9, 24(up)
+	mulld	r11, r9, r6
+	mulhdu	r10, r9, r6
+	addc	r7, r7, r26
+	adde	r11, r11, r8
+	addze	r12, r10
+	std	r0, 8(rp)
+	std	r7, 16(rp)
+	std	r11, 24(rp)
+	addi	rp, rp, 24
+	ld	r9, 32(up)
+	ld	r27, 40(up)
+	addi	up, up, 48
+	bdz	L(em2)
+
+	ALIGN(16)
+L(tm2):	mulld	r0, r9, r6
+	mulhdu	r26, r9, r6
+	mulld	r7, r27, r6
+	mulhdu	r8, r27, r6
+	ld	r9, 0(up)
+	ld	r27, 8(up)
+	adde	r0, r0, r12
+	adde	r7, r7, r26
+	mulld	r26, r9, r6
+	mulhdu	r10, r9, r6
+	mulld	r11, r27, r6
+	mulhdu	r12, r27, r6
+	ld	r9, 16(up)
+	ld	r27, 24(up)
+	std	r0, 8(rp)
+	adde	r26, r26, r8
+	std	r7, 16(rp)
+	adde	r11, r11, r10
+	std	r26, 24(rp)
+	addi	up, up, 32
+	std	r11, 32(rp)
+	addi	rp, rp, 32
+	bdnz	L(tm2)
+
+L(em2):	mulld	r0, r9, r6
+	mulhdu	r26, r9, r6
+	mulld	r7, r27, r6
+	mulhdu	r8, r27, r6
+	adde	r0, r0, r12
+	adde	r7, r7, r26
+	std	r0, 8(rp)
+	std	r7, 16(rp)
+	addze	r8, r8
+	std	r8, 24(rp)
+	addi	n, n, 2
+	b	L(outer_loop_ent_0)
+
+
+L(outer_loop):
+	addi	n, n, -1
+	addi	up_outer, up_outer, 8
+	addi	rp_outer, rp_outer, 16
+
+	mr	up, up_outer
+	addi	rp, rp_outer, 8
+
+	srdi	r0, n, 2
+	mtctr	r0
+
+	bdz	L(outer_end)
+
+	ld	r6, 0(up)
+	ld	r9, 8(up)
+	ld	r27, 16(up)
+	mulld	r0, r9, r6
+	mulhdu	r26, r9, r6
+	mulld	r7, r27, r6
+	mulhdu	r8, r27, r6
+	ld	r9, 24(up)
+	ld	r28, 0(rp)
+	ld	r29, 8(rp)
+	ld	r30, 16(rp)
+	mulld	r11, r9, r6
+	mulhdu	r10, r9, r6
+	addc	r7, r7, r26
+	adde	r11, r11, r8
+	addze	r12, r10
+	addc	r0, r0, r28
+	std	r0, 0(rp)
+	adde	r7, r7, r29
+	std	r7, 8(rp)
+	adde	r11, r11, r30
+	std	r11, 16(rp)
+	addi	rp, rp, 24
+	ld	r9, 32(up)
+	ld	r27, 40(up)
+	addi	up, up, 48
+	bdz	L(ea1)
+
+	ALIGN(16)
+L(ta1):	mulld	r0, r9, r6
+	mulhdu	r26, r9, r6	C 9
+	mulld	r7, r27, r6
+	mulhdu	r8, r27, r6	C 27
+	ld	r9, 0(up)
+	ld	r28, 0(rp)
+	ld	r27, 8(up)
+	ld	r29, 8(rp)
+	adde	r0, r0, r12	C 0 12
+	adde	r7, r7, r26	C 5 7
+	mulld	r26, r9, r6
+	mulhdu	r10, r9, r6	C 9
+	mulld	r11, r27, r6
+	mulhdu	r12, r27, r6	C 27
+	ld	r9, 16(up)
+	ld	r30, 16(rp)
+	ld	r27, 24(up)
+	ld	r31, 24(rp)
+	adde	r26, r26, r8	C 8 5
+	adde	r11, r11, r10	C 10 11
+	addze	r12, r12	C 12
+	addc	r0, r0, r28	C 0 28
+	std	r0, 0(rp)	C 0
+	adde	r7, r7, r29	C 7 29
+	std	r7, 8(rp)	C 7
+	adde	r26, r26, r30	C 5 30
+	std	r26, 16(rp)	C 5
+	adde	r11, r11, r31	C 11 31
+	std	r11, 24(rp)	C 11
+	addi	up, up, 32
+	addi	rp, rp, 32
+	bdnz	L(ta1)
+
+L(ea1):	mulld	r0, r9, r6
+	mulhdu	r26, r9, r6
+	mulld	r7, r27, r6
+	mulhdu	r8, r27, r6
+	ld	r28, 0(rp)
+	ld	r29, 8(rp)
+	adde	r0, r0, r12
+	adde	r7, r7, r26
+	addze	r8, r8
+	addc	r0, r0, r28
+	std	r0, 0(rp)
+	adde	r7, r7, r29
+	std	r7, 8(rp)
+	addze	r8, r8
+	std	r8, 16(rp)
+
+L(outer_loop_ent_0):
+	addi	n, n, -1
+	addi	up_outer, up_outer, 8
+	addi	rp_outer, rp_outer, 16
+
+	mr	up, up_outer
+	addi	rp, rp_outer, 8
+
+	srdi	r0, n, 2
+	mtctr	r0
+
+	ld	r6, 0(up)
+	ld	r9, 8(up)
+	ld	r27, 16(up)
+	ld	r28, 0(rp)
+	ld	r29, 8(rp)
+	mulld	r0, r9, r6
+	mulhdu	r26, r9, r6
+	mulld	r7, r27, r6
+	mulhdu	r8, r27, r6
+	addc	r0, r0, r28
+	adde	r7, r7, r26
+	addze	r12, r8
+	std	r0, 0(rp)
+	adde	r7, r7, r29
+	std	r7, 8(rp)
+	addi	rp, rp, 16
+	ld	r9, 24(up)
+	ld	r27, 32(up)
+	addi	up, up, 40
+	bdz	L(ea0)
+
+	ALIGN(16)
+L(ta0):	mulld	r0, r9, r6
+	mulhdu	r26, r9, r6	C 9
+	mulld	r7, r27, r6
+	mulhdu	r8, r27, r6	C 27
+	ld	r9, 0(up)
+	ld	r28, 0(rp)
+	ld	r27, 8(up)
+	ld	r29, 8(rp)
+	adde	r0, r0, r12	C 0 12
+	adde	r7, r7, r26	C 5 7
+	mulld	r26, r9, r6
+	mulhdu	r10, r9, r6	C 9
+	mulld	r11, r27, r6
+	mulhdu	r12, r27, r6	C 27
+	ld	r9, 16(up)
+	ld	r30, 16(rp)
+	ld	r27, 24(up)
+	ld	r31, 24(rp)
+	adde	r26, r26, r8	C 8 5
+	adde	r11, r11, r10	C 10 11
+	addze	r12, r12	C 12
+	addc	r0, r0, r28	C 0 28
+	std	r0, 0(rp)	C 0
+	adde	r7, r7, r29	C 7 29
+	std	r7, 8(rp)	C 7
+	adde	r26, r26, r30	C 5 30
+	std	r26, 16(rp)	C 5
+	adde	r11, r11, r31	C 11 31
+	std	r11, 24(rp)	C 11
+	addi	up, up, 32
+	addi	rp, rp, 32
+	bdnz	L(ta0)
+
+L(ea0):	mulld	r0, r9, r6
+	mulhdu	r26, r9, r6
+	mulld	r7, r27, r6
+	mulhdu	r8, r27, r6
+	ld	r28, 0(rp)
+	ld	r29, 8(rp)
+	adde	r0, r0, r12
+	adde	r7, r7, r26
+	addze	r8, r8
+	addc	r0, r0, r28
+	std	r0, 0(rp)
+	adde	r7, r7, r29
+	std	r7, 8(rp)
+	addze	r8, r8
+	std	r8, 16(rp)
+
+L(outer_loop_ent_3):
+	addi	n, n, -1
+	addi	up_outer, up_outer, 8
+	addi	rp_outer, rp_outer, 16
+
+	mr	up, up_outer
+	addi	rp, rp_outer, 8
+
+	srdi	r0, n, 2
+	mtctr	r0
+
+	ld	r6, 0(up)
+	ld	r9, 8(up)
+	ld	r28, 0(rp)
+	mulld	r0, r9, r6
+	mulhdu	r12, r9, r6
+	addc	r0, r0, r28
+	std	r0, 0(rp)
+	addi	rp, rp, 8
+	ld	r9, 16(up)
+	ld	r27, 24(up)
+	addi	up, up, 32
+	bdz	L(ea3)
+
+	ALIGN(16)
+L(ta3):	mulld	r0, r9, r6
+	mulhdu	r26, r9, r6	C 9
+	mulld	r7, r27, r6
+	mulhdu	r8, r27, r6	C 27
+	ld	r9, 0(up)
+	ld	r28, 0(rp)
+	ld	r27, 8(up)
+	ld	r29, 8(rp)
+	adde	r0, r0, r12	C 0 12
+	adde	r7, r7, r26	C 5 7
+	mulld	r26, r9, r6
+	mulhdu	r10, r9, r6	C 9
+	mulld	r11, r27, r6
+	mulhdu	r12, r27, r6	C 27
+	ld	r9, 16(up)
+	ld	r30, 16(rp)
+	ld	r27, 24(up)
+	ld	r31, 24(rp)
+	adde	r26, r26, r8	C 8 5
+	adde	r11, r11, r10	C 10 11
+	addze	r12, r12	C 12
+	addc	r0, r0, r28	C 0 28
+	std	r0, 0(rp)	C 0
+	adde	r7, r7, r29	C 7 29
+	std	r7, 8(rp)	C 7
+	adde	r26, r26, r30	C 5 30
+	std	r26, 16(rp)	C 5
+	adde	r11, r11, r31	C 11 31
+	std	r11, 24(rp)	C 11
+	addi	up, up, 32
+	addi	rp, rp, 32
+	bdnz	L(ta3)
+
+L(ea3):	mulld	r0, r9, r6
+	mulhdu	r26, r9, r6
+	mulld	r7, r27, r6
+	mulhdu	r8, r27, r6
+	ld	r28, 0(rp)
+	ld	r29, 8(rp)
+	adde	r0, r0, r12
+	adde	r7, r7, r26
+	addze	r8, r8
+	addc	r0, r0, r28
+	std	r0, 0(rp)
+	adde	r7, r7, r29
+	std	r7, 8(rp)
+	addze	r8, r8
+	std	r8, 16(rp)
+
+
+L(outer_loop_ent_2):
+	addi	n, n, -1
+	addi	up_outer, up_outer, 8
+	addi	rp_outer, rp_outer, 16
+
+	mr	up, up_outer
+	addi	rp, rp_outer, 8
+
+	srdi	r0, n, 2
+	mtctr	r0
+
+	addic	r0, r0, 0
+	li	r12, 0		C cy_limb = 0
+	ld	r6, 0(up)
+	ld	r9, 8(up)
+	ld	r27, 16(up)
+	bdz	L(ea2)
+	addi	up, up, 24
+
+	ALIGN(16)
+L(ta2):	mulld	r0, r9, r6
+	mulhdu	r26, r9, r6	C 9
+	mulld	r7, r27, r6
+	mulhdu	r8, r27, r6	C 27
+	ld	r9, 0(up)
+	ld	r28, 0(rp)
+	ld	r27, 8(up)
+	ld	r29, 8(rp)
+	adde	r0, r0, r12	C 0 12
+	adde	r7, r7, r26	C 5 7
+	mulld	r26, r9, r6
+	mulhdu	r10, r9, r6	C 9
+	mulld	r11, r27, r6
+	mulhdu	r12, r27, r6	C 27
+	ld	r9, 16(up)
+	ld	r30, 16(rp)
+	ld	r27, 24(up)
+	ld	r31, 24(rp)
+	adde	r26, r26, r8	C 8 5
+	adde	r11, r11, r10	C 10 11
+	addze	r12, r12	C 12
+	addc	r0, r0, r28	C 0 28
+	std	r0, 0(rp)	C 0
+	adde	r7, r7, r29	C 7 29
+	std	r7, 8(rp)	C 7
+	adde	r26, r26, r30	C 5 30
+	std	r26, 16(rp)	C 5
+	adde	r11, r11, r31	C 11 31
+	std	r11, 24(rp)	C 11
+	addi	up, up, 32
+	addi	rp, rp, 32
+	bdnz	L(ta2)
+
+L(ea2):	mulld	r0, r9, r6
+	mulhdu	r26, r9, r6
+	mulld	r7, r27, r6
+	mulhdu	r8, r27, r6
+	ld	r28, 0(rp)
+	ld	r29, 8(rp)
+	adde	r0, r0, r12
+	adde	r7, r7, r26
+	addze	r8, r8
+	addc	r0, r0, r28
+	std	r0, 0(rp)
+	adde	r7, r7, r29
+	std	r7, 8(rp)
+	addze	r8, r8
+	std	r8, 16(rp)
+
+	b	L(outer_loop)
+
+L(outer_end):
+	ld	r6, 0(up)
+	ld	r9, 8(up)
+	ld	r11, 0(rp)
+	mulld	r0, r9, r6
+	mulhdu	r8, r9, r6
+	addc	r0, r0, r11
+	std	r0, 0(rp)
+	addze	r8, r8
+	std	r8, 8(rp)
+
+define(`rp',  `rp_saved')
+define(`up',  `r5')
+define(`n',   `r6')
+define(`climb',	`r0')
+
+	addi	r4, rp_saved, 8
+	mr	r5, up_saved
+	mr	r6, n_saved
+
+	rldicl.	r0, n, 0,62		C r0 = n & 3, set cr0
+	cmpdi	cr6, r0, 2
+	addi	n, n, 2			C compute count...
+	srdi	n, n, 2			C ...for ctr
+	mtctr	n			C put loop count into ctr
+	beq	cr0, L(xb0)
+	blt	cr6, L(xb1)
+	beq	cr6, L(xb2)
+
+L(xb3):	ld	r6,   0(up)
+	ld	r7,   8(up)
+	ld	r12, 16(up)
+	addi	up, up, 24
+	mulld	r24, r6, r6
+	mulhdu	r25, r6, r6
+	mulld	r26, r7, r7
+	mulhdu	r27, r7, r7
+	mulld	r28, r12, r12
+	mulhdu	r29, r12, r12
+	ld	r10,  8(rp)
+	ld	r11, 16(rp)
+	ld	r6,  24(rp)
+	ld	r7,  32(rp)
+	addc	r10, r10, r10
+	adde	r11, r11, r11
+	adde	r6, r6, r6
+	adde	r7, r7, r7
+	addze	climb, r29
+	addc	r10, r10, r25
+	adde	r11, r11, r26
+	adde	r6, r6, r27
+	adde	r7, r7, r28
+	std	r24,  0(rp)
+	std	r10,  8(rp)
+	std	r11, 16(rp)
+	std	r6,  24(rp)
+	std	r7,  32(rp)
+	addi	rp, rp, 40
+	bdnz	L(top)
+	b	L(end)
+
+L(xb2):	ld	r6,  0(up)
+	ld	r7,  8(up)
+	addi	up, up, 16
+	mulld	r24, r6, r6
+	mulhdu	r25, r6, r6
+	mulld	r26, r7, r7
+	mulhdu	r27, r7, r7
+	ld	r10,  8(rp)
+	ld	r11, 16(rp)
+	addc	r10, r10, r10
+	adde	r11, r11, r11
+	addze	climb, r27
+	addc	r10, r10, r25
+	adde	r11, r11, r26
+	std	r24,  0(rp)
+	std	r10,  8(rp)
+	std	r11, 16(rp)
+	addi	rp, rp, 24
+	bdnz	L(top)
+	b	L(end)
+
+L(xb0):	ld	r6,   0(up)
+	ld	r7,   8(up)
+	ld	r12, 16(up)
+	ld	r23, 24(up)
+	addi	up, up, 32
+	mulld	r24, r6, r6
+	mulhdu	r25, r6, r6
+	mulld	r26, r7, r7
+	mulhdu	r27, r7, r7
+	mulld	r28, r12, r12
+	mulhdu	r29, r12, r12
+	mulld	r30, r23, r23
+	mulhdu	r31, r23, r23
+	ld	r10,  8(rp)
+	ld	r11, 16(rp)
+	ld	r6,  24(rp)
+	ld	r7,  32(rp)
+	ld	r12, 40(rp)
+	ld	r23, 48(rp)
+	addc	r10, r10, r10
+	adde	r11, r11, r11
+	adde	r6, r6, r6
+	adde	r7, r7, r7
+	adde	r12, r12, r12
+	adde	r23, r23, r23
+	addze	climb, r31
+	std	r24,  0(rp)
+	addc	r10, r10, r25
+	std	r10,  8(rp)
+	adde	r11, r11, r26
+	std	r11, 16(rp)
+	adde	r6, r6, r27
+	std	r6,  24(rp)
+	adde	r7, r7, r28
+	std	r7,  32(rp)
+	adde	r12, r12, r29
+	std	r12, 40(rp)
+	adde	r23, r23, r30
+	std	r23, 48(rp)
+	addi	rp, rp, 56
+	bdnz	L(top)
+	b	L(end)
+
+L(xb1):	ld	r6,  0(up)
+	addi	up, up, 8
+	mulld	r24, r6, r6
+	mulhdu	climb, r6, r6
+	std	r24, 0(rp)
+	addic	rp, rp, 8		C clear carry as side-effect
+
+	ALIGN(32)
+L(top):	ld	r6,   0(up)
+	ld	r7,   8(up)
+	ld	r12, 16(up)
+	ld	r23, 24(up)
+	addi	up, up, 32
+	mulld	r24, r6, r6
+	mulhdu	r25, r6, r6
+	mulld	r26, r7, r7
+	mulhdu	r27, r7, r7
+	mulld	r28, r12, r12
+	mulhdu	r29, r12, r12
+	mulld	r30, r23, r23
+	mulhdu	r31, r23, r23
+	ld	r8,   0(rp)
+	ld	r9,   8(rp)
+	adde	r8, r8, r8
+	adde	r9, r9, r9
+	ld	r10, 16(rp)
+	ld	r11, 24(rp)
+	adde	r10, r10, r10
+	adde	r11, r11, r11
+	ld	r6,  32(rp)
+	ld	r7,  40(rp)
+	adde	r6, r6, r6
+	adde	r7, r7, r7
+	ld	r12, 48(rp)
+	ld	r23, 56(rp)
+	adde	r12, r12, r12
+	adde	r23, r23, r23
+	addze	r31, r31
+	addc	r8, r8, climb
+	std	r8,   0(rp)
+	adde	r9, r9, r24
+	std	r9,   8(rp)
+	adde	r10, r10, r25
+	std	r10, 16(rp)
+	adde	r11, r11, r26
+	std	r11, 24(rp)
+	adde	r6, r6, r27
+	std	r6,  32(rp)
+	adde	r7, r7, r28
+	std	r7,  40(rp)
+	adde	r12, r12, r29
+	std	r12, 48(rp)
+	adde	r23, r23, r30
+	std	r23, 56(rp)
+	mr	climb, r31
+	addi	rp, rp, 64
+	bdnz	L(top)
+
+L(end):	addze	climb, climb
+	std	climb,  0(rp)
+
+	ld	r31,  -8(r1)
+	ld	r30, -16(r1)
+	ld	r29, -24(r1)
+	ld	r28, -32(r1)
+	ld	r27, -40(r1)
+	ld	r26, -48(r1)
+	ld	r25, -56(r1)
+	ld	r24, -64(r1)
+	ld	r23, -72(r1)
+	ld	r22, -80(r1)
+	ld	r21, -88(r1)
+	blr
+EPILOGUE()

diff --git a/third_party/gmp/mpn/powerpc64/p6/lshift.asm b/third_party/gmp/mpn/powerpc64/p6/lshift.asm
new file mode 100644
index 0000000..1a200fb
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc64/p6/lshift.asm

@@ -0,0 +1,132 @@
+dnl  PowerPC-64 mpn_lshift -- rp[] = up[] << cnt
+
+dnl  Copyright 2003, 2005, 2010, 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C		    cycles/limb
+C POWER3/PPC630		 ?
+C POWER4/PPC970		 ?
+C POWER5		 2.25
+C POWER6		 4
+
+C TODO
+C  * Micro-optimise header code
+C  * Perhaps do 4-way unrolling, for 2.5 c/l on POWER6.  The code is 4236
+C    bytes, 4-way code would become about 50% larger.
+
+C INPUT PARAMETERS
+define(`rp_param',  `r3')
+define(`up',  `r4')
+define(`n',   `r5')
+define(`cnt', `r6')
+
+define(`tnc',`r0')
+define(`retval',`r3')
+define(`rp',  `r7')
+
+ASM_START()
+PROLOGUE(mpn_lshift,toc)
+
+ifdef(`HAVE_ABI_mode32',`
+	rldicl	n, n, 0,32		C FIXME: avoid this zero extend
+')
+	mflr	r12
+	sldi	r8, n, 3
+	sldi	r10, cnt, 6		C multiply cnt by size of a SHIFT block
+	LEAL(	r11, L(e1))		C address of L(e1) label in SHIFT(1)
+	add	up, up, r8		C make up point at end of up[]
+	add	r11, r11, r10		C address of L(oN) for N = cnt
+	srdi	r10, n, 1
+	add	rp, rp_param, r8	C make rp point at end of rp[]
+	subfic	tnc, cnt, 64
+	rlwinm.	r8, n, 0,31,31		C extract bit 0
+	mtctr	r10
+	beq	L(evn)
+
+L(odd):	ld	r9, -8(up)
+	cmpdi	cr0, n, 1		C n = 1?
+	beq	L(1)
+	ld	r8, -16(up)
+	addi	r11, r11, -84		C L(o1) - L(e1) - 64
+	mtlr	r11
+	srd	r3, r9, tnc		C retval
+	addi	up, up, 8
+	addi	rp, rp, -8
+	blr				C branch to L(oN)
+
+L(evn):	ld	r8, -8(up)
+	ld	r9, -16(up)
+	addi	r11, r11, -64
+	mtlr	r11
+	srd	r3, r8, tnc		C retval
+	blr				C branch to L(eN)
+
+L(1):	srd	r3, r9, tnc		C retval
+	sld	r8, r9, cnt
+	std	r8, -8(rp)
+	mtlr	r12
+ifdef(`HAVE_ABI_mode32',
+`	mr	r4, r3
+	srdi	r3, r3, 32
+')
+	blr
+
+
+define(SHIFT,`
+L(lo$1):ld	r8, -24(up)
+	std	r11, -8(rp)
+	addi	rp, rp, -16
+L(o$1):	srdi	r10, r8, eval(64-$1)
+	rldimi	r10, r9, $1, 0
+	ld	r9, -32(up)
+	addi	up, up, -16
+	std	r10, 0(rp)
+L(e$1):	srdi	r11, r9, eval(64-$1)
+	rldimi	r11, r8, $1, 0
+	bdnz	L(lo$1)
+	std	r11, -8(rp)
+	sldi	r10, r9, $1
+	b	L(com)
+	nop
+	nop
+')
+
+	ALIGN(64)
+forloop(`i',1,63,`SHIFT(i)')
+
+L(com):	std	r10, -16(rp)
+	mtlr	r12
+ifdef(`HAVE_ABI_mode32',
+`	mr	r4, r3
+	srdi	r3, r3, 32
+')
+	blr
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/powerpc64/p6/lshiftc.asm b/third_party/gmp/mpn/powerpc64/p6/lshiftc.asm
new file mode 100644
index 0000000..e4b3caa
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc64/p6/lshiftc.asm

@@ -0,0 +1,136 @@
+dnl  PowerPC-64 mpn_lshiftc -- rp[] = ~up[] << cnt
+
+dnl  Copyright 2003, 2005, 2010, 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C		    cycles/limb
+C POWER3/PPC630		 ?
+C POWER4/PPC970		 ?
+C POWER5		 2.25
+C POWER6		 4
+
+C TODO
+C  * Micro-optimise header code
+C  * Perhaps do 4-way unrolling, for 2.5 c/l on POWER6.  The code is 4236
+C    bytes, 4-way code would become about 50% larger.
+
+C INPUT PARAMETERS
+define(`rp_param',  `r3')
+define(`up',  `r4')
+define(`n',   `r5')
+define(`cnt', `r6')
+
+define(`tnc',`r0')
+define(`retval',`r3')
+define(`rp',  `r7')
+
+ASM_START()
+PROLOGUE(mpn_lshiftc,toc)
+
+ifdef(`HAVE_ABI_mode32',`
+	rldicl	n, n, 0,32		C FIXME: avoid this zero extend
+')
+	mflr	r12
+	sldi	r8, n, 3
+	sldi	r10, cnt, 6		C multiply cnt by size of a SHIFT block
+	LEAL(	r11, L(e1))		C address of L(e1) label in SHIFT(1)
+	add	up, up, r8		C make up point at end of up[]
+	add	r11, r11, r10		C address of L(oN) for N = cnt
+	srdi	r10, n, 1
+	add	rp, rp_param, r8	C make rp point at end of rp[]
+	subfic	tnc, cnt, 64
+	rlwinm.	r8, n, 0,31,31		C extract bit 0
+	mtctr	r10
+	beq	L(evn)
+
+L(odd):	ld	r9, -8(up)
+	cmpdi	cr0, n, 1		C n = 1?
+	beq	L(1)
+	ld	r8, -16(up)
+	addi	r11, r11, -88		C L(o1) - L(e1) - 64
+	mtlr	r11
+	srd	r3, r9, tnc		C retval
+	addi	up, up, 8
+	addi	rp, rp, -8
+	blr				C branch to L(oN)
+
+L(evn):	ld	r8, -8(up)
+	ld	r9, -16(up)
+	addi	r11, r11, -64
+	mtlr	r11
+	srd	r3, r8, tnc		C retval
+	blr				C branch to L(eN)
+
+L(1):	srd	r3, r9, tnc		C retval
+	sld	r8, r9, cnt
+	nor	r8, r8, r8
+	std	r8, -8(rp)
+	mtlr	r12
+ifdef(`HAVE_ABI_mode32',
+`	mr	r4, r3
+	srdi	r3, r3, 32
+')
+	blr
+
+
+define(SHIFT,`
+L(lo$1):ld	r8, -24(up)
+	nor	r11, r11, r11
+	std	r11, -8(rp)
+	addi	rp, rp, -16
+L(o$1):	srdi	r10, r8, eval(64-$1)
+	rldimi	r10, r9, $1, 0
+	ld	r9, -32(up)
+	addi	up, up, -16
+	nor	r10, r10, r10
+	std	r10, 0(rp)
+L(e$1):	srdi	r11, r9, eval(64-$1)
+	rldimi	r11, r8, $1, 0
+	bdnz	L(lo$1)
+	sldi	r10, r9, $1
+	b	L(com)
+	nop
+')
+
+	ALIGN(64)
+forloop(`i',1,63,`SHIFT(i)')
+
+L(com):	nor	r11, r11, r11
+	nor	r10, r10, r10
+	std	r11, -8(rp)
+	std	r10, -16(rp)
+	mtlr	r12
+ifdef(`HAVE_ABI_mode32',
+`	mr	r4, r3
+	srdi	r3, r3, 32
+')
+	blr
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/powerpc64/p6/rshift.asm b/third_party/gmp/mpn/powerpc64/p6/rshift.asm
new file mode 100644
index 0000000..9e848c1
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc64/p6/rshift.asm

@@ -0,0 +1,131 @@
+dnl  PowerPC-64 mpn_rshift -- rp[] = up[] << cnt
+
+dnl  Copyright 2003, 2005, 2010, 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C		    cycles/limb
+C POWER3/PPC630		 ?
+C POWER4/PPC970		 ?
+C POWER5		 2
+C POWER6		 3.5  (mysteriously 3.0 for cnt=1)
+
+C TODO
+C  * Micro-optimise header code
+C  * Perhaps do 4-way unrolling, for 2.5 c/l on POWER6.  The code is 4248
+C    bytes, 4-way code would become about 50% larger.
+
+C INPUT PARAMETERS
+define(`rp_param',  `r3')
+define(`up',  `r4')
+define(`n',   `r5')
+define(`cnt', `r6')
+
+define(`tnc',`r0')
+define(`retval',`r3')
+define(`rp',  `r7')
+
+ASM_START()
+PROLOGUE(mpn_rshift,toc)
+
+ifdef(`HAVE_ABI_mode32',`
+	rldicl	n, n, 0,32		C FIXME: avoid this zero extend
+')
+	mflr	r12
+	LEAL(	r11, L(e1))		C address of L(e1) label in SHIFT(1)
+	sldi	r10, cnt, 6		C multiply cnt by size of a SHIFT block
+	add	r11, r11, r10		C address of L(oN) for N = cnt
+	srdi	r10, n, 1
+	mr	rp, rp_param
+	subfic	tnc, cnt, 64
+	rlwinm.	r8, n, 0,31,31		C extract bit 0
+	mtctr	r10
+	beq	L(evn)
+
+L(odd):	ld	r9, 0(up)
+	cmpdi	cr0, n, 1		C n = 1?
+	beq	L(1)
+	ld	r8, 8(up)
+	addi	r11, r11, -84		C L(o1) - L(e1) - 64
+	mtlr	r11
+	sld	r3, r9, tnc		C retval
+	addi	up, up, 8
+	addi	rp, rp, 8
+	blr				C branch to L(oN)
+
+L(evn):	ld	r8, 0(up)
+	ld	r9, 8(up)
+	addi	r11, r11, -64
+	mtlr	r11
+	sld	r3, r8, tnc		C retval
+	addi	up, up, 16
+	blr				C branch to L(eN)
+
+L(1):	sld	r3, r9, tnc		C retval
+	srd	r8, r9, cnt
+	std	r8, 0(rp)
+	mtlr	r12
+ifdef(`HAVE_ABI_mode32',
+`	mr	r4, r3
+	srdi	r3, r3, 32
+')
+	blr
+
+
+define(SHIFT,`
+L(lo$1):ld	r8, 0(up)
+	std	r11, 0(rp)
+	addi	rp, rp, 16
+L(o$1):	srdi	r10, r9, $1
+	rldimi	r10, r8, eval(64-$1), 0
+	ld	r9, 8(up)
+	addi	up, up, 16
+	std	r10, -8(rp)
+L(e$1):	srdi	r11, r8, $1
+	rldimi	r11, r9, eval(64-$1), 0
+	bdnz	L(lo$1)
+	std	r11, 0(rp)
+	srdi	r10, r9, $1
+	b	L(com)
+	nop
+	nop
+')
+
+	ALIGN(64)
+forloop(`i',1,63,`SHIFT(i)')
+
+L(com):	std	r10, 8(rp)
+	mtlr	r12
+ifdef(`HAVE_ABI_mode32',
+`	mr	r4, r3
+	srdi	r3, r3, 32
+')
+	blr
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/powerpc64/p7/copyd.asm b/third_party/gmp/mpn/powerpc64/p7/copyd.asm
new file mode 100644
index 0000000..f04ca58
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc64/p7/copyd.asm

@@ -0,0 +1,128 @@
+dnl  PowerPC-64 mpn_copyd.
+
+dnl  Copyright 2004, 2005, 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C                  cycles/limb
+C POWER3/PPC630          ?
+C POWER4/PPC970          ?
+C POWER5                 ?
+C POWER6                 1.25
+C POWER7                 1.09
+
+C INPUT PARAMETERS
+define(`rp',	`r3')
+define(`up',	`r4')
+define(`n',	`r5')
+
+ASM_START()
+PROLOGUE(mpn_copyd)
+
+ifdef(`HAVE_ABI_mode32',
+`	rldicl	n, n, 0,32')
+
+	sldi	r0, n, 3
+	add	up, up, r0		C point at u[] end
+	add	rp, rp, r0		C point at r[] end
+
+	cmpdi	cr0, n, 4
+	blt	L(sml)
+
+	addi	r10, n, 4
+	srdi	r10, r10, 3
+	mtctr	r10
+
+	andi.	r0, n, 1
+	rlwinm	r11, n, 0,30,30
+	rlwinm	r12, n, 0,29,29
+	cmpdi	cr6, r11, 0
+	cmpdi	cr7, r12, 0
+
+	beq	cr0, L(xx0)
+L(xx1):	ld	r6, -8(up)
+	addi	up, up, -8
+	std	r6, -8(rp)
+	addi	rp, rp, -8
+
+L(xx0):	bne	cr6, L(x10)
+L(x00):	ld	r6, -8(up)
+	ld	r7, -16(up)
+	bne	cr7, L(100)
+L(000):	addi	rp, rp, 32
+	b	L(lo0)
+L(100):	addi	up, up, 32
+	b	L(lo4)
+L(x10):	ld	r8, -8(up)
+	ld	r9, -16(up)
+	bne	cr7, L(110)
+L(010):	addi	up, up, -16
+	addi	rp, rp, 16
+	b	L(lo2)
+L(110):	addi	up, up, 16
+	addi	rp, rp, 48
+	b	L(lo6)
+
+L(sml):	cmpdi	cr0, n, 0
+	beqlr-	cr0
+	mtctr	n
+L(t):	ld	r6, -8(up)
+	addi	up, up, -8
+	std	r6, -8(rp)
+	addi	rp, rp, -8
+	bdnz	L(t)
+	blr
+
+	ALIGN(32)
+L(top):	std	r6, -8(rp)
+	std	r7, -16(rp)
+L(lo2):	ld	r6, -8(up)
+	ld	r7, -16(up)
+	std	r8, -24(rp)
+	std	r9, -32(rp)
+L(lo0):	ld	r8, -24(up)
+	ld	r9, -32(up)
+	std	r6, -40(rp)
+	std	r7, -48(rp)
+L(lo6):	ld	r6, -40(up)
+	ld	r7, -48(up)
+	std	r8, -56(rp)
+	std	r9, -64(rp)
+	addi	rp, rp, -64
+L(lo4):	ld	r8, -56(up)
+	ld	r9, -64(up)
+	addi	up, up, -64
+	bdnz	L(top)
+
+L(end):	std	r6, -8(rp)
+	std	r7, -16(rp)
+	std	r8, -24(rp)
+	std	r9, -32(rp)
+	blr
+EPILOGUE()

diff --git a/third_party/gmp/mpn/powerpc64/p7/copyi.asm b/third_party/gmp/mpn/powerpc64/p7/copyi.asm
new file mode 100644
index 0000000..854cf9f
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc64/p7/copyi.asm

@@ -0,0 +1,129 @@
+dnl  PowerPC-64 mpn_copyi.
+
+dnl  Copyright 2004, 2005, 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C                  cycles/limb
+C POWER3/PPC630          ?
+C POWER4/PPC970          ?
+C POWER5                 ?
+C POWER6                 1.25
+C POWER7                 1.09
+
+C INPUT PARAMETERS
+define(`rp',	`r3')
+define(`up',	`r4')
+define(`n',	`r5')
+
+C TODO
+C  * Try rolling the two loop leading std to the end, allowing the code to
+C    handle also n = 2.
+C  * Consider using 4 pointers, schedule ptr update early wrt use.
+
+ASM_START()
+PROLOGUE(mpn_copyi)
+
+ifdef(`HAVE_ABI_mode32',
+`	rldicl	n, n, 0,32')
+
+	cmpdi	cr0, n, 4
+	blt	L(sml)
+
+	addi	r10, n, 4
+	srdi	r10, r10, 3
+	mtctr	r10
+
+	andi.	r0, n, 1
+	rlwinm	r11, n, 0,30,30
+	rlwinm	r12, n, 0,29,29
+	cmpdi	cr6, r11, 0
+	cmpdi	cr7, r12, 0
+
+	beq	cr0, L(xx0)
+L(xx1):	ld	r6, 0(up)
+	addi	up, up, 8
+	std	r6, 0(rp)
+	addi	rp, rp, 8
+
+L(xx0):	bne	cr6, L(x10)
+L(x00):	ld	r6, 0(up)
+	ld	r7, 8(up)
+	bne	cr7, L(100)
+L(000):	addi	rp, rp, -32
+	b	L(lo0)
+L(100):	addi	up, up, -32
+	b	L(lo4)
+L(x10):	ld	r8, 0(up)
+	ld	r9, 8(up)
+	bne	cr7, L(110)
+L(010):	addi	up, up, 16
+	addi	rp, rp, -16
+	b	L(lo2)
+L(110):	addi	up, up, -16
+	addi	rp, rp, -48
+	b	L(lo6)
+
+L(sml):	cmpdi	cr0, n, 0
+	beqlr-	cr0
+	mtctr	n
+L(t):	ld	r6, 0(up)
+	addi	up, up, 8
+	std	r6, 0(rp)
+	addi	rp, rp, 8
+	bdnz	L(t)
+	blr
+
+	ALIGN(32)
+L(top):	std	r6, 0(rp)
+	std	r7, 8(rp)
+L(lo2):	ld	r6, 0(up)
+	ld	r7, 8(up)
+	std	r8, 16(rp)
+	std	r9, 24(rp)
+L(lo0):	ld	r8, 16(up)
+	ld	r9, 24(up)
+	std	r6, 32(rp)
+	std	r7, 40(rp)
+L(lo6):	ld	r6, 32(up)
+	ld	r7, 40(up)
+	std	r8, 48(rp)
+	std	r9, 56(rp)
+	addi	rp, rp, 64
+L(lo4):	ld	r8, 48(up)
+	ld	r9, 56(up)
+	addi	up, up, 64
+	bdnz	L(top)
+
+L(end):	std	r6, 0(rp)
+	std	r7, 8(rp)
+	std	r8, 16(rp)
+	std	r9, 24(rp)
+	blr
+EPILOGUE()

diff --git a/third_party/gmp/mpn/powerpc64/p7/hamdist.asm b/third_party/gmp/mpn/powerpc64/p7/hamdist.asm
new file mode 100644
index 0000000..960b3bc
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc64/p7/hamdist.asm

@@ -0,0 +1,110 @@
+dnl  PowerPC-64 mpn_hamdist.
+
+dnl  Copyright 2012, 2017 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C                   cycles/limb
+C POWER3/PPC630          -
+C POWER4/PPC970          -
+C POWER5                 -
+C POWER6                 -
+C POWER7                 2.87
+
+define(`up', r3)
+define(`vp', r4)
+define(`n',  r5)
+
+ASM_START()
+PROLOGUE(mpn_hamdist)
+	std	r30, -16(r1)
+	std	r31, -8(r1)
+
+	addi	r0, n, 1
+ifdef(`HAVE_ABI_mode32',
+`	rldicl	r0, r0, 63,33',	C ...branch count
+`	srdi	r0, r0, 1')	C ...for ctr
+	mtctr	r0
+
+	andi.	r0, n, 1
+
+	li	r0, 0
+	li	r12, 0
+
+	beq	L(evn)
+
+L(odd):	ld	r6, 0(up)
+	addi	up, up, 8
+	ld	r8, 0(vp)
+	addi	vp, vp, 8
+	xor	r10, r6, r8
+	popcntd(r0, r10)
+	bdz	L(e1)
+
+L(evn):	ld	r6, 0(up)
+	ld	r8, 0(vp)
+	ld	r7, 8(up)
+	ld	r9, 8(vp)
+	xor	r10, r6, r8
+	addi	up, up, 16
+	addi	vp, vp, 16
+	li	r30, 0
+	li	r31, 0
+	bdz	L(end)
+
+	nop
+	nop
+C	ALIGN(16)
+L(top):	add	r0, r0, r30
+	ld	r6, 0(up)
+	ld	r8, 0(vp)
+	xor	r11, r7, r9
+	popcntd(r30, r10)
+	add	r12, r12, r31
+	ld	r7, 8(up)
+	ld	r9, 8(vp)
+	xor	r10, r6, r8
+	popcntd(r31, r11)
+	addi	up, up, 16
+	addi	vp, vp, 16
+	bdnz	L(top)
+
+L(end):	add	r0, r0, r30
+	xor	r11, r7, r9
+	popcntd(r30, r10)
+	add	r12, r12, r31
+	popcntd(r31, r11)
+
+	add	r0, r0, r30
+	add	r12, r12, r31
+L(e1):	add	r3, r0, r12
+	ld	r30, -16(r1)
+	ld	r31, -8(r1)
+	blr
+EPILOGUE()

diff --git a/third_party/gmp/mpn/powerpc64/p7/popcount.asm b/third_party/gmp/mpn/powerpc64/p7/popcount.asm
new file mode 100644
index 0000000..129ffef
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc64/p7/popcount.asm

@@ -0,0 +1,90 @@
+dnl  PowerPC-64 mpn_popcount.
+
+dnl  Copyright 2012, 2017 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C                   cycles/limb
+C POWER3/PPC630          -
+C POWER4/PPC970          -
+C POWER5                 -
+C POWER6                 -
+C POWER7                 2
+
+define(`up', r3)
+define(`n',  r4)
+
+ASM_START()
+PROLOGUE(mpn_popcount)
+	addi	r0, n, 1
+ifdef(`HAVE_ABI_mode32',
+`	rldicl	r0, r0, 63,33',	C ...branch count
+`	srdi	r0, r0, 1')	C ...for ctr
+	mtctr	r0
+
+	andi.	r0, n, 1
+
+	li	r0, 0
+	li	r12, 0
+	beq	L(evn)
+
+L(odd):	ld	r4, 0(up)
+	addi	up, up, 8
+	popcntd(r0, r4)
+	bdz	L(e1)
+
+L(evn):	ld	r4, 0(up)
+	ld	r5, 8(up)
+	popcntd(r8, r4)
+	popcntd(r9, r5)
+	bdz	L(e2)
+
+	ld	r4, 16(up)
+	ld	r5, 24(up)
+	bdz	L(e4)
+	addi	up, up, 32
+
+L(top):	add	r0, r0, r8
+	popcntd(r8, r4)
+	ld	r4, 0(up)
+	add	r12, r12, r9
+	popcntd(r9, r5)
+	ld	r5, 8(up)
+	addi	up, up, 16
+	bdnz	L(top)
+
+L(e4):	add	r0, r0, r8
+	popcntd(r8, r4)
+	add	r12, r12, r9
+	popcntd(r9, r5)
+L(e2):	add	r0, r0, r8
+	add	r12, r12, r9
+L(e1):	add	r3, r0, r12
+	blr
+EPILOGUE()

diff --git a/third_party/gmp/mpn/powerpc64/rshift.asm b/third_party/gmp/mpn/powerpc64/rshift.asm
new file mode 100644
index 0000000..7654a16
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc64/rshift.asm

@@ -0,0 +1,207 @@
+dnl  PowerPC-64 mpn_rshift -- rp[] = up[] >> cnt
+
+dnl  Copyright 2003, 2005, 2010, 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C                   cycles/limb
+C POWER3/PPC630          ?
+C POWER4/PPC970          ?
+C POWER5                 2.25
+C POWER6                 9.75
+C POWER7                 2.15
+
+C TODO
+C  * Try to reduce the number of needed live registers
+C  * Micro-optimise header code
+C  * Keep in synch with lshift.asm and lshiftc.asm
+
+C INPUT PARAMETERS
+define(`rp',  `r3')
+define(`up',  `r4')
+define(`n',   `r5')
+define(`cnt', `r6')
+
+define(`tnc',`r0')
+define(`u0',`r30')
+define(`u1',`r31')
+define(`retval',`r5')
+
+ASM_START()
+PROLOGUE(mpn_rshift)
+	std	r31, -8(r1)
+	std	r30, -16(r1)
+	subfic	tnc, cnt, 64
+C	sldi	r30, n, 3	C byte count corresponding to n
+C	add	rp, rp, r30	C rp = rp + n
+C	add	up, up, r30	C up = up + n
+	rldicl.	r30, n, 0,62	C r30 = n & 3, set cr0
+	cmpdi	cr6, r30, 2
+	addi	r31, n, 3	C compute count...
+	ld	r10, 0(up)	C load 1st limb for b00...b11
+	sld	retval, r10, tnc
+ifdef(`HAVE_ABI_mode32',
+`	rldicl	r31, r31, 62,34',	C ...branch count
+`	srdi	r31, r31, 2')	C ...for ctr
+	mtctr	r31		C copy count into ctr
+	beq	cr0, L(b00)
+	blt	cr6, L(b01)
+	ld	r11, 8(up)	C load 2nd limb for b10 and b11
+	beq	cr6, L(b10)
+
+	ALIGN(16)
+L(b11):	srd	r8, r10, cnt
+	sld	r9, r11, tnc
+	ld	u1, 16(up)
+	addi	up, up, 24
+	srd	r12, r11, cnt
+	sld	r7, u1, tnc
+	addi	rp, rp, -16
+	bdnz	L(gt3)
+
+	or	r11, r8, r9
+	srd	r8, u1, cnt
+	b	L(cj3)
+
+	ALIGN(16)
+L(gt3):	ld	u0, 0(up)
+	or	r11, r8, r9
+	srd	r8, u1, cnt
+	sld	r9, u0, tnc
+	ld	u1, 8(up)
+	or	r10, r12, r7
+	b	L(L11)
+
+	ALIGN(32)
+L(b10):	srd	r12, r10, cnt
+	addi	rp, rp, -24
+	sld	r7, r11, tnc
+	bdnz	L(gt2)
+
+	srd	r8, r11, cnt
+	or	r10, r12, r7
+	b	L(cj2)
+
+L(gt2):	ld	u0, 16(up)
+	srd	r8, r11, cnt
+	sld	r9, u0, tnc
+	ld	u1, 24(up)
+	or	r10, r12, r7
+	srd	r12, u0, cnt
+	sld	r7, u1, tnc
+	ld	u0, 32(up)
+	or	r11, r8, r9
+	addi	up, up, 16
+	b	L(L10)
+
+	ALIGN(16)
+L(b00):	ld	u1, 8(up)
+	srd	r12, r10, cnt
+	sld	r7, u1, tnc
+	ld	u0, 16(up)
+	srd	r8, u1, cnt
+	sld	r9, u0, tnc
+	ld	u1, 24(up)
+	or	r10, r12, r7
+	srd	r12, u0, cnt
+	sld	r7, u1, tnc
+	addi	rp, rp, -8
+	bdz	L(cj4)
+
+L(gt4):	addi	up, up, 32
+	ld	u0, 0(up)
+	or	r11, r8, r9
+	b	L(L00)
+
+	ALIGN(16)
+L(b01):	bdnz	L(gt1)
+	srd	r8, r10, cnt
+	std	r8, 0(rp)
+	b	L(ret)
+
+L(gt1):	ld	u0, 8(up)
+	srd	r8, r10, cnt
+	sld	r9, u0, tnc
+	ld	u1, 16(up)
+	srd	r12, u0, cnt
+	sld	r7, u1, tnc
+	ld	u0, 24(up)
+	or	r11, r8, r9
+	srd	r8, u1, cnt
+	sld	r9, u0, tnc
+	ld	u1, 32(up)
+	addi	up, up, 40
+	or	r10, r12, r7
+	bdz	L(end)
+
+	ALIGN(32)
+L(top):	srd	r12, u0, cnt
+	sld	r7, u1, tnc
+	ld	u0, 0(up)
+	std	r11, 0(rp)
+	or	r11, r8, r9
+L(L00):	srd	r8, u1, cnt
+	sld	r9, u0, tnc
+	ld	u1, 8(up)
+	std	r10, 8(rp)
+	or	r10, r12, r7
+L(L11):	srd	r12, u0, cnt
+	sld	r7, u1, tnc
+	ld	u0, 16(up)
+	std	r11, 16(rp)
+	or	r11, r8, r9
+L(L10):	srd	r8, u1, cnt
+	sld	r9, u0, tnc
+	ld	u1, 24(up)
+	addi	up, up, 32
+	std	r10, 24(rp)
+	addi	rp, rp, 32
+	or	r10, r12, r7
+	bdnz	L(top)
+
+	ALIGN(32)
+L(end):	srd	r12, u0, cnt
+	sld	r7, u1, tnc
+	std	r11, 0(rp)
+L(cj4):	or	r11, r8, r9
+	srd	r8, u1, cnt
+	std	r10, 8(rp)
+L(cj3):	or	r10, r12, r7
+	std	r11, 16(rp)
+L(cj2):	std	r10, 24(rp)
+	std	r8, 32(rp)
+
+L(ret):	ld	r31, -8(r1)
+	ld	r30, -16(r1)
+ifdef(`HAVE_ABI_mode32',
+`	srdi	r3, retval, 32
+	mr	r4, retval
+',`	mr	r3, retval')
+	blr
+EPILOGUE()

diff --git a/third_party/gmp/mpn/powerpc64/sec_tabselect.asm b/third_party/gmp/mpn/powerpc64/sec_tabselect.asm
new file mode 100644
index 0000000..085577c
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc64/sec_tabselect.asm

@@ -0,0 +1,147 @@
+dnl  PowerPC-64 mpn_sec_tabselect.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2011-2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C                   cycles/limb
+C POWER3/PPC630		 1.75
+C POWER4/PPC970		 2.0
+C POWER5		 ?
+C POWER6		 5.0
+C POWER7		 1.75
+
+define(`rp',     `r3')
+define(`tp',     `r4')
+define(`n',      `r5')
+define(`nents',  `r6')
+define(`which',  `r7')
+
+define(`i',      `r8')
+define(`j',      `r9')
+define(`stride', `r12')
+define(`mask',   `r11')
+
+
+ASM_START()
+PROLOGUE(mpn_sec_tabselect)
+	addic.	j, n, -4		C outer loop induction variable
+	std	r31, -8(r1)
+	std	r30, -16(r1)
+	std	r29, -24(r1)
+	std	r28, -32(r1)
+	std	r27, -40(r1)
+	sldi	stride, n, 3
+
+	blt	cr0, L(outer_end)
+L(outer_top):
+	mtctr	nents
+	mr	r10, tp
+	li	r28, 0
+	li	r29, 0
+	li	r30, 0
+	li	r31, 0
+	addic.	j, j, -4		C outer loop induction variable
+	mr	i, which
+
+	ALIGN(16)
+L(top):	addic	i, i, -1		C set carry iff i != 0
+	subfe	mask, mask, mask
+	ld	r0, 0(tp)
+	ld	r27, 8(tp)
+	and	r0, r0, mask
+	and	r27, r27, mask
+	or	r28, r28, r0
+	or	r29, r29, r27
+	ld	r0, 16(tp)
+	ld	r27, 24(tp)
+	and	r0, r0, mask
+	and	r27, r27, mask
+	or	r30, r30, r0
+	or	r31, r31, r27
+	add	tp, tp, stride
+	bdnz	L(top)
+
+	std	r28, 0(rp)
+	std	r29, 8(rp)
+	std	r30, 16(rp)
+	std	r31, 24(rp)
+	addi	tp, r10, 32
+	addi	rp, rp, 32
+	bge	cr0, L(outer_top)
+L(outer_end):
+
+	rldicl.	r0, n, 63, 63
+	beq	cr0, L(b0x)
+L(b1x):	mtctr	nents
+	mr	r10, tp
+	li	r28, 0
+	li	r29, 0
+	mr	i, which
+	ALIGN(16)
+L(tp2):	addic	i, i, -1
+	subfe	mask, mask, mask
+	ld	r0, 0(tp)
+	ld	r27, 8(tp)
+	and	r0, r0, mask
+	and	r27, r27, mask
+	or	r28, r28, r0
+	or	r29, r29, r27
+	add	tp, tp, stride
+	bdnz	L(tp2)
+	std	r28, 0(rp)
+	std	r29, 8(rp)
+	addi	tp, r10, 16
+	addi	rp, rp, 16
+
+L(b0x):	rldicl.	r0, n, 0, 63
+	beq	cr0, L(b00)
+L(b01):	mtctr	nents
+	mr	r10, tp
+	li	r28, 0
+	mr	i, which
+	ALIGN(16)
+L(tp1):	addic	i, i, -1
+	subfe	mask, mask, mask
+	ld	r0, 0(tp)
+	and	r0, r0, mask
+	or	r28, r28, r0
+	add	tp, tp, stride
+	bdnz	L(tp1)
+	std	r28, 0(rp)
+
+L(b00):	ld	r31, -8(r1)
+	ld	r30, -16(r1)
+	ld	r29, -24(r1)
+	ld	r28, -32(r1)
+	ld	r27, -40(r1)
+	blr
+EPILOGUE()

diff --git a/third_party/gmp/mpn/powerpc64/umul.asm b/third_party/gmp/mpn/powerpc64/umul.asm
new file mode 100644
index 0000000..7fcc72f
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc64/umul.asm

@@ -0,0 +1,53 @@
+dnl  PowerPC-64 umul_ppmm -- support for longlong.h
+
+dnl  Copyright 2000, 2001, 2005 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_umul_ppmm (mp_limb_t *lowptr, mp_limb_t m1, mp_limb_t m2);
+C
+
+ASM_START()
+PROLOGUE(mpn_umul_ppmm)
+
+	C r3	lowptr
+	C r4	m1
+	C r5	m2
+
+	mulld	r0, r4, r5
+	mulhdu	r4, r4, r5
+	std	r0, 0(r3)
+ifdef(`HAVE_ABI_mode32',
+`	srdi	r3, r4, 32
+',`	mr	r3, r4
+')
+	blr
+
+EPILOGUE(mpn_umul_ppmm)

diff --git a/third_party/gmp/mpn/powerpc64/vmx/popcount.asm b/third_party/gmp/mpn/powerpc64/vmx/popcount.asm
new file mode 100644
index 0000000..b95fb88
--- /dev/null
+++ b/third_party/gmp/mpn/powerpc64/vmx/popcount.asm

@@ -0,0 +1,230 @@
+dnl  PowerPC-32/VMX and PowerPC-64/VMX mpn_popcount.
+
+dnl  Copyright 2006, 2010 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C                   cycles/limb
+C 7400,7410 (G4):       ?
+C 744x,745x (G4+):      1.125
+C 970 (G5):             2.25
+
+C TODO
+C  * Rewrite the awkward huge n outer loop code.
+C  * Two lvx, two vperm, and two vxor could make us a similar hamdist.
+C  * Compress cnsts table in 64-bit mode, only half the values are needed.
+
+define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8))
+define(`LIMBS_PER_VR',  eval(16/GMP_LIMB_BYTES))
+define(`LIMBS_PER_2VR', eval(32/GMP_LIMB_BYTES))
+
+define(`OPERATION_popcount')
+
+define(`ap',	`r3')
+define(`n',	`r4')
+
+define(`rtab',	`v10')
+define(`cnt4',	`v11')
+
+ifelse(GMP_LIMB_BITS,32,`
+	define(`LIMB32',`	$1')
+	define(`LIMB64',`')
+',`
+	define(`LIMB32',`')
+	define(`LIMB64',`	$1')
+')
+
+C The inner loop handles up to 2^34 bits, i.e., 2^31 64-limbs, due to overflow
+C in vsum4ubs.  For large operands, we work in chunks, of size LIMBS_PER_CHUNK.
+define(`LIMBS_PER_CHUNK', 0x1000)
+define(`LIMBS_CHUNK_THRES', 0x1001)
+
+ASM_START()
+PROLOGUE(mpn_popcount,toc)
+	mfspr	r10, 256
+	oris	r0, r10, 0xfffc		C Set VRSAVE bit 0-13
+	mtspr	256, r0
+
+ifdef(`HAVE_ABI_mode32',
+`	rldicl	n, n, 0, 32')		C zero extend n
+
+C Load various constants into vector registers
+	LEAL(	r11, cnsts)
+	li	r12, 16
+	vspltisb cnt4, 4		C 0x0404...04 used as shift count
+
+	li	r7, 160
+	lvx	rtab, 0, r11
+
+LIMB64(`lis	r0, LIMBS_CHUNK_THRES	')
+LIMB64(`cmpd	cr7, n, r0		')
+
+	lvx	v0, 0, ap
+	addi	r7, r11, 80
+	rlwinm	r6, ap, 2,26,29
+	lvx	v8, r7, r6
+	vand	v0, v0, v8
+
+LIMB32(`rlwinm	r8, ap, 30,30,31	')
+LIMB64(`rlwinm	r8, ap, 29,31,31	')
+	add	n, n, r8		C compensate n for rounded down `ap'
+
+	vxor	v1, v1, v1
+	li	r8, 0			C grand total count
+
+	vxor	v12, v12, v12		C zero total count
+	vxor	v13, v13, v13		C zero total count
+
+	addic.	n, n, -LIMBS_PER_VR
+	ble	L(sum)
+
+	addic.	n, n, -LIMBS_PER_VR
+	ble	L(lsum)
+
+C For 64-bit machines, handle huge n that would overflow vsum4ubs
+LIMB64(`ble	cr7, L(small)		')
+LIMB64(`addis	r9, n, -LIMBS_PER_CHUNK	') C remaining n
+LIMB64(`lis	n, LIMBS_PER_CHUNK	')
+
+	ALIGN(16)
+L(small):
+LIMB32(`srwi	r7, n, 3	')	C loop count corresponding to n
+LIMB64(`srdi	r7, n, 2	')	C loop count corresponding to n
+	addi	r7, r7, 1
+	mtctr	r7			C copy n to count register
+	b	L(ent)
+
+	ALIGN(16)
+L(top):
+	lvx	v0, 0, ap
+L(ent):	lvx	v1, r12, ap
+	addi	ap, ap, 32
+	vsrb	v8, v0, cnt4
+	vsrb	v9, v1, cnt4
+	vperm	v2, rtab, rtab, v0
+	vperm	v3, rtab, rtab, v8
+	vperm	v4, rtab, rtab, v1
+	vperm	v5, rtab, rtab, v9
+	vaddubm	v6, v2, v3
+	vaddubm	v7, v4, v5
+	vsum4ubs v12, v6, v12
+	vsum4ubs v13, v7, v13
+	bdnz	L(top)
+
+	andi.	n, n, eval(LIMBS_PER_2VR-1)
+	beq	L(rt)
+
+	lvx	v0, 0, ap
+	vxor	v1, v1, v1
+	cmpwi	n, LIMBS_PER_VR
+	ble	L(sum)
+L(lsum):
+	vor	v1, v0, v0
+	lvx	v0, r12, ap
+L(sum):
+LIMB32(`rlwinm	r6, n, 4,26,27	')
+LIMB64(`rlwinm	r6, n, 5,26,26	')
+	addi	r7, r11, 16
+	lvx	v8, r7, r6
+	vand	v0, v0, v8
+	vsrb	v8, v0, cnt4
+	vsrb	v9, v1, cnt4
+	vperm	v2, rtab, rtab, v0
+	vperm	v3, rtab, rtab, v8
+	vperm	v4, rtab, rtab, v1
+	vperm	v5, rtab, rtab, v9
+	vaddubm	v6, v2, v3
+	vaddubm	v7, v4, v5
+	vsum4ubs v12, v6, v12
+	vsum4ubs v13, v7, v13
+
+	ALIGN(16)
+L(rt):	vadduwm	v3, v12, v13
+	li	r7, -16			C FIXME: does all ppc32 and ppc64 ABIs
+	stvx	v3, r7, r1		C FIXME: ...support storing below sp?
+
+	lwz	r7, -16(r1)
+	add	r8, r8, r7
+	lwz	r7, -12(r1)
+	add	r8, r8, r7
+	lwz	r7, -8(r1)
+	add	r8, r8, r7
+	lwz	r7, -4(r1)
+	add	r8, r8, r7
+
+C Handle outer loop for huge n.  We inherit cr7 and r0 from above.
+LIMB64(`ble	cr7, L(ret)
+	vxor	v12, v12, v12		C zero total count
+	vxor	v13, v13, v13		C zero total count
+	mr	n, r9
+	cmpd	cr7, n, r0
+	ble	cr7, L(2)
+	addis	r9, n, -LIMBS_PER_CHUNK	C remaining n
+	lis	n, LIMBS_PER_CHUNK
+L(2):	srdi	r7, n, 2		C loop count corresponding to n
+	mtctr	r7			C copy n to count register
+	b	L(top)
+')
+
+	ALIGN(16)
+L(ret):	mr	r3, r8
+	mtspr	256, r10
+	blr
+EPILOGUE()
+
+DEF_OBJECT(cnsts,16)
+C Counts for vperm
+	.byte	0x00,0x01,0x01,0x02,0x01,0x02,0x02,0x03
+	.byte	0x01,0x02,0x02,0x03,0x02,0x03,0x03,0x04
+C Masks for high end of number
+	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+
+	.byte	0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
+	.byte	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+
+	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+	.byte	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+
+	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+	.byte	0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
+C Masks for low end of number
+	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+
+	.byte	0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff
+	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+
+	.byte	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+
+	.byte	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+	.byte	0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff
+END_OBJECT(cnsts)
+ASM_END()

diff --git a/third_party/gmp/mpn/riscv/64/aors_n.asm b/third_party/gmp/mpn/riscv/64/aors_n.asm
new file mode 100644
index 0000000..6e38083
--- /dev/null
+++ b/third_party/gmp/mpn/riscv/64/aors_n.asm

@@ -0,0 +1,89 @@
+dnl  RISC-V/64 mpn_add_n and mpn_sub_n.
+
+dnl  Copyright 2016 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C  INPUT PARAMETERS
+define(`rp',	`a0')
+define(`up',	`a1')
+define(`vp',	`a2')
+define(`n',	`a3')
+
+ifdef(`OPERATION_add_n',`
+    define(`ADDSUB',	`add')
+    define(`CMPCY',	`sltu	$1, $2, $3')
+    define(`func',	`mpn_add_n')
+')
+ifdef(`OPERATION_sub_n',`
+    define(`ADDSUB',	`sub')
+    define(`CMPCY',	`sltu	$1, $3, $2')
+    define(`func',	`mpn_sub_n')
+')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_sub_n)
+
+ASM_START()
+PROLOGUE(func)
+	li	t6, 0
+
+	andi	t0, n, 1
+	beq	t0, x0, L(top)
+	addi	up, up, 8
+	addi	vp, vp, -8
+	addi	rp, rp, -8
+	addi	n, n, -1
+	j	L(mid)
+
+L(top):	ld	a4, 0(up)
+	ld	a6, 0(vp)
+	addi	n, n, -2	C bookkeeping
+	addi	up, up, 16	C bookkeeping
+	ADDSUB	t0, a4, a6
+	CMPCY(	t2, t0, a4)
+	ADDSUB	t4, t0, t6	C cycle 3, 9, ...
+	CMPCY(	t3, t4, t0)	C cycle 4, 10, ...
+	sd	t4, 0(rp)
+	add	t6, t2, t3	C cycle 5, 11, ...
+L(mid):	ld	a5, -8(up)
+	ld	a7, 8(vp)
+	addi	vp, vp, 16	C bookkeeping
+	addi	rp, rp, 16	C bookkeeping
+	ADDSUB	t1, a5, a7
+	CMPCY(	t2, t1, a5)
+	ADDSUB	t4, t1, t6	C cycle 0, 6, ...
+	CMPCY(	t3, t4, t1)	C cycle 1, 7, ...
+	sd	t4, -8(rp)
+	add	t6, t2, t3	C cycle 2, 8, ...
+	bne	n, x0, L(top)	C bookkeeping
+
+L(end):	mv	a0, t6
+	ret
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/riscv/64/aorsmul_1.asm b/third_party/gmp/mpn/riscv/64/aorsmul_1.asm
new file mode 100644
index 0000000..1125a9f
--- /dev/null
+++ b/third_party/gmp/mpn/riscv/64/aorsmul_1.asm

@@ -0,0 +1,75 @@
+dnl  RISC-V/64 mpn_addmul_1 and mpn_submul_1.
+
+dnl  Copyright 2016 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C  INPUT PARAMETERS
+define(`rp',	`a0')
+define(`up',	`a1')
+define(`n',	`a2')
+define(`v0',	`a3')
+
+ifdef(`OPERATION_addmul_1',`
+    define(`ADDSUB',	`add')
+    define(`CMPCY',	`sltu	$1, $2, $3')
+    define(`func',	`mpn_addmul_1')
+')
+ifdef(`OPERATION_submul_1',`
+    define(`ADDSUB',	`sub')
+    define(`CMPCY',	`sltu	$1, $3, $2')
+    define(`func',	`mpn_submul_1')
+')
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
+
+ASM_START()
+PROLOGUE(func)
+	li	a6, 0
+
+L(top):	ld	a7, 0(up)
+	addi	up, up, 8	C bookkeeping
+	ld	a4, 0(rp)
+	addi	rp, rp, 8	C bookkeeping
+	mul	a5, a7, v0
+	addi	n, n, -1	C bookkeeping
+	mulhu	a7, a7, v0
+	ADDSUB	a5, a4, a5
+	ADDSUB	a6, a5, a6	C cycle 0, 3, ...
+	CMPCY(	a4, a5, a4)
+	add	a4, a4, a7
+	CMPCY(	a5, a6, a5)	C cycle 1, 4, ...
+	sd	a6, -8(rp)
+	add	a6, a4, a5	C cycle 2, 5, ...
+	bne	n, x0, L(top)	C bookkeeping
+
+L(end):	mv	a0, a6
+	ret
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/riscv/64/mul_1.asm b/third_party/gmp/mpn/riscv/64/mul_1.asm
new file mode 100644
index 0000000..e35eaa9
--- /dev/null
+++ b/third_party/gmp/mpn/riscv/64/mul_1.asm

@@ -0,0 +1,58 @@
+dnl  RISC-V/64 mpn_mul_1.
+
+dnl  Copyright 2016 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C  INPUT PARAMETERS
+define(`rp',	`a0')
+define(`up',	`a1')
+define(`n',	`a2')
+define(`v0',	`a3')
+
+ASM_START()
+PROLOGUE(mpn_mul_1)
+	li	a6, 0
+
+L(top):	ld	a7, 0(up)
+	addi	up, up, 8	C bookkeeping
+	addi	rp, rp, 8	C bookkeeping
+	mul	a5, a7, v0
+	addi	n, n, -1	C bookkeeping
+	mulhu	a7, a7, v0
+	add	a6, a5, a6	C cycle 0, 3, ...
+	sltu	a5, a6, a5	C cycle 1, 4, ...
+	sd	a6, -8(rp)
+	add	a6, a7, a5	C cycle 2, 5, ...
+	bne	n, x0, L(top)	C bookkeeping
+
+L(end):	mv	a0, a6
+	ret
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/s390_32/README b/third_party/gmp/mpn/s390_32/README
new file mode 100644
index 0000000..59519ba
--- /dev/null
+++ b/third_party/gmp/mpn/s390_32/README

@@ -0,0 +1,37 @@
+All current (2001) S/390 and z/Architecture machines are single-issue,
+but some newer machines have a deep pipeline.  Software-pipelining is
+therefore beneficial.
+
+* mpn_add_n, mpn_sub_n: Use code along the lines below.  Two-way unrolling
+  would be adequate.
+
+  mp_limb_t
+  mpn_add_n (mp_ptr rp, mp_srcptr up, mp_srcptr vp, mp_size_t n)
+  {
+    mp_limb_t a, b, r, cy;
+    mp_size_t i;
+    mp_limb_t mm = -1;
+
+    cy = 0;
+    up += n;
+    vp += n;
+    rp += n;
+    i = -n;
+    do
+      {
+	a = up[i];
+	b = vp[i];
+	r = a + b + cy;
+	rp[i] = r;
+	cy = (((a & b) | ((a | b) & (r ^ mm)))) >> 31;
+	i++;
+      }
+    while (i < 0);
+    return cy;
+  }
+
+* mpn_lshift, mpn_rshift: Use SLDL/SRDL, and two-way unrolling.
+
+* mpn_mul_1, mpn_addmul_1, mpn_submul_1: For machines with just signed
+  multiply (MR), use two loops, similar to the corresponding VAX or
+  POWER functions.  Handle carry like for mpn_add_n.

diff --git a/third_party/gmp/mpn/s390_32/addmul_1.asm b/third_party/gmp/mpn/s390_32/addmul_1.asm
new file mode 100644
index 0000000..97189a8
--- /dev/null
+++ b/third_party/gmp/mpn/s390_32/addmul_1.asm

@@ -0,0 +1,93 @@
+dnl  S/390 mpn_addmul_1 -- Multiply a limb vector with a limb and add the
+dnl  result to a second limb vector.
+
+dnl  Copyright 2001 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+define(`rp',2)
+define(`up',3)
+define(`n',4)
+define(`vlimb',5)
+define(`cylimb',7)
+
+ASM_START()
+PROLOGUE(mpn_addmul_1)
+	stm	6,7,24(15)
+	slr	cylimb,cylimb	# clear cylimb
+	ltr	vlimb,vlimb
+	jnl	.Loopp
+
+.Loopn:	l	1,0(up)		# load from u
+	lr	6,1		#
+	mr	0,vlimb		# multiply signed
+	alr	0,6		# add vlimb to phi
+	sra	6,31		# make mask
+	nr	6,vlimb		# 0 or vlimb
+	alr	0,6		# conditionally add vlimb to phi
+	alr	1,cylimb	# add carry limb to plo
+	brc	8+4,+8		# branch if not carry
+	ahi	0,1		# increment phi
+	l	6,0(rp)		# load r limb
+	alr	6,1		# add u limb to plo
+	brc	8+4,+8		# branch if not carry
+	ahi	0,1		# increment phi
+	lr	cylimb,0	# new cylimb
+	st	6,0(rp)		# store
+	la	up,4(,up)
+	la	rp,4(,rp)
+	brct	n,.Loopn
+
+	lr	2,cylimb
+	lm	6,7,24(15)
+	br	14
+
+.Loopp:	l	1,0(up)		# load from u
+	lr	6,1		#
+	mr	0,vlimb		# multiply signed
+	sra	6,31		# make mask
+	nr	6,vlimb		# 0 or vlimb
+	alr	0,6		# conditionally add vlimb to phi
+	alr	1,cylimb	# add carry limb to plo
+	brc	8+4,+8		# branch if not carry
+	ahi	0,1		# increment phi
+	l	6,0(rp)		# load r limb
+	alr	6,1		# add u limb to plo
+	brc	8+4,+8		# branch if not carry
+	ahi	0,1		# increment phi
+	lr	cylimb,0	# new cylimb
+	st	6,0(rp)		# store
+	la	up,4(,up)
+	la	rp,4(,rp)
+	brct	n,.Loopp
+
+	lr	2,cylimb
+	lm	6,7,24(15)
+	br	14
+EPILOGUE(mpn_addmul_1)

diff --git a/third_party/gmp/mpn/s390_32/copyd.asm b/third_party/gmp/mpn/s390_32/copyd.asm
new file mode 100644
index 0000000..ff252bc
--- /dev/null
+++ b/third_party/gmp/mpn/s390_32/copyd.asm

@@ -0,0 +1,145 @@
+dnl  S/390-32 mpn_copyd
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+C            cycles/limb
+C            cycles/limb
+C z900		 1.65
+C z990           1.125
+C z9		 ?
+C z10		 ?
+C z196		 ?
+
+C FIXME:
+C  * Avoid saving/restoring callee-saves registers for n < 3.  This could be
+C    done by setting rp=r1, up=r2, i=r0 and r3,r4,r5 for clock regs.
+C    We could then use r3...r10 in main loop.
+
+C INPUT PARAMETERS
+define(`rp_param',	`%r2')
+define(`up_param',	`%r3')
+define(`n',		`%r4')
+
+define(`rp',	`%r8')
+define(`up',	`%r9')
+
+ASM_START()
+PROLOGUE(mpn_copyd)
+	stm	%r6, %r11, 24(%r15)
+
+	lr	%r1, n
+	sll	%r1, 2
+	la	%r10, 8(n)
+	ahi	%r1, -32
+	srl	%r10, 3
+	lhi	%r11, -32
+
+	la	rp, 0(%r1,rp_param)	C FIXME use lay on z990 and later
+	la	up, 0(%r1,up_param)	C FIXME use lay on z990 and later
+
+	lhi	%r7, 7
+	nr	%r7, n			C n mod 8
+	chi	%r7, 2
+	jh	L(b34567)
+	chi	%r7, 1
+	je	L(b1)
+	jh	L(b2)
+
+L(b0):	brct	%r10, L(top)
+	j	L(end)
+
+L(b1):	l	%r0, 28(up)
+	ahi	up, -4
+	st	%r0, 28(rp)
+	ahi	rp, -4
+	brct	%r10, L(top)
+	j	L(end)
+
+L(b2):	lm	%r0, %r1, 24(up)
+	ahi	up, -8
+	stm	%r0, %r1, 24(rp)
+	ahi	rp, -8
+	brct	%r10, L(top)
+	j	L(end)
+
+L(b34567):
+	chi	%r7, 4
+	jl	L(b3)
+	je	L(b4)
+	chi	%r7, 6
+	je	L(b6)
+	jh	L(b7)
+
+L(b5):	lm	%r0, %r4, 12(up)
+	ahi	up, -20
+	stm	%r0, %r4, 12(rp)
+	ahi	rp, -20
+	brct	%r10, L(top)
+	j	L(end)
+
+L(b3):	lm	%r0, %r2, 20(up)
+	ahi	up, -12
+	stm	%r0, %r2, 20(rp)
+	ahi	rp, -12
+	brct	%r10, L(top)
+	j	L(end)
+
+L(b4):	lm	%r0, %r3, 16(up)
+	ahi	up, -16
+	stm	%r0, %r3, 16(rp)
+	ahi	rp, -16
+	brct	%r10, L(top)
+	j	L(end)
+
+L(b6):	lm	%r0, %r5, 8(up)
+	ahi	up, -24
+	stm	%r0, %r5, 8(rp)
+	ahi	rp, -24
+	brct	%r10, L(top)
+	j	L(end)
+
+L(b7):	lm	%r0, %r6, 4(up)
+	ahi	up, -28
+	stm	%r0, %r6, 4(rp)
+	ahi	rp, -28
+	brct	%r10, L(top)
+	j	L(end)
+
+L(top):	lm	%r0, %r7, 0(up)
+	la	up, 0(%r11,up)
+	stm	%r0, %r7, 0(rp)
+	la	rp, 0(%r11,rp)
+	brct	%r10, L(top)
+
+L(end):	lm	%r6, %r11, 24(%r15)
+	br	%r14
+EPILOGUE()

diff --git a/third_party/gmp/mpn/s390_32/copyi.asm b/third_party/gmp/mpn/s390_32/copyi.asm
new file mode 100644
index 0000000..1df32f1
--- /dev/null
+++ b/third_party/gmp/mpn/s390_32/copyi.asm

@@ -0,0 +1,69 @@
+dnl  S/390-32 mpn_copyi
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+C            cycles/limb
+C z900		 0.75
+C z990           0.375
+C z9		 ?
+C z10		 ?
+C z196		 ?
+
+C NOTE
+C  * This is based on GNU libc memcpy which was written by Martin Schwidefsky.
+
+C INPUT PARAMETERS
+define(`rp',	`%r2')
+define(`up',	`%r3')
+define(`n',	`%r4')
+
+ASM_START()
+PROLOGUE(mpn_copyi)
+	ltr	%r4, %r4
+	sll	%r4, 2
+	je	L(rtn)
+	ahi	%r4, -1
+	lr	%r5, %r4
+	srl	%r5, 8
+	ltr	%r5, %r5		C < 256 bytes to copy?
+	je	L(1)
+
+L(top):	mvc	0(256, rp), 0(up)
+	la	rp, 256(rp)
+	la	up, 256(up)
+	brct	%r5, L(top)
+
+L(1):	bras	%r5, L(2)		C make r5 point to mvc insn
+	mvc	0(1, rp), 0(up)
+L(2):	ex	%r4, 0(%r5)		C execute mvc with length ((n-1) mod 256)+1
+L(rtn):	br	%r14
+EPILOGUE()

diff --git a/third_party/gmp/mpn/s390_32/esame/addmul_1.asm b/third_party/gmp/mpn/s390_32/esame/addmul_1.asm
new file mode 100644
index 0000000..4375b74
--- /dev/null
+++ b/third_party/gmp/mpn/s390_32/esame/addmul_1.asm

@@ -0,0 +1,72 @@
+dnl  S/390-32 mpn_addmul_1 for systems with MLR instruction
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C            cycles/limb
+C z900		18.5
+C z990		10
+C z9		 ?
+C z10		 ?
+C z196		 ?
+
+C INPUT PARAMETERS
+define(`rp',	`%r2')
+define(`up',	`%r3')
+define(`n',	`%r4')
+define(`v0',	`%r5')
+
+define(`z',	`%r9')
+
+ASM_START()
+PROLOGUE(mpn_addmul_1)
+	stm	%r9, %r12, 36(%r15)
+	lhi	%r12, 0			C zero index register
+	ahi	%r12, 0			C clear carry fla
+	lhi	%r11, 0			C clear carry limb
+	lhi	z, 0			C clear carry limb
+
+L(top):	l	%r1, 0(%r12,up)
+	l	%r10, 0(%r12,rp)
+	mlr	%r0, v0
+	alcr	%r1, %r10
+	alcr	%r0, z
+	alr	%r1, %r11
+	lr	%r11, %r0
+	st	%r1, 0(%r12,rp)
+	la	%r12, 4(%r12)
+	brct	n, L(top)
+
+	lhi	%r2, 0
+	alcr	%r2, %r11
+
+	lm	%r9, %r12, 36(%r15)
+	br	%r14
+EPILOGUE()

diff --git a/third_party/gmp/mpn/s390_32/esame/aors_n.asm b/third_party/gmp/mpn/s390_32/esame/aors_n.asm
new file mode 100644
index 0000000..98b0dbc
--- /dev/null
+++ b/third_party/gmp/mpn/s390_32/esame/aors_n.asm

@@ -0,0 +1,137 @@
+dnl  S/390-32 mpn_add_n and mpn_sub_n.
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C            cycles/limb
+C z900		 ?
+C z990	      2.75-3		(fast for even n, slow for odd n)
+C z9		 ?
+C z10		 ?
+C z196		 ?
+
+C TODO
+C  * Optimise for small n
+C  * Use r0 and save/restore one less register
+C  * Using logops_n's v1 inner loop operand order make the loop about 20%
+C    faster, at the expense of highly alignment-dependent performance.
+
+C INPUT PARAMETERS
+define(`rp',	`%r2')
+define(`up',	`%r3')
+define(`vp',	`%r4')
+define(`n',	`%r5')
+
+ifdef(`OPERATION_add_n', `
+  define(ADSB,		al)
+  define(ADSBCR,	alcr)
+  define(ADSBC,		alc)
+  define(RETVAL,`dnl
+	lhi	%r2, 0
+	alcr	%r2, %r2')
+  define(func,		mpn_add_n)
+  define(func_nc,	mpn_add_nc)')
+ifdef(`OPERATION_sub_n', `
+  define(ADSB,		sl)
+  define(ADSBCR,	slbr)
+  define(ADSBC,		slb)
+  define(RETVAL,`dnl
+	slbr	%r2, %r2
+	lcr	%r2, %r2')
+  define(func,		mpn_sub_n)
+  define(func_nc,	mpn_sub_nc)')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_sub_n)
+
+ASM_START()
+PROLOGUE(func)
+	stm	%r6, %r8, 24(%r15)
+
+	ahi	n, 3
+	lhi	%r7, 3
+	lr	%r1, n
+	srl	%r1, 2
+	nr	%r7, n			C n mod 4
+	je	L(b1)
+	chi	%r7, 2
+	jl	L(b2)
+	jne	L(b0)
+
+L(b3):	lm	%r5, %r7, 0(up)
+	la	up, 12(up)
+	ADSB	%r5, 0(vp)
+	ADSBC	%r6, 4(vp)
+	ADSBC	%r7, 8(vp)
+	la	vp, 12(vp)
+	stm	%r5, %r7, 0(rp)
+	la	rp, 12(rp)
+	brct	%r1, L(top)
+	j	L(end)
+
+L(b0):	lm	%r5, %r8, 0(up)		C This redundant insns is no mistake,
+	la	up, 16(up)		C it is needed to make main loop run
+	ADSB	%r5, 0(vp)		C fast for n = 0 (mod 4).
+	ADSBC	%r6, 4(vp)
+	j	L(m0)
+
+L(b1):	l	%r5, 0(up)
+	la	up, 4(up)
+	ADSB	%r5, 0(vp)
+	la	vp, 4(vp)
+	st	%r5, 0(rp)
+	la	rp, 4(rp)
+	brct	%r1, L(top)
+	j	L(end)
+
+L(b2):	lm	%r5, %r6, 0(up)
+	la	up, 8(up)
+	ADSB	%r5, 0(vp)
+	ADSBC	%r6, 4(vp)
+	la	vp, 8(vp)
+	stm	%r5, %r6, 0(rp)
+	la	rp, 8(rp)
+	brct	%r1, L(top)
+	j	L(end)
+
+L(top):	lm	%r5, %r8, 0(up)
+	la	up, 16(up)
+	ADSBC	%r5, 0(vp)
+	ADSBC	%r6, 4(vp)
+L(m0):	ADSBC	%r7, 8(vp)
+	ADSBC	%r8, 12(vp)
+	la	vp, 16(vp)
+	stm	%r5, %r8, 0(rp)
+	la	rp, 16(rp)
+	brct	%r1, L(top)
+
+L(end):	RETVAL
+	lm	%r6, %r8, 24(%r15)
+	br	%r14
+EPILOGUE()

diff --git a/third_party/gmp/mpn/s390_32/esame/aorslsh1_n.asm b/third_party/gmp/mpn/s390_32/esame/aorslsh1_n.asm
new file mode 100644
index 0000000..f2b222b
--- /dev/null
+++ b/third_party/gmp/mpn/s390_32/esame/aorslsh1_n.asm

@@ -0,0 +1,173 @@
+dnl  S/390-32 mpn_addlsh1_n
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C            cycles/limb
+C z900		 9.25
+C z990		 5
+C z9		 ?
+C z10		 ?
+C z196		 ?
+
+C TODO
+C  * Optimise for small n
+C  * Compute RETVAL for sublsh1_n less stupidly
+
+C INPUT PARAMETERS
+define(`rp',	`%r2')
+define(`up',	`%r3')
+define(`vp',	`%r4')
+define(`n',	`%r5')
+
+ifdef(`OPERATION_addlsh1_n',`
+  define(ADDSUBC,       alr)
+  define(ADDSUBE,       alcr)
+  define(INITCY,        `lhi	%r13, -1')
+  define(RETVAL,        `alr	%r1, %r13
+			lhi	%r2, 2
+			alr	%r2, %r1')
+  define(func, mpn_addlsh1_n)
+')
+ifdef(`OPERATION_sublsh1_n',`
+  define(ADDSUBC,       slr)
+  define(ADDSUBE,       slbr)
+  define(INITCY,        `lhi	%r13, 0')
+  define(RETVAL,        `slr	%r1, %r13
+			lhi	%r2, 1
+			alr	%r2, %r1')
+  define(func, mpn_sublsh1_n)
+')
+
+MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_sublsh1_n)
+
+ASM_START()
+PROLOGUE(func)
+	stm	%r6, %r13, 24(%r15)
+
+	la	%r0, 3(n)
+	lhi	%r7, 3
+	srl	%r0, 2
+	nr	%r7, n			C n mod 4
+	je	L(b0)
+	chi	%r7, 2
+	jl	L(b1)
+	je	L(b2)
+
+L(b3):	lm	%r5, %r7, 0(up)
+	la	up, 12(up)
+	lm	%r9, %r11, 0(vp)
+	la	vp, 12(vp)
+
+	alr	%r9, %r9
+	alcr	%r10, %r10
+	alcr	%r11, %r11
+	slbr	%r1, %r1
+
+	ADDSUBC	%r5, %r9
+	ADDSUBE	%r6, %r10
+	ADDSUBE	%r7, %r11
+	slbr	%r13, %r13
+
+	stm	%r5, %r7, 0(rp)
+	la	rp, 12(rp)
+	brct	%r0, L(top)
+	j	L(end)
+
+L(b0):	lhi	%r1, -1
+	INITCY
+	j	L(top)
+
+L(b1):	l	%r5, 0(up)
+	la	up, 4(up)
+	l	%r9, 0(vp)
+	la	vp, 4(vp)
+
+	alr	%r9, %r9
+	slbr	%r1, %r1
+	ADDSUBC	%r5, %r9
+	slbr	%r13, %r13
+
+	st	%r5, 0(rp)
+	la	rp, 4(rp)
+	brct	%r0, L(top)
+	j	L(end)
+
+L(b2):	lm	%r5, %r6, 0(up)
+	la	up, 8(up)
+	lm	%r9, %r10, 0(vp)
+	la	vp, 8(vp)
+
+	alr	%r9, %r9
+	alcr	%r10, %r10
+	slbr	%r1, %r1
+
+	ADDSUBC	%r5, %r9
+	ADDSUBE	%r6, %r10
+	slbr	%r13, %r13
+
+	stm	%r5, %r6, 0(rp)
+	la	rp, 8(rp)
+	brct	%r0, L(top)
+	j	L(end)
+
+L(top):	lm	%r9, %r12, 0(vp)
+	la	vp, 16(vp)
+
+	ahi	%r1, 1			C restore carry
+
+	alcr	%r9, %r9
+	alcr	%r10, %r10
+	alcr	%r11, %r11
+	alcr	%r12, %r12
+
+	slbr	%r1, %r1		C save carry
+
+	lm	%r5, %r8, 0(up)
+	la	up, 16(up)
+
+	ahi	%r13, 1			C restore carry
+
+	ADDSUBE	%r5, %r9
+	ADDSUBE	%r6, %r10
+	ADDSUBE	%r7, %r11
+	ADDSUBE	%r8, %r12
+
+	slbr	%r13, %r13
+
+	stm	%r5, %r8, 0(rp)
+	la	rp, 16(rp)
+	brct	%r0, L(top)
+
+L(end):
+	RETVAL
+	lm	%r6, %r13, 24(%r15)
+	br	%r14
+EPILOGUE()

diff --git a/third_party/gmp/mpn/s390_32/esame/bdiv_dbm1c.asm b/third_party/gmp/mpn/s390_32/esame/bdiv_dbm1c.asm
new file mode 100644
index 0000000..568a2a4
--- /dev/null
+++ b/third_party/gmp/mpn/s390_32/esame/bdiv_dbm1c.asm

@@ -0,0 +1,65 @@
+dnl  S/390-32 mpn_bdiv_dbm1c for systems with MLR instruction.
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C            cycles/limb
+C z900		14
+C z990		10
+C z9		 ?
+C z10		 ?
+C z196		 ?
+
+C INPUT PARAMETERS
+define(`qp',	  `%r2')
+define(`up',	  `%r3')
+define(`n',	  `%r4')
+define(`bd',	  `%r5')
+define(`cy',	  `%r6')
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_bdiv_dbm1c)
+	stm	%r6, %r7, 24(%r15)
+	lhi	%r7, 0			C zero index register
+
+L(top):	l	%r1, 0(%r7,up)
+	mlr	%r0, bd
+	slr	%r6, %r1
+	st	%r6, 0(%r7,qp)
+	slbr	%r6, %r0
+	la	%r7, 4(%r7)
+	brct	n, L(top)
+
+	lr	%r2, %r6
+	lm	%r6, %r7, 24(%r15)
+	br	%r14
+EPILOGUE()

diff --git a/third_party/gmp/mpn/s390_32/esame/gmp-mparam.h b/third_party/gmp/mpn/s390_32/esame/gmp-mparam.h
new file mode 100644
index 0000000..c0e5046
--- /dev/null
+++ b/third_party/gmp/mpn/s390_32/esame/gmp-mparam.h

@@ -0,0 +1,177 @@
+/* S/390-32 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 2000-2008-2011, 2014 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* 4400 MHz IBM z196 running in 32-bit mode */
+/* FFT tuning limit = 0.5M */
+/* Generated by tuneup.c, 2017-01-02, gcc 4.9 */
+
+#define DIVREM_1_NORM_THRESHOLD          MP_SIZE_T_MAX  /* never */
+#define DIVREM_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */
+#define MOD_1_1P_METHOD                      2
+#define MOD_1_NORM_THRESHOLD             MP_SIZE_T_MAX  /* never */
+#define MOD_1_UNNORM_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD         45
+#define MOD_1U_TO_MOD_1_1_THRESHOLD         18
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD         0  /* never mpn_mod_1_1p */
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD     MP_SIZE_T_MAX
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD      3
+#define USE_PREINV_DIVREM_1                  0
+#define DIV_QR_1N_PI1_METHOD                 1
+#define DIV_QR_1_NORM_THRESHOLD          MP_SIZE_T_MAX  /* never */
+#define DIV_QR_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD                 6
+#define BMOD_1_TO_MOD_1_THRESHOLD            0  /* always */
+
+#define DIV_1_VS_MUL_1_PERCENT             320
+
+#define MUL_TOOM22_THRESHOLD                12
+#define MUL_TOOM33_THRESHOLD                81
+#define MUL_TOOM44_THRESHOLD               130
+#define MUL_TOOM6H_THRESHOLD               173
+#define MUL_TOOM8H_THRESHOLD               260
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      81
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD      91
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD      83
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD      86
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD     112
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 18
+#define SQR_TOOM3_THRESHOLD                 69
+#define SQR_TOOM4_THRESHOLD                178
+#define SQR_TOOM6_THRESHOLD                254
+#define SQR_TOOM8_THRESHOLD                406
+
+#define MULMID_TOOM42_THRESHOLD             30
+
+#define MULMOD_BNM1_THRESHOLD               12
+#define SQRMOD_BNM1_THRESHOLD                7
+
+#define MUL_FFT_MODF_THRESHOLD             276  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    276, 5}, {     15, 6}, {      8, 5}, {     17, 6}, \
+    {      9, 5}, {     19, 6}, {     13, 7}, {      7, 6}, \
+    {     17, 7}, {      9, 6}, {     19, 7}, {     11, 6}, \
+    {     23, 7}, {     13, 8}, {      7, 7}, {     19, 8}, \
+    {     11, 7}, {     25, 8}, {     15, 7}, {     31, 8}, \
+    {     19, 7}, {     39, 8}, {     23, 9}, {     15, 8}, \
+    {     39, 9}, {     23,10}, {     15, 9}, {     31, 8}, \
+    {     67, 9}, {     39, 8}, {     79, 9}, {     47,10}, \
+    {     31, 9}, {     71, 8}, {    143, 9}, {     79,10}, \
+    {     47,11}, {     31,10}, {     63, 9}, {    127, 8}, \
+    {    255, 7}, {    511, 9}, {    143,10}, {     79, 9}, \
+    {    159, 8}, {    319, 9}, {    175, 8}, {    351,10}, \
+    {     95, 9}, {    191, 8}, {    383,11}, {     63,10}, \
+    {    127, 9}, {    255, 8}, {    511,10}, {    143, 9}, \
+    {    287, 8}, {    575,10}, {    159, 9}, {    319,10}, \
+    {    175, 9}, {    351,11}, {     95,10}, {    191, 9}, \
+    {    383,12}, {     63,11}, {    127,10}, {    255, 9}, \
+    {    511,10}, {    271, 9}, {    543, 8}, {   1087,10}, \
+    {    287, 9}, {    575,11}, {    159,10}, {    351, 9}, \
+    {    703, 8}, {   1407,11}, {    191,10}, {    415, 9}, \
+    {    831,11}, {    223,10}, {    479, 9}, {    959, 8}, \
+    {   1919,12}, {   4096,13}, {   8192,14}, {  16384,15}, \
+    {  32768,16} }
+#define MUL_FFT_TABLE3_SIZE 89
+#define MUL_FFT_THRESHOLD                 2688
+
+#define SQR_FFT_MODF_THRESHOLD             240  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    240, 5}, {     17, 6}, {     17, 7}, {      9, 6}, \
+    {     19, 7}, {     11, 6}, {     23, 7}, {     13, 8}, \
+    {      7, 7}, {     19, 8}, {     11, 7}, {     25, 8}, \
+    {     15, 7}, {     33, 8}, {     19, 7}, {     39, 8}, \
+    {     23, 9}, {     15, 8}, {     39, 9}, {     23,10}, \
+    {     15, 9}, {     31, 8}, {     63, 9}, {     47,10}, \
+    {     31, 9}, {     63, 8}, {    127, 9}, {     71, 8}, \
+    {    143,10}, {     47,11}, {     31,10}, {     63, 9}, \
+    {    127, 8}, {    255, 7}, {    511, 9}, {    143,10}, \
+    {     79, 9}, {    159, 8}, {    319, 9}, {    175, 8}, \
+    {    351, 7}, {    703,10}, {     95, 9}, {    191, 8}, \
+    {    383, 9}, {    207, 8}, {    415,11}, {     63,10}, \
+    {    127, 9}, {    255, 8}, {    511,10}, {    143, 9}, \
+    {    287, 8}, {    575,10}, {    159, 9}, {    319,10}, \
+    {    175, 9}, {    351, 8}, {    703, 7}, {   1407,11}, \
+    {     95,10}, {    191, 9}, {    383,10}, {    207, 9}, \
+    {    415,12}, {     63,11}, {    127,10}, {    255, 9}, \
+    {    511,10}, {    287, 9}, {    575,11}, {    159,10}, \
+    {    351, 9}, {    703, 8}, {   1407,11}, {    191,10}, \
+    {    415, 9}, {    831,11}, {    223,10}, {    479,12}, \
+    {   4096,13}, {   8192,14}, {  16384,15}, {  32768,16} }
+#define SQR_FFT_TABLE3_SIZE 84
+#define SQR_FFT_THRESHOLD                 1856
+
+#define MULLO_BASECASE_THRESHOLD             0  /* always */
+#define MULLO_DC_THRESHOLD                  27
+#define MULLO_MUL_N_THRESHOLD             5240
+#define SQRLO_BASECASE_THRESHOLD             0  /* always */
+#define SQRLO_DC_THRESHOLD                  65
+#define SQRLO_SQR_THRESHOLD               3470
+
+#define DC_DIV_QR_THRESHOLD                 32
+#define DC_DIVAPPR_Q_THRESHOLD             135
+#define DC_BDIV_QR_THRESHOLD                32
+#define DC_BDIV_Q_THRESHOLD                 80
+
+#define INV_MULMOD_BNM1_THRESHOLD           42
+#define INV_NEWTON_THRESHOLD               177
+#define INV_APPR_THRESHOLD                 139
+
+#define BINV_NEWTON_THRESHOLD              179
+#define REDC_1_TO_REDC_N_THRESHOLD          39
+
+#define MU_DIV_QR_THRESHOLD                872
+#define MU_DIVAPPR_Q_THRESHOLD             998
+#define MUPI_DIV_QR_THRESHOLD               66
+#define MU_BDIV_QR_THRESHOLD               748
+#define MU_BDIV_Q_THRESHOLD                906
+
+#define POWM_SEC_TABLE  9,34,257,946,2913
+
+#define GET_STR_DC_THRESHOLD                10
+#define GET_STR_PRECOMPUTE_THRESHOLD        16
+#define SET_STR_DC_THRESHOLD              1045
+#define SET_STR_PRECOMPUTE_THRESHOLD      1800
+
+#define FAC_DSC_THRESHOLD                   77
+#define FAC_ODD_THRESHOLD                   24
+
+#define MATRIX22_STRASSEN_THRESHOLD         15
+#define HGCD_THRESHOLD                     121
+#define HGCD_APPR_THRESHOLD                142
+#define HGCD_REDUCE_THRESHOLD             1679
+#define GCD_DC_THRESHOLD                   389
+#define GCDEXT_DC_THRESHOLD                285
+#define JACOBI_BASE_METHOD                   4

diff --git a/third_party/gmp/mpn/s390_32/esame/mul_1.asm b/third_party/gmp/mpn/s390_32/esame/mul_1.asm
new file mode 100644
index 0000000..04be963
--- /dev/null
+++ b/third_party/gmp/mpn/s390_32/esame/mul_1.asm

@@ -0,0 +1,66 @@
+dnl  S/390-32 mpn_mul_1 for systems with MLR instruction
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C            cycles/limb
+C z900		14
+C z990		 9
+C z9		 ?
+C z10		 ?
+C z196		 ?
+
+C INPUT PARAMETERS
+define(`rp',	`%r2')
+define(`up',	`%r3')
+define(`n',	`%r4')
+define(`v0',	`%r5')
+
+ASM_START()
+PROLOGUE(mpn_mul_1)
+	stm	%r11, %r12, 44(%r15)
+	lhi	%r12, 0			C zero index register
+	ahi	%r12, 0			C clear carry flag
+	lhi	%r11, 0			C clear carry limb
+
+L(top):	l	%r1, 0(%r12,up)
+	mlr	%r0, v0
+	alcr	%r1, %r11
+	lr	%r11, %r0		C copy high part to carry limb
+	st	%r1, 0(%r12,rp)
+	la	%r12, 4(%r12)
+	brct	n, L(top)
+
+	lhi	%r2, 0
+	alcr	%r2, %r11
+
+	lm	%r11, %r12, 44(%r15)
+	br	%r14
+EPILOGUE()

diff --git a/third_party/gmp/mpn/s390_32/esame/mul_basecase.asm b/third_party/gmp/mpn/s390_32/esame/mul_basecase.asm
new file mode 100644
index 0000000..2c8138d
--- /dev/null
+++ b/third_party/gmp/mpn/s390_32/esame/mul_basecase.asm

@@ -0,0 +1,130 @@
+dnl  S/390-32/esame mpn_mul_basecase.
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C            cycles/limb
+C z900		 ?
+C z990		 ?
+C z9		 ?
+C z10		 ?
+C z196		 ?
+
+C TODO
+C  * Perhaps add special case for un <= 2.
+C  * Replace loops by faster code.  The mul_1 and addmul_1 loops could be sped
+C    up by about 10%.
+
+C INPUT PARAMETERS
+define(`rp',	`%r2')
+define(`up',	`%r3')
+define(`un',	`%r4')
+define(`vp',	`%r5')
+define(`vn',	`%r6')
+
+define(`zero',	`%r8')
+
+ASM_START()
+PROLOGUE(mpn_mul_basecase)
+	chi	un, 2
+	jhe	L(ge2)
+
+C un = vn = 1
+	l	%r1, 0(vp)
+	ml	%r0, 0(up)
+	st	%r1, 0(rp)
+	st	%r0, 4(rp)
+	br	%r14
+
+L(ge2):	C jne	L(gen)
+
+
+L(gen):
+C mul_1 =======================================================================
+
+	stm	%r6, %r12, 24(%r15)
+	lhi	zero, 0
+	ahi	un, -1
+
+	l	%r7, 0(vp)
+	l	%r11, 0(up)
+	lhi	%r12, 4			C init index register
+	mlr	%r10, %r7
+	lr	%r9, un
+	st	%r11, 0(rp)
+	cr	%r15, %r15		C clear carry flag
+
+L(tm):	l	%r1, 0(%r12,up)
+	mlr	%r0, %r7
+	alcr	%r1, %r10
+	lr	%r10, %r0		C copy high part to carry limb
+	st	%r1, 0(%r12,rp)
+	la	%r12, 4(%r12)
+	brct	%r9, L(tm)
+
+	alcr	%r0, zero
+	st	%r0, 0(%r12,rp)
+
+C addmul_1 loop ===============================================================
+
+	ahi	vn, -1
+	je	L(outer_end)
+L(outer_loop):
+
+	la	rp, 4(rp)		C rp += 1
+	la	vp, 4(vp)		C up += 1
+	l	%r7, 0(vp)
+	l	%r11, 0(up)
+	lhi	%r12, 4			C init index register
+	mlr	%r10, %r7
+	lr	%r9, un
+	al	%r11, 0(rp)
+	st	%r11, 0(rp)
+
+L(tam):	l	%r1, 0(%r12,up)
+	l	%r11, 0(%r12,rp)
+	mlr	%r0, %r7
+	alcr	%r1, %r11
+	alcr	%r0, zero
+	alr	%r1, %r10
+	lr	%r10, %r0
+	st	%r1, 0(%r12,rp)
+	la	%r12, 4(%r12)
+	brct	%r9, L(tam)
+
+	alcr	%r0, zero
+	st	%r0, 0(%r12,rp)
+
+	brct	vn, L(outer_loop)
+L(outer_end):
+
+	lm	%r6, %r12, 24(%r15)
+	br	%r14
+EPILOGUE()

diff --git a/third_party/gmp/mpn/s390_32/esame/sqr_basecase.asm b/third_party/gmp/mpn/s390_32/esame/sqr_basecase.asm
new file mode 100644
index 0000000..f45f87a
--- /dev/null
+++ b/third_party/gmp/mpn/s390_32/esame/sqr_basecase.asm

@@ -0,0 +1,203 @@
+dnl  S/390-32 mpn_sqr_basecase.
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C            cycles/limb
+C z900		 ?
+C z990		23
+C z9		 ?
+C z10		 ?
+C z196		 ?
+
+C TODO
+C  * Clean up.
+C  * Stop iterating addmul_1 loop at latest for n = 2, implement longer tail.
+C    This will ask for basecase handling of n = 3.
+C  * Update counters and pointers more straightforwardly, possibly lowering
+C    register usage.
+C  * Should we use this allocation-free style for more sqr_basecase asm
+C    implementations?  The only disadvantage is that it requires R != U.
+C  * Replace loops by faster code.  The mul_1 and addmul_1 loops could be sped
+C    up by about 10%.  The sqr_diag_addlsh1 loop could probably be sped up even
+C    more.
+
+C INPUT PARAMETERS
+define(`rp',	`%r2')
+define(`up',	`%r3')
+define(`n',	`%r4')
+
+define(`zero',	`%r8')
+define(`rp_saved',	`%r9')
+define(`up_saved',	`%r13')
+define(`n_saved',	`%r14')
+
+ASM_START()
+PROLOGUE(mpn_sqr_basecase)
+	ahi	n, -2
+	jhe	L(ge2)
+
+C n = 1
+	l	%r5, 0(up)
+	mlr	%r4, %r5
+	st	%r5, 0(rp)
+	st	%r4, 4(rp)
+	br	%r14
+
+L(ge2):	jne	L(gen)
+
+C n = 2
+	stm	%r6, %r8, 24(%r15)
+	lhi	zero, 0
+
+	l	%r5, 0(up)
+	mlr	%r4, %r5		C u0 * u0
+	l	%r1, 4(up)
+	mlr	%r0, %r1		C u1 * u1
+	st	%r5, 0(rp)
+
+	l	%r7, 0(up)
+	ml	%r6, 4(up)		C u0 * u1
+	alr	%r7, %r7
+	alcr	%r6, %r6
+	alcr	%r0, zero
+
+	alr	%r4, %r7
+	alcr	%r1, %r6
+	alcr	%r0, zero
+	st	%r4, 4(rp)
+	st	%r1, 8(rp)
+	st	%r0, 12(rp)
+
+	lm	%r6, %r8, 24(%r15)
+	br	%r14
+
+L(gen):
+C mul_1 =======================================================================
+
+	stm	%r6, %r14, 24(%r15)
+	lhi	zero, 0
+	lr	up_saved, up
+	lr	rp_saved, rp
+	lr	n_saved, n
+
+	l	%r6, 0(up)
+	l	%r11, 4(up)
+	lhi	%r12, 8		C init index register
+	mlr	%r10, %r6
+	lr	%r5, n
+	st	%r11, 4(rp)
+	cr	%r15, %r15		C clear carry flag
+
+L(tm):	l	%r1, 0(%r12,up)
+	mlr	%r0, %r6
+	alcr	%r1, %r10
+	lr	%r10, %r0		C copy high part to carry limb
+	st	%r1, 0(%r12,rp)
+	la	%r12, 4(%r12)
+	brct	%r5, L(tm)
+
+	alcr	%r0, zero
+	st	%r0, 0(%r12,rp)
+
+C addmul_1 loop ===============================================================
+
+	ahi	n, -1
+	je	L(outer_end)
+L(outer_loop):
+
+	la	rp, 8(rp)		C rp += 2
+	la	up, 4(up)		C up += 1
+	l	%r6, 0(up)
+	l	%r11, 4(up)
+	lhi	%r12, 8		C init index register
+	mlr	%r10, %r6
+	lr	%r5, n
+	al	%r11, 4(rp)
+	st	%r11, 4(rp)
+
+L(tam):	l	%r1, 0(%r12,up)
+	l	%r7, 0(%r12,rp)
+	mlr	%r0, %r6
+	alcr	%r1, %r7
+	alcr	%r0, zero
+	alr	%r1, %r10
+	lr	%r10, %r0
+	st	%r1, 0(%r12,rp)
+	la	%r12, 4(%r12)
+	brct	%r5, L(tam)
+
+	alcr	%r0, zero
+	st	%r0, 0(%r12,rp)
+
+	brct	n, L(outer_loop)
+L(outer_end):
+
+	l	%r6, 4(up)
+	l	%r1, 8(up)
+	lr	%r7, %r0		C Same as: l %r7, 12(,rp)
+	mlr	%r0, %r6
+	alr	%r1, %r7
+	alcr	%r0, zero
+	st	%r1, 12(rp)
+	st	%r0, 16(rp)
+
+C sqr_diag_addlsh1 ============================================================
+
+define(`up', `up_saved')
+define(`rp', `rp_saved')
+	la	n, 1(n_saved)
+
+	l	%r1, 0(up)
+	mlr	%r0, %r1
+	st	%r1, 0(rp)
+C	clr	%r15, %r15		C clear carry (already clear per above)
+
+L(top):	l	%r11, 4(up)
+	la	up, 4(up)
+	l	%r6, 4(rp)
+	l	%r7, 8(rp)
+	mlr	%r10, %r11
+	alcr	%r6, %r6
+	alcr	%r7, %r7
+	alcr	%r10, zero		C propagate carry to high product limb
+	alr	%r6, %r0
+	alcr	%r7, %r11
+	stm	%r6, %r7, 4(rp)
+	la	rp, 8(rp)
+	lr	%r0, %r10		C copy carry limb
+	brct	n, L(top)
+
+	alcr	%r0, zero
+	st	%r0, 4(rp)
+
+	lm	%r6, %r14, 24(%r15)
+	br	%r14
+EPILOGUE()

diff --git a/third_party/gmp/mpn/s390_32/esame/submul_1.asm b/third_party/gmp/mpn/s390_32/esame/submul_1.asm
new file mode 100644
index 0000000..a71e57e
--- /dev/null
+++ b/third_party/gmp/mpn/s390_32/esame/submul_1.asm

@@ -0,0 +1,70 @@
+dnl  S/390-32 mpn_submul_1 for systems with MLR instruction.
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C            cycles/limb
+C z900		20
+C z990		11
+C z9		 ?
+C z10		 ?
+C z196		 ?
+
+C INPUT PARAMETERS
+define(`rp',	`%r2')
+define(`up',	`%r3')
+define(`n',	`%r4')
+define(`v0',	`%r5')
+
+ASM_START()
+PROLOGUE(mpn_submul_1)
+	stm	%r9, %r12, 36(%r15)
+	lhi	%r12, 0
+	slr	%r11, %r11
+
+L(top):	l	%r1, 0(%r12, up)
+	l	%r10, 0(%r12, rp)
+	mlr	%r0, v0
+	slbr	%r10, %r1
+	slbr	%r9, %r9
+	slr	%r0, %r9		C conditional incr
+	slr	%r10, %r11
+	lr	%r11, %r0
+	st	%r10, 0(%r12, rp)
+	la	%r12, 4(%r12)
+	brct	%r4,  L(top)
+
+	lr	%r2, %r11
+	slbr	%r9, %r9
+	slr	%r2, %r9
+
+	lm	%r9, %r12, 36(%r15)
+	br	%r14
+EPILOGUE()

diff --git a/third_party/gmp/mpn/s390_32/gmp-mparam.h b/third_party/gmp/mpn/s390_32/gmp-mparam.h
new file mode 100644
index 0000000..1aca74a
--- /dev/null
+++ b/third_party/gmp/mpn/s390_32/gmp-mparam.h

@@ -0,0 +1,138 @@
+/* S/390-32 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 2000-2011 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* 770 MHz IBM z900 running in 32-bit mode, using just traditional insns */
+
+#define DIVREM_1_NORM_THRESHOLD              0  /* always */
+#define DIVREM_1_UNNORM_THRESHOLD            5
+#define MOD_1_1P_METHOD                      2
+#define MOD_1_NORM_THRESHOLD                 0  /* always */
+#define MOD_1_UNNORM_THRESHOLD               5
+#define MOD_1N_TO_MOD_1_1_THRESHOLD      MP_SIZE_T_MAX  /* never */
+#define MOD_1U_TO_MOD_1_1_THRESHOLD         15
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD         0  /* never mpn_mod_1_1p */
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD        30
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD  MP_SIZE_T_MAX  /* never */
+#define USE_PREINV_DIVREM_1                  1
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD                 0  /* always */
+#define BMOD_1_TO_MOD_1_THRESHOLD        MP_SIZE_T_MAX  /* never */
+
+#define MUL_TOOM22_THRESHOLD                19
+#define MUL_TOOM33_THRESHOLD               114
+#define MUL_TOOM44_THRESHOLD               166
+#define MUL_TOOM6H_THRESHOLD               226
+#define MUL_TOOM8H_THRESHOLD               333
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD     106
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     122
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD     105
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD     113
+
+#define SQR_BASECASE_THRESHOLD               7
+#define SQR_TOOM2_THRESHOLD                 40
+#define SQR_TOOM3_THRESHOLD                126
+#define SQR_TOOM4_THRESHOLD                192
+#define SQR_TOOM6_THRESHOLD                246
+#define SQR_TOOM8_THRESHOLD                357
+
+#define MULMID_TOOM42_THRESHOLD             28
+
+#define MULMOD_BNM1_THRESHOLD               12
+#define SQRMOD_BNM1_THRESHOLD               18
+
+#define MUL_FFT_MODF_THRESHOLD             244  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    244, 5}, {     13, 6}, {      7, 5}, {     15, 6}, \
+    {      8, 5}, {     17, 6}, {     13, 7}, {      7, 6}, \
+    {     16, 7}, {      9, 6}, {     19, 7}, {     11, 6}, \
+    {     23, 7}, {     13, 8}, {      7, 7}, {     19, 8}, \
+    {     11, 7}, {     25, 9}, {      7, 8}, {     15, 7}, \
+    {     33, 8}, {     19, 7}, {     39, 8}, {     23, 7}, \
+    {     47, 8}, {     27, 9}, {     15, 8}, {     39, 9}, \
+    {     23, 8}, {     47,10}, {     15, 9}, {     31, 8}, \
+    {     63, 9}, {     39, 8}, {     79, 9}, {     47,10}, \
+    {     31, 9}, {     63, 8}, {    127, 9}, {     71, 8}, \
+    {    143, 9}, {     79,10}, {     47,11}, {   2048,12}, \
+    {   4096,13}, {   8192,14}, {  16384,15}, {  32768,16} }
+#define MUL_FFT_TABLE3_SIZE 48
+#define MUL_FFT_THRESHOLD                 2688
+
+#define SQR_FFT_MODF_THRESHOLD             216  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    216, 5}, {      7, 4}, {     15, 5}, {     17, 6}, \
+    {     13, 7}, {      7, 6}, {     17, 7}, {      9, 6}, \
+    {     20, 7}, {     11, 6}, {     23, 7}, {     13, 8}, \
+    {      7, 7}, {     19, 8}, {     11, 7}, {     25, 9}, \
+    {      7, 8}, {     15, 7}, {     33, 8}, {     19, 7}, \
+    {     39, 8}, {     23, 9}, {     15, 8}, {     39, 9}, \
+    {     23, 8}, {     47,10}, {     15, 9}, {     31, 8}, \
+    {     63, 9}, {     39, 8}, {     79, 9}, {     47,10}, \
+    {     31, 9}, {     63, 8}, {    127, 9}, {     71, 8}, \
+    {    143, 9}, {     79,10}, {     47,11}, {   2048,12}, \
+    {   4096,13}, {   8192,14}, {  16384,15}, {  32768,16} }
+#define SQR_FFT_TABLE3_SIZE 44
+#define SQR_FFT_THRESHOLD                 1856
+
+#define MULLO_BASECASE_THRESHOLD             0  /* always */
+#define MULLO_DC_THRESHOLD                  61
+#define MULLO_MUL_N_THRESHOLD             5240
+
+#define DC_DIV_QR_THRESHOLD                 70
+#define DC_DIVAPPR_Q_THRESHOLD             234
+#define DC_BDIV_QR_THRESHOLD                59
+#define DC_BDIV_Q_THRESHOLD                137
+
+#define INV_MULMOD_BNM1_THRESHOLD           36
+#define INV_NEWTON_THRESHOLD               327
+#define INV_APPR_THRESHOLD                 268
+
+#define BINV_NEWTON_THRESHOLD              324
+#define REDC_1_TO_REDC_N_THRESHOLD          63
+
+#define MU_DIV_QR_THRESHOLD               1099
+#define MU_DIVAPPR_Q_THRESHOLD            1360
+#define MUPI_DIV_QR_THRESHOLD              138
+#define MU_BDIV_QR_THRESHOLD               889
+#define MU_BDIV_Q_THRESHOLD               1234
+
+#define MATRIX22_STRASSEN_THRESHOLD         18
+#define HGCD_THRESHOLD                     167
+#define GCD_DC_THRESHOLD                   518
+#define GCDEXT_DC_THRESHOLD                378
+#define JACOBI_BASE_METHOD                   2
+
+#define GET_STR_DC_THRESHOLD                14
+#define GET_STR_PRECOMPUTE_THRESHOLD        25
+#define SET_STR_DC_THRESHOLD               577
+#define SET_STR_PRECOMPUTE_THRESHOLD      1217

diff --git a/third_party/gmp/mpn/s390_32/logops_n.asm b/third_party/gmp/mpn/s390_32/logops_n.asm
new file mode 100644
index 0000000..1f2cd2a
--- /dev/null
+++ b/third_party/gmp/mpn/s390_32/logops_n.asm

@@ -0,0 +1,295 @@
+dnl  S/390-32 logops.
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb     variant 1           variant 2       variant 3
+C	        rp!=up  rp=up
+C z900		 ?	 ?		 ?		 ?
+C z990		 2.5	 1		 2.75		 2.75
+C z9		 ?			 ?		 ?
+C z10		 ?			 ?		 ?
+C z196		 ?			 ?		 ?
+
+C INPUT PARAMETERS
+define(`rp',	`%r2')
+define(`up',	`%r3')
+define(`vp',	`%r4')
+define(`nn',	`%r5')
+
+ifdef(`OPERATION_and_n',`
+  define(`func',`mpn_and_n')
+  define(`VARIANT_1')
+  define(`LOGOPC',`nc')
+  define(`LOGOP',`n')')
+ifdef(`OPERATION_andn_n',`
+  define(`func',`mpn_andn_n')
+  define(`VARIANT_2')
+  define(`LOGOP',`n')')
+ifdef(`OPERATION_nand_n',`
+  define(`func',`mpn_nand_n')
+  define(`VARIANT_3')
+  define(`LOGOP',`n')')
+ifdef(`OPERATION_ior_n',`
+  define(`func',`mpn_ior_n')
+  define(`VARIANT_1')
+  define(`LOGOPC',`oc')
+  define(`LOGOP',`o')')
+ifdef(`OPERATION_iorn_n',`
+  define(`func',`mpn_iorn_n')
+  define(`VARIANT_2')
+  define(`LOGOP',`o')')
+ifdef(`OPERATION_nior_n',`
+  define(`func',`mpn_nior_n')
+  define(`VARIANT_3')
+  define(`LOGOP',`o')')
+ifdef(`OPERATION_xor_n',`
+  define(`func',`mpn_xor_n')
+  define(`VARIANT_1')
+  define(`LOGOPC',`xc')
+  define(`LOGOP',`x')')
+ifdef(`OPERATION_xnor_n',`
+  define(`func',`mpn_xnor_n')
+  define(`VARIANT_2')
+  define(`LOGOP',`x')')
+
+MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n)
+
+ASM_START()
+PROLOGUE(func)
+ifdef(`VARIANT_1',`
+	cr	rp, up
+	jne	L(normal)
+
+	sll	nn, 2
+	ahi	nn, -1
+	lr	%r1, nn
+	srl	%r1, 8
+	ltr	%r1, %r1		C < 256 bytes to copy?
+	je	L(1)
+
+L(tp):	LOGOPC	0(256, rp), 0(vp)
+	la	rp, 256(rp)
+	la	vp, 256(vp)
+	brct	%r1, L(tp)
+
+L(1):	bras	%r1, L(2)		C make r1 point to mvc insn
+	LOGOPC	0(1, rp), 0(vp)
+L(2):	ex	nn, 0(%r1)		C execute mvc with length ((nn-1) mod 256)+1
+L(rtn):	br	%r14
+
+
+L(normal):
+	stm	%r6, %r8, 12(%r15)
+	ahi	nn, 3
+	lhi	%r7, 3
+	lr	%r0, nn
+	srl	%r0, 2
+	nr	%r7, nn			C nn mod 4
+	je	L(b1)
+	chi	%r7, 2
+	jl	L(b2)
+	jne	L(top)
+
+L(b3):	lm	%r5, %r7, 0(up)
+	la	up, 12(up)
+	LOGOP	%r5, 0(vp)
+	LOGOP	%r6, 4(vp)
+	LOGOP	%r7, 8(vp)
+	stm	%r5, %r7, 0(rp)
+	la	rp, 12(rp)
+	la	vp, 12(vp)
+	j	L(mid)
+
+L(b1):	l	%r5, 0(up)
+	la	up, 4(up)
+	LOGOP	%r5, 0(vp)
+	st	%r5, 0(rp)
+	la	rp, 4(rp)
+	la	vp, 4(vp)
+	j	L(mid)
+
+L(b2):	lm	%r5, %r6, 0(up)
+	la	up, 8(up)
+	LOGOP	%r5, 0(vp)
+	LOGOP	%r6, 4(vp)
+	stm	%r5, %r6, 0(rp)
+	la	rp, 8(rp)
+	la	vp, 8(vp)
+	j	L(mid)
+
+L(top):	lm	%r5, %r8, 0(up)
+	la	up, 16(up)
+	LOGOP	%r5, 0(vp)
+	LOGOP	%r6, 4(vp)
+	LOGOP	%r7, 8(vp)
+	LOGOP	%r8, 12(vp)
+	stm	%r5, %r8, 0(rp)
+	la	rp, 16(rp)
+	la	vp, 16(vp)
+L(mid):	brct	%r0, L(top)
+
+	lm	%r6, %r8, 12(%r15)
+	br	%r14
+')
+
+ifdef(`VARIANT_2',`
+	stm	%r6, %r8, 12(%r15)
+	lhi	%r1, -1
+
+	ahi	nn, 3
+	lhi	%r7, 3
+	lr	%r0, nn
+	srl	%r0, 2
+	nr	%r7, nn			C nn mod 4
+	je	L(b1)
+	chi	%r7, 2
+	jl	L(b2)
+	jne	L(top)
+
+L(b3):	lm	%r5, %r7, 0(vp)
+	la	vp, 12(vp)
+	xr	%r5, %r1
+	xr	%r6, %r1
+	xr	%r7, %r1
+	LOGOP	%r5, 0(up)
+	LOGOP	%r6, 4(up)
+	LOGOP	%r7, 8(up)
+	stm	%r5, %r7, 0(rp)
+	la	rp, 12(rp)
+	la	up, 12(up)
+	j	L(mid)
+
+L(b1):	l	%r5, 0(vp)
+	la	vp, 4(vp)
+	xr	%r5, %r1
+	LOGOP	%r5, 0(up)
+	st	%r5, 0(rp)
+	la	rp, 4(rp)
+	la	up, 4(up)
+	j	L(mid)
+
+L(b2):	lm	%r5, %r6, 0(vp)
+	la	vp, 8(vp)
+	xr	%r5, %r1
+	xr	%r6, %r1
+	LOGOP	%r5, 0(up)
+	LOGOP	%r6, 4(up)
+	stm	%r5, %r6, 0(rp)
+	la	rp, 8(rp)
+	la	up, 8(up)
+	j	L(mid)
+
+L(top):	lm	%r5, %r8, 0(vp)
+	la	vp, 16(vp)
+	xr	%r5, %r1
+	xr	%r6, %r1
+	xr	%r7, %r1
+	xr	%r8, %r1
+	LOGOP	%r5, 0(up)
+	LOGOP	%r6, 4(up)
+	LOGOP	%r7, 8(up)
+	LOGOP	%r8, 12(up)
+	la	up, 16(up)
+	stm	%r5, %r8, 0(rp)
+	la	rp, 16(rp)
+L(mid):	brct	%r0, L(top)
+
+	lm	%r6, %r8, 12(%r15)
+	br	%r14
+')
+
+ifdef(`VARIANT_3',`
+	stm	%r6, %r8, 12(%r15)
+	lhi	%r1, -1
+
+	ahi	nn, 3
+	lhi	%r7, 3
+	lr	%r0, nn
+	srl	%r0, 2
+	nr	%r7, nn			C nn mod 4
+	je	L(b1)
+	chi	%r7, 2
+	jl	L(b2)
+	jne	L(top)
+
+L(b3):	lm	%r5, %r7, 0(vp)
+	la	vp, 12(vp)
+	LOGOP	%r5, 0(up)
+	LOGOP	%r6, 4(up)
+	xr	%r5, %r1
+	xr	%r6, %r1
+	LOGOP	%r7, 8(up)
+	xr	%r7, %r1
+	stm	%r5, %r7, 0(rp)
+	la	rp, 12(rp)
+	la	up, 12(up)
+	j	L(mid)
+
+L(b1):	l	%r5, 0(vp)
+	la	vp, 4(vp)
+	LOGOP	%r5, 0(up)
+	xr	%r5, %r1
+	st	%r5, 0(rp)
+	la	rp, 4(rp)
+	la	up, 4(up)
+	j	L(mid)
+
+L(b2):	lm	%r5, %r6, 0(vp)
+	la	vp, 8(vp)
+	LOGOP	%r5, 0(up)
+	LOGOP	%r6, 4(up)
+	xr	%r5, %r1
+	xr	%r6, %r1
+	stm	%r5, %r6, 0(rp)
+	la	rp, 8(rp)
+	la	up, 8(up)
+	j	L(mid)
+
+L(top):	lm	%r5, %r8, 0(vp)
+	la	vp, 16(vp)
+	LOGOP	%r5, 0(up)
+	LOGOP	%r6, 4(up)
+	xr	%r5, %r1
+	xr	%r6, %r1
+	LOGOP	%r7, 8(up)
+	LOGOP	%r8, 12(up)
+	xr	%r7, %r1
+	xr	%r8, %r1
+	stm	%r5, %r8, 0(rp)
+	la	up, 16(up)
+	la	rp, 16(rp)
+L(mid):	brct	%r0, L(top)
+
+	lm	%r6, %r8, 12(%r15)
+	br	%r14
+')
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/s390_32/lshift.asm b/third_party/gmp/mpn/s390_32/lshift.asm
new file mode 100644
index 0000000..da7d76e
--- /dev/null
+++ b/third_party/gmp/mpn/s390_32/lshift.asm

@@ -0,0 +1,144 @@
+dnl  S/390-32 mpn_lshift.
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C            cycles/limb
+C z900		 6
+C z990	         3
+C z9		 ?
+C z10		 ?
+C z196		 ?
+
+C TODO
+C  *
+
+C INPUT PARAMETERS
+define(`rp',	`%r2')
+define(`up',	`%r3')
+define(`n',	`%r4')
+define(`cnt',	`%r5')
+
+ASM_START()
+PROLOGUE(mpn_lshift)
+	lr	%r1, n
+	sll	%r1, 2
+	stm	%r6, %r12, 24(%r15)
+	la	up, 0(%r1,up)		C put up near end of U
+	la	rp, 0(%r1,rp)		C put rp near end of R
+	ahi	up, -20
+	ahi	rp, -16
+	lhi	%r8, 32
+	sr	%r8, cnt
+	l	%r12, 16(up)
+	srl	%r12, 0(%r8)		C return value
+	lhi	%r7, 3
+	nr	%r7, n
+	srl	n, 2
+	je	L(b0)
+	chi	%r7, 2
+	jl	L(b1)
+	je	L(b2)
+
+L(b3):	l	%r10, 16(up)
+	l	%r11, 12(up)
+	l	%r9,   8(up)
+	ahi	up, -8
+	lr	%r8, %r11
+	sldl	%r10, 0(cnt)
+	sldl	%r8,  0(cnt)
+	st	%r10, 12(rp)
+	st	%r8,   8(rp)
+	ahi	rp, -8
+	ltr	n, n
+	je	L(end)
+	j	L(top)
+
+L(b2):	l	%r10, 16(up)
+	l	%r11, 12(up)
+	ahi	up, -4
+	sldl	%r10, 0(cnt)
+	st	%r10, 12(rp)
+	ahi	rp, -4
+	ltr	n, n
+	je	L(end)
+	j	L(top)
+
+L(b1):	ltr	n, n
+	je	L(end)
+	j	L(top)
+
+L(b0):	l	%r10,16(up)
+	l	%r8, 12(up)
+	l	%r6,  8(up)
+	l	%r0,  4(up)
+	ahi	up, -12
+	lr	%r11, %r8
+	lr	%r9,  %r6
+	lr	%r7,  %r0
+	sldl	%r10,0(cnt)
+	sldl	%r8, 0(cnt)
+	sldl	%r6, 0(cnt)
+	st	%r10, 12(rp)
+	st	%r8,   8(rp)
+	st	%r6,   4(rp)
+	ahi	rp, -12
+	ahi	n, -1
+	je	L(end)
+
+	ALIGN(8)
+L(top):	l	%r10, 16(up)
+	l	%r8,  12(up)
+	l	%r6,   8(up)
+	l	%r0,   4(up)
+	l	%r1,   0(up)
+	lr	%r11, %r8
+	lr	%r9,  %r6
+	lr	%r7,  %r0
+	ahi	up, -16
+	sldl	%r10, 0(cnt)
+	sldl	%r8,  0(cnt)
+	sldl	%r6,  0(cnt)
+	sldl	%r0,  0(cnt)
+	st	%r10, 12(rp)
+	st	%r8,   8(rp)
+	st	%r6,   4(rp)
+	st	%r0,   0(rp)
+	ahi	rp, -16
+	brct	n, L(top)
+
+L(end):	l	%r10, 16(up)
+	sll	%r10, 0(cnt)
+	st	%r10, 12(rp)
+
+	lr	%r2, %r12
+	lm	%r6, %r12, 24(%r15)
+	br	%r14
+EPILOGUE()

diff --git a/third_party/gmp/mpn/s390_32/lshiftc.asm b/third_party/gmp/mpn/s390_32/lshiftc.asm
new file mode 100644
index 0000000..f601673
--- /dev/null
+++ b/third_party/gmp/mpn/s390_32/lshiftc.asm

@@ -0,0 +1,156 @@
+dnl  S/390-32 mpn_lshiftc.
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C            cycles/limb
+C z900		 7
+C z990	         3.375
+C z9		 ?
+C z10		 ?
+C z196		 ?
+
+C TODO
+C  *
+
+C INPUT PARAMETERS
+define(`rp',	`%r2')
+define(`up',	`%r3')
+define(`n',	`%r4')
+define(`cnt',	`%r5')
+
+ASM_START()
+PROLOGUE(mpn_lshiftc)
+	lr	%r1, n
+	sll	%r1, 2
+	stm	%r6, %r13, 24(%r15)
+	la	up, 0(%r1,up)		C put up near end of U
+	la	rp, 0(%r1,rp)		C put rp near end of R
+	ahi	up, -20
+	ahi	rp, -16
+	lhi	%r8, 32
+	sr	%r8, cnt
+	l	%r12, 16(up)
+	srl	%r12, 0(%r8)		C return value
+	lhi	%r13, -1
+	lhi	%r7, 3
+	nr	%r7, n
+	srl	n, 2
+	je	L(b0)
+	chi	%r7, 2
+	jl	L(b1)
+	je	L(b2)
+
+L(b3):	l	%r10, 16(up)
+	l	%r11, 12(up)
+	l	%r9,   8(up)
+	ahi	up, -8
+	lr	%r8, %r11
+	sldl	%r10, 0(cnt)
+	sldl	%r8,  0(cnt)
+	xr	%r10, %r13
+	xr	%r8, %r13
+	st	%r10, 12(rp)
+	st	%r8,   8(rp)
+	ahi	rp, -8
+	ltr	n, n
+	je	L(end)
+	j	L(top)
+
+L(b2):	l	%r10, 16(up)
+	l	%r11, 12(up)
+	ahi	up, -4
+	sldl	%r10, 0(cnt)
+	xr	%r10, %r13
+	st	%r10, 12(rp)
+	ahi	rp, -4
+	ltr	n, n
+	je	L(end)
+	j	L(top)
+
+L(b1):	ltr	n, n
+	je	L(end)
+	j	L(top)
+
+L(b0):	l	%r10,16(up)
+	l	%r8, 12(up)
+	l	%r6,  8(up)
+	l	%r0,  4(up)
+	ahi	up, -12
+	lr	%r11, %r8
+	lr	%r9,  %r6
+	lr	%r7,  %r0
+	sldl	%r10,0(cnt)
+	sldl	%r8, 0(cnt)
+	sldl	%r6, 0(cnt)
+	xr	%r10, %r13
+	xr	%r8, %r13
+	xr	%r6, %r13
+	st	%r10, 12(rp)
+	st	%r8,   8(rp)
+	st	%r6,   4(rp)
+	ahi	rp, -12
+	ahi	n, -1
+	je	L(end)
+
+	ALIGN(8)
+L(top):	l	%r10, 16(up)
+	l	%r8,  12(up)
+	l	%r6,   8(up)
+	l	%r0,   4(up)
+	l	%r1,   0(up)
+	lr	%r11, %r8
+	lr	%r9,  %r6
+	lr	%r7,  %r0
+	ahi	up, -16
+	sldl	%r10, 0(cnt)
+	sldl	%r8,  0(cnt)
+	sldl	%r6,  0(cnt)
+	sldl	%r0,  0(cnt)
+	xr	%r10, %r13
+	xr	%r8, %r13
+	xr	%r6, %r13
+	xr	%r0, %r13
+	st	%r10, 12(rp)
+	st	%r8,   8(rp)
+	st	%r6,   4(rp)
+	st	%r0,   0(rp)
+	ahi	rp, -16
+	brct	n, L(top)
+
+L(end):	l	%r10, 16(up)
+	sll	%r10, 0(cnt)
+	xr	%r10, %r13
+	st	%r10, 12(rp)
+
+	lr	%r2, %r12
+	lm	%r6, %r13, 24(%r15)
+	br	%r14
+EPILOGUE()

diff --git a/third_party/gmp/mpn/s390_32/mul_1.asm b/third_party/gmp/mpn/s390_32/mul_1.asm
new file mode 100644
index 0000000..e3ad0c5
--- /dev/null
+++ b/third_party/gmp/mpn/s390_32/mul_1.asm

@@ -0,0 +1,85 @@
+dnl  S/390 mpn_mul_1 -- Multiply a limb vector with a limb and store the
+dnl  result in a second limb vector.
+
+dnl  Copyright 2001 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+define(`rp',2)
+define(`up',3)
+define(`n',4)
+define(`vlimb',5)
+define(`cylimb',7)
+
+ASM_START()
+PROLOGUE(mpn_mul_1)
+	stm	6,7,24(15)
+	slr	cylimb,cylimb	# clear cylimb
+	ltr	vlimb,vlimb
+	jnl	.Loopp
+
+.Loopn:	l	1,0(up)		# load from u
+	lr	6,1		#
+	mr	0,vlimb		# multiply signed
+	alr	0,6		# add vlimb to phi
+	sra	6,31		# make mask
+	nr	6,vlimb		# 0 or vlimb
+	alr	0,6		# conditionally add vlimb to phi
+	alr	1,cylimb	# add carry limb to plo
+	brc	8+4,+8		# branch if not carry
+	ahi	0,1		# increment phi
+	lr	cylimb,0	# new cylimb
+	st	1,0(rp)		# store
+	la	up,4(,up)
+	la	rp,4(,rp)
+	brct	n,.Loopn
+
+	lr	2,cylimb
+	lm	6,7,24(15)
+	br	14
+
+.Loopp:	l	1,0(up)		# load from u
+	lr	6,1		#
+	mr	0,vlimb		# multiply signed
+	sra	6,31		# make mask
+	nr	6,vlimb		# 0 or vlimb
+	alr	0,6		# conditionally add vlimb to phi
+	alr	1,cylimb	# add carry limb to plo
+	brc	8+4,+8		# branch if not carry
+	ahi	0,1		# increment phi
+	lr	cylimb,0	# new cylimb
+	st	1,0(rp)		# store
+	la	up,4(,up)
+	la	rp,4(,rp)
+	brct	n,.Loopp
+
+	lr	2,cylimb
+	lm	6,7,24(15)
+	br	14
+EPILOGUE(mpn_mul_1)

diff --git a/third_party/gmp/mpn/s390_32/rshift.asm b/third_party/gmp/mpn/s390_32/rshift.asm
new file mode 100644
index 0000000..5f2cf37
--- /dev/null
+++ b/third_party/gmp/mpn/s390_32/rshift.asm

@@ -0,0 +1,138 @@
+dnl  S/390-32 mpn_rshift.
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C            cycles/limb
+C z900		 6
+C z990	         3
+C z9		 ?
+C z10		 ?
+C z196		 ?
+
+C TODO
+C  *
+
+C INPUT PARAMETERS
+define(`rp',	`%r2')
+define(`up',	`%r3')
+define(`n',	`%r4')
+define(`cnt',	`%r5')
+
+ASM_START()
+PROLOGUE(mpn_rshift)
+	stm	%r6, %r12, 24(%r15)
+	lhi	%r8, 32
+	sr	%r8, cnt
+	l	%r12, 0(up)
+	sll	%r12, 0(%r8)		C return value
+	lhi	%r7, 3
+	nr	%r7, n
+	srl	n, 2
+	je	L(b0)
+	chi	%r7, 2
+	jl	L(b1)
+	je	L(b2)
+
+L(b3):	l	%r11, 0(up)
+	l	%r10, 4(up)
+	l	%r8,  8(up)
+	ahi	up, 8
+	lr	%r9, %r10
+	srdl	%r10, 0(cnt)
+	srdl	%r8,  0(cnt)
+	st	%r11, 0(rp)
+	st	%r9,  4(rp)
+	ahi	rp, 8
+	ltr	n, n
+	je	L(end)
+	j	L(top)
+
+L(b2):	l	%r11, 0(up)
+	l	%r10, 4(up)
+	ahi	up, 4
+	srdl	%r10, 0(cnt)
+	st	%r11, 0(rp)
+	ahi	rp, 4
+	ltr	n, n
+	je	L(end)
+	j	L(top)
+
+L(b1):	ltr	n, n
+	je	L(end)
+	j	L(top)
+
+L(b0):	l	%r11, 0(up)
+	l	%r9,  4(up)
+	l	%r7,  8(up)
+	l	%r1, 12(up)
+	ahi	up, 12
+	lr	%r10, %r9
+	lr	%r8,  %r7
+	lr	%r6,  %r1
+	srdl	%r10, 0(cnt)
+	srdl	%r8,  0(cnt)
+	srdl	%r6,  0(cnt)
+	st	%r11, 0(rp)
+	st	%r9,  4(rp)
+	st	%r7,  8(rp)
+	ahi	rp, 12
+	ahi	n, -1
+	je	L(end)
+
+	ALIGN(8)
+L(top):	l	%r11, 0(up)
+	l	%r9,  4(up)
+	l	%r7,  8(up)
+	l	%r1, 12(up)
+	l	%r0, 16(up)
+	lr	%r10, %r9
+	lr	%r8,  %r7
+	lr	%r6,  %r1
+	ahi	up, 16
+	srdl	%r10, 0(cnt)
+	srdl	%r8,  0(cnt)
+	srdl	%r6,  0(cnt)
+	srdl	%r0,  0(cnt)
+	st	%r11, 0(rp)
+	st	%r9,  4(rp)
+	st	%r7,  8(rp)
+	st	%r1, 12(rp)
+	ahi	rp, 16
+	brct	n, L(top)
+
+L(end):	l	%r11, 0(up)
+	srl	%r11, 0(cnt)
+	st	%r11, 0(rp)
+
+	lr	%r2, %r12
+	lm	%r6, %r12, 24(%r15)
+	br	%r14
+EPILOGUE()

diff --git a/third_party/gmp/mpn/s390_32/submul_1.asm b/third_party/gmp/mpn/s390_32/submul_1.asm
new file mode 100644
index 0000000..da7d849
--- /dev/null
+++ b/third_party/gmp/mpn/s390_32/submul_1.asm

@@ -0,0 +1,93 @@
+dnl  S/390 mpn_submul_1 -- Multiply a limb vector with a limb and subtract the
+dnl  result from a second limb vector.
+
+dnl  Copyright 2001 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+define(`rp',2)
+define(`up',3)
+define(`n',4)
+define(`vlimb',5)
+define(`cylimb',7)
+
+ASM_START()
+PROLOGUE(mpn_submul_1)
+	stm	6,7,24(15)
+	slr	cylimb,cylimb	# clear cylimb
+	ltr	vlimb,vlimb
+	jnl	.Loopp
+
+.Loopn:	l	1,0(up)		# load from u
+	lr	6,1		#
+	mr	0,vlimb		# multiply signed
+	alr	0,6		# add vlimb to phi
+	sra	6,31		# make mask
+	nr	6,vlimb		# 0 or vlimb
+	alr	0,6		# conditionally add vlimb to phi
+	alr	1,cylimb	# add carry limb to plo
+	brc	8+4,+8		# branch if not carry
+	ahi	0,1		# increment phi
+	l	6,0(rp)		# load r limb
+	slr	6,1		# add u limb to plo
+	brc	2+1,+8		# branch if not carry
+	ahi	0,1		# increment phi
+	lr	cylimb,0	# new cylimb
+	st	6,0(rp)		# store
+	la	up,4(,up)
+	la	rp,4(,rp)
+	brct	n,.Loopn
+
+	lr	2,cylimb
+	lm	6,7,24(15)
+	br	14
+
+.Loopp:	l	1,0(up)		# load from u
+	lr	6,1		#
+	mr	0,vlimb		# multiply signed
+	sra	6,31		# make mask
+	nr	6,vlimb		# 0 or vlimb
+	alr	0,6		# conditionally add vlimb to phi
+	alr	1,cylimb	# add carry limb to plo
+	brc	8+4,+8		# branch if not carry
+	ahi	0,1		# increment phi
+	l	6,0(rp)		# load r limb
+	slr	6,1		# add u limb to plo
+	brc	2+1,+8		# branch if not carry
+	ahi	0,1		# increment phi
+	lr	cylimb,0	# new cylimb
+	st	6,0(rp)		# store
+	la	up,4(,up)
+	la	rp,4(,rp)
+	brct	n,.Loopp
+
+	lr	2,cylimb
+	lm	6,7,24(15)
+	br	14
+EPILOGUE(mpn_submul_1)

diff --git a/third_party/gmp/mpn/s390_64/README b/third_party/gmp/mpn/s390_64/README
new file mode 100644
index 0000000..53702db
--- /dev/null
+++ b/third_party/gmp/mpn/s390_64/README

@@ -0,0 +1,88 @@
+Copyright 2011 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.
+
+
+
+There are 5 generations of 64-bit s390 processors, z900, z990, z9,
+z10, and z196.  The current GMP code was optimised for the two oldest,
+z900 and z990.
+
+
+mpn_copyi
+
+This code makes use of a loop around MVC.  It almost surely runs very
+close to optimally.  A small improvement could be done by using one
+MVC for size 256 bytes, now we use two (we use an extra MVC when
+copying any multiple of 256 bytes).
+
+
+mpn_copyd
+
+We have tried several feed-in variants here, branch tree, jump table
+and computed goto.  The fastest (on z990) turned out to be computed
+goto.
+
+An approach not tried is EX of LMG and STMG, modifying the register set
+on-the-fly.  Using that trick, we could completely avoid using
+separate feed-in paths.
+
+
+mpn_lshift, mpn_rshift
+
+The current code runs at pipeline decode bandwidth on z990.
+
+
+mpn_add_n, mpn_sub_n
+
+The current code is 4-way unrolled.  It should be unrolled more, at
+least 8x, in order to reach 2.5 c/l.
+
+
+mpn_mul_1, mpn_addmul_1, mpn_submul_1
+
+The current code is very naive, but due to the non-pipelined nature of
+MLGR on z900 and z990, more sophisticated code would not gain much.
+
+On z10 one would need to cluster at least 4 MLGR together, in order to
+reduce stalling.
+
+On z196, one surely want to use unrolling and pipelining, to perhaps
+reach around 12 c/l.  A major issue here and on z10 is ALCGR's 3 cycle
+stalling.
+
+
+mpn_mul_2, mpn_addmul_2
+
+At least for older machines (z900, z990) with very slow MLGR, we
+should use Karatsuba's algorithm on 2-limb units, making mul_2 and
+addmul_2 the main multiplication primitives.  The newer machines might
+benefit less from this approach, perhaps in particular z10, where MLGR
+clustering is more important.
+
+With Karatsuba, one could hope for around 16 cycles per accumulated
+128 cross product, on z990.

diff --git a/third_party/gmp/mpn/s390_64/addmul_1.asm b/third_party/gmp/mpn/s390_64/addmul_1.asm
new file mode 100644
index 0000000..84cca12
--- /dev/null
+++ b/third_party/gmp/mpn/s390_64/addmul_1.asm

@@ -0,0 +1,72 @@
+dnl  S/390-64 mpn_addmul_1
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C            cycles/limb
+C z900		34
+C z990		23
+C z9		 ?
+C z10		28
+C z196		 ?
+
+C INPUT PARAMETERS
+define(`rp',	`%r2')
+define(`up',	`%r3')
+define(`n',	`%r4')
+define(`v0',	`%r5')
+
+define(`z',	`%r9')
+
+ASM_START()
+PROLOGUE(mpn_addmul_1)
+	stmg	%r9, %r12, 72(%r15)
+	lghi	%r12, 0			C zero index register
+	aghi	%r12, 0			C clear carry flag
+	lghi	%r11, 0			C clear carry limb
+	lghi	z, 0			C keep register zero
+
+L(top):	lg	%r1, 0(%r12,up)
+	lg	%r10, 0(%r12,rp)
+	mlgr	%r0, v0
+	alcgr	%r1, %r10
+	alcgr	%r0, z
+	algr	%r1, %r11
+	lgr	%r11, %r0
+	stg	%r1, 0(%r12,rp)
+	la	%r12, 8(%r12)
+	brctg	n, L(top)
+
+	lghi	%r2, 0
+	alcgr	%r2, %r11
+
+	lmg	%r9, %r12, 72(%r15)
+	br	%r14
+EPILOGUE()

diff --git a/third_party/gmp/mpn/s390_64/aorrlsh1_n.asm b/third_party/gmp/mpn/s390_64/aorrlsh1_n.asm
new file mode 100644
index 0000000..697259e
--- /dev/null
+++ b/third_party/gmp/mpn/s390_64/aorrlsh1_n.asm

@@ -0,0 +1,168 @@
+dnl  S/390-64 mpn_addlsh1_n and mpn_rsblsh1_n.
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C            cycles/limb
+C z900		 9
+C z990		 4.75
+C z9		 ?
+C z10		11
+C z196		 ?
+
+C TODO
+C  * Optimise for small n, avoid 'la' like in aors_n.asm.
+C  * Tune to reach 3.5 c/l.  For addlsh1, we could let the main alcgr propagate
+C    carry to the lsh1 alcgr.
+C  * Compute RETVAL for sublsh1_n less stupidly.
+
+C INPUT PARAMETERS
+define(`rp',	`%r2')
+define(`up',	`%r3')
+define(`vp',	`%r4')
+define(`n',	`%r5')
+
+ifdef(`OPERATION_addlsh1_n',`
+  define(ADSB,		alg)
+  define(ADSBC,		alcg)
+  define(INITCY,	`lghi	%r9, -1')
+  define(RETVAL,	`la	%r2, 2(%r1,%r9)')
+  define(func, mpn_addlsh1_n)
+')
+ifdef(`OPERATION_rsblsh1_n',`
+  define(ADSB,		slg)
+  define(ADSBC,		slbg)
+  define(INITCY,	`lghi	%r9, 0')
+  define(RETVAL,`dnl
+	algr	%r1, %r9
+	lghi	%r2, 1
+	algr	%r2, %r1')
+  define(func, mpn_rsblsh1_n)
+')
+
+MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_rsblsh1_n)
+
+ASM_START()
+PROLOGUE(func)
+	stmg	%r6, %r9, 48(%r15)
+
+	aghi	n, 3
+	lghi	%r7, 3
+	srlg	%r0, n, 2
+	ngr	%r7, n			C n mod 4
+	je	L(b1)
+	cghi	%r7, 2
+	jl	L(b2)
+	jne	L(b0)
+
+L(b3):	lmg	%r5, %r7, 0(vp)
+	la	vp, 24(vp)
+
+	algr	%r5, %r5
+	alcgr	%r6, %r6
+	alcgr	%r7, %r7
+	slbgr	%r1, %r1
+
+	ADSB	%r5, 0(up)
+	ADSBC	%r6, 8(up)
+	ADSBC	%r7, 16(up)
+	la	up, 24(up)
+	slbgr	%r9, %r9
+
+	stmg	%r5, %r7, 0(rp)
+	la	rp, 24(rp)
+	brctg	%r0, L(top)
+	j	L(end)
+
+L(b0):	lghi	%r1, -1
+	INITCY
+	j	L(top)
+
+L(b1):	lg	%r5, 0(vp)
+	la	vp, 8(vp)
+
+	algr	%r5, %r5
+	slbgr	%r1, %r1
+	ADSB	%r5, 0(up)
+	la	up, 8(up)
+	slbgr	%r9, %r9
+
+	stg	%r5, 0(rp)
+	la	rp, 8(rp)
+	brctg	%r0, L(top)
+	j	L(end)
+
+L(b2):	lmg	%r5, %r6, 0(vp)
+	la	vp, 16(vp)
+
+	algr	%r5, %r5
+	alcgr	%r6, %r6
+	slbgr	%r1, %r1
+
+	ADSB	%r5, 0(up)
+	ADSBC	%r6, 8(up)
+	la	up, 16(up)
+	slbgr	%r9, %r9
+
+	stmg	%r5, %r6, 0(rp)
+	la	rp, 16(rp)
+	brctg	%r0, L(top)
+	j	L(end)
+
+L(top):	lmg	%r5, %r8, 0(vp)
+	la	vp, 32(vp)
+
+	aghi	%r1, 1			C restore carry
+
+	alcgr	%r5, %r5
+	alcgr	%r6, %r6
+	alcgr	%r7, %r7
+	alcgr	%r8, %r8
+
+	slbgr	%r1, %r1		C save carry
+
+	aghi	%r9, 1			C restore carry
+
+	ADSBC	%r5, 0(up)
+	ADSBC	%r6, 8(up)
+	ADSBC	%r7, 16(up)
+	ADSBC	%r8, 24(up)
+	la	up, 32(up)
+
+	slbgr	%r9, %r9		C save carry
+
+	stmg	%r5, %r8, 0(rp)
+	la	rp, 32(rp)
+	brctg	%r0, L(top)
+
+L(end):	RETVAL
+	lmg	%r6, %r9, 48(%r15)
+	br	%r14
+EPILOGUE()

diff --git a/third_party/gmp/mpn/s390_64/aors_n.asm b/third_party/gmp/mpn/s390_64/aors_n.asm
new file mode 100644
index 0000000..a3c3ca7
--- /dev/null
+++ b/third_party/gmp/mpn/s390_64/aors_n.asm

@@ -0,0 +1,136 @@
+dnl  S/390-64 mpn_add_n and mpn_sub_n.
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C            cycles/limb
+C z900		 5.5
+C z990		 3
+C z9		 ?
+C z10		 6
+C z196		 ?
+
+C TODO
+C  * Optimise for small n
+C  * Use r0 and save/restore one less register
+C  * Using logops_n's v1 inner loop operand order make the loop about 20%
+C    faster, at the expense of highly alignment-dependent performance.
+
+C INPUT PARAMETERS
+define(`rp',	`%r2')
+define(`up',	`%r3')
+define(`vp',	`%r4')
+define(`n',	`%r5')
+
+ifdef(`OPERATION_add_n', `
+  define(ADSB,		alg)
+  define(ADSBCR,	alcgr)
+  define(ADSBC,		alcg)
+  define(RETVAL,`dnl
+	lghi	%r2, 0
+	alcgr	%r2, %r2')
+  define(func,		mpn_add_n)
+  define(func_nc,	mpn_add_nc)')
+ifdef(`OPERATION_sub_n', `
+  define(ADSB,		slg)
+  define(ADSBCR,	slbgr)
+  define(ADSBC,		slbg)
+  define(RETVAL,`dnl
+	slbgr	%r2, %r2
+	lcgr	%r2, %r2')
+  define(func,		mpn_sub_n)
+  define(func_nc,	mpn_sub_nc)')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_sub_n)
+
+ASM_START()
+PROLOGUE(func)
+	stmg	%r6, %r8, 48(%r15)
+
+	aghi	n, 3
+	lghi	%r7, 3
+	srlg	%r1, n, 2
+	ngr	%r7, n			C n mod 4
+	je	L(b1)
+	cghi	%r7, 2
+	jl	L(b2)
+	jne	L(b0)
+
+L(b3):	lmg	%r5, %r7, 0(up)
+	la	up, 24(up)
+	ADSB	%r5, 0(vp)
+	ADSBC	%r6, 8(vp)
+	ADSBC	%r7, 16(vp)
+	la	vp, 24(vp)
+	stmg	%r5, %r7, 0(rp)
+	la	rp, 24(rp)
+	brctg	%r1, L(top)
+	j	L(end)
+
+L(b0):	lmg	%r5, %r8, 0(up)		C This redundant insns is no mistake,
+	la	up, 32(up)		C it is needed to make main loop run
+	ADSB	%r5, 0(vp)		C fast for n = 0 (mod 4).
+	ADSBC	%r6, 8(vp)
+	j	L(m0)
+
+L(b1):	lg	%r5, 0(up)
+	la	up, 8(up)
+	ADSB	%r5, 0(vp)
+	la	vp, 8(vp)
+	stg	%r5, 0(rp)
+	la	rp, 8(rp)
+	brctg	%r1, L(top)
+	j	L(end)
+
+L(b2):	lmg	%r5, %r6, 0(up)
+	la	up, 16(up)
+	ADSB	%r5, 0(vp)
+	ADSBC	%r6, 8(vp)
+	la	vp, 16(vp)
+	stmg	%r5, %r6, 0(rp)
+	la	rp, 16(rp)
+	brctg	%r1, L(top)
+	j	L(end)
+
+L(top):	lmg	%r5, %r8, 0(up)
+	la	up, 32(up)
+	ADSBC	%r5, 0(vp)
+	ADSBC	%r6, 8(vp)
+L(m0):	ADSBC	%r7, 16(vp)
+	ADSBC	%r8, 24(vp)
+	la	vp, 32(vp)
+	stmg	%r5, %r8, 0(rp)
+	la	rp, 32(rp)
+	brctg	%r1, L(top)
+
+L(end):	RETVAL
+	lmg	%r6, %r8, 48(%r15)
+	br	%r14
+EPILOGUE()

diff --git a/third_party/gmp/mpn/s390_64/bdiv_dbm1c.asm b/third_party/gmp/mpn/s390_64/bdiv_dbm1c.asm
new file mode 100644
index 0000000..35e900a
--- /dev/null
+++ b/third_party/gmp/mpn/s390_64/bdiv_dbm1c.asm

@@ -0,0 +1,65 @@
+dnl  S/390-64 mpn_bdiv_dbm1c
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C            cycles/limb
+C z900		29
+C z990		22
+C z9		 ?
+C z10		19
+C z196		 ?
+
+C INPUT PARAMETERS
+define(`qp',	  `%r2')
+define(`up',	  `%r3')
+define(`n',	  `%r4')
+define(`bd',	  `%r5')
+define(`cy',	  `%r6')
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_bdiv_dbm1c)
+	stmg	%r6, %r7, 48(%r15)
+	lghi	%r7, 0			C zero index register
+
+L(top):	lg	%r1, 0(%r7,up)
+	mlgr	%r0, bd
+	slgr	%r6, %r1
+	stg	%r6, 0(%r7,qp)
+	la	%r7, 8(%r7)
+	slbgr	%r6, %r0
+	brctg	n, L(top)
+
+	lgr	%r2, %r6
+	lmg	%r6, %r7, 48(%r15)
+	br	%r14
+EPILOGUE()

diff --git a/third_party/gmp/mpn/s390_64/copyd.asm b/third_party/gmp/mpn/s390_64/copyd.asm
new file mode 100644
index 0000000..8631e19
--- /dev/null
+++ b/third_party/gmp/mpn/s390_64/copyd.asm

@@ -0,0 +1,144 @@
+dnl  S/390-64 mpn_copyd
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+C            cycles/limb
+C z900		 2.67
+C z990           1.5
+C z9		 ?
+C z10		 1.8
+C z196		 ?
+
+C FIXME:
+C  * Avoid saving/restoring callee-saves registers for n < 3.  This could be
+C    done by setting rp=r1, up=r2, i=r0 and r3,r4,r5 for clock regs.
+C    We could then use r3...r10 in main loop.
+C  * Could we use some EX trick, modifying lmg/stmg, for the feed-in code?
+
+C INPUT PARAMETERS
+define(`rp_param',	`%r2')
+define(`up_param',	`%r3')
+define(`n',		`%r4')
+
+define(`rp',	`%r8')
+define(`up',	`%r9')
+
+ASM_START()
+PROLOGUE(mpn_copyd)
+	stmg	%r6, %r11, 48(%r15)
+
+	sllg	%r1, n, 3
+	la	%r10, 8(n)
+	aghi	%r1, -64
+	srlg	%r10, %r10, 3
+	lghi	%r11, -64
+
+	la	rp, 0(%r1,rp_param)	C FIXME use lay on z990 and later
+	la	up, 0(%r1,up_param)	C FIXME use lay on z990 and later
+
+	lghi	%r7, 7
+	ngr	%r7, n			C n mod 8
+	cghi	%r7, 2
+	jh	L(b34567)
+	cghi	%r7, 1
+	je	L(b1)
+	jh	L(b2)
+
+L(b0):	brctg	%r10, L(top)
+	j	L(end)
+
+L(b1):	lg	%r0, 56(up)
+	aghi	up, -8
+	stg	%r0, 56(rp)
+	aghi	rp, -8
+	brctg	%r10, L(top)
+	j	L(end)
+
+L(b2):	lmg	%r0, %r1, 48(up)
+	aghi	up, -16
+	stmg	%r0, %r1, 48(rp)
+	aghi	rp, -16
+	brctg	%r10, L(top)
+	j	L(end)
+
+L(b34567):
+	cghi	%r7, 4
+	jl	L(b3)
+	je	L(b4)
+	cghi	%r7, 6
+	je	L(b6)
+	jh	L(b7)
+
+L(b5):	lmg	%r0, %r4, 24(up)
+	aghi	up, -40
+	stmg	%r0, %r4, 24(rp)
+	aghi	rp, -40
+	brctg	%r10, L(top)
+	j	L(end)
+
+L(b3):	lmg	%r0, %r2, 40(up)
+	aghi	up, -24
+	stmg	%r0, %r2, 40(rp)
+	aghi	rp, -24
+	brctg	%r10, L(top)
+	j	L(end)
+
+L(b4):	lmg	%r0, %r3, 32(up)
+	aghi	up, -32
+	stmg	%r0, %r3, 32(rp)
+	aghi	rp, -32
+	brctg	%r10, L(top)
+	j	L(end)
+
+L(b6):	lmg	%r0, %r5, 16(up)
+	aghi	up, -48
+	stmg	%r0, %r5, 16(rp)
+	aghi	rp, -48
+	brctg	%r10, L(top)
+	j	L(end)
+
+L(b7):	lmg	%r0, %r6, 8(up)
+	aghi	up, -56
+	stmg	%r0, %r6, 8(rp)
+	aghi	rp, -56
+	brctg	%r10, L(top)
+	j	L(end)
+
+L(top):	lmg	%r0, %r7, 0(up)
+	la	up, 0(%r11,up)
+	stmg	%r0, %r7, 0(rp)
+	la	rp, 0(%r11,rp)
+	brctg	%r10, L(top)
+
+L(end):	lmg	%r6, %r11, 48(%r15)
+	br	%r14
+EPILOGUE()

diff --git a/third_party/gmp/mpn/s390_64/copyi.asm b/third_party/gmp/mpn/s390_64/copyi.asm
new file mode 100644
index 0000000..bfb8881
--- /dev/null
+++ b/third_party/gmp/mpn/s390_64/copyi.asm

@@ -0,0 +1,68 @@
+dnl  S/390-64 mpn_copyi
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+C            cycles/limb
+C z900		 1.25
+C z990           0.75
+C z9		 ?
+C z10		 1
+C z196		 ?
+
+C NOTE
+C  * This is based on GNU libc memcpy which was written by Martin Schwidefsky.
+
+C INPUT PARAMETERS
+define(`rp',	`%r2')
+define(`up',	`%r3')
+define(`n',	`%r4')
+
+ASM_START()
+PROLOGUE(mpn_copyi)
+	ltgr	%r4, %r4
+	sllg	%r4, %r4, 3
+	je	L(rtn)
+	aghi	%r4, -1
+	srlg	%r5, %r4, 8
+	ltgr	%r5, %r5		C < 256 bytes to copy?
+	je	L(1)
+
+L(top):	mvc	0(256, rp), 0(up)
+	la	rp, 256(rp)
+	la	up, 256(up)
+	brctg	%r5, L(top)
+
+L(1):	bras	%r5, L(2)		C make r5 point to mvc insn
+	mvc	0(1, rp), 0(up)
+L(2):	ex	%r4, 0(%r5)		C execute mvc with length ((n-1) mod 256)+1
+L(rtn):	br	%r14
+EPILOGUE()

diff --git a/third_party/gmp/mpn/s390_64/gmp-mparam.h b/third_party/gmp/mpn/s390_64/gmp-mparam.h
new file mode 100644
index 0000000..062c3d2
--- /dev/null
+++ b/third_party/gmp/mpn/s390_64/gmp-mparam.h

@@ -0,0 +1,181 @@
+/* S/390-64 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 2000-2011 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+/* 4400 MHz z196 */
+/* Generated by tuneup.c, 2017-01-02, gcc 4.9 */
+
+#define DIVREM_1_NORM_THRESHOLD          MP_SIZE_T_MAX  /* never */
+#define DIVREM_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */
+#define MOD_1_1P_METHOD                      2
+#define MOD_1_NORM_THRESHOLD             MP_SIZE_T_MAX  /* never */
+#define MOD_1_UNNORM_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD         14
+#define MOD_1U_TO_MOD_1_1_THRESHOLD         15
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD         0  /* never mpn_mod_1_1p */
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD        31
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD      2
+#define USE_PREINV_DIVREM_1                  0
+#define DIV_QR_1N_PI1_METHOD                 1
+#define DIV_QR_1_NORM_THRESHOLD          MP_SIZE_T_MAX  /* never */
+#define DIV_QR_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */
+#define DIV_QR_2_PI2_THRESHOLD              10
+#define DIVEXACT_1_THRESHOLD                 4
+#define BMOD_1_TO_MOD_1_THRESHOLD            0  /* always */
+
+#define DIV_1_VS_MUL_1_PERCENT             317
+
+#define MUL_TOOM22_THRESHOLD                14
+#define MUL_TOOM33_THRESHOLD                45
+#define MUL_TOOM44_THRESHOLD               121
+#define MUL_TOOM6H_THRESHOLD               177
+#define MUL_TOOM8H_THRESHOLD               260
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      81
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD      78
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD      81
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD      88
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD     118
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 13
+#define SQR_TOOM3_THRESHOLD                 89
+#define SQR_TOOM4_THRESHOLD                242
+#define SQR_TOOM6_THRESHOLD                363
+#define SQR_TOOM8_THRESHOLD                482
+
+#define MULMID_TOOM42_THRESHOLD             38
+
+#define MULMOD_BNM1_THRESHOLD                9
+#define SQRMOD_BNM1_THRESHOLD                9
+
+#define MUL_FFT_MODF_THRESHOLD             236  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    236, 5}, {     11, 6}, {      6, 5}, {     13, 6}, \
+    {     13, 7}, {      7, 6}, {     15, 7}, {      8, 6}, \
+    {     17, 7}, {     17, 8}, {      9, 7}, {     19, 8}, \
+    {     13, 9}, {      7, 8}, {     19, 9}, {     11, 8}, \
+    {     25,10}, {      7, 9}, {     15, 8}, {     33, 9}, \
+    {     19, 8}, {     39, 9}, {     23,10}, {     15, 9}, \
+    {     39,10}, {     23, 9}, {     47,11}, {     15,10}, \
+    {     31, 9}, {     67,10}, {     39, 9}, {     79,10}, \
+    {     47,11}, {     31,10}, {     63, 9}, {    127, 8}, \
+    {    255,10}, {     71, 9}, {    143, 8}, {    287, 7}, \
+    {    575, 9}, {    159,11}, {     47,12}, {     31,11}, \
+    {     63,10}, {    127, 9}, {    255, 8}, {    511,10}, \
+    {    143, 9}, {    287,11}, {     79,10}, {    159, 9}, \
+    {    319, 8}, {    639,10}, {    175, 9}, {    351, 8}, \
+    {    703,11}, {     95,10}, {    191, 9}, {    383, 8}, \
+    {    767,10}, {    207, 9}, {    415, 8}, {    831,10}, \
+    {    223,12}, {     63,11}, {    127,10}, {    255, 9}, \
+    {    511,11}, {    143,10}, {    287, 9}, {    575, 8}, \
+    {   1151,11}, {    159,10}, {    319, 9}, {    639,11}, \
+    {    175,10}, {    351, 9}, {    703, 8}, {   1407,12}, \
+    {     95,11}, {    191,10}, {    383, 9}, {    767,11}, \
+    {    207,10}, {    415, 9}, {    831,11}, {    223,13}, \
+    {   8192,14}, {  16384,15}, {  32768,16}, {  65536,17}, \
+    { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
+    {2097152,22}, {4194304,23}, {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 99
+#define MUL_FFT_THRESHOLD                 2240
+
+#define SQR_FFT_MODF_THRESHOLD             220  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    220, 5}, {      7, 4}, {     15, 5}, {     13, 6}, \
+    {      7, 5}, {     15, 6}, {      8, 5}, {     17, 6}, \
+    {     13, 7}, {      7, 6}, {     15, 7}, {      8, 6}, \
+    {     17, 7}, {      9, 6}, {     19, 7}, {     13, 8}, \
+    {      7, 7}, {     17, 8}, {      9, 7}, {     20, 8}, \
+    {     11, 7}, {     23, 8}, {     13, 9}, {      7, 8}, \
+    {     19, 9}, {     11, 8}, {     23, 9}, {     15, 8}, \
+    {     31, 9}, {     19, 8}, {     39, 9}, {     23,10}, \
+    {     15, 9}, {     39,10}, {     23,11}, {     15,10}, \
+    {     31, 9}, {     63,10}, {     47,11}, {     31,10}, \
+    {     63, 9}, {    127, 8}, {    255,10}, {     71, 9}, \
+    {    143, 8}, {    287,11}, {     47,12}, {     31,11}, \
+    {     63,10}, {    127, 9}, {    255, 8}, {    511,10}, \
+    {    143, 9}, {    287, 8}, {    575, 7}, {   1151,10}, \
+    {    159, 9}, {    319, 8}, {    639,10}, {    175, 9}, \
+    {    351, 8}, {    703,11}, {     95,10}, {    191, 9}, \
+    {    383,12}, {     63,11}, {    127,10}, {    255, 9}, \
+    {    511,11}, {    143,10}, {    287, 9}, {    575, 8}, \
+    {   1151,11}, {    159,10}, {    319, 9}, {    639,11}, \
+    {    175,10}, {    351, 9}, {    703,12}, {     95,11}, \
+    {    191,10}, {    383,11}, {    207,13}, {   8192,14}, \
+    {  16384,15}, {  32768,16}, {  65536,17}, { 131072,18}, \
+    { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \
+    {4194304,23}, {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 94
+#define SQR_FFT_THRESHOLD                 1728
+
+#define MULLO_BASECASE_THRESHOLD             0  /* always */
+#define MULLO_DC_THRESHOLD                  38
+#define MULLO_MUL_N_THRESHOLD             4392
+#define SQRLO_BASECASE_THRESHOLD             0  /* always */
+#define SQRLO_DC_THRESHOLD                  54
+#define SQRLO_SQR_THRESHOLD               3176
+
+#define DC_DIV_QR_THRESHOLD                 42
+#define DC_DIVAPPR_Q_THRESHOLD             148
+#define DC_BDIV_QR_THRESHOLD                46
+#define DC_BDIV_Q_THRESHOLD                107
+
+#define INV_MULMOD_BNM1_THRESHOLD           34
+#define INV_NEWTON_THRESHOLD               163
+#define INV_APPR_THRESHOLD                 131
+
+#define BINV_NEWTON_THRESHOLD              183
+#define REDC_1_TO_REDC_N_THRESHOLD          43
+
+#define MU_DIV_QR_THRESHOLD                807
+#define MU_DIVAPPR_Q_THRESHOLD             942
+#define MUPI_DIV_QR_THRESHOLD               78
+#define MU_BDIV_QR_THRESHOLD               680
+#define MU_BDIV_Q_THRESHOLD                828
+
+#define POWM_SEC_TABLE  3,35,285,1603
+
+#define GET_STR_DC_THRESHOLD                12
+#define GET_STR_PRECOMPUTE_THRESHOLD        21
+#define SET_STR_DC_THRESHOLD              1391
+#define SET_STR_PRECOMPUTE_THRESHOLD      2872
+
+#define FAC_DSC_THRESHOLD                  151
+#define FAC_ODD_THRESHOLD                   23
+
+#define MATRIX22_STRASSEN_THRESHOLD         15
+#define HGCD_THRESHOLD                     135
+#define HGCD_APPR_THRESHOLD                169
+#define HGCD_REDUCE_THRESHOLD             1437
+#define GCD_DC_THRESHOLD                   469
+#define GCDEXT_DC_THRESHOLD                342
+#define JACOBI_BASE_METHOD                   4

diff --git a/third_party/gmp/mpn/s390_64/invert_limb.asm b/third_party/gmp/mpn/s390_64/invert_limb.asm
new file mode 100644
index 0000000..edcebdd
--- /dev/null
+++ b/third_party/gmp/mpn/s390_64/invert_limb.asm

@@ -0,0 +1,94 @@
+dnl  S/390-64 mpn_invert_limb
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  Copyright 2011, 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C            cycles/limb
+C z900	       142
+C z990          86
+C z9		 ?
+C z10	       120
+C z196		 ?
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_invert_limb)
+	stg	%r9, 72(%r15)
+	srlg	%r9, %r2, 55
+	agr	%r9, %r9
+	larl	%r4, approx_tab-512
+	srlg	%r3, %r2, 24
+	aghi	%r3, 1
+	lghi	%r5, 1
+	llgh	%r4, 0(%r9, %r4)
+	sllg	%r9, %r4, 11
+	msgr	%r4, %r4
+	msgr	%r4, %r3
+	srlg	%r4, %r4, 40
+	aghi	%r9, -1
+	sgr	%r9, %r4
+	sllg	%r0, %r9, 60
+	sllg	%r1, %r9, 13
+	msgr	%r9, %r9
+	msgr	%r9, %r3
+	sgr	%r0, %r9
+	ngr	%r5, %r2
+	srlg	%r4, %r2, 1
+	srlg	%r3, %r0, 47
+	agr	%r3, %r1
+	agr	%r4, %r5
+	msgr	%r4, %r3
+	srlg	%r1, %r3, 1
+	lcgr	%r5, %r5
+	ngr	%r1, %r5
+	sgr	%r1, %r4
+	mlgr	%r0, %r3
+	srlg	%r9, %r0, 1
+	sllg	%r4, %r3, 31
+	agr	%r4, %r9
+	lgr	%r1, %r4
+	mlgr	%r0, %r2
+	algr	%r1, %r2
+	alcgr	%r0, %r2
+	lgr	%r2, %r4
+	sgr	%r2, %r0
+	lg	%r9, 72(%r15)
+	br	%r14
+EPILOGUE()
+	RODATA
+	ALIGN(2)
+approx_tab:
+forloop(i,256,512-1,dnl
+`	.word	eval(0x7fd00/i)
+')dnl
+ASM_END()

diff --git a/third_party/gmp/mpn/s390_64/logops_n.asm b/third_party/gmp/mpn/s390_64/logops_n.asm
new file mode 100644
index 0000000..914cfb6
--- /dev/null
+++ b/third_party/gmp/mpn/s390_64/logops_n.asm

@@ -0,0 +1,291 @@
+dnl  S/390-64 logops.
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb     variant 1           variant 2       variant 3
+C	        rp!=up  rp=up
+C z900		 4.5	 2.25		 5.5		 5.5
+C z990		 2.75	 2		 3.25		 3.25
+C z9		 ?			 ?		 ?
+C z10		 3.25			 3.75		 3.75
+C z196		 ?			 ?		 ?
+
+C INPUT PARAMETERS
+define(`rp',	`%r2')
+define(`up',	`%r3')
+define(`vp',	`%r4')
+define(`n',	`%r5')
+
+ifdef(`OPERATION_and_n',`
+  define(`func',`mpn_and_n')
+  define(`VARIANT_1')
+  define(`LOGOPC',`nc')
+  define(`LOGOP',`ng')')
+ifdef(`OPERATION_andn_n',`
+  define(`func',`mpn_andn_n')
+  define(`VARIANT_2')
+  define(`LOGOP',`ng')')
+ifdef(`OPERATION_nand_n',`
+  define(`func',`mpn_nand_n')
+  define(`VARIANT_3')
+  define(`LOGOP',`ng')')
+ifdef(`OPERATION_ior_n',`
+  define(`func',`mpn_ior_n')
+  define(`VARIANT_1')
+  define(`LOGOPC',`oc')
+  define(`LOGOP',`og')')
+ifdef(`OPERATION_iorn_n',`
+  define(`func',`mpn_iorn_n')
+  define(`VARIANT_2')
+  define(`LOGOP',`og')')
+ifdef(`OPERATION_nior_n',`
+  define(`func',`mpn_nior_n')
+  define(`VARIANT_3')
+  define(`LOGOP',`og')')
+ifdef(`OPERATION_xor_n',`
+  define(`func',`mpn_xor_n')
+  define(`VARIANT_1')
+  define(`LOGOPC',`xc')
+  define(`LOGOP',`xg')')
+ifdef(`OPERATION_xnor_n',`
+  define(`func',`mpn_xnor_n')
+  define(`VARIANT_2')
+  define(`LOGOP',`xg')')
+
+MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n)
+
+ASM_START()
+PROLOGUE(func)
+ifdef(`VARIANT_1',`
+	cgr	rp, up
+	jne	L(normal)
+
+	sllg	n, n, 3
+	aghi	n, -1
+	srlg	%r1, n, 8
+	ltgr	%r1, %r1		C < 256 bytes to copy?
+	je	L(1)
+
+L(tp):	LOGOPC	0(256, rp), 0(vp)
+	la	rp, 256(rp)
+	la	vp, 256(vp)
+	brctg	%r1, L(tp)
+
+L(1):	bras	%r1, L(2)		C make r1 point to mvc insn
+	LOGOPC	0(1, rp), 0(vp)
+L(2):	ex	n, 0(%r1)		C execute mvc with length ((n-1) mod 256)+1
+L(rtn):	br	%r14
+
+
+L(normal):
+	stmg	%r6, %r8, 48(%r15)
+	aghi	n, 3
+	lghi	%r7, 3
+	srlg	%r0, n, 2
+	ngr	%r7, n			C n mod 4
+	je	L(b1)
+	cghi	%r7, 2
+	jl	L(b2)
+	jne	L(top)
+
+L(b3):	lmg	%r5, %r7, 0(up)
+	la	up, 24(up)
+	LOGOP	%r5, 0(vp)
+	LOGOP	%r6, 8(vp)
+	LOGOP	%r7, 16(vp)
+	stmg	%r5, %r7, 0(rp)
+	la	rp, 24(rp)
+	la	vp, 24(vp)
+	j	L(mid)
+
+L(b1):	lg	%r5, 0(up)
+	la	up, 8(up)
+	LOGOP	%r5, 0(vp)
+	stg	%r5, 0(rp)
+	la	rp, 8(rp)
+	la	vp, 8(vp)
+	j	L(mid)
+
+L(b2):	lmg	%r5, %r6, 0(up)
+	la	up, 16(up)
+	LOGOP	%r5, 0(vp)
+	LOGOP	%r6, 8(vp)
+	stmg	%r5, %r6, 0(rp)
+	la	rp, 16(rp)
+	la	vp, 16(vp)
+	j	L(mid)
+
+L(top):	lmg	%r5, %r8, 0(up)
+	la	up, 32(up)
+	LOGOP	%r5, 0(vp)
+	LOGOP	%r6, 8(vp)
+	LOGOP	%r7, 16(vp)
+	LOGOP	%r8, 24(vp)
+	stmg	%r5, %r8, 0(rp)
+	la	rp, 32(rp)
+	la	vp, 32(vp)
+L(mid):	brctg	%r0, L(top)
+
+	lmg	%r6, %r8, 48(%r15)
+	br	%r14
+')
+
+ifdef(`VARIANT_2',`
+	stmg	%r6, %r8, 48(%r15)
+	lghi	%r1, -1
+
+	aghi	n, 3
+	lghi	%r7, 3
+	srlg	%r0, n, 2
+	ngr	%r7, n			C n mod 4
+	je	L(b1)
+	cghi	%r7, 2
+	jl	L(b2)
+	jne	L(top)
+
+L(b3):	lmg	%r5, %r7, 0(vp)
+	la	vp, 24(vp)
+	xgr	%r5, %r1
+	xgr	%r6, %r1
+	xgr	%r7, %r1
+	LOGOP	%r5, 0(up)
+	LOGOP	%r6, 8(up)
+	LOGOP	%r7, 16(up)
+	stmg	%r5, %r7, 0(rp)
+	la	rp, 24(rp)
+	la	up, 24(up)
+	j	L(mid)
+
+L(b1):	lg	%r5, 0(vp)
+	la	vp, 8(vp)
+	xgr	%r5, %r1
+	LOGOP	%r5, 0(up)
+	stg	%r5, 0(rp)
+	la	rp, 8(rp)
+	la	up, 8(up)
+	j	L(mid)
+
+L(b2):	lmg	%r5, %r6, 0(vp)
+	la	vp, 16(vp)
+	xgr	%r5, %r1
+	xgr	%r6, %r1
+	LOGOP	%r5, 0(up)
+	LOGOP	%r6, 8(up)
+	stmg	%r5, %r6, 0(rp)
+	la	rp, 16(rp)
+	la	up, 16(up)
+	j	L(mid)
+
+L(top):	lmg	%r5, %r8, 0(vp)
+	la	vp, 32(vp)
+	xgr	%r5, %r1
+	xgr	%r6, %r1
+	xgr	%r7, %r1
+	xgr	%r8, %r1
+	LOGOP	%r5, 0(up)
+	LOGOP	%r6, 8(up)
+	LOGOP	%r7, 16(up)
+	LOGOP	%r8, 24(up)
+	la	up, 32(up)
+	stmg	%r5, %r8, 0(rp)
+	la	rp, 32(rp)
+L(mid):	brctg	%r0, L(top)
+
+	lmg	%r6, %r8, 48(%r15)
+	br	%r14
+')
+
+ifdef(`VARIANT_3',`
+	stmg	%r6, %r8, 48(%r15)
+	lghi	%r1, -1
+
+	aghi	n, 3
+	lghi	%r7, 3
+	srlg	%r0, n, 2
+	ngr	%r7, n			C n mod 4
+	je	L(b1)
+	cghi	%r7, 2
+	jl	L(b2)
+	jne	L(top)
+
+L(b3):	lmg	%r5, %r7, 0(vp)
+	la	vp, 24(vp)
+	LOGOP	%r5, 0(up)
+	LOGOP	%r6, 8(up)
+	xgr	%r5, %r1
+	xgr	%r6, %r1
+	LOGOP	%r7, 16(up)
+	xgr	%r7, %r1
+	stmg	%r5, %r7, 0(rp)
+	la	rp, 24(rp)
+	la	up, 24(up)
+	j	L(mid)
+
+L(b1):	lg	%r5, 0(vp)
+	la	vp, 8(vp)
+	LOGOP	%r5, 0(up)
+	xgr	%r5, %r1
+	stg	%r5, 0(rp)
+	la	rp, 8(rp)
+	la	up, 8(up)
+	j	L(mid)
+
+L(b2):	lmg	%r5, %r6, 0(vp)
+	la	vp, 16(vp)
+	LOGOP	%r5, 0(up)
+	LOGOP	%r6, 8(up)
+	xgr	%r5, %r1
+	xgr	%r6, %r1
+	stmg	%r5, %r6, 0(rp)
+	la	rp, 16(rp)
+	la	up, 16(up)
+	j	L(mid)
+
+L(top):	lmg	%r5, %r8, 0(vp)
+	la	vp, 32(vp)
+	LOGOP	%r5, 0(up)
+	LOGOP	%r6, 8(up)
+	xgr	%r5, %r1
+	xgr	%r6, %r1
+	LOGOP	%r7, 16(up)
+	LOGOP	%r8, 24(up)
+	xgr	%r7, %r1
+	xgr	%r8, %r1
+	stmg	%r5, %r8, 0(rp)
+	la	up, 32(up)
+	la	rp, 32(rp)
+L(mid):	brctg	%r0, L(top)
+
+	lmg	%r6, %r8, 48(%r15)
+	br	%r14
+')
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/s390_64/lshift.asm b/third_party/gmp/mpn/s390_64/lshift.asm
new file mode 100644
index 0000000..4dae035
--- /dev/null
+++ b/third_party/gmp/mpn/s390_64/lshift.asm

@@ -0,0 +1,196 @@
+dnl  S/390-64 mpn_lshift.
+
+dnl  Copyright 2011, 2012, 2014 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C            cycles/limb
+C z900		 7
+C z990           3
+C z9		 ?
+C z10		 6
+C z196		 ?
+
+C NOTES
+C  * This uses discrete loads and stores in a software pipeline.  Using lmg and
+C    stmg is not faster.
+C  * One could assume more pipelining could approach 2.5 c/l, but we have not
+C    found any 8-way loop that runs better than the current 4-way loop.
+C  * Consider using the same feed-in code for 1 <= n <= 3 as for n mod 4,
+C    similarly to the x86_64 sqr_basecase feed-in.
+
+C INPUT PARAMETERS
+define(`rp',	`%r2')
+define(`up',	`%r3')
+define(`n',	`%r4')
+define(`cnt',	`%r5')
+
+define(`tnc',	`%r6')
+
+ASM_START()
+PROLOGUE(mpn_lshift)
+	cghi	n, 3
+	jh	L(gt1)
+
+	stmg	%r6, %r7, 48(%r15)
+	larl	%r1, L(tab)-4
+	lcgr	tnc, cnt
+	sllg	n, n, 2
+	b	0(n,%r1)
+L(tab):	j	L(n1)
+	j	L(n2)
+	j	L(n3)
+
+L(n1):	lg	%r1, 0(up)
+	sllg	%r0, %r1, 0(cnt)
+	stg	%r0, 0(rp)
+	srlg	%r2, %r1, 0(tnc)
+	lg	%r6, 48(%r15)		C restoring r7 not needed
+	br	%r14
+
+L(n2):	lg	%r1, 8(up)
+	srlg	%r4, %r1, 0(tnc)
+	sllg	%r0, %r1, 0(cnt)
+	j	L(cj)
+
+L(n3):	lg	%r1, 16(up)
+	srlg	%r4, %r1, 0(tnc)
+	sllg	%r0, %r1, 0(cnt)
+	lg	%r1, 8(up)
+	srlg	%r7, %r1, 0(tnc)
+	ogr	%r7, %r0
+	sllg	%r0, %r1, 0(cnt)
+	stg	%r7, 16(rp)
+L(cj):	lg	%r1, 0(up)
+	srlg	%r7, %r1, 0(tnc)
+	ogr	%r7, %r0
+	sllg	%r0, %r1, 0(cnt)
+	stg	%r7, 8(rp)
+	stg	%r0, 0(rp)
+	lgr	%r2, %r4
+	lmg	%r6, %r7, 48(%r15)
+	br	%r14
+
+L(gt1):	stmg	%r6, %r13, 48(%r15)
+	lcgr	tnc, cnt		C tnc = -cnt
+
+	sllg	%r1, n, 3
+	srlg	%r0, n, 2		C loop count
+
+	agr	up, %r1			C point up at end of U
+	agr	rp, %r1			C point rp at end of R
+	aghi	up, -56
+	aghi	rp, -40
+
+	lghi	%r7, 3
+	ngr	%r7, n
+	je	L(b0)
+	cghi	%r7, 2
+	jl	L(b1)
+	je	L(b2)
+
+L(b3):	lg	%r7, 48(up)
+	srlg	%r9, %r7, 0(tnc)
+	sllg	%r11, %r7, 0(cnt)
+	lg	%r8, 40(up)
+	lg	%r7, 32(up)
+	srlg	%r4, %r8, 0(tnc)
+	sllg	%r13, %r8, 0(cnt)
+	ogr	%r11, %r4
+	la	rp, 16(rp)
+	j	L(lm3)
+
+L(b2):	lg	%r8, 48(up)
+	lg	%r7, 40(up)
+	srlg	%r9, %r8, 0(tnc)
+	sllg	%r13, %r8, 0(cnt)
+	la	rp, 24(rp)
+	la	up, 8(up)
+	j	L(lm2)
+
+L(b1):	lg	%r7, 48(up)
+	srlg	%r9, %r7, 0(tnc)
+	sllg	%r11, %r7, 0(cnt)
+	lg	%r8, 40(up)
+	lg	%r7, 32(up)
+	srlg	%r4, %r8, 0(tnc)
+	sllg	%r10, %r8, 0(cnt)
+	ogr	%r11, %r4
+	la	rp, 32(rp)
+	la	up, 16(up)
+	j	L(lm1)
+
+L(b0):	lg	%r8, 48(up)
+	lg	%r7, 40(up)
+	srlg	%r9, %r8, 0(tnc)
+	sllg	%r10, %r8, 0(cnt)
+	la	rp, 40(rp)
+	la	up, 24(up)
+	j	L(lm0)
+
+	ALIGN(8)
+L(top):	srlg	%r4, %r8, 0(tnc)
+	sllg	%r13, %r8, 0(cnt)
+	ogr	%r11, %r4
+	stg	%r10, 24(rp)
+L(lm3):	stg	%r11, 16(rp)
+L(lm2):	srlg	%r12, %r7, 0(tnc)
+	sllg	%r11, %r7, 0(cnt)
+	lg	%r8, 24(up)
+	lg	%r7, 16(up)
+	ogr	%r13, %r12
+	srlg	%r4, %r8, 0(tnc)
+	sllg	%r10, %r8, 0(cnt)
+	ogr	%r11, %r4
+	stg	%r13, 8(rp)
+L(lm1):	stg	%r11, 0(rp)
+L(lm0):	srlg	%r12, %r7, 0(tnc)
+	aghi	rp, -32
+	sllg	%r11, %r7, 0(cnt)
+	lg	%r8, 8(up)
+	lg	%r7, 0(up)
+	aghi	up, -32
+	ogr	%r10, %r12
+	brctg	%r0, L(top)
+
+L(end):	srlg	%r4, %r8, 0(tnc)
+	sllg	%r13, %r8, 0(cnt)
+	ogr	%r11, %r4
+	stg	%r10, 24(rp)
+	stg	%r11, 16(rp)
+	srlg	%r12, %r7, 0(tnc)
+	sllg	%r11, %r7, 0(cnt)
+	ogr	%r13, %r12
+	stg	%r13, 8(rp)
+	stg	%r11, 0(rp)
+	lgr	%r2, %r9
+
+	lmg	%r6, %r13, 48(%r15)
+	br	%r14
+EPILOGUE()

diff --git a/third_party/gmp/mpn/s390_64/lshiftc.asm b/third_party/gmp/mpn/s390_64/lshiftc.asm
new file mode 100644
index 0000000..92552d5
--- /dev/null
+++ b/third_party/gmp/mpn/s390_64/lshiftc.asm

@@ -0,0 +1,207 @@
+dnl  S/390-64 mpn_lshiftc.
+
+dnl  Copyright 2011, 2014 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C            cycles/limb
+C z900		 9
+C z990           3.5
+C z9		 ?
+C z10		 7
+C z196		 ?
+
+C NOTES
+C  * See notes in lshift.asm.
+
+C INPUT PARAMETERS
+define(`rp',	`%r2')
+define(`up',	`%r3')
+define(`n',	`%r4')
+define(`cnt',	`%r5')
+
+define(`tnc',	`%r6')
+
+ASM_START()
+PROLOGUE(mpn_lshiftc)
+	cghi	n, 3
+	jh	L(gt1)
+
+	stmg	%r6, %r8, 48(%r15)
+	larl	%r1, L(tab)-4
+	lcgr	tnc, cnt
+	sllg	n, n, 2
+	lghi	%r8, -1
+	b	0(n,%r1)
+L(tab):	j	L(n1)
+	j	L(n2)
+	j	L(n3)
+
+L(n1):	lg	%r1, 0(up)
+	sllg	%r0, %r1, 0(cnt)
+	xgr	%r0, %r8
+	stg	%r0, 0(rp)
+	srlg	%r2, %r1, 0(tnc)
+	lmg	%r6, %r8, 48(%r15)
+	br	%r14
+
+L(n2):	lg	%r1, 8(up)
+	srlg	%r4, %r1, 0(tnc)
+	sllg	%r0, %r1, 0(cnt)
+	j	L(cj)
+
+L(n3):	lg	%r1, 16(up)
+	srlg	%r4, %r1, 0(tnc)
+	sllg	%r0, %r1, 0(cnt)
+	lg	%r1, 8(up)
+	srlg	%r7, %r1, 0(tnc)
+	ogr	%r7, %r0
+	sllg	%r0, %r1, 0(cnt)
+	xgr	%r7, %r8
+	stg	%r7, 16(rp)
+L(cj):	lg	%r1, 0(up)
+	srlg	%r7, %r1, 0(tnc)
+	ogr	%r7, %r0
+	sllg	%r0, %r1, 0(cnt)
+	xgr	%r7, %r8
+	xgr	%r0, %r8
+	stg	%r7, 8(rp)
+	stg	%r0, 0(rp)
+	lgr	%r2, %r4
+	lmg	%r6, %r8, 48(%r15)
+	br	%r14
+
+L(gt1):	stmg	%r6, %r14, 48(%r15)
+	lcgr	tnc, cnt		C tnc = -cnt
+
+	sllg	%r1, n, 3
+	srlg	%r0, n, 2		C loop count
+
+	agr	up, %r1			C point up at end of U
+	agr	rp, %r1			C point rp at end of R
+	aghi	up, -56
+	aghi	rp, -40
+
+	lghi	%r7, 3
+	lghi	%r14, -1
+	ngr	%r7, n
+	je	L(b0)
+	cghi	%r7, 2
+	jl	L(b1)
+	je	L(b2)
+
+L(b3):	lg	%r7, 48(up)
+	srlg	%r9, %r7, 0(tnc)
+	sllg	%r11, %r7, 0(cnt)
+	lg	%r8, 40(up)
+	lg	%r7, 32(up)
+	srlg	%r4, %r8, 0(tnc)
+	sllg	%r13, %r8, 0(cnt)
+	ogr	%r11, %r4
+	la	rp, 16(rp)
+	xgr	%r11, %r14
+	j	L(lm3)
+
+L(b2):	lg	%r8, 48(up)
+	lg	%r7, 40(up)
+	srlg	%r9, %r8, 0(tnc)
+	sllg	%r13, %r8, 0(cnt)
+	la	rp, 24(rp)
+	la	up, 8(up)
+	j	L(lm2)
+
+L(b1):	lg	%r7, 48(up)
+	srlg	%r9, %r7, 0(tnc)
+	sllg	%r11, %r7, 0(cnt)
+	lg	%r8, 40(up)
+	lg	%r7, 32(up)
+	srlg	%r4, %r8, 0(tnc)
+	sllg	%r10, %r8, 0(cnt)
+	ogr	%r11, %r4
+	la	rp, 32(rp)
+	la	up, 16(up)
+	xgr	%r11, %r14
+	j	L(lm1)
+
+L(b0):	lg	%r8, 48(up)
+	lg	%r7, 40(up)
+	srlg	%r9, %r8, 0(tnc)
+	sllg	%r10, %r8, 0(cnt)
+	la	rp, 40(rp)
+	la	up, 24(up)
+	j	L(lm0)
+
+	ALIGN(8)
+L(top):	srlg	%r4, %r8, 0(tnc)
+	sllg	%r13, %r8, 0(cnt)
+	ogr	%r11, %r4
+	xgr	%r10, %r14
+	xgr	%r11, %r14
+	stg	%r10, 24(rp)
+L(lm3):	stg	%r11, 16(rp)
+L(lm2):	srlg	%r12, %r7, 0(tnc)
+	sllg	%r11, %r7, 0(cnt)
+	lg	%r8, 24(up)
+	lg	%r7, 16(up)
+	ogr	%r13, %r12
+	srlg	%r4, %r8, 0(tnc)
+	sllg	%r10, %r8, 0(cnt)
+	ogr	%r11, %r4
+	xgr	%r13, %r14
+	xgr	%r11, %r14
+	stg	%r13, 8(rp)
+L(lm1):	stg	%r11, 0(rp)
+L(lm0):	srlg	%r12, %r7, 0(tnc)
+	aghi	rp, -32
+	sllg	%r11, %r7, 0(cnt)
+	lg	%r8, 8(up)
+	lg	%r7, 0(up)
+	aghi	up, -32
+	ogr	%r10, %r12
+	brctg	%r0, L(top)
+
+L(end):	srlg	%r4, %r8, 0(tnc)
+	sllg	%r13, %r8, 0(cnt)
+	ogr	%r11, %r4
+	xgr	%r10, %r14
+	xgr	%r11, %r14
+	stg	%r10, 24(rp)
+	stg	%r11, 16(rp)
+	srlg	%r12, %r7, 0(tnc)
+	sllg	%r11, %r7, 0(cnt)
+	ogr	%r13, %r12
+	xgr	%r13, %r14
+	xgr	%r11, %r14
+	stg	%r13, 8(rp)
+	stg	%r11, 0(rp)
+	lgr	%r2, %r9
+
+	lmg	%r6, %r14, 48(%r15)
+	br	%r14
+EPILOGUE()

diff --git a/third_party/gmp/mpn/s390_64/mod_34lsub1.asm b/third_party/gmp/mpn/s390_64/mod_34lsub1.asm
new file mode 100644
index 0000000..fd40011
--- /dev/null
+++ b/third_party/gmp/mpn/s390_64/mod_34lsub1.asm

@@ -0,0 +1,109 @@
+dnl  S/390-64 mpn_mod_34lsub1
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C            cycles/limb
+C z900		 5.8
+C z990           2
+C z9		 ?
+C z10		 4.5
+C z196		 ?
+
+C TODO
+C  * Optimise summation code, see x86_64.
+
+C INPUT PARAMETERS
+define(`rp',	`%r2')
+define(`n',	`%r3')
+
+ASM_START()
+PROLOGUE(mpn_mod_34lsub1)
+	stmg	%r7, %r12, 56(%r15)
+	lghi	%r11, 0
+	lghi	%r12, 0
+	lghi	%r0, 0
+	lghi	%r8, 0
+	lghi	%r9, 0
+	lghi	%r10, 0
+	lghi	%r7, 0
+	aghi	%r3, -3
+	jl	.L3
+
+L(top):	alg	%r0, 0(%r2)
+	alcg	%r12, 8(%r2)
+	alcg	%r11, 16(%r2)
+	alcgr	%r8, %r7
+	la	%r2, 24(%r2)
+	aghi	%r3, -3
+	jnl	L(top)
+
+	lgr	%r7, %r8
+	srlg	%r1, %r11, 16
+	nihh	%r7, 0			C 0xffffffffffff
+	agr	%r7, %r1
+	srlg	%r8, %r8, 48
+	agr	%r7, %r8
+	sllg	%r11, %r11, 32
+	nihh	%r11, 0
+	agr	%r7, %r11
+.L3:
+	cghi	%r3, -3
+	je	.L6
+	alg	%r0, 0(%r2)
+	alcgr	%r10, %r10
+	cghi	%r3, -2
+	je	.L6
+	alg	%r12, 8(%r2)
+	alcgr	%r9, %r9
+.L6:
+	srlg	%r1, %r0, 48
+	nihh	%r0, 0			C 0xffffffffffff
+	agr	%r0, %r1
+	agr	%r0, %r7
+	srlg	%r1, %r12, 32
+	agr	%r0, %r1
+	srlg	%r1, %r10, 32
+	agr	%r0, %r1
+	llgfr	%r12, %r12
+	srlg	%r1, %r9, 16
+	sllg	%r12, %r12, 16
+	llgfr	%r10, %r10
+	agr	%r0, %r1
+	llill	%r2, 65535
+	agr	%r0, %r12
+	sllg	%r10, %r10, 16
+	ngr	%r2, %r9
+	agr	%r0, %r10
+	sllg	%r2, %r2, 32
+	agr	%r2, %r0
+	lmg	%r7, %r12, 56(%r15)
+	br	%r14
+EPILOGUE()

diff --git a/third_party/gmp/mpn/s390_64/mul_1.asm b/third_party/gmp/mpn/s390_64/mul_1.asm
new file mode 100644
index 0000000..a8f6da9
--- /dev/null
+++ b/third_party/gmp/mpn/s390_64/mul_1.asm

@@ -0,0 +1,66 @@
+dnl  S/390-64 mpn_mul_1
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C            cycles/limb
+C z900		29
+C z990		22
+C z9		 ?
+C z10		20
+C z196		 ?
+
+C INPUT PARAMETERS
+define(`rp',	`%r2')
+define(`up',	`%r3')
+define(`n',	`%r4')
+define(`v0',	`%r5')
+
+ASM_START()
+PROLOGUE(mpn_mul_1)
+	stmg	%r11, %r12, 88(%r15)
+	lghi	%r12, 0			C zero index register
+	aghi	%r12, 0			C clear carry flag
+	lghi	%r11, 0			C clear carry limb
+
+L(top):	lg	%r1, 0(%r12,up)
+	mlgr	%r0, v0
+	alcgr	%r1, %r11
+	lgr	%r11, %r0		C copy high part to carry limb
+	stg	%r1, 0(%r12,rp)
+	la	%r12, 8(%r12)
+	brctg	n, L(top)
+
+	lghi	%r2, 0
+	alcgr	%r2, %r11
+
+	lmg	%r11, %r12, 88(%r15)
+	br	%r14
+EPILOGUE()

diff --git a/third_party/gmp/mpn/s390_64/mul_basecase.asm b/third_party/gmp/mpn/s390_64/mul_basecase.asm
new file mode 100644
index 0000000..7d14ea9
--- /dev/null
+++ b/third_party/gmp/mpn/s390_64/mul_basecase.asm

@@ -0,0 +1,130 @@
+dnl  S/390-64 mpn_mul_basecase.
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C            cycles/limb
+C z900		 ?
+C z990		23
+C z9		 ?
+C z10		28
+C z196		 ?
+
+C TODO
+C  * Perhaps add special case for un <= 2.
+C  * Replace loops by faster code.  The mul_1 and addmul_1 loops could be sped
+C    up by about 10%.
+
+C INPUT PARAMETERS
+define(`rp',	`%r2')
+define(`up',	`%r3')
+define(`un',	`%r4')
+define(`vp',	`%r5')
+define(`vn',	`%r6')
+
+define(`zero',	`%r8')
+
+ASM_START()
+PROLOGUE(mpn_mul_basecase)
+	cghi	un, 2
+	jhe	L(ge2)
+
+C un = vn = 1
+	lg	%r1, 0(vp)
+	mlg	%r0, 0(up)
+	stg	%r1, 0(rp)
+	stg	%r0, 8(rp)
+	br	%r14
+
+L(ge2):	C jne	L(gen)
+
+
+L(gen):
+C mul_1 =======================================================================
+
+	stmg	%r6, %r12, 48(%r15)
+	lghi	zero, 0
+	aghi	un, -1
+
+	lg	%r7, 0(vp)
+	lg	%r11, 0(up)
+	lghi	%r12, 8			C init index register
+	mlgr	%r10, %r7
+	lgr	%r9, un
+	stg	%r11, 0(rp)
+	cr	%r15, %r15		C clear carry flag
+
+L(tm):	lg	%r1, 0(%r12,up)
+	mlgr	%r0, %r7
+	alcgr	%r1, %r10
+	lgr	%r10, %r0		C copy high part to carry limb
+	stg	%r1, 0(%r12,rp)
+	la	%r12, 8(%r12)
+	brctg	%r9, L(tm)
+
+	alcgr	%r0, zero
+	stg	%r0, 0(%r12,rp)
+
+C addmul_1 loop ===============================================================
+
+	aghi	vn, -1
+	je	L(outer_end)
+L(outer_loop):
+
+	la	rp, 8(rp)		C rp += 1
+	la	vp, 8(vp)		C up += 1
+	lg	%r7, 0(vp)
+	lg	%r11, 0(up)
+	lghi	%r12, 8			C init index register
+	mlgr	%r10, %r7
+	lgr	%r9, un
+	alg	%r11, 0(rp)
+	stg	%r11, 0(rp)
+
+L(tam):	lg	%r1, 0(%r12,up)
+	lg	%r11, 0(%r12,rp)
+	mlgr	%r0, %r7
+	alcgr	%r1, %r11
+	alcgr	%r0, zero
+	algr	%r1, %r10
+	lgr	%r10, %r0
+	stg	%r1, 0(%r12,rp)
+	la	%r12, 8(%r12)
+	brctg	%r9, L(tam)
+
+	alcgr	%r0, zero
+	stg	%r0, 0(%r12,rp)
+
+	brctg	vn, L(outer_loop)
+L(outer_end):
+
+	lmg	%r6, %r12, 48(%r15)
+	br	%r14
+EPILOGUE()

diff --git a/third_party/gmp/mpn/s390_64/rshift.asm b/third_party/gmp/mpn/s390_64/rshift.asm
new file mode 100644
index 0000000..e870971
--- /dev/null
+++ b/third_party/gmp/mpn/s390_64/rshift.asm

@@ -0,0 +1,195 @@
+dnl  S/390-64 mpn_rshift.
+
+dnl  Copyright 2011, 2014 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C            cycles/limb
+C z900		 7
+C z990           3
+C z9		 ?
+C z10		 6
+C z196		 ?
+
+C NOTES
+C  * See notes in lshift.asm.
+
+C INPUT PARAMETERS
+define(`rp',	`%r2')
+define(`up',	`%r3')
+define(`n',	`%r4')
+define(`cnt',	`%r5')
+
+define(`tnc',	`%r6')
+
+ASM_START()
+PROLOGUE(mpn_rshift)
+	cghi	n, 3
+	jh	L(gt1)
+
+	stmg	%r6, %r7, 48(%r15)
+	larl	%r1, L(tab)-4
+	lcgr	tnc, cnt
+	sllg	n, n, 2
+	b	0(n,%r1)
+L(tab):	j	L(n1)
+	j	L(n2)
+	j	L(n3)
+
+L(n1):	lg	%r1, 0(up)
+	srlg	%r0, %r1, 0(cnt)
+	stg	%r0, 0(rp)
+	sllg	%r2, %r1, 0(tnc)
+	lg	%r6, 48(%r15)		C restoring r7 not needed
+	br	%r14
+
+L(n2):	lg	%r1, 0(up)
+	sllg	%r4, %r1, 0(tnc)
+	srlg	%r0, %r1, 0(cnt)
+	lg	%r1, 8(up)
+	sllg	%r7, %r1, 0(tnc)
+	ogr	%r7, %r0
+	srlg	%r0, %r1, 0(cnt)
+	stg	%r7, 0(rp)
+	stg	%r0, 8(rp)
+	lgr	%r2, %r4
+	lmg	%r6, %r7, 48(%r15)
+	br	%r14
+
+
+L(n3):	lg	%r1, 0(up)
+	sllg	%r4, %r1, 0(tnc)
+	srlg	%r0, %r1, 0(cnt)
+	lg	%r1, 8(up)
+	sllg	%r7, %r1, 0(tnc)
+	ogr	%r7, %r0
+	srlg	%r0, %r1, 0(cnt)
+	stg	%r7, 0(rp)
+	lg	%r1, 16(up)
+	sllg	%r7, %r1, 0(tnc)
+	ogr	%r7, %r0
+	srlg	%r0, %r1, 0(cnt)
+	stg	%r7, 8(rp)
+	stg	%r0, 16(rp)
+	lgr	%r2, %r4
+	lmg	%r6, %r7, 48(%r15)
+	br	%r14
+
+L(gt1):	stmg	%r6, %r13, 48(%r15)
+	lcgr	tnc, cnt		C tnc = -cnt
+
+	sllg	%r1, n, 3
+	srlg	%r0, n, 2		C loop count
+
+	lghi	%r7, 3
+	ngr	%r7, n
+	je	L(b0)
+	cghi	%r7, 2
+	jl	L(b1)
+	je	L(b2)
+
+L(b3):	aghi	rp, -8
+	lg	%r7, 0(up)
+	sllg	%r9, %r7, 0(tnc)
+	srlg	%r11, %r7, 0(cnt)
+	lg	%r8, 8(up)
+	lg	%r7, 16(up)
+	sllg	%r4, %r8, 0(tnc)
+	srlg	%r13, %r8, 0(cnt)
+	ogr	%r11, %r4
+	la	up, 24(up)
+	j	L(lm3)
+
+L(b2):	aghi	rp, -16
+	lg	%r8, 0(up)
+	lg	%r7, 8(up)
+	sllg	%r9, %r8, 0(tnc)
+	srlg	%r13, %r8, 0(cnt)
+	la	up, 16(up)
+	j	L(lm2)
+
+L(b1):	aghi	rp, -24
+	lg	%r7, 0(up)
+	sllg	%r9, %r7, 0(tnc)
+	srlg	%r11, %r7, 0(cnt)
+	lg	%r8, 8(up)
+	lg	%r7, 16(up)
+	sllg	%r4, %r8, 0(tnc)
+	srlg	%r10, %r8, 0(cnt)
+	ogr	%r11, %r4
+	la	up, 8(up)
+	j	L(lm1)
+
+L(b0):	aghi	rp, -32
+	lg	%r8, 0(up)
+	lg	%r7, 8(up)
+	sllg	%r9, %r8, 0(tnc)
+	srlg	%r10, %r8, 0(cnt)
+	j	L(lm0)
+
+	ALIGN(8)
+L(top):	sllg	%r4, %r8, 0(tnc)
+	srlg	%r13, %r8, 0(cnt)
+	ogr	%r11, %r4
+	stg	%r10, 0(rp)
+L(lm3):	stg	%r11, 8(rp)
+L(lm2):	sllg	%r12, %r7, 0(tnc)
+	srlg	%r11, %r7, 0(cnt)
+	lg	%r8, 0(up)
+	lg	%r7, 8(up)
+	ogr	%r13, %r12
+	sllg	%r4, %r8, 0(tnc)
+	srlg	%r10, %r8, 0(cnt)
+	ogr	%r11, %r4
+	stg	%r13, 16(rp)
+L(lm1):	stg	%r11, 24(rp)
+L(lm0):	sllg	%r12, %r7, 0(tnc)
+	aghi	rp, 32
+	srlg	%r11, %r7, 0(cnt)
+	lg	%r8, 16(up)
+	lg	%r7, 24(up)
+	aghi	up, 32
+	ogr	%r10, %r12
+	brctg	%r0, L(top)
+
+L(end):	sllg	%r4, %r8, 0(tnc)
+	srlg	%r13, %r8, 0(cnt)
+	ogr	%r11, %r4
+	stg	%r10, 0(rp)
+	stg	%r11, 8(rp)
+	sllg	%r12, %r7, 0(tnc)
+	srlg	%r11, %r7, 0(cnt)
+	ogr	%r13, %r12
+	stg	%r13, 16(rp)
+	stg	%r11, 24(rp)
+	lgr	%r2, %r9
+
+	lmg	%r6, %r13, 48(%r15)
+	br	%r14
+EPILOGUE()

diff --git a/third_party/gmp/mpn/s390_64/sqr_basecase.asm b/third_party/gmp/mpn/s390_64/sqr_basecase.asm
new file mode 100644
index 0000000..bf31bd5
--- /dev/null
+++ b/third_party/gmp/mpn/s390_64/sqr_basecase.asm

@@ -0,0 +1,203 @@
+dnl  S/390-64 mpn_sqr_basecase.
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C            cycles/limb
+C z900		 ?
+C z990		23
+C z9		 ?
+C z10		28
+C z196		 ?
+
+C TODO
+C  * Clean up.
+C  * Stop iterating addmul_1 loop at latest for n = 2, implement longer tail.
+C    This will ask for basecase handling of n = 3.
+C  * Update counters and pointers more straightforwardly, possibly lowering
+C    register usage.
+C  * Should we use this allocation-free style for more sqr_basecase asm
+C    implementations?  The only disadvantage is that it requires R != U.
+C  * Replace loops by faster code.  The mul_1 and addmul_1 loops could be sped
+C    up by about 10%.  The sqr_diag_addlsh1 loop could probably be sped up even
+C    more.
+
+C INPUT PARAMETERS
+define(`rp',	`%r2')
+define(`up',	`%r3')
+define(`n',	`%r4')
+
+define(`zero',	`%r8')
+define(`rp_saved',	`%r9')
+define(`up_saved',	`%r13')
+define(`n_saved',	`%r14')
+
+ASM_START()
+PROLOGUE(mpn_sqr_basecase)
+	aghi	n, -2
+	jhe	L(ge2)
+
+C n = 1
+	lg	%r5, 0(up)
+	mlgr	%r4, %r5
+	stg	%r5, 0(rp)
+	stg	%r4, 8(rp)
+	br	%r14
+
+L(ge2):	jne	L(gen)
+
+C n = 2
+	stmg	%r6, %r8, 48(%r15)
+	lghi	zero, 0
+
+	lg	%r5, 0(up)
+	mlgr	%r4, %r5		C u0 * u0
+	lg	%r1, 8(up)
+	mlgr	%r0, %r1		C u1 * u1
+	stg	%r5, 0(rp)
+
+	lg	%r7, 0(up)
+	mlg	%r6, 8(up)		C u0 * u1
+	algr	%r7, %r7
+	alcgr	%r6, %r6
+	alcgr	%r0, zero
+
+	algr	%r4, %r7
+	alcgr	%r1, %r6
+	alcgr	%r0, zero
+	stg	%r4, 8(rp)
+	stg	%r1, 16(rp)
+	stg	%r0, 24(rp)
+
+	lmg	%r6, %r8, 48(%r15)
+	br	%r14
+
+L(gen):
+C mul_1 =======================================================================
+
+	stmg	%r6, %r14, 48(%r15)
+	lghi	zero, 0
+	lgr	up_saved, up
+	lgr	rp_saved, rp
+	lgr	n_saved, n
+
+	lg	%r6, 0(up)
+	lg	%r11, 8(up)
+	lghi	%r12, 16		C init index register
+	mlgr	%r10, %r6
+	lgr	%r5, n
+	stg	%r11, 8(rp)
+	cr	%r15, %r15		C clear carry flag
+
+L(tm):	lg	%r1, 0(%r12,up)
+	mlgr	%r0, %r6
+	alcgr	%r1, %r10
+	lgr	%r10, %r0		C copy high part to carry limb
+	stg	%r1, 0(%r12,rp)
+	la	%r12, 8(%r12)
+	brctg	%r5, L(tm)
+
+	alcgr	%r0, zero
+	stg	%r0, 0(%r12,rp)
+
+C addmul_1 loop ===============================================================
+
+	aghi	n, -1
+	je	L(outer_end)
+L(outer_loop):
+
+	la	rp, 16(rp)		C rp += 2
+	la	up, 8(up)		C up += 1
+	lg	%r6, 0(up)
+	lg	%r11, 8(up)
+	lghi	%r12, 16		C init index register
+	mlgr	%r10, %r6
+	lgr	%r5, n
+	alg	%r11, 8(rp)
+	stg	%r11, 8(rp)
+
+L(tam):	lg	%r1, 0(%r12,up)
+	lg	%r7, 0(%r12,rp)
+	mlgr	%r0, %r6
+	alcgr	%r1, %r7
+	alcgr	%r0, zero
+	algr	%r1, %r10
+	lgr	%r10, %r0
+	stg	%r1, 0(%r12,rp)
+	la	%r12, 8(%r12)
+	brctg	%r5, L(tam)
+
+	alcgr	%r0, zero
+	stg	%r0, 0(%r12,rp)
+
+	brctg	n, L(outer_loop)
+L(outer_end):
+
+	lg	%r6, 8(up)
+	lg	%r1, 16(up)
+	lgr	%r7, %r0		C Same as: lg %r7, 24(,rp)
+	mlgr	%r0, %r6
+	algr	%r1, %r7
+	alcgr	%r0, zero
+	stg	%r1, 24(rp)
+	stg	%r0, 32(rp)
+
+C sqr_diag_addlsh1 ============================================================
+
+define(`up', `up_saved')
+define(`rp', `rp_saved')
+	la	n, 1(n_saved)
+
+	lg	%r1, 0(up)
+	mlgr	%r0, %r1
+	stg	%r1, 0(rp)
+C	clr	%r15, %r15		C clear carry (already clear per above)
+
+L(top):	lg	%r11, 8(up)
+	la	up, 8(up)
+	lg	%r6, 8(rp)
+	lg	%r7, 16(rp)
+	mlgr	%r10, %r11
+	alcgr	%r6, %r6
+	alcgr	%r7, %r7
+	alcgr	%r10, zero		C propagate carry to high product limb
+	algr	%r6, %r0
+	alcgr	%r7, %r11
+	stmg	%r6, %r7, 8(rp)
+	la	rp, 16(rp)
+	lgr	%r0, %r10		C copy carry limb
+	brctg	n, L(top)
+
+	alcgr	%r0, zero
+	stg	%r0, 8(rp)
+
+	lmg	%r6, %r14, 48(%r15)
+	br	%r14
+EPILOGUE()

diff --git a/third_party/gmp/mpn/s390_64/sublsh1_n.asm b/third_party/gmp/mpn/s390_64/sublsh1_n.asm
new file mode 100644
index 0000000..50f127a
--- /dev/null
+++ b/third_party/gmp/mpn/s390_64/sublsh1_n.asm

@@ -0,0 +1,169 @@
+dnl  S/390-64 mpn_sublsh1_n
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C            cycles/limb
+C z900		10
+C z990		 5
+C z9		 ?
+C z10		12
+C z196		 ?
+
+C TODO
+C  * Optimise for small n
+C  * Compute RETVAL for sublsh1_n less stupidly
+
+C INPUT PARAMETERS
+define(`rp',	`%r2')
+define(`up',	`%r3')
+define(`vp',	`%r4')
+define(`n',	`%r5')
+
+ifdef(`OPERATION_addlsh1_n',`
+  define(ADSBR,		algr)
+  define(ADSBCR,	alcgr)
+  define(INITCY,	`lghi	%r13, -1')
+  define(RETVAL,	`la	%r2, 2(%r1,%r13)')
+  define(func, mpn_addlsh1_n)
+')
+ifdef(`OPERATION_sublsh1_n',`
+  define(ADSBR,		slgr)
+  define(ADSBCR,	slbgr)
+  define(INITCY,	`lghi	%r13, 0')
+  define(RETVAL,`dnl
+	slgr	%r1, %r13
+	lghi	%r2, 1
+	algr	%r2, %r1')
+  define(func, mpn_sublsh1_n)
+')
+
+ASM_START()
+PROLOGUE(mpn_sublsh1_n)
+	stmg	%r6, %r13, 48(%r15)
+
+	aghi	n, 3
+	lghi	%r7, 3
+	srlg	%r0, n, 2
+	ngr	%r7, n			C n mod 4
+	je	L(b1)
+	cghi	%r7, 2
+	jl	L(b2)
+	jne	L(b0)
+
+L(b3):	lmg	%r5, %r7, 0(up)
+	la	up, 24(up)
+	lmg	%r9, %r11, 0(vp)
+	la	vp, 24(vp)
+
+	algr	%r9, %r9
+	alcgr	%r10, %r10
+	alcgr	%r11, %r11
+	slbgr	%r1, %r1
+
+	ADSBR	%r5, %r9
+	ADSBCR	%r6, %r10
+	ADSBCR	%r7, %r11
+	slbgr	%r13, %r13
+
+	stmg	%r5, %r7, 0(rp)
+	la	rp, 24(rp)
+	brctg	%r0, L(top)
+	j	L(end)
+
+L(b0):	lghi	%r1, -1
+	INITCY
+	j	L(top)
+
+L(b1):	lg	%r5, 0(up)
+	la	up, 8(up)
+	lg	%r9, 0(vp)
+	la	vp, 8(vp)
+
+	algr	%r9, %r9
+	slbgr	%r1, %r1
+	ADSBR	%r5, %r9
+	slbgr	%r13, %r13
+
+	stg	%r5, 0(rp)
+	la	rp, 8(rp)
+	brctg	%r0, L(top)
+	j	L(end)
+
+L(b2):	lmg	%r5, %r6, 0(up)
+	la	up, 16(up)
+	lmg	%r9, %r10, 0(vp)
+	la	vp, 16(vp)
+
+	algr	%r9, %r9
+	alcgr	%r10, %r10
+	slbgr	%r1, %r1
+
+	ADSBR	%r5, %r9
+	ADSBCR	%r6, %r10
+	slbgr	%r13, %r13
+
+	stmg	%r5, %r6, 0(rp)
+	la	rp, 16(rp)
+	brctg	%r0, L(top)
+	j	L(end)
+
+L(top):	lmg	%r9, %r12, 0(vp)
+	la	vp, 32(vp)
+
+	aghi	%r1, 1			C restore carry
+
+	alcgr	%r9, %r9
+	alcgr	%r10, %r10
+	alcgr	%r11, %r11
+	alcgr	%r12, %r12
+
+	slbgr	%r1, %r1		C save carry
+
+	lmg	%r5, %r8, 0(up)
+	la	up, 32(up)
+
+	aghi	%r13, 1			C restore carry
+
+	ADSBCR	%r5, %r9
+	ADSBCR	%r6, %r10
+	ADSBCR	%r7, %r11
+	ADSBCR	%r8, %r12
+
+	slbgr	%r13, %r13		C save carry
+
+	stmg	%r5, %r8, 0(rp)
+	la	rp, 32(rp)
+	brctg	%r0, L(top)
+
+L(end):	RETVAL
+	lmg	%r6, %r13, 48(%r15)
+	br	%r14
+EPILOGUE()

diff --git a/third_party/gmp/mpn/s390_64/submul_1.asm b/third_party/gmp/mpn/s390_64/submul_1.asm
new file mode 100644
index 0000000..91c4b06
--- /dev/null
+++ b/third_party/gmp/mpn/s390_64/submul_1.asm

@@ -0,0 +1,70 @@
+dnl  S/390-64 mpn_submul_1
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C            cycles/limb
+C z900		35
+C z990		24
+C z9		 ?
+C z10		28
+C z196		 ?
+
+C INPUT PARAMETERS
+define(`rp',	`%r2')
+define(`up',	`%r3')
+define(`n',	`%r4')
+define(`v0',	`%r5')
+
+ASM_START()
+PROLOGUE(mpn_submul_1)
+	stmg	%r9, %r12, 72(%r15)
+	lghi	%r12, 0
+	slgr	%r11, %r11
+
+L(top):	lg	%r1, 0(%r12, up)
+	lg	%r10, 0(%r12, rp)
+	mlgr	%r0, v0
+	slbgr	%r10, %r1
+	slbgr	%r9, %r9
+	slgr	%r0, %r9		C conditional incr
+	slgr	%r10, %r11
+	lgr	%r11, %r0
+	stg	%r10, 0(%r12, rp)
+	la	%r12, 8(%r12)
+	brctg	%r4,  L(top)
+
+	lgr	%r2, %r11
+	slbgr	%r9, %r9
+	slgr	%r2, %r9
+
+	lmg	%r9, %r12, 72(%r15)
+	br	%r14
+EPILOGUE()

diff --git a/third_party/gmp/mpn/s390_64/z10/gmp-mparam.h b/third_party/gmp/mpn/s390_64/z10/gmp-mparam.h
new file mode 100644
index 0000000..c3a9416
--- /dev/null
+++ b/third_party/gmp/mpn/s390_64/z10/gmp-mparam.h

@@ -0,0 +1,233 @@
+/* S/390-64 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 2000-2011, 2014, 2015 Free Software Foundation,
+Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+/* 4400 MHz IBM z10 */
+/* FFT tuning limit = 30 M */
+/* Generated by tuneup.c, 2015-10-09, gcc 4.8 */
+
+#define DIVREM_1_NORM_THRESHOLD              0  /* always */
+#define DIVREM_1_UNNORM_THRESHOLD            3
+#define MOD_1_1P_METHOD                      2
+#define MOD_1_NORM_THRESHOLD                 0  /* always */
+#define MOD_1_UNNORM_THRESHOLD               3
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          6
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          5
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD        15
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD        17
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     24
+#define USE_PREINV_DIVREM_1                  1
+#define DIV_QR_1N_PI1_METHOD                 1
+#define DIV_QR_1_NORM_THRESHOLD              2
+#define DIV_QR_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD                 0  /* always */
+#define BMOD_1_TO_MOD_1_THRESHOLD           48
+
+#define MUL_TOOM22_THRESHOLD                 9
+#define MUL_TOOM33_THRESHOLD                65
+#define MUL_TOOM44_THRESHOLD                94
+#define MUL_TOOM6H_THRESHOLD               129
+#define MUL_TOOM8H_THRESHOLD               187
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      65
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD      61
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD      62
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD      64
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD      85
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 11
+#define SQR_TOOM3_THRESHOLD                 80
+#define SQR_TOOM4_THRESHOLD                118
+#define SQR_TOOM6_THRESHOLD                189
+#define SQR_TOOM8_THRESHOLD                236
+
+#define MULMID_TOOM42_THRESHOLD             24
+
+#define MULMOD_BNM1_THRESHOLD                7
+#define SQRMOD_BNM1_THRESHOLD                9
+
+#define MUL_FFT_MODF_THRESHOLD             252  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    252, 5}, {      9, 6}, {      5, 5}, {     11, 6}, \
+    {      6, 5}, {     13, 6}, {      7, 5}, {     15, 6}, \
+    {     13, 7}, {      7, 6}, {     15, 7}, {     13, 8}, \
+    {      7, 7}, {     15, 8}, {      9, 7}, {     19, 8}, \
+    {     11, 7}, {     23, 8}, {     13, 9}, {      7, 8}, \
+    {     15, 7}, {     31, 8}, {     19, 9}, {     11, 8}, \
+    {     27,10}, {      7, 9}, {     15, 8}, {     31, 9}, \
+    {     19, 8}, {     41, 9}, {     27,10}, {     15, 9}, \
+    {     39,10}, {     23, 9}, {     47,11}, {     15,10}, \
+    {     31, 9}, {     67,10}, {     39, 9}, {     79,10}, \
+    {     47,11}, {     31,10}, {     63, 9}, {    127, 8}, \
+    {    255,10}, {     71, 9}, {    143, 8}, {    287, 7}, \
+    {    575, 6}, {   1151,10}, {     79,11}, {     47,12}, \
+    {     31,11}, {     63,10}, {    127, 9}, {    255, 8}, \
+    {    511,10}, {    143,11}, {     79,10}, {    159, 9}, \
+    {    319, 8}, {    639,10}, {    175, 8}, {    703,11}, \
+    {     95,10}, {    191, 9}, {    383, 8}, {    767, 9}, \
+    {    415, 8}, {    831, 7}, {   1663,10}, {    239, 9}, \
+    {    479,12}, {     63,11}, {    127,10}, {    255, 9}, \
+    {    511,11}, {    143,10}, {    287, 9}, {    575, 8}, \
+    {   1151,10}, {    319, 9}, {    639,11}, {    175,10}, \
+    {    351, 9}, {    703, 8}, {   1407, 7}, {   2815,11}, \
+    {    191,10}, {    383, 9}, {    767,10}, {    415,11}, \
+    {    223,10}, {    447, 9}, {    895,13}, {     63,11}, \
+    {    255,10}, {    575, 9}, {   1151,12}, {    159,11}, \
+    {    319,10}, {    639, 9}, {   1279,10}, {    703, 9}, \
+    {   1407,12}, {    191,10}, {    767,11}, {    415,12}, \
+    {    223,11}, {    447,10}, {    895,11}, {    479,13}, \
+    {    127,12}, {    255,11}, {    511,12}, {    287,10}, \
+    {   1151,12}, {    319,11}, {    703,10}, {   1407, 9}, \
+    {   2815,12}, {    383,11}, {    767,12}, {    415,11}, \
+    {    831,10}, {   1663,12}, {    447,11}, {    895,10}, \
+    {   1791, 9}, {   3583,12}, {    479,11}, {    959,10}, \
+    {   1919, 9}, {   3839,12}, {    511, 9}, {   4095, 6}, \
+    {  32767, 8}, {   8447,11}, {   1151,13}, {    319,12}, \
+    {    639,10}, {   2559,12}, {    703,10}, {   2815,12}, \
+    {    831,11}, {   1663,12}, {    895,11}, {   1791,12}, \
+    {    959,11}, {   1919,14}, {    255,13}, {    511,11}, \
+    {   2047,12}, {   1215,10}, {   4863,11}, {   2559,14}, \
+    {    383,12}, {   1535,13}, {    831,12}, {   1663,13}, \
+    {    895,12}, {   1791,11}, {   3583,15}, {    255,14}, \
+    {    511,13}, {   1151,14}, {    639,13}, {   1279,12}, \
+    {   2559,13}, {   1407,12}, {   2815,14}, {    767,13}, \
+    {   1663,10}, {  13311,14}, {    895,13}, {   1791,12}, \
+    {   3583,13}, {   1919,12}, {   3839,10}, {  15359,14}, \
+    {   1151,13}, {   2431,12}, {   4863,14}, {   1279,13}, \
+    {   2559,14}, {   1407,13}, {   2815,15}, {    767,14}, \
+    {   1791,13}, {   8192,14}, {  16384,15}, {  32768,16}, \
+    {  65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \
+    {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 200
+#define MUL_FFT_THRESHOLD                 1728
+
+#define SQR_FFT_MODF_THRESHOLD             212  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    212, 5}, {      7, 4}, {     15, 5}, {      9, 4}, \
+    {     19, 6}, {      5, 5}, {     11, 6}, {      6, 5}, \
+    {     13, 6}, {      7, 5}, {     15, 6}, {      9, 5}, \
+    {     19, 6}, {     13, 7}, {      7, 6}, {     15, 7}, \
+    {      9, 6}, {     19, 7}, {     13, 8}, {      7, 7}, \
+    {     16, 8}, {      9, 7}, {     19, 8}, {     11, 7}, \
+    {     23, 8}, {     13, 9}, {      7, 8}, {     19, 9}, \
+    {     11, 8}, {     25,10}, {      7, 9}, {     15, 8}, \
+    {     31, 9}, {     23,10}, {     15, 9}, {     39,10}, \
+    {     23,11}, {     15,10}, {     31, 9}, {     63,10}, \
+    {     47,11}, {     31,10}, {     63, 9}, {    127, 8}, \
+    {    255,10}, {     71, 9}, {    143, 8}, {    287, 7}, \
+    {    575,11}, {     47,12}, {     31,11}, {     63,10}, \
+    {    127, 9}, {    255, 8}, {    511,10}, {    143, 9}, \
+    {    287, 8}, {    575, 7}, {   1151,11}, {     79,10}, \
+    {    159, 9}, {    319,10}, {    175, 9}, {    351, 8}, \
+    {    703, 7}, {   1407,10}, {    191, 9}, {    383,10}, \
+    {    207,11}, {    111,10}, {    223,12}, {     63,11}, \
+    {    127,10}, {    255, 9}, {    511,11}, {    143,10}, \
+    {    287, 9}, {    575, 8}, {   1151,11}, {    159,10}, \
+    {    319, 9}, {    639,11}, {    175,10}, {    351, 9}, \
+    {    703, 8}, {   1407,11}, {    191,10}, {    383,11}, \
+    {    207,10}, {    415,11}, {    223,10}, {    447,13}, \
+    {     63,12}, {    127,11}, {    255,10}, {    511,11}, \
+    {    287,10}, {    575, 9}, {   1151,12}, {    159,11}, \
+    {    319,10}, {    639,11}, {    351,10}, {    703, 9}, \
+    {   1407,12}, {    191,11}, {    383,10}, {    767,11}, \
+    {    415,12}, {    223,11}, {    447,10}, {    895, 9}, \
+    {   1791,13}, {    127,12}, {    255,11}, {    511,12}, \
+    {    287,11}, {    575,10}, {   1151,11}, {    607,12}, \
+    {    319,11}, {    639,12}, {    351,11}, {    703,10}, \
+    {   1407,13}, {    191,12}, {    383,11}, {    767,12}, \
+    {    415,11}, {    831,10}, {   1663,12}, {    447,11}, \
+    {    895,10}, {   1791,14}, {    127,13}, {    255,12}, \
+    {    511,11}, {   1023,10}, {   2047,11}, {   1151,12}, \
+    {    607,13}, {    319,11}, {   1279, 9}, {   5119, 8}, \
+    {  10751, 4}, { 172031, 7}, {  22015,11}, {   1407,10}, \
+    {   2943, 8}, {  11775, 9}, {   6143,12}, {    831, 8}, \
+    {  13311,11}, {   1791,14}, {    255,11}, {   2047,13}, \
+    {    575,12}, {   1151,13}, {    639,12}, {   1279,13}, \
+    {    703,12}, {   1407,11}, {   2815,12}, {   1471, 9}, \
+    {  11775,13}, {    767,12}, {   1535,13}, {    831,12}, \
+    {   1663,13}, {    895,11}, {   3583,13}, {    959,12}, \
+    {   1919,10}, {   7679, 9}, {  15359,11}, {   3967,14}, \
+    {    511,13}, {   1151,12}, {   2303,13}, {   1215,14}, \
+    {    639,13}, {   1279,12}, {   2559,14}, {    767,13}, \
+    {   1663,14}, {    895,15}, {    511,13}, {   2047,14}, \
+    {   1279,13}, {   2815,15}, {    767,14}, {   1791,13}, \
+    {   3583,16}, {  65536,17}, { 131072,18}, { 262144,19}, \
+    { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \
+    {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 201
+#define SQR_FFT_THRESHOLD                 1344
+
+#define MULLO_BASECASE_THRESHOLD             0  /* always */
+#define MULLO_DC_THRESHOLD                  33
+#define MULLO_MUL_N_THRESHOLD             2586
+#define SQRLO_BASECASE_THRESHOLD             0  /* always */
+#define SQRLO_DC_THRESHOLD                  63
+#define SQRLO_SQR_THRESHOLD               2663
+
+#define DC_DIV_QR_THRESHOLD                 37
+#define DC_DIVAPPR_Q_THRESHOLD             143
+#define DC_BDIV_QR_THRESHOLD                37
+#define DC_BDIV_Q_THRESHOLD                 86
+
+#define INV_MULMOD_BNM1_THRESHOLD           16
+#define INV_NEWTON_THRESHOLD               147
+#define INV_APPR_THRESHOLD                 141
+
+#define BINV_NEWTON_THRESHOLD              141
+#define REDC_1_TO_REDC_N_THRESHOLD          39
+
+#define MU_DIV_QR_THRESHOLD                807
+#define MU_DIVAPPR_Q_THRESHOLD             807
+#define MUPI_DIV_QR_THRESHOLD               81
+#define MU_BDIV_QR_THRESHOLD               654
+#define MU_BDIV_Q_THRESHOLD                792
+
+#define POWM_SEC_TABLE  1,28,163,1083,2111
+
+#define GET_STR_DC_THRESHOLD                19
+#define GET_STR_PRECOMPUTE_THRESHOLD        33
+#define SET_STR_DC_THRESHOLD               898
+#define SET_STR_PRECOMPUTE_THRESHOLD      2031
+
+#define FAC_DSC_THRESHOLD                  372
+#define FAC_ODD_THRESHOLD                   23
+
+#define MATRIX22_STRASSEN_THRESHOLD         17
+#define HGCD_THRESHOLD                     105
+#define HGCD_APPR_THRESHOLD                111
+#define HGCD_REDUCE_THRESHOLD             1137
+#define GCD_DC_THRESHOLD                   285
+#define GCDEXT_DC_THRESHOLD                210
+#define JACOBI_BASE_METHOD                   4

diff --git a/third_party/gmp/mpn/sh/add_n.asm b/third_party/gmp/mpn/sh/add_n.asm
new file mode 100644
index 0000000..79d17d0
--- /dev/null
+++ b/third_party/gmp/mpn/sh/add_n.asm

@@ -0,0 +1,59 @@
+dnl  SH mpn_add_n -- Add two limb vectors of the same length > 0 and store sum
+dnl  in a third limb vector.
+
+dnl  Copyright 1995, 1997, 2000, 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C rp		r4
+C up		r5
+C vp		r6
+C n		r7
+
+changecom(blah)			C disable # to make all C comments below work
+
+ASM_START()
+PROLOGUE(mpn_add_n)
+	mov	#0,r3		C clear cy save reg
+
+L(top):	mov.l	@r5+,r1
+	mov.l	@r6+,r2
+	shlr	r3		C restore cy
+	addc	r2,r1
+	movt	r3		C save cy
+	mov.l	r1,@r4
+	dt	r7
+	bf.s	L(top)
+	 add	#4,r4
+
+	rts
+	mov	r3,r0		C return carry-out from most significant limb
+EPILOGUE()

diff --git a/third_party/gmp/mpn/sh/sh2/addmul_1.asm b/third_party/gmp/mpn/sh/sh2/addmul_1.asm
new file mode 100644
index 0000000..c914b29
--- /dev/null
+++ b/third_party/gmp/mpn/sh/sh2/addmul_1.asm

@@ -0,0 +1,65 @@
+dnl  SH2 mpn_addmul_1 -- Multiply a limb vector with a limb and add the result
+dnl  to a second limb vector.
+
+dnl  Copyright 1995, 2000, 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr	r4
+C s1_ptr	r5
+C size		r6
+C s2_limb	r7
+
+changecom(blah)			C disable # to make all C comments below work
+
+ASM_START()
+PROLOGUE(mpn_addmul_1)
+	mov	#0,r2		C cy_limb = 0
+	mov	#0,r0		C Keep r0 = 0 for entire loop
+	clrt
+
+L(top):	mov.l	@r5+,r3
+	dmulu.l	r3,r7
+	sts	macl,r1
+	addc	r2,r1		C lo_prod += old cy_limb
+	sts	mach,r2		C new cy_limb = hi_prod
+	mov.l	@r4,r3
+	addc	r0,r2		C cy_limb += T, T = 0
+	addc	r3,r1
+	addc	r0,r2		C cy_limb += T, T = 0
+	dt	r6
+	mov.l	r1,@r4
+	bf.s	L(top)
+	add	#4,r4
+
+	rts
+	mov	r2,r0
+EPILOGUE()

diff --git a/third_party/gmp/mpn/sh/sh2/mul_1.asm b/third_party/gmp/mpn/sh/sh2/mul_1.asm
new file mode 100644
index 0000000..83548a6
--- /dev/null
+++ b/third_party/gmp/mpn/sh/sh2/mul_1.asm

@@ -0,0 +1,62 @@
+dnl  SH2 mpn_mul_1 -- Multiply a limb vector with a limb and store the result
+dnl  in a second limb vector.
+
+dnl  Copyright 1995, 2000, 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr	r4
+C s1_ptr	r5
+C size		r6
+C s2_limb	r7
+
+changecom(blah)			C disable # to make all C comments below work
+
+ASM_START()
+PROLOGUE(mpn_mul_1)
+	mov	#0,r2		C cy_limb = 0
+	mov	#0,r0		C Keep r0 = 0 for entire loop
+	clrt
+
+L(top):	mov.l	@r5+,r3
+	dmulu.l	r3,r7
+	sts	macl,r1
+	addc	r2,r1
+	sts	mach,r2
+	addc	r0,r2		C propagate carry to cy_limb (dt clobbers T)
+	dt	r6
+	mov.l	r1,@r4
+	bf.s	L(top)
+	add	#4,r4
+
+	rts
+	mov	r2,r0
+EPILOGUE()

diff --git a/third_party/gmp/mpn/sh/sh2/submul_1.asm b/third_party/gmp/mpn/sh/sh2/submul_1.asm
new file mode 100644
index 0000000..bef2abd
--- /dev/null
+++ b/third_party/gmp/mpn/sh/sh2/submul_1.asm

@@ -0,0 +1,65 @@
+dnl  SH2 mpn_submul_1 -- Multiply a limb vector with a limb and subtract the
+dnl  result from a second limb vector.
+
+dnl  Copyright 1995, 2000, 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr	r4
+C s1_ptr	r5
+C size		r6
+C s2_limb	r7
+
+changecom(blah)			C disable # to make all C comments below work
+
+ASM_START()
+PROLOGUE(mpn_submul_1)
+	mov	#0,r2		C cy_limb = 0
+	mov	#0,r0		C Keep r0 = 0 for entire loop
+	clrt
+
+L(top):	mov.l	@r5+,r3
+	dmulu.l	r3,r7
+	sts	macl,r1
+	addc	r2,r1		C lo_prod += old cy_limb
+	sts	mach,r2		C new cy_limb = hi_prod
+	mov.l	@r4,r3
+	addc	r0,r2		C cy_limb += T, T = 0
+	subc	r1,r3
+	addc	r0,r2		C cy_limb += T, T = 0
+	dt	r6
+	mov.l	r3,@r4
+	bf.s	L(top)
+	add	#4,r4
+
+	rts
+	mov	r2,r0
+EPILOGUE()

diff --git a/third_party/gmp/mpn/sh/sub_n.asm b/third_party/gmp/mpn/sh/sub_n.asm
new file mode 100644
index 0000000..465bc80
--- /dev/null
+++ b/third_party/gmp/mpn/sh/sub_n.asm

@@ -0,0 +1,59 @@
+dnl  SH mpn_sub_n -- Subtract two limb vectors of the same length > 0 and store
+dnl  difference in a third limb vector.
+
+dnl  Copyright 1995, 1997, 2000, 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C rp		r4
+C up		r5
+C vp		r6
+C n		r7
+
+changecom(blah)			C disable # to make all C comments below work
+
+ASM_START()
+PROLOGUE(mpn_sub_n)
+	mov	#0,r3		C clear cy save reg
+
+L(top):	mov.l	@r5+,r1
+	mov.l	@r6+,r2
+	shlr	r3		C restore cy
+	subc	r2,r1
+	movt	r3		C save cy
+	mov.l	r1,@r4
+	dt	r7
+	bf.s	L(top)
+	 add	#4,r4
+
+	rts
+	mov	r3,r0		C return carry-out from most significant limb
+EPILOGUE()

diff --git a/third_party/gmp/mpn/sparc32/README b/third_party/gmp/mpn/sparc32/README
new file mode 100644
index 0000000..f2dd116
--- /dev/null
+++ b/third_party/gmp/mpn/sparc32/README

@@ -0,0 +1,71 @@
+Copyright 1996, 2001 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.
+
+
+
+
+
+This directory contains mpn functions for various SPARC chips.  Code that
+runs only on version 8 SPARC implementations, is in the v8 subdirectory.
+
+RELEVANT OPTIMIZATION ISSUES
+
+  Load and Store timing
+
+On most early SPARC implementations, the ST instructions takes multiple
+cycles, while a STD takes just a single cycle more than an ST.  For the CPUs
+in SPARCstation I and II, the times are 3 and 4 cycles, respectively.
+Therefore, combining two ST instructions into a STD when possible is a
+significant optimization.
+
+Later SPARC implementations have single cycle ST.
+
+For SuperSPARC, we can perform just one memory instruction per cycle, even
+if up to two integer instructions can be executed in its pipeline.  For
+programs that perform so many memory operations that there are not enough
+non-memory operations to issue in parallel with all memory operations, using
+LDD and STD when possible helps.
+
+UltraSPARC-1/2 has very slow integer multiplication.  In the v9 subdirectory,
+we therefore use floating-point multiplication.
+
+STATUS
+
+1. On a SuperSPARC, mpn_lshift and mpn_rshift run at 3 cycles/limb, or 2.5
+   cycles/limb asymptotically.  We could optimize speed for special counts
+   by using ADDXCC.
+
+2. On a SuperSPARC, mpn_add_n and mpn_sub_n runs at 2.5 cycles/limb, or 2
+   cycles/limb asymptotically.
+
+3. mpn_mul_1 runs at what is believed to be optimal speed.
+
+4. On SuperSPARC, mpn_addmul_1 and mpn_submul_1 could both be improved by a
+   cycle by avoiding one of the add instructions.  See a29k/addmul_1.
+
+The speed of the code for other SPARC implementations is uncertain.

diff --git a/third_party/gmp/mpn/sparc32/add_n.asm b/third_party/gmp/mpn/sparc32/add_n.asm
new file mode 100644
index 0000000..8549195
--- /dev/null
+++ b/third_party/gmp/mpn/sparc32/add_n.asm

@@ -0,0 +1,245 @@
+dnl  SPARC mpn_add_n -- Add two limb vectors of the same length > 0 and store
+dnl  sum in a third limb vector.
+
+dnl  Copyright 1995, 1996, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+define(res_ptr,%o0)
+define(s1_ptr,%o1)
+define(s2_ptr,%o2)
+define(n,%o3)
+
+ASM_START()
+PROLOGUE(mpn_add_n)
+	xor	s2_ptr,res_ptr,%g1
+	andcc	%g1,4,%g0
+	bne	L(1)			C branch if alignment differs
+	nop
+C **  V1a  **
+L(0):	andcc	res_ptr,4,%g0		C res_ptr unaligned? Side effect: cy=0
+	be	L(v1)			C if no, branch
+	nop
+C Add least significant limb separately to align res_ptr and s2_ptr
+	ld	[s1_ptr],%g4
+	add	s1_ptr,4,s1_ptr
+	ld	[s2_ptr],%g2
+	add	s2_ptr,4,s2_ptr
+	add	n,-1,n
+	addcc	%g4,%g2,%o4
+	st	%o4,[res_ptr]
+	add	res_ptr,4,res_ptr
+L(v1):	addx	%g0,%g0,%o4		C save cy in register
+	cmp	n,2			C if n < 2 ...
+	bl	L(end2)			C ... branch to tail code
+	subcc	%g0,%o4,%g0		C restore cy
+
+	ld	[s1_ptr+0],%g4
+	addcc	n,-10,n
+	ld	[s1_ptr+4],%g1
+	ldd	[s2_ptr+0],%g2
+	blt	L(fin1)
+	subcc	%g0,%o4,%g0		C restore cy
+C Add blocks of 8 limbs until less than 8 limbs remain
+L(loop1):
+	addxcc	%g4,%g2,%o4
+	ld	[s1_ptr+8],%g4
+	addxcc	%g1,%g3,%o5
+	ld	[s1_ptr+12],%g1
+	ldd	[s2_ptr+8],%g2
+	std	%o4,[res_ptr+0]
+	addxcc	%g4,%g2,%o4
+	ld	[s1_ptr+16],%g4
+	addxcc	%g1,%g3,%o5
+	ld	[s1_ptr+20],%g1
+	ldd	[s2_ptr+16],%g2
+	std	%o4,[res_ptr+8]
+	addxcc	%g4,%g2,%o4
+	ld	[s1_ptr+24],%g4
+	addxcc	%g1,%g3,%o5
+	ld	[s1_ptr+28],%g1
+	ldd	[s2_ptr+24],%g2
+	std	%o4,[res_ptr+16]
+	addxcc	%g4,%g2,%o4
+	ld	[s1_ptr+32],%g4
+	addxcc	%g1,%g3,%o5
+	ld	[s1_ptr+36],%g1
+	ldd	[s2_ptr+32],%g2
+	std	%o4,[res_ptr+24]
+	addx	%g0,%g0,%o4		C save cy in register
+	addcc	n,-8,n
+	add	s1_ptr,32,s1_ptr
+	add	s2_ptr,32,s2_ptr
+	add	res_ptr,32,res_ptr
+	bge	L(loop1)
+	subcc	%g0,%o4,%g0		C restore cy
+
+L(fin1):
+	addcc	n,8-2,n
+	blt	L(end1)
+	subcc	%g0,%o4,%g0		C restore cy
+C Add blocks of 2 limbs until less than 2 limbs remain
+L(loope1):
+	addxcc	%g4,%g2,%o4
+	ld	[s1_ptr+8],%g4
+	addxcc	%g1,%g3,%o5
+	ld	[s1_ptr+12],%g1
+	ldd	[s2_ptr+8],%g2
+	std	%o4,[res_ptr+0]
+	addx	%g0,%g0,%o4		C save cy in register
+	addcc	n,-2,n
+	add	s1_ptr,8,s1_ptr
+	add	s2_ptr,8,s2_ptr
+	add	res_ptr,8,res_ptr
+	bge	L(loope1)
+	subcc	%g0,%o4,%g0		C restore cy
+L(end1):
+	addxcc	%g4,%g2,%o4
+	addxcc	%g1,%g3,%o5
+	std	%o4,[res_ptr+0]
+	addx	%g0,%g0,%o4		C save cy in register
+
+	andcc	n,1,%g0
+	be	L(ret1)
+	subcc	%g0,%o4,%g0		C restore cy
+C Add last limb
+	ld	[s1_ptr+8],%g4
+	ld	[s2_ptr+8],%g2
+	addxcc	%g4,%g2,%o4
+	st	%o4,[res_ptr+8]
+
+L(ret1):
+	retl
+	addx	%g0,%g0,%o0	C return carry-out from most sign. limb
+
+L(1):	xor	s1_ptr,res_ptr,%g1
+	andcc	%g1,4,%g0
+	bne	L(2)
+	nop
+C **  V1b  **
+	mov	s2_ptr,%g1
+	mov	s1_ptr,s2_ptr
+	b	L(0)
+	mov	%g1,s1_ptr
+
+C **  V2  **
+C If we come here, the alignment of s1_ptr and res_ptr as well as the
+C alignment of s2_ptr and res_ptr differ.  Since there are only two ways
+C things can be aligned (that we care about) we now know that the alignment
+C of s1_ptr and s2_ptr are the same.
+
+L(2):	cmp	n,1
+	be	L(jone)
+	nop
+	andcc	s1_ptr,4,%g0		C s1_ptr unaligned? Side effect: cy=0
+	be	L(v2)			C if no, branch
+	nop
+C Add least significant limb separately to align s1_ptr and s2_ptr
+	ld	[s1_ptr],%g4
+	add	s1_ptr,4,s1_ptr
+	ld	[s2_ptr],%g2
+	add	s2_ptr,4,s2_ptr
+	add	n,-1,n
+	addcc	%g4,%g2,%o4
+	st	%o4,[res_ptr]
+	add	res_ptr,4,res_ptr
+
+L(v2):	addx	%g0,%g0,%o4		C save cy in register
+	addcc	n,-8,n
+	blt	L(fin2)
+	subcc	%g0,%o4,%g0		C restore cy
+C Add blocks of 8 limbs until less than 8 limbs remain
+L(loop2):
+	ldd	[s1_ptr+0],%g2
+	ldd	[s2_ptr+0],%o4
+	addxcc	%g2,%o4,%g2
+	st	%g2,[res_ptr+0]
+	addxcc	%g3,%o5,%g3
+	st	%g3,[res_ptr+4]
+	ldd	[s1_ptr+8],%g2
+	ldd	[s2_ptr+8],%o4
+	addxcc	%g2,%o4,%g2
+	st	%g2,[res_ptr+8]
+	addxcc	%g3,%o5,%g3
+	st	%g3,[res_ptr+12]
+	ldd	[s1_ptr+16],%g2
+	ldd	[s2_ptr+16],%o4
+	addxcc	%g2,%o4,%g2
+	st	%g2,[res_ptr+16]
+	addxcc	%g3,%o5,%g3
+	st	%g3,[res_ptr+20]
+	ldd	[s1_ptr+24],%g2
+	ldd	[s2_ptr+24],%o4
+	addxcc	%g2,%o4,%g2
+	st	%g2,[res_ptr+24]
+	addxcc	%g3,%o5,%g3
+	st	%g3,[res_ptr+28]
+	addx	%g0,%g0,%o4		C save cy in register
+	addcc	n,-8,n
+	add	s1_ptr,32,s1_ptr
+	add	s2_ptr,32,s2_ptr
+	add	res_ptr,32,res_ptr
+	bge	L(loop2)
+	subcc	%g0,%o4,%g0		C restore cy
+
+L(fin2):
+	addcc	n,8-2,n
+	blt	L(end2)
+	subcc	%g0,%o4,%g0		C restore cy
+L(loope2):
+	ldd	[s1_ptr+0],%g2
+	ldd	[s2_ptr+0],%o4
+	addxcc	%g2,%o4,%g2
+	st	%g2,[res_ptr+0]
+	addxcc	%g3,%o5,%g3
+	st	%g3,[res_ptr+4]
+	addx	%g0,%g0,%o4		C save cy in register
+	addcc	n,-2,n
+	add	s1_ptr,8,s1_ptr
+	add	s2_ptr,8,s2_ptr
+	add	res_ptr,8,res_ptr
+	bge	L(loope2)
+	subcc	%g0,%o4,%g0		C restore cy
+L(end2):
+	andcc	n,1,%g0
+	be	L(ret2)
+	subcc	%g0,%o4,%g0		C restore cy
+C Add last limb
+L(jone):
+	ld	[s1_ptr],%g4
+	ld	[s2_ptr],%g2
+	addxcc	%g4,%g2,%o4
+	st	%o4,[res_ptr]
+
+L(ret2):
+	retl
+	addx	%g0,%g0,%o0	C return carry-out from most sign. limb
+EPILOGUE(mpn_add_n)

diff --git a/third_party/gmp/mpn/sparc32/addmul_1.asm b/third_party/gmp/mpn/sparc32/addmul_1.asm
new file mode 100644
index 0000000..92d5d78
--- /dev/null
+++ b/third_party/gmp/mpn/sparc32/addmul_1.asm

@@ -0,0 +1,155 @@
+dnl  SPARC mpn_addmul_1 -- Multiply a limb vector with a limb and add the
+dnl  result to a second limb vector.
+
+dnl  Copyright 1992-1994, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr	o0
+C s1_ptr	o1
+C size		o2
+C s2_limb	o3
+
+ASM_START()
+PROLOGUE(mpn_addmul_1)
+	C Make S1_PTR and RES_PTR point at the end of their blocks
+	C and put (- 4 x SIZE) in index/loop counter.
+	sll	%o2,2,%o2
+	add	%o0,%o2,%o4	C RES_PTR in o4 since o0 is retval
+	add	%o1,%o2,%o1
+	sub	%g0,%o2,%o2
+
+	cmp	%o3,0xfff
+	bgu	L(large)
+	nop
+
+	ld	[%o1+%o2],%o5
+	mov	0,%o0
+	b	L(0)
+	 add	%o4,-4,%o4
+L(loop0):
+	addcc	%o5,%g1,%g1
+	ld	[%o1+%o2],%o5
+	addx	%o0,%g0,%o0
+	st	%g1,[%o4+%o2]
+L(0):	wr	%g0,%o3,%y
+	sra	%o5,31,%g2
+	and	%o3,%g2,%g2
+	andcc	%g1,0,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,0,%g1
+	sra	%g1,20,%g4
+	sll	%g1,12,%g1
+	rd	%y,%g3
+	srl	%g3,20,%g3
+	or	%g1,%g3,%g1
+
+	addcc	%g1,%o0,%g1
+	addx	%g2,%g4,%o0	C add sign-compensation and cy to hi limb
+	addcc	%o2,4,%o2	C loop counter
+	bne	L(loop0)
+	 ld	[%o4+%o2],%o5
+
+	addcc	%o5,%g1,%g1
+	addx	%o0,%g0,%o0
+	retl
+	st	%g1,[%o4+%o2]
+
+L(large):
+	ld	[%o1+%o2],%o5
+	mov	0,%o0
+	sra	%o3,31,%g4	C g4 = mask of ones iff S2_LIMB < 0
+	b	L(1)
+	 add	%o4,-4,%o4
+L(loop):
+	addcc	%o5,%g3,%g3
+	ld	[%o1+%o2],%o5
+	addx	%o0,%g0,%o0
+	st	%g3,[%o4+%o2]
+L(1):	wr	%g0,%o5,%y
+	and	%o5,%g4,%g2
+	andcc	%g0,%g0,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%g0,%g1
+	rd	%y,%g3
+	addcc	%g3,%o0,%g3
+	addx	%g2,%g1,%o0
+	addcc	%o2,4,%o2
+	bne	L(loop)
+	 ld	[%o4+%o2],%o5
+
+	addcc	%o5,%g3,%g3
+	addx	%o0,%g0,%o0
+	retl
+	st	%g3,[%o4+%o2]
+EPILOGUE(mpn_addmul_1)

diff --git a/third_party/gmp/mpn/sparc32/gmp-mparam.h b/third_party/gmp/mpn/sparc32/gmp-mparam.h
new file mode 100644
index 0000000..a3bc612
--- /dev/null
+++ b/third_party/gmp/mpn/sparc32/gmp-mparam.h

@@ -0,0 +1,67 @@
+/* SPARC v7 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 1999-2002 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+/* Generated by tuneup.c, 2002-03-13, gcc 2.95, Weitek 8701 */
+
+#define MUL_TOOM22_THRESHOLD              8
+#define MUL_TOOM33_THRESHOLD            466
+
+#define SQR_BASECASE_THRESHOLD            4
+#define SQR_TOOM2_THRESHOLD              16
+#define SQR_TOOM3_THRESHOLD             258
+
+#define DIV_SB_PREINV_THRESHOLD           4
+#define DIV_DC_THRESHOLD                 28
+#define POWM_THRESHOLD                   28
+
+#define GCD_ACCEL_THRESHOLD               3
+#define JACOBI_BASE_METHOD                2
+
+#define DIVREM_1_NORM_THRESHOLD           3
+#define DIVREM_1_UNNORM_THRESHOLD         4
+#define MOD_1_NORM_THRESHOLD              3
+#define MOD_1_UNNORM_THRESHOLD            4
+#define USE_PREINV_DIVREM_1               1
+#define USE_PREINV_MOD_1                  1
+#define DIVREM_2_THRESHOLD                0  /* always */
+#define DIVEXACT_1_THRESHOLD            120
+#define MODEXACT_1_ODD_THRESHOLD      MP_SIZE_T_MAX  /* never */
+
+#define GET_STR_DC_THRESHOLD             21
+#define GET_STR_PRECOMPUTE_THRESHOLD     25
+#define SET_STR_THRESHOLD              1012
+
+#define MUL_FFT_TABLE  { 272, 672, 1152, 3584, 10240, 24576, 0 }
+#define MUL_FFT_MODF_THRESHOLD          264
+#define MUL_FFT_THRESHOLD              2304
+
+#define SQR_FFT_TABLE  { 304, 736, 1152, 3584, 10240, 24576, 0 }
+#define SQR_FFT_MODF_THRESHOLD          248
+#define SQR_FFT_THRESHOLD              2304

diff --git a/third_party/gmp/mpn/sparc32/lshift.asm b/third_party/gmp/mpn/sparc32/lshift.asm
new file mode 100644
index 0000000..8321343
--- /dev/null
+++ b/third_party/gmp/mpn/sparc32/lshift.asm

@@ -0,0 +1,105 @@
+dnl  SPARC mpn_lshift -- Shift a number left.
+
+dnl  Copyright 1995, 1996, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr	%o0
+C src_ptr	%o1
+C size		%o2
+C cnt		%o3
+
+ASM_START()
+PROLOGUE(mpn_lshift)
+	sll	%o2,2,%g1
+	add	%o1,%g1,%o1	C make %o1 point at end of src
+	ld	[%o1-4],%g2	C load first limb
+	sub	%g0,%o3,%o5	C negate shift count
+	add	%o0,%g1,%o0	C make %o0 point at end of res
+	add	%o2,-1,%o2
+	andcc	%o2,4-1,%g4	C number of limbs in first loop
+	srl	%g2,%o5,%g1	C compute function result
+	be	L(0)		C if multiple of 4 limbs, skip first loop
+	st	%g1,[%sp+80]
+
+	sub	%o2,%g4,%o2	C adjust count for main loop
+
+L(loop0):
+	ld	[%o1-8],%g3
+	add	%o0,-4,%o0
+	add	%o1,-4,%o1
+	addcc	%g4,-1,%g4
+	sll	%g2,%o3,%o4
+	srl	%g3,%o5,%g1
+	mov	%g3,%g2
+	or	%o4,%g1,%o4
+	bne	L(loop0)
+	 st	%o4,[%o0+0]
+
+L(0):	tst	%o2
+	be	L(end)
+	 nop
+
+L(loop):
+	ld	[%o1-8],%g3
+	add	%o0,-16,%o0
+	addcc	%o2,-4,%o2
+	sll	%g2,%o3,%o4
+	srl	%g3,%o5,%g1
+
+	ld	[%o1-12],%g2
+	sll	%g3,%o3,%g4
+	or	%o4,%g1,%o4
+	st	%o4,[%o0+12]
+	srl	%g2,%o5,%g1
+
+	ld	[%o1-16],%g3
+	sll	%g2,%o3,%o4
+	or	%g4,%g1,%g4
+	st	%g4,[%o0+8]
+	srl	%g3,%o5,%g1
+
+	ld	[%o1-20],%g2
+	sll	%g3,%o3,%g4
+	or	%o4,%g1,%o4
+	st	%o4,[%o0+4]
+	srl	%g2,%o5,%g1
+
+	add	%o1,-16,%o1
+	or	%g4,%g1,%g4
+	bne	L(loop)
+	 st	%g4,[%o0+0]
+
+L(end):	sll	%g2,%o3,%g2
+	st	%g2,[%o0-4]
+	retl
+	ld	[%sp+80],%o0
+EPILOGUE(mpn_lshift)

diff --git a/third_party/gmp/mpn/sparc32/mul_1.asm b/third_party/gmp/mpn/sparc32/mul_1.asm
new file mode 100644
index 0000000..42b4168
--- /dev/null
+++ b/third_party/gmp/mpn/sparc32/mul_1.asm

@@ -0,0 +1,146 @@
+dnl  SPARC mpn_mul_1 -- Multiply a limb vector with a limb and store
+dnl  the result in a second limb vector.
+
+dnl  Copyright 1992-1994, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr	o0
+C s1_ptr	o1
+C size		o2
+C s2_limb	o3
+
+ASM_START()
+PROLOGUE(mpn_mul_1)
+	C Make S1_PTR and RES_PTR point at the end of their blocks
+	C and put (- 4 x SIZE) in index/loop counter.
+	sll	%o2,2,%o2
+	add	%o0,%o2,%o4	C RES_PTR in o4 since o0 is retval
+	add	%o1,%o2,%o1
+	sub	%g0,%o2,%o2
+
+	cmp	%o3,0xfff
+	bgu	L(large)
+	nop
+
+	ld	[%o1+%o2],%o5
+	mov	0,%o0
+	b	L(0)
+	 add	%o4,-4,%o4
+L(loop0):
+	st	%g1,[%o4+%o2]
+L(0):	wr	%g0,%o3,%y
+	sra	%o5,31,%g2
+	and	%o3,%g2,%g2
+	andcc	%g1,0,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,0,%g1
+	sra	%g1,20,%g4
+	sll	%g1,12,%g1
+	rd	%y,%g3
+	srl	%g3,20,%g3
+	or	%g1,%g3,%g1
+
+	addcc	%g1,%o0,%g1
+	addx	%g2,%g4,%o0	C add sign-compensation and cy to hi limb
+	addcc	%o2,4,%o2	C loop counter
+	bne,a	L(loop0)
+	 ld	[%o1+%o2],%o5
+
+	retl
+	st	%g1,[%o4+%o2]
+
+
+L(large):
+	ld	[%o1+%o2],%o5
+	mov	0,%o0
+	sra	%o3,31,%g4	C g4 = mask of ones iff S2_LIMB < 0
+	b	L(1)
+	 add	%o4,-4,%o4
+L(loop):
+	st	%g3,[%o4+%o2]
+L(1):	wr	%g0,%o5,%y
+	and	%o5,%g4,%g2	C g2 = S1_LIMB iff S2_LIMB < 0, else 0
+	andcc	%g0,%g0,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%g0,%g1
+	rd	%y,%g3
+	addcc	%g3,%o0,%g3
+	addx	%g2,%g1,%o0	C add sign-compensation and cy to hi limb
+	addcc	%o2,4,%o2	C loop counter
+	bne,a	L(loop)
+	 ld	[%o1+%o2],%o5
+
+	retl
+	st	%g3,[%o4+%o2]
+EPILOGUE(mpn_mul_1)

diff --git a/third_party/gmp/mpn/sparc32/rshift.asm b/third_party/gmp/mpn/sparc32/rshift.asm
new file mode 100644
index 0000000..e155476
--- /dev/null
+++ b/third_party/gmp/mpn/sparc32/rshift.asm

@@ -0,0 +1,102 @@
+dnl  SPARC mpn_rshift -- Shift a number right.
+
+dnl  Copyright 1995, 1996, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr	%o0
+C src_ptr	%o1
+C size		%o2
+C cnt		%o3
+
+ASM_START()
+PROLOGUE(mpn_rshift)
+	ld	[%o1],%g2	C load first limb
+	sub	%g0,%o3,%o5	C negate shift count
+	add	%o2,-1,%o2
+	andcc	%o2,4-1,%g4	C number of limbs in first loop
+	sll	%g2,%o5,%g1	C compute function result
+	be	L(0)		C if multiple of 4 limbs, skip first loop
+	st	%g1,[%sp+80]
+
+	sub	%o2,%g4,%o2	C adjust count for main loop
+
+L(loop0):
+	ld	[%o1+4],%g3
+	add	%o0,4,%o0
+	add	%o1,4,%o1
+	addcc	%g4,-1,%g4
+	srl	%g2,%o3,%o4
+	sll	%g3,%o5,%g1
+	mov	%g3,%g2
+	or	%o4,%g1,%o4
+	bne	L(loop0)
+	 st	%o4,[%o0-4]
+
+L(0):	tst	%o2
+	be	L(end)
+	 nop
+
+L(loop):
+	ld	[%o1+4],%g3
+	add	%o0,16,%o0
+	addcc	%o2,-4,%o2
+	srl	%g2,%o3,%o4
+	sll	%g3,%o5,%g1
+
+	ld	[%o1+8],%g2
+	srl	%g3,%o3,%g4
+	or	%o4,%g1,%o4
+	st	%o4,[%o0-16]
+	sll	%g2,%o5,%g1
+
+	ld	[%o1+12],%g3
+	srl	%g2,%o3,%o4
+	or	%g4,%g1,%g4
+	st	%g4,[%o0-12]
+	sll	%g3,%o5,%g1
+
+	ld	[%o1+16],%g2
+	srl	%g3,%o3,%g4
+	or	%o4,%g1,%o4
+	st	%o4,[%o0-8]
+	sll	%g2,%o5,%g1
+
+	add	%o1,16,%o1
+	or	%g4,%g1,%g4
+	bne	L(loop)
+	 st	%g4,[%o0-4]
+
+L(end):	srl	%g2,%o3,%g2
+	st	%g2,[%o0-0]
+	retl
+	ld	[%sp+80],%o0
+EPILOGUE(mpn_rshift)

diff --git a/third_party/gmp/mpn/sparc32/sparc-defs.m4 b/third_party/gmp/mpn/sparc32/sparc-defs.m4
new file mode 100644
index 0000000..33a0c53
--- /dev/null
+++ b/third_party/gmp/mpn/sparc32/sparc-defs.m4

@@ -0,0 +1,93 @@
+divert(-1)
+
+dnl  m4 macros for SPARC assembler (32 and 64 bit).
+
+
+dnl  Copyright 2002, 2011, 2013, 2017 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+changecom(;)	dnl cannot use default # since that's used in REGISTER decls
+
+
+dnl  Usage: REGISTER(reg,attr)
+dnl
+dnl  Give a ".register reg,attr" directive, if the assembler supports it.
+dnl  HAVE_REGISTER comes from the GMP_ASM_SPARC_REGISTER configure test.
+
+define(REGISTER,
+m4_assert_numargs(2)
+m4_assert_defined(`HAVE_REGISTER')
+`ifelse(HAVE_REGISTER,yes,
+`.register `$1',`$2'')')
+
+
+C Testing mechanism for running newer code on older processors
+ifdef(`FAKE_T3',`
+  include_mpn(`sparc64/ultrasparct3/missing.m4')
+',`
+  define(`addxccc',	``addxccc'	$1, $2, $3')
+  define(`addxc',	``addxc'	$1, $2, $3')
+  define(`umulxhi',	``umulxhi'	$1, $2, $3')
+  define(`lzcnt',	``lzd'	$1, $2')
+')
+
+dnl  Usage: LEA64(symbol,reg,pic_reg)
+dnl
+dnl  Use whatever 64-bit code sequence is appropriate to load "symbol" into
+dnl  register "reg", potentially using register "pic_reg" to perform the
+dnl  calculations.
+dnl
+dnl  Caveat: We used to use the setx pseudo insn here, but some GNU/Linux
+dnl  releases causes invalid code or relocs for that.
+dnl
+dnl  Optimisation 1: Use thunk call instead of RDPC which causes pipeline
+dnl  replay for some sparcs.
+dnl
+dnl  Optimisation 2: Do the two symbol building sequences in parallel instead
+dnl  of one after the other.  That might need one more scratch register.
+
+define(LEA64,
+m4_assert_numargs(3)
+m4_assert_defined(`HAVE_GOTDATA')
+`ifdef(`PIC',`
+	rd	%pc, %`$2'
+	sethi	%hi(_GLOBAL_OFFSET_TABLE_+4), %`$3'
+	add	%`$3', %lo(_GLOBAL_OFFSET_TABLE_+8), %`$3'
+	add	%`$2', %`$3', %`$3'
+	sethi	%gdop_hix22(`$1'), %`$2'
+	xor	%`$2', %gdop_lox10(`$1'), %`$2'
+	ldx	[%`$3' + %`$2'], %`$2', %gdop(`$1')
+',`
+	sethi	%h44(`$1'), %`$2'
+	or	%`$2', %m44(`$1'), %`$2'
+	sllx	%`$2', 12, %`$2'
+	or	%`$2', %l44(`$1'), %$2
+')')
+
+divert

diff --git a/third_party/gmp/mpn/sparc32/sub_n.asm b/third_party/gmp/mpn/sparc32/sub_n.asm
new file mode 100644
index 0000000..24a576d
--- /dev/null
+++ b/third_party/gmp/mpn/sparc32/sub_n.asm

@@ -0,0 +1,335 @@
+dnl  SPARC mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+dnl  store difference in a third limb vector.
+
+dnl  Copyright 1995, 1996, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+define(res_ptr,%o0)
+define(s1_ptr,%o1)
+define(s2_ptr,%o2)
+define(n,%o3)
+
+ASM_START()
+PROLOGUE(mpn_sub_n)
+	xor	s2_ptr,res_ptr,%g1
+	andcc	%g1,4,%g0
+	bne	L(1)			C branch if alignment differs
+	nop
+C **  V1a  **
+	andcc	res_ptr,4,%g0		C res_ptr unaligned? Side effect: cy=0
+	be	L(v1)			C if no, branch
+	nop
+C Add least significant limb separately to align res_ptr and s2_ptr
+	ld	[s1_ptr],%g4
+	add	s1_ptr,4,s1_ptr
+	ld	[s2_ptr],%g2
+	add	s2_ptr,4,s2_ptr
+	add	n,-1,n
+	subcc	%g4,%g2,%o4
+	st	%o4,[res_ptr]
+	add	res_ptr,4,res_ptr
+L(v1):	addx	%g0,%g0,%o4		C save cy in register
+	cmp	n,2			C if n < 2 ...
+	bl	L(end2)			C ... branch to tail code
+	subcc	%g0,%o4,%g0		C restore cy
+
+	ld	[s1_ptr+0],%g4
+	addcc	n,-10,n
+	ld	[s1_ptr+4],%g1
+	ldd	[s2_ptr+0],%g2
+	blt	L(fin1)
+	subcc	%g0,%o4,%g0		C restore cy
+C Add blocks of 8 limbs until less than 8 limbs remain
+L(loop1):
+	subxcc	%g4,%g2,%o4
+	ld	[s1_ptr+8],%g4
+	subxcc	%g1,%g3,%o5
+	ld	[s1_ptr+12],%g1
+	ldd	[s2_ptr+8],%g2
+	std	%o4,[res_ptr+0]
+	subxcc	%g4,%g2,%o4
+	ld	[s1_ptr+16],%g4
+	subxcc	%g1,%g3,%o5
+	ld	[s1_ptr+20],%g1
+	ldd	[s2_ptr+16],%g2
+	std	%o4,[res_ptr+8]
+	subxcc	%g4,%g2,%o4
+	ld	[s1_ptr+24],%g4
+	subxcc	%g1,%g3,%o5
+	ld	[s1_ptr+28],%g1
+	ldd	[s2_ptr+24],%g2
+	std	%o4,[res_ptr+16]
+	subxcc	%g4,%g2,%o4
+	ld	[s1_ptr+32],%g4
+	subxcc	%g1,%g3,%o5
+	ld	[s1_ptr+36],%g1
+	ldd	[s2_ptr+32],%g2
+	std	%o4,[res_ptr+24]
+	addx	%g0,%g0,%o4		C save cy in register
+	addcc	n,-8,n
+	add	s1_ptr,32,s1_ptr
+	add	s2_ptr,32,s2_ptr
+	add	res_ptr,32,res_ptr
+	bge	L(loop1)
+	subcc	%g0,%o4,%g0		C restore cy
+
+L(fin1):
+	addcc	n,8-2,n
+	blt	L(end1)
+	subcc	%g0,%o4,%g0		C restore cy
+C Add blocks of 2 limbs until less than 2 limbs remain
+L(loope1):
+	subxcc	%g4,%g2,%o4
+	ld	[s1_ptr+8],%g4
+	subxcc	%g1,%g3,%o5
+	ld	[s1_ptr+12],%g1
+	ldd	[s2_ptr+8],%g2
+	std	%o4,[res_ptr+0]
+	addx	%g0,%g0,%o4		C save cy in register
+	addcc	n,-2,n
+	add	s1_ptr,8,s1_ptr
+	add	s2_ptr,8,s2_ptr
+	add	res_ptr,8,res_ptr
+	bge	L(loope1)
+	subcc	%g0,%o4,%g0		C restore cy
+L(end1):
+	subxcc	%g4,%g2,%o4
+	subxcc	%g1,%g3,%o5
+	std	%o4,[res_ptr+0]
+	addx	%g0,%g0,%o4		C save cy in register
+
+	andcc	n,1,%g0
+	be	L(ret1)
+	subcc	%g0,%o4,%g0		C restore cy
+C Add last limb
+	ld	[s1_ptr+8],%g4
+	ld	[s2_ptr+8],%g2
+	subxcc	%g4,%g2,%o4
+	st	%o4,[res_ptr+8]
+
+L(ret1):
+	retl
+	addx	%g0,%g0,%o0	C return carry-out from most sign. limb
+
+L(1):	xor	s1_ptr,res_ptr,%g1
+	andcc	%g1,4,%g0
+	bne	L(2)
+	nop
+C **  V1b  **
+	andcc	res_ptr,4,%g0		C res_ptr unaligned? Side effect: cy=0
+	be	L(v1b)			C if no, branch
+	nop
+C Add least significant limb separately to align res_ptr and s1_ptr
+	ld	[s2_ptr],%g4
+	add	s2_ptr,4,s2_ptr
+	ld	[s1_ptr],%g2
+	add	s1_ptr,4,s1_ptr
+	add	n,-1,n
+	subcc	%g2,%g4,%o4
+	st	%o4,[res_ptr]
+	add	res_ptr,4,res_ptr
+L(v1b):	addx	%g0,%g0,%o4		C save cy in register
+	cmp	n,2			C if n < 2 ...
+	bl	L(end2)			C ... branch to tail code
+	subcc	%g0,%o4,%g0		C restore cy
+
+	ld	[s2_ptr+0],%g4
+	addcc	n,-10,n
+	ld	[s2_ptr+4],%g1
+	ldd	[s1_ptr+0],%g2
+	blt	L(fin1b)
+	subcc	%g0,%o4,%g0		C restore cy
+C Add blocks of 8 limbs until less than 8 limbs remain
+L(loop1b):
+	subxcc	%g2,%g4,%o4
+	ld	[s2_ptr+8],%g4
+	subxcc	%g3,%g1,%o5
+	ld	[s2_ptr+12],%g1
+	ldd	[s1_ptr+8],%g2
+	std	%o4,[res_ptr+0]
+	subxcc	%g2,%g4,%o4
+	ld	[s2_ptr+16],%g4
+	subxcc	%g3,%g1,%o5
+	ld	[s2_ptr+20],%g1
+	ldd	[s1_ptr+16],%g2
+	std	%o4,[res_ptr+8]
+	subxcc	%g2,%g4,%o4
+	ld	[s2_ptr+24],%g4
+	subxcc	%g3,%g1,%o5
+	ld	[s2_ptr+28],%g1
+	ldd	[s1_ptr+24],%g2
+	std	%o4,[res_ptr+16]
+	subxcc	%g2,%g4,%o4
+	ld	[s2_ptr+32],%g4
+	subxcc	%g3,%g1,%o5
+	ld	[s2_ptr+36],%g1
+	ldd	[s1_ptr+32],%g2
+	std	%o4,[res_ptr+24]
+	addx	%g0,%g0,%o4		C save cy in register
+	addcc	n,-8,n
+	add	s1_ptr,32,s1_ptr
+	add	s2_ptr,32,s2_ptr
+	add	res_ptr,32,res_ptr
+	bge	L(loop1b)
+	subcc	%g0,%o4,%g0		C restore cy
+
+L(fin1b):
+	addcc	n,8-2,n
+	blt	L(end1b)
+	subcc	%g0,%o4,%g0		C restore cy
+C Add blocks of 2 limbs until less than 2 limbs remain
+L(loope1b):
+	subxcc	%g2,%g4,%o4
+	ld	[s2_ptr+8],%g4
+	subxcc	%g3,%g1,%o5
+	ld	[s2_ptr+12],%g1
+	ldd	[s1_ptr+8],%g2
+	std	%o4,[res_ptr+0]
+	addx	%g0,%g0,%o4		C save cy in register
+	addcc	n,-2,n
+	add	s1_ptr,8,s1_ptr
+	add	s2_ptr,8,s2_ptr
+	add	res_ptr,8,res_ptr
+	bge	L(loope1b)
+	subcc	%g0,%o4,%g0		C restore cy
+L(end1b):
+	subxcc	%g2,%g4,%o4
+	subxcc	%g3,%g1,%o5
+	std	%o4,[res_ptr+0]
+	addx	%g0,%g0,%o4		C save cy in register
+
+	andcc	n,1,%g0
+	be	L(ret1b)
+	subcc	%g0,%o4,%g0		C restore cy
+C Add last limb
+	ld	[s2_ptr+8],%g4
+	ld	[s1_ptr+8],%g2
+	subxcc	%g2,%g4,%o4
+	st	%o4,[res_ptr+8]
+
+L(ret1b):
+	retl
+	addx	%g0,%g0,%o0		C return carry-out from most sign. limb
+
+C **  V2  **
+C If we come here, the alignment of s1_ptr and res_ptr as well as the
+C alignment of s2_ptr and res_ptr differ.  Since there are only two ways
+C things can be aligned (that we care about) we now know that the alignment
+C of s1_ptr and s2_ptr are the same.
+
+L(2):	cmp	n,1
+	be	L(jone)
+	nop
+	andcc	s1_ptr,4,%g0		C s1_ptr unaligned? Side effect: cy=0
+	be	L(v2)			C if no, branch
+	nop
+C Add least significant limb separately to align s1_ptr and s2_ptr
+	ld	[s1_ptr],%g4
+	add	s1_ptr,4,s1_ptr
+	ld	[s2_ptr],%g2
+	add	s2_ptr,4,s2_ptr
+	add	n,-1,n
+	subcc	%g4,%g2,%o4
+	st	%o4,[res_ptr]
+	add	res_ptr,4,res_ptr
+
+L(v2):	addx	%g0,%g0,%o4		C save cy in register
+	addcc	n,-8,n
+	blt	L(fin2)
+	subcc	%g0,%o4,%g0		C restore cy
+C Add blocks of 8 limbs until less than 8 limbs remain
+L(loop2):
+	ldd	[s1_ptr+0],%g2
+	ldd	[s2_ptr+0],%o4
+	subxcc	%g2,%o4,%g2
+	st	%g2,[res_ptr+0]
+	subxcc	%g3,%o5,%g3
+	st	%g3,[res_ptr+4]
+	ldd	[s1_ptr+8],%g2
+	ldd	[s2_ptr+8],%o4
+	subxcc	%g2,%o4,%g2
+	st	%g2,[res_ptr+8]
+	subxcc	%g3,%o5,%g3
+	st	%g3,[res_ptr+12]
+	ldd	[s1_ptr+16],%g2
+	ldd	[s2_ptr+16],%o4
+	subxcc	%g2,%o4,%g2
+	st	%g2,[res_ptr+16]
+	subxcc	%g3,%o5,%g3
+	st	%g3,[res_ptr+20]
+	ldd	[s1_ptr+24],%g2
+	ldd	[s2_ptr+24],%o4
+	subxcc	%g2,%o4,%g2
+	st	%g2,[res_ptr+24]
+	subxcc	%g3,%o5,%g3
+	st	%g3,[res_ptr+28]
+	addx	%g0,%g0,%o4		C save cy in register
+	addcc	n,-8,n
+	add	s1_ptr,32,s1_ptr
+	add	s2_ptr,32,s2_ptr
+	add	res_ptr,32,res_ptr
+	bge	L(loop2)
+	subcc	%g0,%o4,%g0		C restore cy
+
+L(fin2):
+	addcc	n,8-2,n
+	blt	L(end2)
+	subcc	%g0,%o4,%g0		C restore cy
+L(loope2):
+	ldd	[s1_ptr+0],%g2
+	ldd	[s2_ptr+0],%o4
+	subxcc	%g2,%o4,%g2
+	st	%g2,[res_ptr+0]
+	subxcc	%g3,%o5,%g3
+	st	%g3,[res_ptr+4]
+	addx	%g0,%g0,%o4		C save cy in register
+	addcc	n,-2,n
+	add	s1_ptr,8,s1_ptr
+	add	s2_ptr,8,s2_ptr
+	add	res_ptr,8,res_ptr
+	bge	L(loope2)
+	subcc	%g0,%o4,%g0		C restore cy
+L(end2):
+	andcc	n,1,%g0
+	be	L(ret2)
+	subcc	%g0,%o4,%g0		C restore cy
+C Add last limb
+L(jone):
+	ld	[s1_ptr],%g4
+	ld	[s2_ptr],%g2
+	subxcc	%g4,%g2,%o4
+	st	%o4,[res_ptr]
+
+L(ret2):
+	retl
+	addx	%g0,%g0,%o0		C return carry-out from most sign. limb
+EPILOGUE(mpn_sub_n)

diff --git a/third_party/gmp/mpn/sparc32/submul_1.asm b/third_party/gmp/mpn/sparc32/submul_1.asm
new file mode 100644
index 0000000..73f9377
--- /dev/null
+++ b/third_party/gmp/mpn/sparc32/submul_1.asm

@@ -0,0 +1,155 @@
+dnl  SPARC mpn_submul_1 -- Multiply a limb vector with a limb and subtract
+dnl  the result from a second limb vector.
+
+dnl  Copyright 1992-1994, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr	o0
+C s1_ptr	o1
+C size		o2
+C s2_limb	o3
+
+ASM_START()
+PROLOGUE(mpn_submul_1)
+	C Make S1_PTR and RES_PTR point at the end of their blocks
+	C and put (- 4 x SIZE) in index/loop counter.
+	sll	%o2,2,%o2
+	add	%o0,%o2,%o4	C RES_PTR in o4 since o0 is retval
+	add	%o1,%o2,%o1
+	sub	%g0,%o2,%o2
+
+	cmp	%o3,0xfff
+	bgu	L(large)
+	nop
+
+	ld	[%o1+%o2],%o5
+	mov	0,%o0
+	b	L(0)
+	 add	%o4,-4,%o4
+L(loop0):
+	subcc	%o5,%g1,%g1
+	ld	[%o1+%o2],%o5
+	addx	%o0,%g0,%o0
+	st	%g1,[%o4+%o2]
+L(0):	wr	%g0,%o3,%y
+	sra	%o5,31,%g2
+	and	%o3,%g2,%g2
+	andcc	%g1,0,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,%o5,%g1
+	mulscc	%g1,0,%g1
+	sra	%g1,20,%g4
+	sll	%g1,12,%g1
+	rd	%y,%g3
+	srl	%g3,20,%g3
+	or	%g1,%g3,%g1
+
+	addcc	%g1,%o0,%g1
+	addx	%g2,%g4,%o0	C add sign-compensation and cy to hi limb
+	addcc	%o2,4,%o2	C loop counter
+	bne	L(loop0)
+	 ld	[%o4+%o2],%o5
+
+	subcc	%o5,%g1,%g1
+	addx	%o0,%g0,%o0
+	retl
+	st	%g1,[%o4+%o2]
+
+L(large):
+	ld	[%o1+%o2],%o5
+	mov	0,%o0
+	sra	%o3,31,%g4	C g4 = mask of ones iff S2_LIMB < 0
+	b	L(1)
+	 add	%o4,-4,%o4
+L(loop):
+	subcc	%o5,%g3,%g3
+	ld	[%o1+%o2],%o5
+	addx	%o0,%g0,%o0
+	st	%g3,[%o4+%o2]
+L(1):	wr	%g0,%o5,%y
+	and	%o5,%g4,%g2
+	andcc	%g0,%g0,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%o3,%g1
+	mulscc	%g1,%g0,%g1
+	rd	%y,%g3
+	addcc	%g3,%o0,%g3
+	addx	%g2,%g1,%o0
+	addcc	%o2,4,%o2
+	bne	L(loop)
+	 ld	[%o4+%o2],%o5
+
+	subcc	%o5,%g3,%g3
+	addx	%o0,%g0,%o0
+	retl
+	st	%g3,[%o4+%o2]
+EPILOGUE(mpn_submul_1)

diff --git a/third_party/gmp/mpn/sparc32/udiv.asm b/third_party/gmp/mpn/sparc32/udiv.asm
new file mode 100644
index 0000000..23ab3de
--- /dev/null
+++ b/third_party/gmp/mpn/sparc32/udiv.asm

@@ -0,0 +1,167 @@
+dnl  SPARC v7 __udiv_qrnnd division support, used from longlong.h.
+dnl  This is for v7 CPUs with a floating-point unit.
+
+dnl  Copyright 1993, 1994, 1996, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C rem_ptr	i0
+C n1		i1
+C n0		i2
+C d		i3
+
+ASM_START()
+
+ifdef(`PIC',
+`	TEXT
+L(getpc):
+	retl
+	nop')
+
+	TEXT
+	ALIGN(8)
+L(C0):	.double	0r4294967296
+L(C1):	.double	0r2147483648
+
+PROLOGUE(mpn_udiv_qrnnd)
+	save	%sp,-104,%sp
+	st	%i1,[%fp-8]
+	ld	[%fp-8],%f10
+
+ifdef(`PIC',
+`L(pc):	call	L(getpc)		C put address of this insn in %o7
+	ldd	[%o7+L(C0)-L(pc)],%f8',
+`	sethi	%hi(L(C0)),%o7
+	ldd	[%o7+%lo(L(C0))],%f8')
+
+	fitod	%f10,%f4
+	cmp	%i1,0
+	bge	L(248)
+	mov	%i0,%i5
+	faddd	%f4,%f8,%f4
+L(248):
+	st	%i2,[%fp-8]
+	ld	[%fp-8],%f10
+	fmuld	%f4,%f8,%f6
+	cmp	%i2,0
+	bge	L(249)
+	fitod	%f10,%f2
+	faddd	%f2,%f8,%f2
+L(249):
+	st	%i3,[%fp-8]
+	faddd	%f6,%f2,%f2
+	ld	[%fp-8],%f10
+	cmp	%i3,0
+	bge	L(250)
+	fitod	%f10,%f4
+	faddd	%f4,%f8,%f4
+L(250):
+	fdivd	%f2,%f4,%f2
+
+ifdef(`PIC',
+`	ldd	[%o7+L(C1)-L(pc)],%f4',
+`	sethi	%hi(L(C1)),%o7
+	ldd	[%o7+%lo(L(C1))],%f4')
+
+	fcmped	%f2,%f4
+	nop
+	fbge,a	L(251)
+	fsubd	%f2,%f4,%f2
+	fdtoi	%f2,%f2
+	st	%f2,[%fp-8]
+	b	L(252)
+	ld	[%fp-8],%i4
+L(251):
+	fdtoi	%f2,%f2
+	st	%f2,[%fp-8]
+	ld	[%fp-8],%i4
+	sethi	%hi(-2147483648),%g2
+	xor	%i4,%g2,%i4
+L(252):
+	wr	%g0,%i4,%y
+	sra	%i3,31,%g2
+	and	%i4,%g2,%g2
+	andcc	%g0,0,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,%i3,%g1
+	mulscc	%g1,0,%g1
+	add	%g1,%g2,%i0
+	rd	%y,%g3
+	subcc	%i2,%g3,%o7
+	subxcc	%i1,%i0,%g0
+	be	L(253)
+	cmp	%o7,%i3
+
+	add	%i4,-1,%i0
+	add	%o7,%i3,%o7
+	st	%o7,[%i5]
+	ret
+	restore
+L(253):
+	blu	L(246)
+	mov	%i4,%i0
+	add	%i4,1,%i0
+	sub	%o7,%i3,%o7
+L(246):
+	st	%o7,[%i5]
+	ret
+	restore
+EPILOGUE(mpn_udiv_qrnnd)

diff --git a/third_party/gmp/mpn/sparc32/udiv_nfp.asm b/third_party/gmp/mpn/sparc32/udiv_nfp.asm
new file mode 100644
index 0000000..ebbb820
--- /dev/null
+++ b/third_party/gmp/mpn/sparc32/udiv_nfp.asm

@@ -0,0 +1,202 @@
+dnl  SPARC v7 __udiv_qrnnd division support, used from longlong.h.
+dnl  This is for v7 CPUs without a floating-point unit.
+
+dnl  Copyright 1993, 1994, 1996, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C rem_ptr	o0
+C n1		o1
+C n0		o2
+C d		o3
+
+ASM_START()
+PROLOGUE(mpn_udiv_qrnnd)
+	tst	%o3
+	bneg	L(largedivisor)
+	mov	8,%g1
+
+	b	L(p1)
+	addxcc	%o2,%o2,%o2
+
+L(plop):
+	bcc	L(n1)
+	addxcc	%o2,%o2,%o2
+L(p1):	addx	%o1,%o1,%o1
+	subcc	%o1,%o3,%o4
+	bcc	L(n2)
+	addxcc	%o2,%o2,%o2
+L(p2):	addx	%o1,%o1,%o1
+	subcc	%o1,%o3,%o4
+	bcc	L(n3)
+	addxcc	%o2,%o2,%o2
+L(p3):	addx	%o1,%o1,%o1
+	subcc	%o1,%o3,%o4
+	bcc	L(n4)
+	addxcc	%o2,%o2,%o2
+L(p4):	addx	%o1,%o1,%o1
+	addcc	%g1,-1,%g1
+	bne	L(plop)
+	subcc	%o1,%o3,%o4
+	bcc	L(n5)
+	addxcc	%o2,%o2,%o2
+L(p5):	st	%o1,[%o0]
+	retl
+	xnor	%g0,%o2,%o0
+
+L(nlop):
+	bcc	L(p1)
+	addxcc	%o2,%o2,%o2
+L(n1):	addx	%o4,%o4,%o4
+	subcc	%o4,%o3,%o1
+	bcc	L(p2)
+	addxcc	%o2,%o2,%o2
+L(n2):	addx	%o4,%o4,%o4
+	subcc	%o4,%o3,%o1
+	bcc	L(p3)
+	addxcc	%o2,%o2,%o2
+L(n3):	addx	%o4,%o4,%o4
+	subcc	%o4,%o3,%o1
+	bcc	L(p4)
+	addxcc	%o2,%o2,%o2
+L(n4):	addx	%o4,%o4,%o4
+	addcc	%g1,-1,%g1
+	bne	L(nlop)
+	subcc	%o4,%o3,%o1
+	bcc	L(p5)
+	addxcc	%o2,%o2,%o2
+L(n5):	st	%o4,[%o0]
+	retl
+	xnor	%g0,%o2,%o0
+
+L(largedivisor):
+	and	%o2,1,%o5	C %o5 = n0 & 1
+
+	srl	%o2,1,%o2
+	sll	%o1,31,%g2
+	or	%g2,%o2,%o2	C %o2 = lo(n1n0 >> 1)
+	srl	%o1,1,%o1	C %o1 = hi(n1n0 >> 1)
+
+	and	%o3,1,%g2
+	srl	%o3,1,%g3	C %g3 = floor(d / 2)
+	add	%g3,%g2,%g3	C %g3 = ceil(d / 2)
+
+	b	L(Lp1)
+	addxcc	%o2,%o2,%o2
+
+L(Lplop):
+	bcc	L(Ln1)
+	addxcc	%o2,%o2,%o2
+L(Lp1):	addx	%o1,%o1,%o1
+	subcc	%o1,%g3,%o4
+	bcc	L(Ln2)
+	addxcc	%o2,%o2,%o2
+L(Lp2):	addx	%o1,%o1,%o1
+	subcc	%o1,%g3,%o4
+	bcc	L(Ln3)
+	addxcc	%o2,%o2,%o2
+L(Lp3):	addx	%o1,%o1,%o1
+	subcc	%o1,%g3,%o4
+	bcc	L(Ln4)
+	addxcc	%o2,%o2,%o2
+L(Lp4):	addx	%o1,%o1,%o1
+	addcc	%g1,-1,%g1
+	bne	L(Lplop)
+	subcc	%o1,%g3,%o4
+	bcc	L(Ln5)
+	addxcc	%o2,%o2,%o2
+L(Lp5):	add	%o1,%o1,%o1	C << 1
+	tst	%g2
+	bne	L(oddp)
+	add	%o5,%o1,%o1
+	st	%o1,[%o0]
+	retl
+	xnor	%g0,%o2,%o0
+
+L(Lnlop):
+	bcc	L(Lp1)
+	addxcc	%o2,%o2,%o2
+L(Ln1):	addx	%o4,%o4,%o4
+	subcc	%o4,%g3,%o1
+	bcc	L(Lp2)
+	addxcc	%o2,%o2,%o2
+L(Ln2):	addx	%o4,%o4,%o4
+	subcc	%o4,%g3,%o1
+	bcc	L(Lp3)
+	addxcc	%o2,%o2,%o2
+L(Ln3):	addx	%o4,%o4,%o4
+	subcc	%o4,%g3,%o1
+	bcc	L(Lp4)
+	addxcc	%o2,%o2,%o2
+L(Ln4):	addx	%o4,%o4,%o4
+	addcc	%g1,-1,%g1
+	bne	L(Lnlop)
+	subcc	%o4,%g3,%o1
+	bcc	L(Lp5)
+	addxcc	%o2,%o2,%o2
+L(Ln5):	add	%o4,%o4,%o4	C << 1
+	tst	%g2
+	bne	L(oddn)
+	add	%o5,%o4,%o4
+	st	%o4,[%o0]
+	retl
+	xnor	%g0,%o2,%o0
+
+L(oddp):
+	xnor	%g0,%o2,%o2
+	C q' in %o2. r' in %o1
+	addcc	%o1,%o2,%o1
+	bcc	L(Lp6)
+	addx	%o2,0,%o2
+	sub	%o1,%o3,%o1
+L(Lp6):	subcc	%o1,%o3,%g0
+	bcs	L(Lp7)
+	subx	%o2,-1,%o2
+	sub	%o1,%o3,%o1
+L(Lp7):	st	%o1,[%o0]
+	retl
+	mov	%o2,%o0
+
+L(oddn):
+	xnor	%g0,%o2,%o2
+	C q' in %o2. r' in %o4
+	addcc	%o4,%o2,%o4
+	bcc	L(Ln6)
+	addx	%o2,0,%o2
+	sub	%o4,%o3,%o4
+L(Ln6):	subcc	%o4,%o3,%g0
+	bcs	L(Ln7)
+	subx	%o2,-1,%o2
+	sub	%o4,%o3,%o4
+L(Ln7):	st	%o4,[%o0]
+	retl
+	mov	%o2,%o0
+EPILOGUE(mpn_udiv_qrnnd)

diff --git a/third_party/gmp/mpn/sparc32/ultrasparct1/add_n.asm b/third_party/gmp/mpn/sparc32/ultrasparct1/add_n.asm
new file mode 100644
index 0000000..c781596
--- /dev/null
+++ b/third_party/gmp/mpn/sparc32/ultrasparct1/add_n.asm

@@ -0,0 +1,70 @@
+dnl  SPARC T1 32-bit mpn_add_n.
+
+dnl  Copyright 2010 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+define(`rp',  %o0)
+define(`ap',  %o1)
+define(`bp',  %o2)
+define(`n',   %o3)
+define(`cy',  %o4)
+
+define(`i',   %o3)
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc)
+
+ASM_START()
+PROLOGUE(mpn_add_nc)
+	b	L(ent)
+	srl	cy, 0, cy	C strip any bogus high bits
+EPILOGUE()
+
+PROLOGUE(mpn_add_n)
+	mov	0, cy
+L(ent):	srl	n, 0, n		C strip any bogus high bits
+	sll	n, 2, n
+	add	ap, n, ap
+	add	bp, n, bp
+	add	rp, n, rp
+	neg	n, i
+
+L(top):	lduw	[ap+i], %g1
+	lduw	[bp+i], %g2
+	add	%g1, %g2, %g3
+	add	%g3, cy, %g3
+	stw	%g3, [rp+i]
+	add	i, 4, i
+	brnz	i, L(top)
+	srlx	%g3, 32, cy
+
+	retl
+	mov	cy, %o0		C return value
+EPILOGUE()

diff --git a/third_party/gmp/mpn/sparc32/ultrasparct1/addmul_1.asm b/third_party/gmp/mpn/sparc32/ultrasparct1/addmul_1.asm
new file mode 100644
index 0000000..89da186
--- /dev/null
+++ b/third_party/gmp/mpn/sparc32/ultrasparct1/addmul_1.asm

@@ -0,0 +1,90 @@
+dnl  SPARC T1 32-bit mpn_addmul_1.
+
+dnl  Contributed to the GNU project by David Miller.
+
+dnl  Copyright 2010, 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C		   cycles/limb
+C UltraSPARC T1:       24
+C UltraSPARC T2:       19
+C UltraSPARC T3:       19
+C UltraSPARC T4:       5
+
+C INPUT PARAMETERS
+define(`rp',	`%i0')
+define(`up',	`%i1')
+define(`n',	`%i2')
+define(`v0',	`%i3')
+
+ASM_START()
+PROLOGUE(mpn_addmul_1)
+	save	%sp, -96, %sp
+	srl	n, 0, %o4
+	srl	v0, 0, %g1
+	subcc	%o4, 1, %o4
+	be	L(final_one)
+	 clr	%o5
+
+L(top):	lduw	[up+0], %l0
+	lduw	[rp+0], %l2
+	lduw	[up+4], %l1
+	lduw	[rp+4], %l3
+	mulx	%l0, %g1, %g3
+	add	up, 8, up
+	mulx	%l1, %g1, %o3
+	sub	%o4, 2, %o4
+	add	rp, 8, rp
+	add	%l2, %g3, %g3
+	add	%o5, %g3, %g3
+	stw	%g3, [rp-8]
+	srlx	%g3, 32, %o5
+	add	%l3, %o3, %o3
+	add	%o5, %o3, %o3
+	stw	%o3, [rp-4]
+	brgz	%o4, L(top)
+	 srlx	%o3, 32, %o5
+
+	brlz,pt	%o4, L(done)
+	 nop
+
+L(final_one):
+	lduw	[up+0], %l0
+	lduw	[rp+0], %l2
+	mulx	%l0, %g1, %g3
+	add	%l2, %g3, %g3
+	add	%o5, %g3, %g3
+	stw	%g3, [rp+0]
+	srlx	%g3, 32, %o5
+
+L(done):
+	ret
+	 restore %o5, 0, %o0
+EPILOGUE()

diff --git a/third_party/gmp/mpn/sparc32/ultrasparct1/gmp-mparam.h b/third_party/gmp/mpn/sparc32/ultrasparct1/gmp-mparam.h
new file mode 100644
index 0000000..6f9d5a4
--- /dev/null
+++ b/third_party/gmp/mpn/sparc32/ultrasparct1/gmp-mparam.h

@@ -0,0 +1,153 @@
+/* UltraSPARC T 32-bit gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 2000-2011 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define DIVREM_1_NORM_THRESHOLD              0  /* always */
+#define DIVREM_1_UNNORM_THRESHOLD            3
+#define MOD_1_1P_METHOD                      2
+#define MOD_1_NORM_THRESHOLD                 0  /* always */
+#define MOD_1_UNNORM_THRESHOLD               0  /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          9
+#define MOD_1U_TO_MOD_1_1_THRESHOLD         10
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD         0  /* never mpn_mod_1_1p */
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD        21
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     22
+#define USE_PREINV_DIVREM_1                  1
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD                 0  /* always */
+#define BMOD_1_TO_MOD_1_THRESHOLD           35
+
+#define MUL_TOOM22_THRESHOLD                14
+#define MUL_TOOM33_THRESHOLD                98
+#define MUL_TOOM44_THRESHOLD               166
+#define MUL_TOOM6H_THRESHOLD               226
+#define MUL_TOOM8H_THRESHOLD               333
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      97
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     139
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD      97
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD      98
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD     120
+
+#define SQR_BASECASE_THRESHOLD               6
+#define SQR_TOOM2_THRESHOLD                 34
+#define SQR_TOOM3_THRESHOLD                110
+#define SQR_TOOM4_THRESHOLD                178
+#define SQR_TOOM6_THRESHOLD                240
+#define SQR_TOOM8_THRESHOLD                333
+
+#define MULMID_TOOM42_THRESHOLD             22
+
+#define MULMOD_BNM1_THRESHOLD                9
+#define SQRMOD_BNM1_THRESHOLD               13
+
+#define MUL_FFT_MODF_THRESHOLD             280  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    280, 5}, {     15, 6}, {      8, 5}, {     17, 6}, \
+    {      9, 5}, {     19, 6}, {     13, 7}, {      7, 6}, \
+    {     17, 7}, {      9, 6}, {     20, 7}, {     11, 6}, \
+    {     23, 7}, {     13, 8}, {      7, 7}, {     21, 8}, \
+    {     11, 7}, {     25, 9}, {      7, 8}, {     15, 7}, \
+    {     33, 8}, {     19, 7}, {     41, 8}, {     23, 7}, \
+    {     49, 8}, {     27, 9}, {     15, 8}, {     31, 7}, \
+    {     63, 8}, {     39, 9}, {     23, 8}, {     47,10}, \
+    {     15, 9}, {     31, 8}, {     67, 9}, {     39, 8}, \
+    {     79, 9}, {     47,10}, {     31, 9}, {     79,10}, \
+    {     47,11}, {     31,10}, {     63, 9}, {    127, 8}, \
+    {    255,10}, {     79, 9}, {    159, 8}, {    319,10}, \
+    {     95, 9}, {    191, 8}, {    383,11}, {     63,10}, \
+    {    127, 9}, {    255,10}, {    143, 9}, {    287,10}, \
+    {    159, 9}, {    319,10}, {    175,11}, {     95,10}, \
+    {    191, 9}, {    383,12}, {   4096,13}, {   8192,14}, \
+    {  16384,15}, {  32768,16} }
+#define MUL_FFT_TABLE3_SIZE 66
+#define MUL_FFT_THRESHOLD                 3712
+
+#define SQR_FFT_MODF_THRESHOLD             240  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    240, 5}, {     15, 6}, {      8, 5}, {     17, 6}, \
+    {     13, 7}, {      7, 6}, {     17, 7}, {      9, 6}, \
+    {     20, 7}, {     11, 6}, {     23, 7}, {     13, 8}, \
+    {      7, 7}, {     19, 8}, {     11, 7}, {     25, 9}, \
+    {      7, 8}, {     15, 7}, {     33, 8}, {     19, 7}, \
+    {     39, 8}, {     23, 7}, {     47, 8}, {     27, 9}, \
+    {     15, 8}, {     39, 9}, {     23, 8}, {     47,10}, \
+    {     15, 9}, {     31, 8}, {     63, 9}, {     39, 8}, \
+    {     79, 9}, {     47,10}, {     31, 9}, {     63, 8}, \
+    {    127, 9}, {     71, 8}, {    143, 9}, {     79,10}, \
+    {     47,11}, {     31,10}, {     63, 9}, {    127, 8}, \
+    {    255, 9}, {    143,10}, {     79, 9}, {    159, 8}, \
+    {    319, 9}, {    175,10}, {     95, 9}, {    191, 8}, \
+    {    383, 9}, {    207,11}, {     63,10}, {    127, 9}, \
+    {    255,10}, {    143, 9}, {    287,10}, {    159, 9}, \
+    {    319,10}, {    175,11}, {     95,10}, {    191, 9}, \
+    {    383,10}, {    207,12}, {   4096,13}, {   8192,14}, \
+    {  16384,15}, {  32768,16} }
+#define SQR_FFT_TABLE3_SIZE 70
+#define SQR_FFT_THRESHOLD                 2624
+
+#define MULLO_BASECASE_THRESHOLD             0  /* always */
+#define MULLO_DC_THRESHOLD                  51
+#define MULLO_MUL_N_THRESHOLD             6633
+
+#define DC_DIV_QR_THRESHOLD                 51
+#define DC_DIVAPPR_Q_THRESHOLD             202
+#define DC_BDIV_QR_THRESHOLD                47
+#define DC_BDIV_Q_THRESHOLD                124
+
+#define INV_MULMOD_BNM1_THRESHOLD           26
+#define INV_NEWTON_THRESHOLD               266
+#define INV_APPR_THRESHOLD                 222
+
+#define BINV_NEWTON_THRESHOLD              296
+#define REDC_1_TO_REDC_N_THRESHOLD          59
+
+#define MU_DIV_QR_THRESHOLD               1334
+#define MU_DIVAPPR_Q_THRESHOLD            1499
+#define MUPI_DIV_QR_THRESHOLD              116
+#define MU_BDIV_QR_THRESHOLD              1057
+#define MU_BDIV_Q_THRESHOLD               1334
+
+#define POWM_SEC_TABLE  6,35,213,724,2618
+
+#define MATRIX22_STRASSEN_THRESHOLD         15
+#define HGCD_THRESHOLD                      84
+#define HGCD_APPR_THRESHOLD                101
+#define HGCD_REDUCE_THRESHOLD             1437
+#define GCD_DC_THRESHOLD                   372
+#define GCDEXT_DC_THRESHOLD                253
+#define JACOBI_BASE_METHOD                   2
+
+#define GET_STR_DC_THRESHOLD                12
+#define GET_STR_PRECOMPUTE_THRESHOLD        27
+#define SET_STR_DC_THRESHOLD               399
+#define SET_STR_PRECOMPUTE_THRESHOLD       885
+
+#define FAC_DSC_THRESHOLD                  179
+#define FAC_ODD_THRESHOLD                   29

diff --git a/third_party/gmp/mpn/sparc32/ultrasparct1/mul_1.asm b/third_party/gmp/mpn/sparc32/ultrasparct1/mul_1.asm
new file mode 100644
index 0000000..0239cd2
--- /dev/null
+++ b/third_party/gmp/mpn/sparc32/ultrasparct1/mul_1.asm

@@ -0,0 +1,83 @@
+dnl  SPARC T1 32-bit mpn_mul_1.
+
+dnl  Contributed to the GNU project by David Miller.
+
+dnl  Copyright 2010, 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C		   cycles/limb
+C UltraSPARC T1:       20
+C UltraSPARC T2:       18
+C UltraSPARC T3:       18
+C UltraSPARC T4:       4
+
+C INPUT PARAMETERS
+define(`rp',	`%o0')
+define(`up',	`%o1')
+define(`n',	`%o2')
+define(`v0',	`%o3')
+
+ASM_START()
+PROLOGUE(mpn_mul_1)
+	srl	n, 0, n
+	srl	v0, 0, v0
+	subcc	n, 1, n
+	be	L(final_one)
+	 clr	%o5
+
+L(top):	lduw	[up+0], %g1
+	lduw	[up+4], %g2
+	mulx	%g1, v0, %g3
+	add	up, 8, up
+	mulx	%g2, v0, %o4
+	sub	n, 2, n
+	add	rp, 8, rp
+	add	%o5, %g3, %g3
+	stw	%g3, [rp-8]
+	srlx	%g3, 32, %o5
+	add	%o5, %o4, %o4
+	stw	%o4, [rp-4]
+	brgz	n, L(top)
+	 srlx	%o4, 32, %o5
+
+	brlz,pt	n, L(done)
+	 nop
+
+L(final_one):
+	lduw	[up+0], %g1
+	mulx	%g1, v0, %g3
+	add	%o5, %g3, %g3
+	stw	%g3, [rp+0]
+	srlx	%g3, 32, %o5
+
+L(done):
+	retl
+	 mov	%o5, %o0
+EPILOGUE()

diff --git a/third_party/gmp/mpn/sparc32/ultrasparct1/sqr_diagonal.asm b/third_party/gmp/mpn/sparc32/ultrasparct1/sqr_diagonal.asm
new file mode 100644
index 0000000..3b906ef
--- /dev/null
+++ b/third_party/gmp/mpn/sparc32/ultrasparct1/sqr_diagonal.asm

@@ -0,0 +1,55 @@
+dnl  SPARC T1 32-bit mpn_sqr_diagonal.
+
+dnl  Copyright 2010 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+define(`rp',	`%o0')
+define(`up',	`%o1')
+define(`n',	`%o2')
+
+ASM_START()
+PROLOGUE(mpn_sqr_diagonal)
+	deccc	n			C n--
+	nop
+
+L(top):	lduw	[up+0], %g1
+	add	up, 4, up		C up++
+	mulx	%g1, %g1, %g3
+	stw	%g3, [rp+0]
+	srlx	%g3, 32, %g4
+	stw	%g4, [rp+4]
+	add	rp, 8, rp		C rp += 2
+	bnz	%icc, L(top)
+	deccc	n			C n--
+
+	retl
+	nop
+EPILOGUE()

diff --git a/third_party/gmp/mpn/sparc32/ultrasparct1/sub_n.asm b/third_party/gmp/mpn/sparc32/ultrasparct1/sub_n.asm
new file mode 100644
index 0000000..946bc3f
--- /dev/null
+++ b/third_party/gmp/mpn/sparc32/ultrasparct1/sub_n.asm

@@ -0,0 +1,70 @@
+dnl  SPARC T1 32-bit mpn_sub_n.
+
+dnl  Copyright 2010 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+define(`rp',  %o0)
+define(`ap',  %o1)
+define(`bp',  %o2)
+define(`n',   %o3)
+define(`cy',  %o4)
+
+define(`i',   %o3)
+
+MULFUNC_PROLOGUE(mpn_sub_n mpn_sub_nc)
+
+ASM_START()
+PROLOGUE(mpn_sub_nc)
+	b	L(ent)
+	srl	cy, 0, cy	C strip any bogus high bits
+EPILOGUE()
+
+PROLOGUE(mpn_sub_n)
+	mov	0, cy
+L(ent):	srl	n, 0, n		C strip any bogus high bits
+	sll	n, 2, n
+	add	ap, n, ap
+	add	bp, n, bp
+	add	rp, n, rp
+	neg	n, i
+
+L(top):	lduw	[ap+i], %g1
+	lduw	[bp+i], %g2
+	sub	%g1, %g2, %g3
+	sub	%g3, cy, %g3
+	stw	%g3, [rp+i]
+	add	i, 4, i
+	brnz	i, L(top)
+	srlx	%g3, 63, cy
+
+	retl
+	mov	cy, %o0		C return value
+EPILOGUE()

diff --git a/third_party/gmp/mpn/sparc32/ultrasparct1/submul_1.asm b/third_party/gmp/mpn/sparc32/ultrasparct1/submul_1.asm
new file mode 100644
index 0000000..8920070
--- /dev/null
+++ b/third_party/gmp/mpn/sparc32/ultrasparct1/submul_1.asm

@@ -0,0 +1,91 @@
+dnl  SPARC T1 32-bit mpn_submul_1.
+
+dnl  Contributed to the GNU project by David Miller.
+
+dnl  Copyright 2010, 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C		   cycles/limb
+C UltraSPARC T1:       24
+C UltraSPARC T2:       19
+C UltraSPARC T3:       19
+C UltraSPARC T4:       5
+
+C INPUT PARAMETERS
+define(`rp',	`%i0')
+define(`up',	`%i1')
+define(`n',	`%i2')
+define(`v0',	`%i3')
+
+ASM_START()
+PROLOGUE(mpn_submul_1)
+	save	%sp, -96, %sp
+	srl	n, 0, %o4
+	srl	v0, 0, %g1
+	subcc	%o4, 1, %o4
+	be	L(final_one)
+	 subcc	%g0, 0, %o5
+
+L(top):	lduw	[up+0], %l0
+	lduw	[rp+0], %l2
+	lduw	[up+4], %l1
+	lduw	[rp+4], %l3
+	mulx	%l0, %g1, %g3
+	add	up, 8, up
+	mulx	%l1, %g1, %o3
+	sub	%o4, 2, %o4
+	add	rp, 8, rp
+	addx	%o5, %g3, %g3
+	srlx	%g3, 32, %o5
+	subcc	%l2, %g3, %g3
+	stw	%g3, [rp-8]
+	addx	%o5, %o3, %o3
+	srlx	%o3, 32, %o5
+	subcc	%l3, %o3, %o3
+	brgz	%o4, L(top)
+	 stw	%o3, [rp-4]
+
+	brlz,pt	%o4, L(done)
+	 nop
+
+L(final_one):
+	lduw	[up+0], %l0
+	lduw	[rp+0], %l2
+	mulx	%l0, %g1, %g3
+	addx	%o5, %g3, %g3
+	srlx	%g3, 32, %o5
+	subcc	%l2, %g3, %g3
+	stw	%g3, [rp+0]
+
+L(done):
+	addx	%o5, 0, %o5
+	ret
+	 restore %o5, 0, %o0
+EPILOGUE()

diff --git a/third_party/gmp/mpn/sparc32/umul.asm b/third_party/gmp/mpn/sparc32/umul.asm
new file mode 100644
index 0000000..3a20b95
--- /dev/null
+++ b/third_party/gmp/mpn/sparc32/umul.asm

@@ -0,0 +1,77 @@
+dnl  SPARC mpn_umul_ppmm -- support for longlong.h for non-gcc.
+
+dnl  Copyright 1995, 1996, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_umul_ppmm)
+	wr	%g0,%o1,%y
+	sra	%o2,31,%g2	C Don't move this insn
+	and	%o1,%g2,%g2	C Don't move this insn
+	andcc	%g0,0,%g1	C Don't move this insn
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,%o2,%g1
+	mulscc	%g1,0,%g1
+	rd	%y,%g3
+	st	%g3,[%o0]
+	retl
+	add	%g1,%g2,%o0
+EPILOGUE(mpn_umul_ppmm)

diff --git a/third_party/gmp/mpn/sparc32/v8/addmul_1.asm b/third_party/gmp/mpn/sparc32/v8/addmul_1.asm
new file mode 100644
index 0000000..0052092
--- /dev/null
+++ b/third_party/gmp/mpn/sparc32/v8/addmul_1.asm

@@ -0,0 +1,131 @@
+dnl  SPARC v8 mpn_addmul_1 -- Multiply a limb vector with a limb and
+dnl  add the result to a second limb vector.
+
+dnl  Copyright 1992-1995, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr	o0
+C s1_ptr	o1
+C size		o2
+C s2_limb	o3
+
+ASM_START()
+PROLOGUE(mpn_addmul_1)
+	orcc	%g0,%g0,%g2
+	ld	[%o1+0],%o4	C 1
+
+	sll	%o2,4,%g1
+	and	%g1,(4-1)<<4,%g1
+ifdef(`PIC',
+`	mov	%o7,%g4		C Save return address register
+0:	call	1f
+	add	%o7,L(1)-0b,%g3
+1:	mov	%g4,%o7		C Restore return address register
+',
+`	sethi	%hi(L(1)),%g3
+	or	%g3,%lo(L(1)),%g3
+')
+	jmp	%g3+%g1
+	nop
+L(1):
+L(L00):	add	%o0,-4,%o0
+	b	L(loop00)	C 4, 8, 12, ...
+	add	%o1,-4,%o1
+	nop
+L(L01):	b	L(loop01)	C 1, 5, 9, ...
+	nop
+	nop
+	nop
+L(L10):	add	%o0,-12,%o0	C 2, 6, 10, ...
+	b	L(loop10)
+	add	%o1,4,%o1
+	nop
+L(L11):	add	%o0,-8,%o0	C 3, 7, 11, ...
+	b	L(loop11)
+	add	%o1,-8,%o1
+	nop
+
+L(loop):
+	addcc	%g3,%g2,%g3	C 1
+	ld	[%o1+4],%o4	C 2
+	rd	%y,%g2		C 1
+	addx	%g0,%g2,%g2
+	ld	[%o0+0],%g1	C 2
+	addcc	%g1,%g3,%g3
+	st	%g3,[%o0+0]	C 1
+L(loop00):
+	umul	%o4,%o3,%g3	C 2
+	ld	[%o0+4],%g1	C 2
+	addxcc	%g3,%g2,%g3	C 2
+	ld	[%o1+8],%o4	C 3
+	rd	%y,%g2		C 2
+	addx	%g0,%g2,%g2
+	nop
+	addcc	%g1,%g3,%g3
+	st	%g3,[%o0+4]	C 2
+L(loop11):
+	umul	%o4,%o3,%g3	C 3
+	addxcc	%g3,%g2,%g3	C 3
+	ld	[%o1+12],%o4	C 4
+	rd	%y,%g2		C 3
+	add	%o1,16,%o1
+	addx	%g0,%g2,%g2
+	ld	[%o0+8],%g1	C 2
+	addcc	%g1,%g3,%g3
+	st	%g3,[%o0+8]	C 3
+L(loop10):
+	umul	%o4,%o3,%g3	C 4
+	addxcc	%g3,%g2,%g3	C 4
+	ld	[%o1+0],%o4	C 1
+	rd	%y,%g2		C 4
+	addx	%g0,%g2,%g2
+	ld	[%o0+12],%g1	C 2
+	addcc	%g1,%g3,%g3
+	st	%g3,[%o0+12]	C 4
+	add	%o0,16,%o0
+	addx	%g0,%g2,%g2
+L(loop01):
+	addcc	%o2,-4,%o2
+	bg	L(loop)
+	umul	%o4,%o3,%g3	C 1
+
+	addcc	%g3,%g2,%g3	C 4
+	rd	%y,%g2		C 4
+	addx	%g0,%g2,%g2
+	ld	[%o0+0],%g1	C 2
+	addcc	%g1,%g3,%g3
+	st	%g3,[%o0+0]	C 4
+	addx	%g0,%g2,%o0
+
+	retl
+	 nop
+EPILOGUE(mpn_addmul_1)

diff --git a/third_party/gmp/mpn/sparc32/v8/gmp-mparam.h b/third_party/gmp/mpn/sparc32/v8/gmp-mparam.h
new file mode 100644
index 0000000..e57897b
--- /dev/null
+++ b/third_party/gmp/mpn/sparc32/v8/gmp-mparam.h

@@ -0,0 +1,73 @@
+/* SPARC v8 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 1999-2004 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* Generated by tuneup.c, 2004-02-07, gcc 2.95 */
+
+#define MUL_TOOM22_THRESHOLD             10
+#define MUL_TOOM33_THRESHOLD             65
+
+#define SQR_BASECASE_THRESHOLD            4
+#define SQR_TOOM2_THRESHOLD              18
+#define SQR_TOOM3_THRESHOLD              65
+
+#define DIV_SB_PREINV_THRESHOLD           5
+#define DIV_DC_THRESHOLD                 24
+#define POWM_THRESHOLD                   38
+
+#define HGCD_THRESHOLD                   69
+#define GCD_ACCEL_THRESHOLD               3
+#define GCD_DC_THRESHOLD                498
+#define JACOBI_BASE_METHOD                2
+
+#define DIVREM_1_NORM_THRESHOLD           6
+#define DIVREM_1_UNNORM_THRESHOLD        11
+#define MOD_1_NORM_THRESHOLD              5
+#define MOD_1_UNNORM_THRESHOLD            9
+#define USE_PREINV_DIVREM_1               1
+#define USE_PREINV_MOD_1                  1
+#define DIVREM_2_THRESHOLD                0  /* always */
+#define DIVEXACT_1_THRESHOLD              0  /* always */
+#define MODEXACT_1_ODD_THRESHOLD          4
+
+#define GET_STR_DC_THRESHOLD             14
+#define GET_STR_PRECOMPUTE_THRESHOLD     23
+#define SET_STR_THRESHOLD              1679
+
+#define MUL_FFT_TABLE  { 272, 672, 1152, 2560, 10240, 24576, 0 }
+#define MUL_FFT_MODF_THRESHOLD          264
+#define MUL_FFT_THRESHOLD              1792
+
+#define SQR_FFT_TABLE  { 304, 672, 1152, 3584, 10240, 24576, 0 }
+#define SQR_FFT_MODF_THRESHOLD          264
+#define SQR_FFT_THRESHOLD              1728

diff --git a/third_party/gmp/mpn/sparc32/v8/mul_1.asm b/third_party/gmp/mpn/sparc32/v8/mul_1.asm
new file mode 100644
index 0000000..e26c853
--- /dev/null
+++ b/third_party/gmp/mpn/sparc32/v8/mul_1.asm

@@ -0,0 +1,112 @@
+dnl  SPARC v8 mpn_mul_1 -- Multiply a limb vector with a single limb and
+dnl  store the product in a second limb vector.
+
+dnl  Copyright 1992, 1994, 1995, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr	o0
+C s1_ptr	o1
+C size		o2
+C s2_limb	o3
+
+ASM_START()
+PROLOGUE(mpn_mul_1)
+	sll	%o2,4,%g1
+	and	%g1,(4-1)<<4,%g1
+ifdef(`PIC',
+`	mov	%o7,%g4		C Save return address register
+0:	call	1f
+	add	%o7,L(1)-0b,%g3
+1:	mov	%g4,%o7		C Restore return address register
+',
+`	sethi	%hi(L(1)),%g3
+	or	%g3,%lo(L(1)),%g3
+')
+	jmp	%g3+%g1
+	ld	[%o1+0],%o4	C 1
+L(1):
+L(L00):	add	%o0,-4,%o0
+	add	%o1,-4,%o1
+	b	L(loop00)	C 4, 8, 12, ...
+	orcc	%g0,%g0,%g2
+L(L01):	b	L(loop01)	C 1, 5, 9, ...
+	orcc	%g0,%g0,%g2
+	nop
+	nop
+L(L10):	add	%o0,-12,%o0	C 2, 6, 10, ...
+	add	%o1,4,%o1
+	b	L(loop10)
+	orcc	%g0,%g0,%g2
+	nop
+L(L11):	add	%o0,-8,%o0	C 3, 7, 11, ...
+	add	%o1,-8,%o1
+	b	L(loop11)
+	orcc	%g0,%g0,%g2
+
+L(loop):
+	addcc	%g3,%g2,%g3	C 1
+	ld	[%o1+4],%o4	C 2
+	st	%g3,[%o0+0]	C 1
+	rd	%y,%g2		C 1
+L(loop00):
+	umul	%o4,%o3,%g3	C 2
+	addxcc	%g3,%g2,%g3	C 2
+	ld	[%o1+8],%o4	C 3
+	st	%g3,[%o0+4]	C 2
+	rd	%y,%g2		C 2
+L(loop11):
+	umul	%o4,%o3,%g3	C 3
+	addxcc	%g3,%g2,%g3	C 3
+	ld	[%o1+12],%o4	C 4
+	add	%o1,16,%o1
+	st	%g3,[%o0+8]	C 3
+	rd	%y,%g2		C 3
+L(loop10):
+	umul	%o4,%o3,%g3	C 4
+	addxcc	%g3,%g2,%g3	C 4
+	ld	[%o1+0],%o4	C 1
+	st	%g3,[%o0+12]	C 4
+	add	%o0,16,%o0
+	rd	%y,%g2		C 4
+	addx	%g0,%g2,%g2
+L(loop01):
+	addcc	%o2,-4,%o2
+	bg	L(loop)
+	umul	%o4,%o3,%g3	C 1
+
+	addcc	%g3,%g2,%g3	C 4
+	st	%g3,[%o0+0]	C 4
+	rd	%y,%g2		C 4
+
+	retl
+	addx	%g0,%g2,%o0
+EPILOGUE(mpn_mul_1)

diff --git a/third_party/gmp/mpn/sparc32/v8/submul_1.asm b/third_party/gmp/mpn/sparc32/v8/submul_1.asm
new file mode 100644
index 0000000..187314e
--- /dev/null
+++ b/third_party/gmp/mpn/sparc32/v8/submul_1.asm

@@ -0,0 +1,67 @@
+dnl  SPARC v8 mpn_submul_1 -- Multiply a limb vector with a limb and
+dnl  subtract the result from a second limb vector.
+
+dnl  Copyright 1992-1994, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr	o0
+C s1_ptr	o1
+C size		o2
+C s2_limb	o3
+
+ASM_START()
+PROLOGUE(mpn_submul_1)
+	sub	%g0,%o2,%o2		C negate ...
+	sll	%o2,2,%o2		C ... and scale size
+	sub	%o1,%o2,%o1		C o1 is offset s1_ptr
+	sub	%o0,%o2,%g1		C g1 is offset res_ptr
+
+	mov	0,%o0			C clear cy_limb
+
+L(loop):
+	ld	[%o1+%o2],%o4
+	ld	[%g1+%o2],%g2
+	umul	%o4,%o3,%o5
+	rd	%y,%g3
+	addcc	%o5,%o0,%o5
+	addx	%g3,0,%o0
+	subcc	%g2,%o5,%g2
+	addx	%o0,0,%o0
+	st	%g2,[%g1+%o2]
+
+	addcc	%o2,4,%o2
+	bne	L(loop)
+	 nop
+
+	retl
+	 nop
+EPILOGUE(mpn_submul_1)

diff --git a/third_party/gmp/mpn/sparc32/v8/supersparc/gmp-mparam.h b/third_party/gmp/mpn/sparc32/v8/supersparc/gmp-mparam.h
new file mode 100644
index 0000000..1ac9239
--- /dev/null
+++ b/third_party/gmp/mpn/sparc32/v8/supersparc/gmp-mparam.h

@@ -0,0 +1,73 @@
+/* SuperSPARC gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 1999-2004 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* Generated by tuneup.c, 2004-02-10, gcc 3.3 */
+
+#define MUL_TOOM22_THRESHOLD             14
+#define MUL_TOOM33_THRESHOLD             81
+
+#define SQR_BASECASE_THRESHOLD            5
+#define SQR_TOOM2_THRESHOLD              28
+#define SQR_TOOM3_THRESHOLD              86
+
+#define DIV_SB_PREINV_THRESHOLD           0  /* always */
+#define DIV_DC_THRESHOLD                 26
+#define POWM_THRESHOLD                   79
+
+#define HGCD_THRESHOLD                   97
+#define GCD_ACCEL_THRESHOLD               3
+#define GCD_DC_THRESHOLD                470
+#define JACOBI_BASE_METHOD                2
+
+#define DIVREM_1_NORM_THRESHOLD           0  /* always */
+#define DIVREM_1_UNNORM_THRESHOLD         3
+#define MOD_1_NORM_THRESHOLD              0  /* always */
+#define MOD_1_UNNORM_THRESHOLD            3
+#define USE_PREINV_DIVREM_1               1
+#define USE_PREINV_MOD_1                  1
+#define DIVREM_2_THRESHOLD                0  /* always */
+#define DIVEXACT_1_THRESHOLD              0  /* always */
+#define MODEXACT_1_ODD_THRESHOLD          0  /* always */
+
+#define GET_STR_DC_THRESHOLD             19
+#define GET_STR_PRECOMPUTE_THRESHOLD     34
+#define SET_STR_THRESHOLD              3524
+
+#define MUL_FFT_TABLE  { 304, 800, 1408, 3584, 10240, 24576, 0 }
+#define MUL_FFT_MODF_THRESHOLD          264
+#define MUL_FFT_THRESHOLD              2304
+
+#define SQR_FFT_TABLE  { 336, 800, 1408, 3584, 10240, 24576, 0 }
+#define SQR_FFT_MODF_THRESHOLD          280
+#define SQR_FFT_THRESHOLD              2304

diff --git a/third_party/gmp/mpn/sparc32/v8/supersparc/udiv.asm b/third_party/gmp/mpn/sparc32/v8/supersparc/udiv.asm
new file mode 100644
index 0000000..12f66ce
--- /dev/null
+++ b/third_party/gmp/mpn/sparc32/v8/supersparc/udiv.asm

@@ -0,0 +1,131 @@
+dnl  SuperSPARC mpn_udiv_qrnnd division support, used from longlong.h.
+dnl  This is for SuperSPARC only, to compensate for its semi-functional
+dnl  udiv instruction.
+
+dnl  Copyright 1993, 1994, 1996, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C rem_ptr	i0
+C n1		i1
+C n0		i2
+C d		i3
+
+ASM_START()
+
+ifdef(`PIC',
+`	TEXT
+L(getpc):
+	retl
+	nop')
+
+	TEXT
+	ALIGN(8)
+L(C0):	.double	0r4294967296
+L(C1):	.double	0r2147483648
+
+PROLOGUE(mpn_udiv_qrnnd)
+	save	%sp,-104,%sp
+	st	%i1,[%fp-8]
+	ld	[%fp-8],%f10
+
+ifdef(`PIC',
+`L(pc):	call	L(getpc)		C put address of this insn in %o7
+	ldd	[%o7+L(C0)-L(pc)],%f8',
+`	sethi	%hi(L(C0)),%o7
+	ldd	[%o7+%lo(L(C0))],%f8')
+
+	fitod	%f10,%f4
+	cmp	%i1,0
+	bge	L(248)
+	mov	%i0,%i5
+	faddd	%f4,%f8,%f4
+L(248):
+	st	%i2,[%fp-8]
+	ld	[%fp-8],%f10
+	fmuld	%f4,%f8,%f6
+	cmp	%i2,0
+	bge	L(249)
+	fitod	%f10,%f2
+	faddd	%f2,%f8,%f2
+L(249):
+	st	%i3,[%fp-8]
+	faddd	%f6,%f2,%f2
+	ld	[%fp-8],%f10
+	cmp	%i3,0
+	bge	L(250)
+	fitod	%f10,%f4
+	faddd	%f4,%f8,%f4
+L(250):
+	fdivd	%f2,%f4,%f2
+
+ifdef(`PIC',
+`	ldd	[%o7+L(C1)-L(pc)],%f4',
+`	sethi	%hi(L(C1)),%o7
+	ldd	[%o7+%lo(L(C1))],%f4')
+
+	fcmped	%f2,%f4
+	nop
+	fbge,a	L(251)
+	fsubd	%f2,%f4,%f2
+	fdtoi	%f2,%f2
+	st	%f2,[%fp-8]
+	b	L(252)
+	ld	[%fp-8],%i4
+L(251):
+	fdtoi	%f2,%f2
+	st	%f2,[%fp-8]
+	ld	[%fp-8],%i4
+	sethi	%hi(-2147483648),%g2
+	xor	%i4,%g2,%i4
+L(252):
+	umul	%i3,%i4,%g3
+	rd	%y,%i0
+	subcc	%i2,%g3,%o7
+	subxcc	%i1,%i0,%g0
+	be	L(253)
+	cmp	%o7,%i3
+
+	add	%i4,-1,%i0
+	add	%o7,%i3,%o7
+	st	%o7,[%i5]
+	ret
+	restore
+L(253):
+	blu	L(246)
+	mov	%i4,%i0
+	add	%i4,1,%i0
+	sub	%o7,%i3,%o7
+L(246):
+	st	%o7,[%i5]
+	ret
+	restore
+EPILOGUE(mpn_udiv_qrnnd)

diff --git a/third_party/gmp/mpn/sparc32/v8/udiv.asm b/third_party/gmp/mpn/sparc32/v8/udiv.asm
new file mode 100644
index 0000000..12f66ce
--- /dev/null
+++ b/third_party/gmp/mpn/sparc32/v8/udiv.asm

@@ -0,0 +1,131 @@
+dnl  SuperSPARC mpn_udiv_qrnnd division support, used from longlong.h.
+dnl  This is for SuperSPARC only, to compensate for its semi-functional
+dnl  udiv instruction.
+
+dnl  Copyright 1993, 1994, 1996, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C rem_ptr	i0
+C n1		i1
+C n0		i2
+C d		i3
+
+ASM_START()
+
+ifdef(`PIC',
+`	TEXT
+L(getpc):
+	retl
+	nop')
+
+	TEXT
+	ALIGN(8)
+L(C0):	.double	0r4294967296
+L(C1):	.double	0r2147483648
+
+PROLOGUE(mpn_udiv_qrnnd)
+	save	%sp,-104,%sp
+	st	%i1,[%fp-8]
+	ld	[%fp-8],%f10
+
+ifdef(`PIC',
+`L(pc):	call	L(getpc)		C put address of this insn in %o7
+	ldd	[%o7+L(C0)-L(pc)],%f8',
+`	sethi	%hi(L(C0)),%o7
+	ldd	[%o7+%lo(L(C0))],%f8')
+
+	fitod	%f10,%f4
+	cmp	%i1,0
+	bge	L(248)
+	mov	%i0,%i5
+	faddd	%f4,%f8,%f4
+L(248):
+	st	%i2,[%fp-8]
+	ld	[%fp-8],%f10
+	fmuld	%f4,%f8,%f6
+	cmp	%i2,0
+	bge	L(249)
+	fitod	%f10,%f2
+	faddd	%f2,%f8,%f2
+L(249):
+	st	%i3,[%fp-8]
+	faddd	%f6,%f2,%f2
+	ld	[%fp-8],%f10
+	cmp	%i3,0
+	bge	L(250)
+	fitod	%f10,%f4
+	faddd	%f4,%f8,%f4
+L(250):
+	fdivd	%f2,%f4,%f2
+
+ifdef(`PIC',
+`	ldd	[%o7+L(C1)-L(pc)],%f4',
+`	sethi	%hi(L(C1)),%o7
+	ldd	[%o7+%lo(L(C1))],%f4')
+
+	fcmped	%f2,%f4
+	nop
+	fbge,a	L(251)
+	fsubd	%f2,%f4,%f2
+	fdtoi	%f2,%f2
+	st	%f2,[%fp-8]
+	b	L(252)
+	ld	[%fp-8],%i4
+L(251):
+	fdtoi	%f2,%f2
+	st	%f2,[%fp-8]
+	ld	[%fp-8],%i4
+	sethi	%hi(-2147483648),%g2
+	xor	%i4,%g2,%i4
+L(252):
+	umul	%i3,%i4,%g3
+	rd	%y,%i0
+	subcc	%i2,%g3,%o7
+	subxcc	%i1,%i0,%g0
+	be	L(253)
+	cmp	%o7,%i3
+
+	add	%i4,-1,%i0
+	add	%o7,%i3,%o7
+	st	%o7,[%i5]
+	ret
+	restore
+L(253):
+	blu	L(246)
+	mov	%i4,%i0
+	add	%i4,1,%i0
+	sub	%o7,%i3,%o7
+L(246):
+	st	%o7,[%i5]
+	ret
+	restore
+EPILOGUE(mpn_udiv_qrnnd)

diff --git a/third_party/gmp/mpn/sparc32/v8/umul.asm b/third_party/gmp/mpn/sparc32/v8/umul.asm
new file mode 100644
index 0000000..1a2e84b
--- /dev/null
+++ b/third_party/gmp/mpn/sparc32/v8/umul.asm

@@ -0,0 +1,40 @@
+dnl  SPARC v8 mpn_umul_ppmm -- support for longlong.h for non-gcc.
+
+dnl  Copyright 1995, 1996, 2000 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_umul_ppmm)
+	umul	%o1,%o2,%g2
+	st	%g2,[%o0]
+	retl
+	rd	%y,%o0
+EPILOGUE(mpn_umul_ppmm)

diff --git a/third_party/gmp/mpn/sparc32/v9/README b/third_party/gmp/mpn/sparc32/v9/README
new file mode 100644
index 0000000..9b39713
--- /dev/null
+++ b/third_party/gmp/mpn/sparc32/v9/README

@@ -0,0 +1,4 @@
+Code for SPARC processors implementing version 9 of the SPARC architecture.
+This code is for systems that doesn't preserve the full 64-bit contents of
+integer register at context switch.  For other systems (such as Solaris 7 or
+later) use the code in ../../sparc64.

diff --git a/third_party/gmp/mpn/sparc32/v9/add_n.asm b/third_party/gmp/mpn/sparc32/v9/add_n.asm
new file mode 100644
index 0000000..7bd5974
--- /dev/null
+++ b/third_party/gmp/mpn/sparc32/v9/add_n.asm

@@ -0,0 +1,129 @@
+dnl  SPARC mpn_add_n -- Add two limb vectors of the same length > 0 and store
+dnl  sum in a third limb vector.
+
+dnl  Copyright 2001 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+define(rp,%o0)
+define(s1p,%o1)
+define(s2p,%o2)
+define(n,%o3)
+define(cy,%g1)
+
+C This code uses 64-bit operations on `o' and `g' registers.  It doesn't
+C require that `o' registers' upper 32 bits are preserved by the operating
+C system, but if they are not, they must be zeroed.  That is indeed what
+C happens at least on Slowaris 2.5 and 2.6.
+
+C On UltraSPARC 1 and 2, this code runs at 3 cycles/limb from the Dcache and at
+C about 10 cycles/limb from the Ecache.
+
+ASM_START()
+PROLOGUE(mpn_add_n)
+	lduw	[s1p+0],%o4
+	lduw	[s2p+0],%o5
+	addcc	n,-2,n
+	bl,pn	%icc,L(end1)
+	lduw	[s1p+4],%g2
+	lduw	[s2p+4],%g3
+	be,pn	%icc,L(end2)
+	mov	0,cy
+
+	.align	16
+L(loop):
+	add	%o4,%o5,%g4
+	add	rp,8,rp
+	lduw	[s1p+8],%o4
+	fitod	%f0,%f2
+C ---
+	add	cy,%g4,%g4
+	addcc	n,-1,n
+	lduw	[s2p+8],%o5
+	fitod	%f0,%f2
+C ---
+	srlx	%g4,32,cy
+	add	s2p,8,s2p
+	stw	%g4,[rp-8]
+	be,pn	%icc,L(exito)+4
+C ---
+	add	%g2,%g3,%g4
+	addcc	n,-1,n
+	lduw	[s1p+12],%g2
+	fitod	%f0,%f2
+C ---
+	add	cy,%g4,%g4
+	add	s1p,8,s1p
+	lduw	[s2p+4],%g3
+	fitod	%f0,%f2
+C ---
+	srlx	%g4,32,cy
+	bne,pt	%icc,L(loop)
+	stw	%g4,[rp-4]
+C ---
+L(exite):
+	add	%o4,%o5,%g4
+	add	cy,%g4,%g4
+	srlx	%g4,32,cy
+	stw	%g4,[rp+0]
+	add	%g2,%g3,%g4
+	add	cy,%g4,%g4
+	stw	%g4,[rp+4]
+	retl
+	srlx	%g4,32,%o0
+
+L(exito):
+	add	%g2,%g3,%g4
+	add	cy,%g4,%g4
+	srlx	%g4,32,cy
+	stw	%g4,[rp-4]
+	add	%o4,%o5,%g4
+	add	cy,%g4,%g4
+	stw	%g4,[rp+0]
+	retl
+	srlx	%g4,32,%o0
+
+L(end1):
+	add	%o4,%o5,%g4
+	stw	%g4,[rp+0]
+	retl
+	srlx	%g4,32,%o0
+
+L(end2):
+	add	%o4,%o5,%g4
+	srlx	%g4,32,cy
+	stw	%g4,[rp+0]
+	add	%g2,%g3,%g4
+	add	cy,%g4,%g4
+	stw	%g4,[rp+4]
+	retl
+	srlx	%g4,32,%o0
+EPILOGUE(mpn_add_n)

diff --git a/third_party/gmp/mpn/sparc32/v9/addmul_1.asm b/third_party/gmp/mpn/sparc32/v9/addmul_1.asm
new file mode 100644
index 0000000..2adf7a8
--- /dev/null
+++ b/third_party/gmp/mpn/sparc32/v9/addmul_1.asm

@@ -0,0 +1,306 @@
+dnl  SPARC v9 32-bit mpn_addmul_1 -- Multiply a limb vector with a limb and add
+dnl  the result to a second limb vector.
+
+dnl  Copyright 1998, 2000, 2001, 2003 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C Algorithm: We use two floating-point multiplies per limb product, with the
+C invariant v operand split into two 16-bit pieces, and the u operand split
+C into 32-bit pieces.  We convert the two 48-bit products and transfer them to
+C the integer unit.
+
+C		   cycles/limb
+C UltraSPARC 1&2:     6.5
+C UltraSPARC 3:	      ?
+
+C Possible optimizations:
+C   1. Combine 32-bit memory operations into 64-bit operations.  Since we're
+C      memory bandwidth limited, this could save 1.5 cycles/limb.
+C   2. Unroll the inner loop.  Since we already use alternate temporary areas,
+C      it is very straightforward to unroll, using an exit branch midways.
+C      Unrolling would allow deeper scheduling which could improve speed for L2
+C      cache case.
+C   3. For mpn_mul_1: Use more alternating temp areas.  The std'es and ldx'es
+C      aren't sufficiently apart-scheduled with just two temp areas.
+C   4. Specialize for particular v values.  If its upper 16 bits are zero, we
+C      could save many operations.
+
+C INPUT PARAMETERS
+C rp	i0
+C up	i1
+C n	i2
+C v	i3
+
+define(`FSIZE',224)
+
+ASM_START()
+PROLOGUE(mpn_addmul_1)
+	add	%sp, -FSIZE, %sp
+	sethi	%hi(0xffff), %g1
+	srl	%o3, 16, %g2
+	or	%g1, %lo(0xffff), %g1
+	and	%o3, %g1, %g1
+	stx	%g1, [%sp+104]
+	stx	%g2, [%sp+112]
+	ldd	[%sp+104], %f6
+	ldd	[%sp+112], %f8
+	fxtod	%f6, %f6
+	fxtod	%f8, %f8
+	ld	[%sp+104], %f10		C zero f10
+
+	mov	0, %g3			C cy = 0
+
+define(`fanop', `fitod %f18, %f0')	C  A quasi nop running in the FA pipe
+
+	add	%sp, 160, %o5		C point in scratch area
+	and	%o5, -32, %o5		C align at 0 (mod 32) in scratch area
+
+	subcc	%o2, 1, %o2
+	ld	[%o1], %f11		C read up[i]
+	add	%o1, 4, %o1		C up++
+	bne,pt	%icc, .L_two_or_more
+	fxtod	%f10, %f2
+
+	fmuld	%f2, %f8, %f16
+	fmuld	%f2, %f6, %f4
+	fdtox	%f16, %f14
+	fdtox	%f4, %f12
+	std	%f14, [%o5+16]
+	std	%f12, [%o5+24]
+	ldx	[%o5+16], %g2		C p16
+	ldx	[%o5+24], %g1		C p0
+	lduw	[%o0], %g5		C read rp[i]
+	b	.L1
+	add	%o0, -16, %o0
+
+	.align	16
+.L_two_or_more:
+	subcc	%o2, 1, %o2
+	ld	[%o1], %f11		C read up[i]
+	fmuld	%f2, %f8, %f16
+	fmuld	%f2, %f6, %f4
+	add	%o1, 4, %o1		C up++
+	bne,pt	%icc, .L_three_or_more
+	fxtod	%f10, %f2
+
+	fdtox	%f16, %f14
+	fdtox	%f4, %f12
+	std	%f14, [%o5+16]
+	fmuld	%f2, %f8, %f16
+	std	%f12, [%o5+24]
+	fmuld	%f2, %f6, %f4
+	fdtox	%f16, %f14
+	fdtox	%f4, %f12
+	std	%f14, [%o5+0]
+	std	%f12, [%o5+8]
+	lduw	[%o0], %g5		C read rp[i]
+	ldx	[%o5+16], %g2		C p16
+	ldx	[%o5+24], %g1		C p0
+	b	.L2
+	add	%o0, -12, %o0
+
+	.align	16
+.L_three_or_more:
+	subcc	%o2, 1, %o2
+	ld	[%o1], %f11		C read up[i]
+	fdtox	%f16, %f14
+	fdtox	%f4, %f12
+	std	%f14, [%o5+16]
+	fmuld	%f2, %f8, %f16
+	std	%f12, [%o5+24]
+	fmuld	%f2, %f6, %f4
+	add	%o1, 4, %o1		C up++
+	bne,pt	%icc, .L_four_or_more
+	fxtod	%f10, %f2
+
+	fdtox	%f16, %f14
+	fdtox	%f4, %f12
+	std	%f14, [%o5+0]
+	fmuld	%f2, %f8, %f16
+	std	%f12, [%o5+8]
+	fmuld	%f2, %f6, %f4
+	fdtox	%f16, %f14
+	ldx	[%o5+16], %g2		C p16
+	fdtox	%f4, %f12
+	ldx	[%o5+24], %g1		C p0
+	std	%f14, [%o5+16]
+	std	%f12, [%o5+24]
+	lduw	[%o0], %g5		C read rp[i]
+	b	.L3
+	add	%o0, -8, %o0
+
+	.align	16
+.L_four_or_more:
+	subcc	%o2, 1, %o2
+	ld	[%o1], %f11		C read up[i]
+	fdtox	%f16, %f14
+	fdtox	%f4, %f12
+	std	%f14, [%o5+0]
+	fmuld	%f2, %f8, %f16
+	std	%f12, [%o5+8]
+	fmuld	%f2, %f6, %f4
+	add	%o1, 4, %o1		C up++
+	bne,pt	%icc, .L_five_or_more
+	fxtod	%f10, %f2
+
+	fdtox	%f16, %f14
+	ldx	[%o5+16], %g2		C p16
+	fdtox	%f4, %f12
+	ldx	[%o5+24], %g1		C p0
+	std	%f14, [%o5+16]
+	fmuld	%f2, %f8, %f16
+	std	%f12, [%o5+24]
+	fmuld	%f2, %f6, %f4
+	add	%o1, 4, %o1		C up++
+	lduw	[%o0], %g5		C read rp[i]
+	b	.L4
+	add	%o0, -4, %o0
+
+	.align	16
+.L_five_or_more:
+	subcc	%o2, 1, %o2
+	ld	[%o1], %f11		C read up[i]
+	fdtox	%f16, %f14
+	ldx	[%o5+16], %g2		C p16
+	fdtox	%f4, %f12
+	ldx	[%o5+24], %g1		C p0
+	std	%f14, [%o5+16]
+	fmuld	%f2, %f8, %f16
+	std	%f12, [%o5+24]
+	fmuld	%f2, %f6, %f4
+	add	%o1, 4, %o1		C up++
+	lduw	[%o0], %g5		C read rp[i]
+	bne,pt	%icc, .Loop
+	fxtod	%f10, %f2
+	b,a	.L5
+
+C BEGIN MAIN LOOP
+	.align 16
+C -- 0
+.Loop:	nop
+	subcc	%o2, 1, %o2
+	ld	[%o1], %f11		C read up[i]
+	fdtox	%f16, %f14
+C -- 1
+	sllx	%g2, 16, %g4		C (p16 << 16)
+	add	%o0, 4, %o0		C rp++
+	ldx	[%o5+0], %g2		C p16
+	fdtox	%f4, %f12
+C -- 2
+	nop
+	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
+	ldx	[%o5+8], %g1		C p0
+	fanop
+C -- 3
+	nop
+	add	%g3, %g4, %g4		C p += cy
+	std	%f14, [%o5+0]
+	fmuld	%f2, %f8, %f16
+C -- 4
+	nop
+	add	%g5, %g4, %g4		C p += rp[i]
+	std	%f12, [%o5+8]
+	fmuld	%f2, %f6, %f4
+C -- 5
+	xor	%o5, 16, %o5		C alternate scratch variables
+	add	%o1, 4, %o1		C up++
+	stw	%g4, [%o0-4]
+	fanop
+C -- 6
+	srlx	%g4, 32, %g3		C new cy
+	lduw	[%o0], %g5		C read rp[i]
+	bne,pt	%icc, .Loop
+	fxtod	%f10, %f2
+C END MAIN LOOP
+
+.L5:	fdtox	%f16, %f14
+	sllx	%g2, 16, %g4		C (p16 << 16)
+	ldx	[%o5+0], %g2		C p16
+	fdtox	%f4, %f12
+	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
+	ldx	[%o5+8], %g1		C p0
+	add	%g4, %g3, %g4		C p += cy
+	std	%f14, [%o5+0]
+	fmuld	%f2, %f8, %f16
+	add	%g5, %g4, %g4		C p += rp[i]
+	std	%f12, [%o5+8]
+	fmuld	%f2, %f6, %f4
+	xor	%o5, 16, %o5
+	stw	%g4, [%o0+0]
+	srlx	%g4, 32, %g3		C new cy
+	lduw	[%o0+4], %g5		C read rp[i]
+
+.L4:	fdtox	%f16, %f14
+	sllx	%g2, 16, %g4		C (p16 << 16)
+	ldx	[%o5+0], %g2		C p16
+	fdtox	%f4, %f12
+	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
+	ldx	[%o5+8], %g1		C p0
+	add	%g3, %g4, %g4		C p += cy
+	std	%f14, [%o5+0]
+	add	%g5, %g4, %g4		C p += rp[i]
+	std	%f12, [%o5+8]
+	xor	%o5, 16, %o5
+	stw	%g4, [%o0+4]
+	srlx	%g4, 32, %g3		C new cy
+	lduw	[%o0+8], %g5		C read rp[i]
+
+.L3:	sllx	%g2, 16, %g4		C (p16 << 16)
+	ldx	[%o5+0], %g2		C p16
+	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
+	ldx	[%o5+8], %g1		C p0
+	add	%g3, %g4, %g4		C p += cy
+	add	%g5, %g4, %g4		C p += rp[i]
+	xor	%o5, 16, %o5
+	stw	%g4, [%o0+8]
+	srlx	%g4, 32, %g3		C new cy
+	lduw	[%o0+12], %g5		C read rp[i]
+
+.L2:	sllx	%g2, 16, %g4		C (p16 << 16)
+	ldx	[%o5+0], %g2		C p16
+	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
+	ldx	[%o5+8], %g1		C p0
+	add	%g3, %g4, %g4		C p += cy
+	add	%g5, %g4, %g4		C p += rp[i]
+	stw	%g4, [%o0+12]
+	srlx	%g4, 32, %g3		C new cy
+	lduw	[%o0+16], %g5		C read rp[i]
+
+.L1:	sllx	%g2, 16, %g4		C (p16 << 16)
+	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
+	add	%g3, %g4, %g4		C p += cy
+	add	%g5, %g4, %g4		C p += rp[i]
+	stw	%g4, [%o0+16]
+	srlx	%g4, 32, %g3		C new cy
+
+	mov	%g3, %o0
+	retl
+	sub	%sp, -FSIZE, %sp
+EPILOGUE(mpn_addmul_1)

diff --git a/third_party/gmp/mpn/sparc32/v9/gmp-mparam.h b/third_party/gmp/mpn/sparc32/v9/gmp-mparam.h
new file mode 100644
index 0000000..f909e2c
--- /dev/null
+++ b/third_party/gmp/mpn/sparc32/v9/gmp-mparam.h

@@ -0,0 +1,204 @@
+/* SPARC v9 32-bit gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 1999-2002, 2004, 2009-2011, 2014 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* 1593 MHz ultrasparc3 running Solaris 10 (swift.nada.kth.se) */
+/* FFT tuning limit = 25000000 */
+/* Generated by tuneup.c, 2014-03-16, gcc 3.4 */
+
+#define DIVREM_1_NORM_THRESHOLD              3
+#define DIVREM_1_UNNORM_THRESHOLD            4
+#define MOD_1_1P_METHOD                      2
+#define MOD_1_NORM_THRESHOLD                 3
+#define MOD_1_UNNORM_THRESHOLD               4
+#define MOD_1N_TO_MOD_1_1_THRESHOLD         13
+#define MOD_1U_TO_MOD_1_1_THRESHOLD         12
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD         0  /* never mpn_mod_1_1p */
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD        22
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     32
+#define USE_PREINV_DIVREM_1                  1
+#define DIV_QR_1N_PI1_METHOD                 1
+#define DIV_QR_1_NORM_THRESHOLD              4
+#define DIV_QR_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD                 0  /* always */
+#define BMOD_1_TO_MOD_1_THRESHOLD        MP_SIZE_T_MAX  /* never */
+
+#define MUL_TOOM22_THRESHOLD                28
+#define MUL_TOOM33_THRESHOLD                43
+#define MUL_TOOM44_THRESHOLD               126
+#define MUL_TOOM6H_THRESHOLD               161
+#define MUL_TOOM8H_THRESHOLD               208
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      73
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD      80
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD      85
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD      55
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD      72
+
+#define SQR_BASECASE_THRESHOLD               4
+#define SQR_TOOM2_THRESHOLD                 64
+#define SQR_TOOM3_THRESHOLD                 85
+#define SQR_TOOM4_THRESHOLD                152
+#define SQR_TOOM6_THRESHOLD                185
+#define SQR_TOOM8_THRESHOLD                324
+
+#define MULMID_TOOM42_THRESHOLD             64
+
+#define MULMOD_BNM1_THRESHOLD               12
+#define SQRMOD_BNM1_THRESHOLD               16
+
+#define MUL_FFT_MODF_THRESHOLD             288  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    288, 5}, {      9, 4}, {     19, 5}, {     11, 6}, \
+    {      6, 5}, {     14, 6}, {      8, 5}, {     17, 6}, \
+    {      9, 5}, {     20, 6}, {     13, 7}, {      7, 6}, \
+    {     16, 7}, {      9, 6}, {     19, 7}, {     11, 6}, \
+    {     23, 7}, {     13, 8}, {      7, 7}, {     15, 6}, \
+    {     31, 7}, {     19, 8}, {     11, 7}, {     23, 9}, \
+    {      7, 8}, {     15, 7}, {     31, 8}, {     19, 7}, \
+    {     39, 8}, {     27, 9}, {     15, 8}, {     31, 7}, \
+    {     63, 8}, {     39, 9}, {     23, 8}, {     47,10}, \
+    {     15, 9}, {     31, 8}, {     67, 9}, {     39, 8}, \
+    {     79, 9}, {     47,10}, {     31, 9}, {     71, 8}, \
+    {    143, 9}, {     79,10}, {     47, 9}, {     95,11}, \
+    {     31,10}, {     63, 9}, {    135, 8}, {    271, 9}, \
+    {    143, 8}, {    287,10}, {     79, 9}, {    175,10}, \
+    {     95, 9}, {    191, 8}, {    383,10}, {    111,11}, \
+    {     63,10}, {    143, 9}, {    287, 8}, {    575,10}, \
+    {    175,11}, {     95,10}, {    191, 9}, {    415, 8}, \
+    {    831,12}, {     63,11}, {    127,10}, {    287, 9}, \
+    {    575,11}, {    159,10}, {    351, 9}, {    703,11}, \
+    {    191,10}, {    415, 9}, {    831,11}, {    223,10}, \
+    {    447, 9}, {    895, 8}, {   1791,12}, {    127,11}, \
+    {    287,10}, {    607, 9}, {   1215, 8}, {   2431,11}, \
+    {    319, 9}, {   1279,11}, {    351,12}, {    191,11}, \
+    {    415,10}, {    831,11}, {    447,10}, {    895, 9}, \
+    {   1791,11}, {    479,13}, {    127,12}, {    255,11}, \
+    {    575,10}, {   1151,11}, {    607,12}, {    319,11}, \
+    {    703,12}, {    383,11}, {    831,12}, {    447,11}, \
+    {    895,10}, {   1791,11}, {    959,13}, {    255,12}, \
+    {    575,11}, {   1215,10}, {   2431,12}, {    703,13}, \
+    {    383,12}, {    959,14}, {    255,13}, {    511,12}, \
+    {   1087,11}, {   2175,12}, {   1215,11}, {   2431,13}, \
+    {    639,12}, {   1407,11}, {   2943,13}, {    895,12}, \
+    {   1919,14}, {    511,13}, {   1151,12}, {   2431,13}, \
+    {   1407,14}, {    767,13}, {   1791,15}, {    511,14}, \
+    {   1023,13}, {   2431,14}, {   1279,13}, {   2943,12}, \
+    {   5887,14}, {  16384,15}, {  32768,16} }
+#define MUL_FFT_TABLE3_SIZE 143
+#define MUL_FFT_THRESHOLD                 2240
+
+#define SQR_FFT_MODF_THRESHOLD             244  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    244, 5}, {      8, 4}, {     17, 5}, {     17, 6}, \
+    {      9, 5}, {     19, 6}, {     17, 7}, {      9, 6}, \
+    {     20, 7}, {     11, 6}, {     23, 7}, {     13, 8}, \
+    {      7, 7}, {     19, 8}, {     11, 7}, {     25, 9}, \
+    {      7, 8}, {     15, 7}, {     33, 8}, {     19, 7}, \
+    {     39, 8}, {     23, 9}, {     15, 8}, {     39, 9}, \
+    {     23,10}, {     15, 9}, {     31, 8}, {     63, 9}, \
+    {     47,10}, {     31, 9}, {     63, 8}, {    127, 9}, \
+    {     71, 8}, {    143, 7}, {    287, 9}, {     79,10}, \
+    {     47,11}, {     31,10}, {     63, 9}, {    127, 8}, \
+    {    255, 9}, {    143, 8}, {    287,10}, {     79, 9}, \
+    {    159, 8}, {    319, 9}, {    175, 8}, {    351, 7}, \
+    {    703,10}, {     95, 9}, {    191, 8}, {    383, 9}, \
+    {    207, 8}, {    415, 9}, {    223,11}, {     63,10}, \
+    {    127, 9}, {    271,10}, {    143, 9}, {    287, 8}, \
+    {    575,10}, {    159, 9}, {    319,10}, {    175, 9}, \
+    {    351, 8}, {    703,11}, {     95,10}, {    191, 9}, \
+    {    383,10}, {    207, 9}, {    415, 8}, {    831,10}, \
+    {    223,12}, {     63,11}, {    127,10}, {    271, 9}, \
+    {    543,10}, {    287, 9}, {    575,11}, {    159,10}, \
+    {    319, 9}, {    639,10}, {    351, 9}, {    703, 8}, \
+    {   1407,11}, {    191,10}, {    415, 9}, {    831,11}, \
+    {    223,10}, {    447, 9}, {    895,10}, {    479,12}, \
+    {    127,11}, {    255,10}, {    543,11}, {    287,10}, \
+    {    575,11}, {    319,10}, {    639,11}, {    351,10}, \
+    {    703,12}, {    191,11}, {    415,10}, {    831,11}, \
+    {    447,10}, {    895, 9}, {   1791,13}, {    127,12}, \
+    {    255,11}, {    575,12}, {    319,11}, {    703,10}, \
+    {   1407,12}, {    383,11}, {    831,12}, {    447,11}, \
+    {    959,10}, {   1919, 9}, {   3839,13}, {    255,12}, \
+    {    575,11}, {   1151,12}, {    703,11}, {   1407,13}, \
+    {    383,12}, {    959,14}, {    255,13}, {    511,12}, \
+    {   1215,11}, {   2431,13}, {    639,12}, {   1407,13}, \
+    {    767,12}, {   1599,13}, {    895,12}, {   1919,14}, \
+    {    511,13}, {   1151,12}, {   2431,13}, {   1407,12}, \
+    {   2815,14}, {    767,13}, {   1535,12}, {   3071,13}, \
+    {   1919,15}, {    511,14}, {   1023,13}, {   2431,14}, \
+    {   1279,13}, {   2943,12}, {   5887,14}, {  16384,15}, \
+    {  32768,16} }
+#define SQR_FFT_TABLE3_SIZE 153
+#define SQR_FFT_THRESHOLD                 2112
+
+#define MULLO_BASECASE_THRESHOLD             0  /* always */
+#define MULLO_DC_THRESHOLD                 144
+#define MULLO_MUL_N_THRESHOLD             4292
+
+#define DC_DIV_QR_THRESHOLD                 74
+#define DC_DIVAPPR_Q_THRESHOLD             406
+#define DC_BDIV_QR_THRESHOLD                63
+#define DC_BDIV_Q_THRESHOLD                363
+
+#define INV_MULMOD_BNM1_THRESHOLD          108
+#define INV_NEWTON_THRESHOLD               351
+#define INV_APPR_THRESHOLD                 303
+
+#define BINV_NEWTON_THRESHOLD              354
+#define REDC_1_TO_REDC_N_THRESHOLD          61
+
+#define MU_DIV_QR_THRESHOLD                998
+#define MU_DIVAPPR_Q_THRESHOLD            1099
+#define MUPI_DIV_QR_THRESHOLD              118
+#define MU_BDIV_QR_THRESHOLD               807
+#define MU_BDIV_Q_THRESHOLD                979
+
+#define POWM_SEC_TABLE  3,22,127,624,779,2351
+
+#define MATRIX22_STRASSEN_THRESHOLD          7
+#define HGCD_THRESHOLD                      90
+#define HGCD_APPR_THRESHOLD                123
+#define HGCD_REDUCE_THRESHOLD             1494
+#define GCD_DC_THRESHOLD                   283
+#define GCDEXT_DC_THRESHOLD                192
+#define JACOBI_BASE_METHOD                   4
+
+#define GET_STR_DC_THRESHOLD                12
+#define GET_STR_PRECOMPUTE_THRESHOLD        27
+#define SET_STR_DC_THRESHOLD               290
+#define SET_STR_PRECOMPUTE_THRESHOLD       634
+
+#define FAC_DSC_THRESHOLD                  156
+#define FAC_ODD_THRESHOLD                   25

diff --git a/third_party/gmp/mpn/sparc32/v9/mul_1.asm b/third_party/gmp/mpn/sparc32/v9/mul_1.asm
new file mode 100644
index 0000000..40aeffa
--- /dev/null
+++ b/third_party/gmp/mpn/sparc32/v9/mul_1.asm

@@ -0,0 +1,287 @@
+dnl  SPARC v9 32-bit mpn_mul_1 -- Multiply a limb vector with a limb and store
+dnl  the result in a second limb vector.
+
+dnl  Copyright 1998, 2000, 2001, 2003 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C Algorithm: We use two floating-point multiplies per limb product, with the
+C invariant v operand split into two 16-bit pieces, and the u operand split
+C into 32-bit pieces.  We convert the two 48-bit products and transfer them to
+C the integer unit.
+
+C		   cycles/limb
+C UltraSPARC 1&2:     6.5
+C UltraSPARC 3:	      ?
+
+C Possible optimizations:
+C   1. Combine 32-bit memory operations into 64-bit operations.  Since we're
+C      memory bandwidth limited, this could save 1.5 cycles/limb.
+C   2. Unroll the inner loop.  Since we already use alternate temporary areas,
+C      it is very straightforward to unroll, using an exit branch midways.
+C      Unrolling would allow deeper scheduling which could improve speed for L2
+C      cache case.
+C   3. For mpn_mul_1: Use more alternating temp areas.  The std'es and ldx'es
+C      aren't sufficiently apart-scheduled with just two temp areas.
+C   4. Specialize for particular v values.  If its upper 16 bits are zero, we
+C      could save many operations.
+
+C INPUT PARAMETERS
+C rp	i0
+C up	i1
+C n	i2
+C v	i3
+
+define(`FSIZE',224)
+
+ASM_START()
+PROLOGUE(mpn_mul_1)
+	add	%sp, -FSIZE, %sp
+	sethi	%hi(0xffff), %g1
+	srl	%o3, 16, %g2
+	or	%g1, %lo(0xffff), %g1
+	and	%o3, %g1, %g1
+	stx	%g1, [%sp+104]
+	stx	%g2, [%sp+112]
+	ldd	[%sp+104], %f6
+	ldd	[%sp+112], %f8
+	fxtod	%f6, %f6
+	fxtod	%f8, %f8
+	ld	[%sp+104], %f10		C zero f10
+
+	mov	0, %g3			C cy = 0
+
+define(`fanop', `fitod %f18, %f0')	C  A quasi nop running in the FA pipe
+
+	add	%sp, 160, %o5		C point in scratch area
+	and	%o5, -32, %o5		C align at 0 (mod 32) in scratch area
+
+	subcc	%o2, 1, %o2
+	ld	[%o1], %f11		C read up[i]
+	add	%o1, 4, %o1		C up++
+	bne,pt	%icc, .L_two_or_more
+	fxtod	%f10, %f2
+
+	fmuld	%f2, %f8, %f16
+	fmuld	%f2, %f6, %f4
+	fdtox	%f16, %f14
+	fdtox	%f4, %f12
+	std	%f14, [%o5+16]
+	std	%f12, [%o5+24]
+	ldx	[%o5+16], %g2		C p16
+	ldx	[%o5+24], %g1		C p0
+	b	.L1
+	add	%o0, -16, %o0
+
+	.align	16
+.L_two_or_more:
+	subcc	%o2, 1, %o2
+	ld	[%o1], %f11		C read up[i]
+	fmuld	%f2, %f8, %f16
+	fmuld	%f2, %f6, %f4
+	add	%o1, 4, %o1		C up++
+	bne,pt	%icc, .L_three_or_more
+	fxtod	%f10, %f2
+
+	fdtox	%f16, %f14
+	fdtox	%f4, %f12
+	std	%f14, [%o5+16]
+	fmuld	%f2, %f8, %f16
+	std	%f12, [%o5+24]
+	fmuld	%f2, %f6, %f4
+	fdtox	%f16, %f14
+	fdtox	%f4, %f12
+	std	%f14, [%o5+0]
+	std	%f12, [%o5+8]
+	ldx	[%o5+16], %g2		C p16
+	ldx	[%o5+24], %g1		C p0
+	b	.L2
+	add	%o0, -12, %o0
+
+	.align	16
+.L_three_or_more:
+	subcc	%o2, 1, %o2
+	ld	[%o1], %f11		C read up[i]
+	fdtox	%f16, %f14
+	fdtox	%f4, %f12
+	std	%f14, [%o5+16]
+	fmuld	%f2, %f8, %f16
+	std	%f12, [%o5+24]
+	fmuld	%f2, %f6, %f4
+	add	%o1, 4, %o1		C up++
+	bne,pt	%icc, .L_four_or_more
+	fxtod	%f10, %f2
+
+	fdtox	%f16, %f14
+	fdtox	%f4, %f12
+	std	%f14, [%o5+0]
+	fmuld	%f2, %f8, %f16
+	std	%f12, [%o5+8]
+	fmuld	%f2, %f6, %f4
+	fdtox	%f16, %f14
+	ldx	[%o5+16], %g2		C p16
+	fdtox	%f4, %f12
+	ldx	[%o5+24], %g1		C p0
+	std	%f14, [%o5+16]
+	std	%f12, [%o5+24]
+	b	.L3
+	add	%o0, -8, %o0
+
+	.align	16
+.L_four_or_more:
+	subcc	%o2, 1, %o2
+	ld	[%o1], %f11		C read up[i]
+	fdtox	%f16, %f14
+	fdtox	%f4, %f12
+	std	%f14, [%o5+0]
+	fmuld	%f2, %f8, %f16
+	std	%f12, [%o5+8]
+	fmuld	%f2, %f6, %f4
+	add	%o1, 4, %o1		C up++
+	bne,pt	%icc, .L_five_or_more
+	fxtod	%f10, %f2
+
+	fdtox	%f16, %f14
+	ldx	[%o5+16], %g2		C p16
+	fdtox	%f4, %f12
+	ldx	[%o5+24], %g1		C p0
+	std	%f14, [%o5+16]
+	fmuld	%f2, %f8, %f16
+	std	%f12, [%o5+24]
+	fmuld	%f2, %f6, %f4
+	add	%o1, 4, %o1		C up++
+	b	.L4
+	add	%o0, -4, %o0
+
+	.align	16
+.L_five_or_more:
+	subcc	%o2, 1, %o2
+	ld	[%o1], %f11		C read up[i]
+	fdtox	%f16, %f14
+	ldx	[%o5+16], %g2		C p16
+	fdtox	%f4, %f12
+	ldx	[%o5+24], %g1		C p0
+	std	%f14, [%o5+16]
+	fmuld	%f2, %f8, %f16
+	std	%f12, [%o5+24]
+	fmuld	%f2, %f6, %f4
+	add	%o1, 4, %o1		C up++
+	bne,pt	%icc, .Loop
+	fxtod	%f10, %f2
+	b,a	.L5
+
+C BEGIN MAIN LOOP
+	.align 16
+C -- 0
+.Loop:	nop
+	subcc	%o2, 1, %o2
+	ld	[%o1], %f11		C read up[i]
+	fdtox	%f16, %f14
+C -- 1
+	sllx	%g2, 16, %g4		C (p16 << 16)
+	add	%o0, 4, %o0		C rp++
+	ldx	[%o5+0], %g2		C p16
+	fdtox	%f4, %f12
+C -- 2
+	nop
+	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
+	ldx	[%o5+8], %g1		C p0
+	fanop
+C -- 3
+	nop
+	add	%g3, %g4, %g4		C p += cy
+	std	%f14, [%o5+0]
+	fmuld	%f2, %f8, %f16
+C -- 4
+	srlx	%g4, 32, %g3		C new cy
+	add	%o1, 4, %o1		C up++
+	std	%f12, [%o5+8]
+	fmuld	%f2, %f6, %f4
+C -- 5
+	xor	%o5, 16, %o5		C alternate scratch variables
+	stw	%g4, [%o0-4]
+	bne,pt	%icc, .Loop
+	fxtod	%f10, %f2
+C END MAIN LOOP
+
+.L5:	fdtox	%f16, %f14
+	sllx	%g2, 16, %g4		C (p16 << 16)
+	ldx	[%o5+0], %g2		C p16
+	fdtox	%f4, %f12
+	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
+	ldx	[%o5+8], %g1		C p0
+	add	%g4, %g3, %g4		C p += cy
+	std	%f14, [%o5+0]
+	fmuld	%f2, %f8, %f16
+	std	%f12, [%o5+8]
+	fmuld	%f2, %f6, %f4
+	xor	%o5, 16, %o5
+	stw	%g4, [%o0+0]
+	srlx	%g4, 32, %g3		C new cy
+
+.L4:	fdtox	%f16, %f14
+	sllx	%g2, 16, %g4		C (p16 << 16)
+	ldx	[%o5+0], %g2		C p16
+	fdtox	%f4, %f12
+	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
+	ldx	[%o5+8], %g1		C p0
+	add	%g3, %g4, %g4		C p += cy
+	std	%f14, [%o5+0]
+	std	%f12, [%o5+8]
+	xor	%o5, 16, %o5
+	stw	%g4, [%o0+4]
+	srlx	%g4, 32, %g3		C new cy
+
+.L3:	sllx	%g2, 16, %g4		C (p16 << 16)
+	ldx	[%o5+0], %g2		C p16
+	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
+	ldx	[%o5+8], %g1		C p0
+	add	%g3, %g4, %g4		C p += cy
+	xor	%o5, 16, %o5
+	stw	%g4, [%o0+8]
+	srlx	%g4, 32, %g3		C new cy
+
+.L2:	sllx	%g2, 16, %g4		C (p16 << 16)
+	ldx	[%o5+0], %g2		C p16
+	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
+	ldx	[%o5+8], %g1		C p0
+	add	%g3, %g4, %g4		C p += cy
+	stw	%g4, [%o0+12]
+	srlx	%g4, 32, %g3		C new cy
+
+.L1:	sllx	%g2, 16, %g4		C (p16 << 16)
+	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
+	add	%g3, %g4, %g4		C p += cy
+	stw	%g4, [%o0+16]
+	srlx	%g4, 32, %g3		C new cy
+
+	mov	%g3, %o0
+	retl
+	sub	%sp, -FSIZE, %sp
+EPILOGUE(mpn_mul_1)

diff --git a/third_party/gmp/mpn/sparc32/v9/sqr_diagonal.asm b/third_party/gmp/mpn/sparc32/v9/sqr_diagonal.asm
new file mode 100644
index 0000000..e024279
--- /dev/null
+++ b/third_party/gmp/mpn/sparc32/v9/sqr_diagonal.asm

@@ -0,0 +1,462 @@
+dnl  SPARC v9 32-bit mpn_sqr_diagonal.
+
+dnl  Copyright 2001, 2003 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C rp	i0
+C up	i1
+C n	i2
+
+C This code uses a very deep software pipeline, due to the need for moving data
+C forth and back between the integer registers and floating-point registers.
+C
+C A VIS variant of this code would make the pipeline less deep, since the
+C masking now done in the integer unit could take place in the floating-point
+C unit using the FAND instruction.  It would be possible to save several cycles
+C too.
+C
+C On UltraSPARC 1 and 2, this code runs at 11 cycles/limb from the Dcache and
+C not much slower from the Ecache.  It would perhaps be possible to shave off
+C one cycle, but not easily.  We cannot do better than 10 cycles/limb with the
+C used instructions, since we have 10 memory operations per limb.  But a VIS
+C variant could run three cycles faster than the corresponding non-VIS code.
+
+C This is non-pipelined code showing the algorithm:
+C
+C .Loop:
+C	lduw	[up+0],%g4		C 00000000hhhhllll
+C	sllx	%g4,16,%g3		C 0000hhhhllll0000
+C	or	%g3,%g4,%g2		C 0000hhhhXXXXllll
+C	andn	%g2,%g5,%g2		C 0000hhhh0000llll
+C	stx	%g2,[%fp+80]
+C	ldd	[%fp+80],%f0
+C	fitod	%f0,%f4			C hi16
+C	fitod	%f1,%f6			C lo16
+C	ld	[up+0],%f9
+C	fxtod	%f8,%f2
+C	fmuld	%f2,%f4,%f4
+C	fmuld	%f2,%f6,%f6
+C	fdtox	%f4,%f4
+C	fdtox	%f6,%f6
+C	std	%f4,[%fp-24]
+C	std	%f6,[%fp-16]
+C	ldx	[%fp-24],%g2
+C	ldx	[%fp-16],%g1
+C	sllx	%g2,16,%g2
+C	add	%g2,%g1,%g1
+C	stw	%g1,[rp+0]
+C	srlx	%g1,32,%l0
+C	stw	%l0,[rp+4]
+C	add	up,4,up
+C	subcc	n,1,n
+C	bne,pt	%icc,.Loop
+C	add	rp,8,rp
+
+define(`fanop',`fitod %f12,%f10')	dnl  A quasi nop running in the FA pipe
+
+ASM_START()
+
+	TEXT
+	ALIGN(4)
+.Lnoll:
+	.word	0
+
+PROLOGUE(mpn_sqr_diagonal)
+	save	%sp,-256,%sp
+
+ifdef(`PIC',
+`.Lpc:	rd	%pc,%o7
+	ld	[%o7+.Lnoll-.Lpc],%f8',
+`	sethi	%hi(.Lnoll),%g1
+	ld	[%g1+%lo(.Lnoll)],%f8')
+
+	sethi	%hi(0xffff0000),%g5
+	add	%i1,-8,%i1
+
+	lduw	[%i1+8],%g4
+	add	%i1,4,%i1		C s1_ptr++
+	sllx	%g4,16,%g3		C 0000hhhhllll0000
+	or	%g3,%g4,%g2		C 0000hhhhXXXXllll
+	subcc	%i2,1,%i2
+	bne,pt	%icc,.L_grt_1
+	andn	%g2,%g5,%g2		C 0000hhhh0000llll
+
+	add	%i1,4,%i1		C s1_ptr++
+	stx	%g2,[%fp+80]
+	ld	[%i1],%f9
+	ldd	[%fp+80],%f0
+	fxtod	%f8,%f2
+	fitod	%f0,%f4
+	fitod	%f1,%f6
+	fmuld	%f2,%f4,%f4
+	fmuld	%f2,%f6,%f6
+	fdtox	%f4,%f4
+	fdtox	%f6,%f6
+	std	%f4,[%fp-24]
+	std	%f6,[%fp-16]
+
+	add	%fp, 80, %l3
+	add	%fp, -24, %l4
+	add	%fp, 72, %l5
+	b	.L1
+	add	%fp, -40, %l6
+
+.L_grt_1:
+	stx	%g2,[%fp+80]
+	lduw	[%i1+8],%g4
+	add	%i1,4,%i1		C s1_ptr++
+	sllx	%g4,16,%g3		C 0000hhhhllll0000
+	or	%g3,%g4,%g2		C 0000hhhhXXXXllll
+	subcc	%i2,1,%i2
+	bne,pt	%icc,.L_grt_2
+	andn	%g2,%g5,%g2		C 0000hhhh0000llll
+
+	stx	%g2,[%fp+72]
+	ld	[%i1],%f9
+	add	%i1,4,%i1		C s1_ptr++
+	ldd	[%fp+80],%f0
+	fxtod	%f8,%f2
+	fitod	%f0,%f4
+	fitod	%f1,%f6
+	fmuld	%f2,%f4,%f4
+	ld	[%i1],%f9
+	fmuld	%f2,%f6,%f6
+	ldd	[%fp+72],%f0
+	fdtox	%f4,%f4
+	fdtox	%f6,%f6
+	std	%f4,[%fp-24]
+	fxtod	%f8,%f2
+	std	%f6,[%fp-16]
+	fitod	%f0,%f4
+	fitod	%f1,%f6
+	fmuld	%f2,%f4,%f4
+	fmuld	%f2,%f6,%f6
+	fdtox	%f4,%f4
+
+	add	%fp, 72, %l3
+	add	%fp, -40, %l4
+	add	%fp, 80, %l5
+	b	.L2
+	add	%fp, -24, %l6
+
+.L_grt_2:
+	stx	%g2,[%fp+72]
+	lduw	[%i1+8],%g4
+	ld	[%i1],%f9
+	add	%i1,4,%i1		C s1_ptr++
+	ldd	[%fp+80],%f0
+	sllx	%g4,16,%g3		C 0000hhhhllll0000
+	or	%g3,%g4,%g2		C 0000hhhhXXXXllll
+	subcc	%i2,1,%i2
+	fxtod	%f8,%f2
+	bne,pt	%icc,.L_grt_3
+	andn	%g2,%g5,%g2		C 0000hhhh0000llll
+
+	stx	%g2,[%fp+80]
+	fitod	%f0,%f4
+	fitod	%f1,%f6
+	fmuld	%f2,%f4,%f4
+	ld	[%i1],%f9
+	fmuld	%f2,%f6,%f6
+	add	%i1,4,%i1		C s1_ptr++
+	ldd	[%fp+72],%f0
+	fdtox	%f4,%f4
+	fdtox	%f6,%f6
+	std	%f4,[%fp-24]
+	fxtod	%f8,%f2
+	std	%f6,[%fp-16]
+	fitod	%f0,%f4
+	fitod	%f1,%f6
+	fmuld	%f2,%f4,%f4
+	ld	[%i1],%f9
+	add	%fp, 80, %l3
+	fmuld	%f2,%f6,%f6
+	add	%fp, -24, %l4
+	ldd	[%fp+80],%f0
+	add	%fp, 72, %l5
+	fdtox	%f4,%f4
+	b	.L3
+	add	%fp, -40, %l6
+
+.L_grt_3:
+	stx	%g2,[%fp+80]
+	fitod	%f0,%f4
+	lduw	[%i1+8],%g4
+	fitod	%f1,%f6
+	fmuld	%f2,%f4,%f4
+	ld	[%i1],%f9
+	fmuld	%f2,%f6,%f6
+	add	%i1,4,%i1		C s1_ptr++
+	ldd	[%fp+72],%f0
+	fdtox	%f4,%f4
+	sllx	%g4,16,%g3		C 0000hhhhllll0000
+	fdtox	%f6,%f6
+	or	%g3,%g4,%g2		C 0000hhhhXXXXllll
+	subcc	%i2,1,%i2
+	std	%f4,[%fp-24]
+	fxtod	%f8,%f2
+	std	%f6,[%fp-16]
+	bne,pt	%icc,.L_grt_4
+	andn	%g2,%g5,%g2		C 0000hhhh0000llll
+
+	stx	%g2,[%fp+72]
+	fitod	%f0,%f4
+	fitod	%f1,%f6
+	add	%fp, 72, %l3
+	fmuld	%f2,%f4,%f4
+	add	%fp, -40, %l4
+	ld	[%i1],%f9
+	fmuld	%f2,%f6,%f6
+	add	%i1,4,%i1		C s1_ptr++
+	ldd	[%fp+80],%f0
+	add	%fp, 80, %l5
+	fdtox	%f4,%f4
+	b	.L4
+	add	%fp, -24, %l6
+
+.L_grt_4:
+	stx	%g2,[%fp+72]
+	fitod	%f0,%f4
+	lduw	[%i1+8],%g4
+	fitod	%f1,%f6
+	fmuld	%f2,%f4,%f4
+	ld	[%i1],%f9
+	fmuld	%f2,%f6,%f6
+	add	%i1,4,%i1		C s1_ptr++
+	ldd	[%fp+80],%f0
+	fdtox	%f4,%f4
+	sllx	%g4,16,%g3		C 0000hhhhllll0000
+	fdtox	%f6,%f6
+	or	%g3,%g4,%g2		C 0000hhhhXXXXllll
+	subcc	%i2,1,%i2
+	std	%f4,[%fp-40]
+	fxtod	%f8,%f2
+	std	%f6,[%fp-32]
+	be,pn	%icc,.L5
+	andn	%g2,%g5,%g2		C 0000hhhh0000llll
+
+	b,a	.Loop
+
+	.align	16
+C --- LOOP BEGIN
+.Loop:	nop
+	nop
+	stx	%g2,[%fp+80]
+	fitod	%f0,%f4
+C ---
+	nop
+	nop
+	lduw	[%i1+8],%g4
+	fitod	%f1,%f6
+C ---
+	nop
+	nop
+	ldx	[%fp-24],%g2		C p16
+	fanop
+C ---
+	nop
+	nop
+	ldx	[%fp-16],%g1		C p0
+	fmuld	%f2,%f4,%f4
+C ---
+	sllx	%g2,16,%g2		C align p16
+	add	%i0,8,%i0		C res_ptr++
+	ld	[%i1],%f9
+	fmuld	%f2,%f6,%f6
+C ---
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	add	%i1,4,%i1		C s1_ptr++
+	ldd	[%fp+72],%f0
+	fanop
+C ---
+	srlx	%g1,32,%l0
+	nop
+	stw	%g1,[%i0-8]
+	fdtox	%f4,%f4
+C ---
+	sllx	%g4,16,%g3		C 0000hhhhllll0000
+	nop
+	stw	%l0,[%i0-4]
+	fdtox	%f6,%f6
+C ---
+	or	%g3,%g4,%g2		C 0000hhhhXXXXllll
+	subcc	%i2,1,%i2
+	std	%f4,[%fp-24]
+	fxtod	%f8,%f2
+C ---
+	std	%f6,[%fp-16]
+	andn	%g2,%g5,%g2		C 0000hhhh0000llll
+	be,pn	%icc,.Lend
+	fanop
+C ---  LOOP MIDDLE
+	nop
+	nop
+	stx	%g2,[%fp+72]
+	fitod	%f0,%f4
+C ---
+	nop
+	nop
+	lduw	[%i1+8],%g4
+	fitod	%f1,%f6
+C ---
+	nop
+	nop
+	ldx	[%fp-40],%g2		C p16
+	fanop
+C ---
+	nop
+	nop
+	ldx	[%fp-32],%g1		C p0
+	fmuld	%f2,%f4,%f4
+C ---
+	sllx	%g2,16,%g2		C align p16
+	add	%i0,8,%i0		C res_ptr++
+	ld	[%i1],%f9
+	fmuld	%f2,%f6,%f6
+C ---
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	add	%i1,4,%i1		C s1_ptr++
+	ldd	[%fp+80],%f0
+	fanop
+C ---
+	srlx	%g1,32,%l0
+	nop
+	stw	%g1,[%i0-8]
+	fdtox	%f4,%f4
+C ---
+	sllx	%g4,16,%g3		C 0000hhhhllll0000
+	nop
+	stw	%l0,[%i0-4]
+	fdtox	%f6,%f6
+C ---
+	or	%g3,%g4,%g2		C 0000hhhhXXXXllll
+	subcc	%i2,1,%i2
+	std	%f4,[%fp-40]
+	fxtod	%f8,%f2
+C ---
+	std	%f6,[%fp-32]
+	andn	%g2,%g5,%g2		C 0000hhhh0000llll
+	bne,pt	%icc,.Loop
+	fanop
+C --- LOOP END
+
+.L5:	add	%fp, 80, %l3
+	add	%fp, -24, %l4
+	add	%fp, 72, %l5
+	b	.Ltail
+	add	%fp, -40, %l6
+
+.Lend:	add	%fp, 72, %l3
+	add	%fp, -40, %l4
+	add	%fp, 80, %l5
+	add	%fp, -24, %l6
+.Ltail:	stx	%g2,[%l3]
+	fitod	%f0,%f4
+	fitod	%f1,%f6
+	ldx	[%l4],%g2		C p16
+	ldx	[%l4+8],%g1		C p0
+	fmuld	%f2,%f4,%f4
+	sllx	%g2,16,%g2		C align p16
+	add	%i0,8,%i0		C res_ptr++
+	ld	[%i1],%f9
+	fmuld	%f2,%f6,%f6
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	add	%i1,4,%i1		C s1_ptr++
+	ldd	[%l5],%f0
+	srlx	%g1,32,%l0
+	stw	%g1,[%i0-8]
+	fdtox	%f4,%f4
+	stw	%l0,[%i0-4]
+.L4:	fdtox	%f6,%f6
+	std	%f4,[%l4]
+	fxtod	%f8,%f2
+	std	%f6,[%l4+8]
+
+	fitod	%f0,%f4
+	fitod	%f1,%f6
+	ldx	[%l6],%g2		C p16
+	ldx	[%l6+8],%g1		C p0
+	fmuld	%f2,%f4,%f4
+	sllx	%g2,16,%g2		C align p16
+	add	%i0,8,%i0		C res_ptr++
+	ld	[%i1],%f9
+	fmuld	%f2,%f6,%f6
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	ldd	[%l3],%f0
+	srlx	%g1,32,%l0
+	stw	%g1,[%i0-8]
+	fdtox	%f4,%f4
+	stw	%l0,[%i0-4]
+.L3:	fdtox	%f6,%f6
+	std	%f4,[%l6]
+	fxtod	%f8,%f2
+	std	%f6,[%l6+8]
+
+	fitod	%f0,%f4
+	fitod	%f1,%f6
+	ldx	[%l4],%g2		C p16
+	ldx	[%l4+8],%g1		C p0
+	fmuld	%f2,%f4,%f4
+	sllx	%g2,16,%g2		C align p16
+	add	%i0,8,%i0		C res_ptr++
+	fmuld	%f2,%f6,%f6
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	srlx	%g1,32,%l0
+	stw	%g1,[%i0-8]
+	fdtox	%f4,%f4
+	stw	%l0,[%i0-4]
+.L2:	fdtox	%f6,%f6
+	std	%f4,[%l4]
+	std	%f6,[%l4+8]
+
+	ldx	[%l6],%g2		C p16
+	ldx	[%l6+8],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+	add	%i0,8,%i0		C res_ptr++
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	srlx	%g1,32,%l0
+	stw	%g1,[%i0-8]
+	stw	%l0,[%i0-4]
+
+.L1:	ldx	[%l4],%g2		C p16
+	ldx	[%l4+8],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+	add	%i0,8,%i0		C res_ptr++
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	srlx	%g1,32,%l0
+	stw	%g1,[%i0-8]
+	stw	%l0,[%i0-4]
+
+	ret
+	restore	%g0,%g0,%o0
+
+EPILOGUE(mpn_sqr_diagonal)

diff --git a/third_party/gmp/mpn/sparc32/v9/sub_n.asm b/third_party/gmp/mpn/sparc32/v9/sub_n.asm
new file mode 100644
index 0000000..636c73b
--- /dev/null
+++ b/third_party/gmp/mpn/sparc32/v9/sub_n.asm

@@ -0,0 +1,129 @@
+dnl  SPARC mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+dnl  store difference in a third limb vector.
+
+dnl  Copyright 2001 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+define(rp,%o0)
+define(s1p,%o1)
+define(s2p,%o2)
+define(n,%o3)
+define(cy,%g1)
+
+C This code uses 64-bit operations on `o' and `g' registers.  It doesn't
+C require that `o' registers' upper 32 bits are preserved by the operating
+C system, but if they are not, they must be zeroed.  That is indeed what
+C happens at least on Slowaris 2.5 and 2.6.
+
+C On UltraSPARC 1 and 2, this code runs at 3 cycles/limb from the Dcache and at
+C about 10 cycles/limb from the Ecache.
+
+ASM_START()
+PROLOGUE(mpn_sub_n)
+	lduw	[s1p+0],%o4
+	lduw	[s2p+0],%o5
+	addcc	n,-2,n
+	bl,pn	%icc,L(end1)
+	lduw	[s1p+4],%g2
+	lduw	[s2p+4],%g3
+	be,pn	%icc,L(end2)
+	mov	0,cy
+
+	.align	16
+L(loop):
+	sub	%o4,%o5,%g4
+	add	rp,8,rp
+	lduw	[s1p+8],%o4
+	fitod	%f0,%f2
+C ---
+	sub	%g4,cy,%g4
+	addcc	n,-1,n
+	lduw	[s2p+8],%o5
+	fitod	%f0,%f2
+C ---
+	srlx	%g4,63,cy
+	add	s2p,8,s2p
+	stw	%g4,[rp-8]
+	be,pn	%icc,L(exito)+4
+C ---
+	sub	%g2,%g3,%g4
+	addcc	n,-1,n
+	lduw	[s1p+12],%g2
+	fitod	%f0,%f2
+C ---
+	sub	%g4,cy,%g4
+	add	s1p,8,s1p
+	lduw	[s2p+4],%g3
+	fitod	%f0,%f2
+C ---
+	srlx	%g4,63,cy
+	bne,pt	%icc,L(loop)
+	stw	%g4,[rp-4]
+C ---
+L(exite):
+	sub	%o4,%o5,%g4
+	sub	%g4,cy,%g4
+	srlx	%g4,63,cy
+	stw	%g4,[rp+0]
+	sub	%g2,%g3,%g4
+	sub	%g4,cy,%g4
+	stw	%g4,[rp+4]
+	retl
+	srlx	%g4,63,%o0
+
+L(exito):
+	sub	%g2,%g3,%g4
+	sub	%g4,cy,%g4
+	srlx	%g4,63,cy
+	stw	%g4,[rp-4]
+	sub	%o4,%o5,%g4
+	sub	%g4,cy,%g4
+	stw	%g4,[rp+0]
+	retl
+	srlx	%g4,63,%o0
+
+L(end1):
+	sub	%o4,%o5,%g4
+	stw	%g4,[rp+0]
+	retl
+	srlx	%g4,63,%o0
+
+L(end2):
+	sub	%o4,%o5,%g4
+	srlx	%g4,63,cy
+	stw	%g4,[rp+0]
+	sub	%g2,%g3,%g4
+	sub	%g4,cy,%g4
+	stw	%g4,[rp+4]
+	retl
+	srlx	%g4,63,%o0
+EPILOGUE(mpn_sub_n)

diff --git a/third_party/gmp/mpn/sparc32/v9/submul_1.asm b/third_party/gmp/mpn/sparc32/v9/submul_1.asm
new file mode 100644
index 0000000..92d0ce7
--- /dev/null
+++ b/third_party/gmp/mpn/sparc32/v9/submul_1.asm

@@ -0,0 +1,316 @@
+dnl  SPARC v9 32-bit mpn_submul_1 -- Multiply a limb vector with a limb and
+dnl  subtract the result from a second limb vector.
+
+dnl  Copyright 1998, 2000, 2001, 2003 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C Algorithm: We use two floating-point multiplies per limb product, with the
+C invariant v operand split into two 16-bit pieces, and the u operand split
+C into 32-bit pieces.  We convert the two 48-bit products and transfer them to
+C the integer unit.
+
+C		   cycles/limb
+C UltraSPARC 1&2:     6.5
+C UltraSPARC 3:	      ?
+
+C Possible optimizations:
+C   1. Combine 32-bit memory operations into 64-bit operations.  Since we're
+C      memory bandwidth limited, this could save 1.5 cycles/limb.
+C   2. Unroll the inner loop.  Since we already use alternate temporary areas,
+C      it is very straightforward to unroll, using an exit branch midways.
+C      Unrolling would allow deeper scheduling which could improve speed for L2
+C      cache case.
+C   3. For mpn_mul_1: Use more alternating temp areas.  The std'es and ldx'es
+C      aren't sufficiently apart-scheduled with just two temp areas.
+C   4. Specialize for particular v values.  If its upper 16 bits are zero, we
+C      could save many operations.
+
+C INPUT PARAMETERS
+C rp	i0
+C up	i1
+C n	i2
+C v	i3
+
+define(`FSIZE',224)
+
+ASM_START()
+PROLOGUE(mpn_submul_1)
+	add	%sp, -FSIZE, %sp
+	sethi	%hi(0xffff), %g1
+	srl	%o3, 16, %g2
+	or	%g1, %lo(0xffff), %g1
+	and	%o3, %g1, %g1
+	stx	%g1, [%sp+104]
+	stx	%g2, [%sp+112]
+	ldd	[%sp+104], %f6
+	ldd	[%sp+112], %f8
+	fxtod	%f6, %f6
+	fxtod	%f8, %f8
+	ld	[%sp+104], %f10		C zero f10
+
+	mov	0, %g3			C cy = 0
+
+define(`fanop', `fitod %f18, %f0')	C  A quasi nop running in the FA pipe
+
+	add	%sp, 160, %o5		C point in scratch area
+	and	%o5, -32, %o5		C align at 0 (mod 32) in scratch area
+
+	subcc	%o2, 1, %o2
+	ld	[%o1], %f11		C read up[i]
+	add	%o1, 4, %o1		C up++
+	bne,pt	%icc, .L_two_or_more
+	fxtod	%f10, %f2
+
+	fmuld	%f2, %f8, %f16
+	fmuld	%f2, %f6, %f4
+	fdtox	%f16, %f14
+	fdtox	%f4, %f12
+	std	%f14, [%o5+16]
+	std	%f12, [%o5+24]
+	ldx	[%o5+16], %g2		C p16
+	ldx	[%o5+24], %g1		C p0
+	lduw	[%o0], %g5		C read rp[i]
+	b	.L1
+	add	%o0, -16, %o0
+
+	.align	16
+.L_two_or_more:
+	subcc	%o2, 1, %o2
+	ld	[%o1], %f11		C read up[i]
+	fmuld	%f2, %f8, %f16
+	fmuld	%f2, %f6, %f4
+	add	%o1, 4, %o1		C up++
+	bne,pt	%icc, .L_three_or_more
+	fxtod	%f10, %f2
+
+	fdtox	%f16, %f14
+	fdtox	%f4, %f12
+	std	%f14, [%o5+16]
+	fmuld	%f2, %f8, %f16
+	std	%f12, [%o5+24]
+	fmuld	%f2, %f6, %f4
+	fdtox	%f16, %f14
+	fdtox	%f4, %f12
+	std	%f14, [%o5+0]
+	std	%f12, [%o5+8]
+	lduw	[%o0], %g5		C read rp[i]
+	ldx	[%o5+16], %g2		C p16
+	ldx	[%o5+24], %g1		C p0
+	b	.L2
+	add	%o0, -12, %o0
+
+	.align	16
+.L_three_or_more:
+	subcc	%o2, 1, %o2
+	ld	[%o1], %f11		C read up[i]
+	fdtox	%f16, %f14
+	fdtox	%f4, %f12
+	std	%f14, [%o5+16]
+	fmuld	%f2, %f8, %f16
+	std	%f12, [%o5+24]
+	fmuld	%f2, %f6, %f4
+	add	%o1, 4, %o1		C up++
+	bne,pt	%icc, .L_four_or_more
+	fxtod	%f10, %f2
+
+	fdtox	%f16, %f14
+	fdtox	%f4, %f12
+	std	%f14, [%o5+0]
+	fmuld	%f2, %f8, %f16
+	std	%f12, [%o5+8]
+	fmuld	%f2, %f6, %f4
+	fdtox	%f16, %f14
+	ldx	[%o5+16], %g2		C p16
+	fdtox	%f4, %f12
+	ldx	[%o5+24], %g1		C p0
+	std	%f14, [%o5+16]
+	std	%f12, [%o5+24]
+	lduw	[%o0], %g5		C read rp[i]
+	b	.L3
+	add	%o0, -8, %o0
+
+	.align	16
+.L_four_or_more:
+	subcc	%o2, 1, %o2
+	ld	[%o1], %f11		C read up[i]
+	fdtox	%f16, %f14
+	fdtox	%f4, %f12
+	std	%f14, [%o5+0]
+	fmuld	%f2, %f8, %f16
+	std	%f12, [%o5+8]
+	fmuld	%f2, %f6, %f4
+	add	%o1, 4, %o1		C up++
+	bne,pt	%icc, .L_five_or_more
+	fxtod	%f10, %f2
+
+	fdtox	%f16, %f14
+	ldx	[%o5+16], %g2		C p16
+	fdtox	%f4, %f12
+	ldx	[%o5+24], %g1		C p0
+	std	%f14, [%o5+16]
+	fmuld	%f2, %f8, %f16
+	std	%f12, [%o5+24]
+	fmuld	%f2, %f6, %f4
+	add	%o1, 4, %o1		C up++
+	lduw	[%o0], %g5		C read rp[i]
+	b	.L4
+	add	%o0, -4, %o0
+
+	.align	16
+.L_five_or_more:
+	subcc	%o2, 1, %o2
+	ld	[%o1], %f11		C read up[i]
+	fdtox	%f16, %f14
+	ldx	[%o5+16], %g2		C p16
+	fdtox	%f4, %f12
+	ldx	[%o5+24], %g1		C p0
+	std	%f14, [%o5+16]
+	fmuld	%f2, %f8, %f16
+	std	%f12, [%o5+24]
+	fmuld	%f2, %f6, %f4
+	add	%o1, 4, %o1		C up++
+	lduw	[%o0], %g5		C read rp[i]
+	bne,pt	%icc, .Loop
+	fxtod	%f10, %f2
+	b,a	.L5
+
+C BEGIN MAIN LOOP
+	.align 16
+C -- 0
+.Loop:	sub	%g0, %g3, %g3
+	subcc	%o2, 1, %o2
+	ld	[%o1], %f11		C read up[i]
+	fdtox	%f16, %f14
+C -- 1
+	sllx	%g2, 16, %g4		C (p16 << 16)
+	add	%o0, 4, %o0		C rp++
+	ldx	[%o5+0], %g2		C p16
+	fdtox	%f4, %f12
+C -- 2
+	srl	%g3, 0, %g3		C zero most significant 32 bits
+	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
+	ldx	[%o5+8], %g1		C p0
+	fanop
+C -- 3
+	nop
+	add	%g3, %g4, %g4		C p += cy
+	std	%f14, [%o5+0]
+	fmuld	%f2, %f8, %f16
+C -- 4
+	nop
+	sub	%g5, %g4, %g4		C p += rp[i]
+	std	%f12, [%o5+8]
+	fmuld	%f2, %f6, %f4
+C -- 5
+	xor	%o5, 16, %o5		C alternate scratch variables
+	add	%o1, 4, %o1		C up++
+	stw	%g4, [%o0-4]
+	fanop
+C -- 6
+	srlx	%g4, 32, %g3		C new cy
+	lduw	[%o0], %g5		C read rp[i]
+	bne,pt	%icc, .Loop
+	fxtod	%f10, %f2
+C END MAIN LOOP
+
+.L5:	sub	%g0, %g3, %g3
+	fdtox	%f16, %f14
+	sllx	%g2, 16, %g4		C (p16 << 16)
+	ldx	[%o5+0], %g2		C p16
+	fdtox	%f4, %f12
+	srl	%g3, 0, %g3		C zero most significant 32 bits
+	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
+	ldx	[%o5+8], %g1		C p0
+	add	%g4, %g3, %g4		C p += cy
+	std	%f14, [%o5+0]
+	fmuld	%f2, %f8, %f16
+	sub	%g5, %g4, %g4		C p += rp[i]
+	std	%f12, [%o5+8]
+	fmuld	%f2, %f6, %f4
+	xor	%o5, 16, %o5
+	stw	%g4, [%o0+0]
+	srlx	%g4, 32, %g3		C new cy
+	lduw	[%o0+4], %g5		C read rp[i]
+
+	sub	%g0, %g3, %g3
+.L4:	fdtox	%f16, %f14
+	sllx	%g2, 16, %g4		C (p16 << 16)
+	ldx	[%o5+0], %g2		C p16
+	fdtox	%f4, %f12
+	srl	%g3, 0, %g3		C zero most significant 32 bits
+	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
+	ldx	[%o5+8], %g1		C p0
+	add	%g3, %g4, %g4		C p += cy
+	std	%f14, [%o5+0]
+	sub	%g5, %g4, %g4		C p += rp[i]
+	std	%f12, [%o5+8]
+	xor	%o5, 16, %o5
+	stw	%g4, [%o0+4]
+	srlx	%g4, 32, %g3		C new cy
+	lduw	[%o0+8], %g5		C read rp[i]
+
+	sub	%g0, %g3, %g3
+.L3:	sllx	%g2, 16, %g4		C (p16 << 16)
+	ldx	[%o5+0], %g2		C p16
+	srl	%g3, 0, %g3		C zero most significant 32 bits
+	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
+	ldx	[%o5+8], %g1		C p0
+	add	%g3, %g4, %g4		C p += cy
+	sub	%g5, %g4, %g4		C p += rp[i]
+	xor	%o5, 16, %o5
+	stw	%g4, [%o0+8]
+	srlx	%g4, 32, %g3		C new cy
+	lduw	[%o0+12], %g5		C read rp[i]
+
+	sub	%g0, %g3, %g3
+.L2:	sllx	%g2, 16, %g4		C (p16 << 16)
+	ldx	[%o5+0], %g2		C p16
+	srl	%g3, 0, %g3		C zero most significant 32 bits
+	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
+	ldx	[%o5+8], %g1		C p0
+	add	%g3, %g4, %g4		C p += cy
+	sub	%g5, %g4, %g4		C p += rp[i]
+	stw	%g4, [%o0+12]
+	srlx	%g4, 32, %g3		C new cy
+	lduw	[%o0+16], %g5		C read rp[i]
+
+	sub	%g0, %g3, %g3
+.L1:	sllx	%g2, 16, %g4		C (p16 << 16)
+	srl	%g3, 0, %g3		C zero most significant 32 bits
+	add	%g1, %g4, %g4		C p = p0 + (p16 << 16)
+	add	%g3, %g4, %g4		C p += cy
+	sub	%g5, %g4, %g4		C p += rp[i]
+	stw	%g4, [%o0+16]
+	srlx	%g4, 32, %g3		C new cy
+
+	sub	%g0, %g3, %o0
+	retl
+	sub	%sp, -FSIZE, %sp
+EPILOGUE(mpn_submul_1)

diff --git a/third_party/gmp/mpn/sparc32/v9/udiv.asm b/third_party/gmp/mpn/sparc32/v9/udiv.asm
new file mode 100644
index 0000000..61dde97
--- /dev/null
+++ b/third_party/gmp/mpn/sparc32/v9/udiv.asm

@@ -0,0 +1,52 @@
+dnl  SPARC v9 32-bit mpn_udiv_qrnnd - division support for longlong.h.
+
+dnl  Copyright 2002, 2003 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C rem_ptr	o0
+C n1		o1
+C n0		o2
+C d		o3
+
+ASM_START()
+PROLOGUE(mpn_udiv_qrnnd)
+	sllx	%o1, 32, %g1		C shift upper dividend limb
+	srl	%o2, 0, %g2		C zero extend lower dividend limb
+	srl	%o3, 0, %g3		C zero extend divisor
+	or	%g2, %g1, %g1		C assemble 64-bit dividend
+	udivx	%g1, %g3, %g1
+	mulx	%g1, %g3, %g4
+	sub	%g2, %g4, %g2
+	st	%g2, [%o0]		C store remainder
+	retl
+	mov	%g1, %o0		C return quotient
+EPILOGUE(mpn_udiv_qrnnd)

diff --git a/third_party/gmp/mpn/sparc64/README b/third_party/gmp/mpn/sparc64/README
new file mode 100644
index 0000000..e2c051a
--- /dev/null
+++ b/third_party/gmp/mpn/sparc64/README

@@ -0,0 +1,125 @@
+Copyright 1997, 1999-2002 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.
+
+
+
+
+
+This directory contains mpn functions for 64-bit V9 SPARC
+
+RELEVANT OPTIMIZATION ISSUES
+
+Notation:
+  IANY = shift/add/sub/logical/sethi
+  IADDLOG = add/sub/logical/sethi
+  MEM = ld*/st*
+  FA = fadd*/fsub*/f*to*/fmov*
+  FM = fmul*
+
+UltraSPARC can issue four instructions per cycle, with these restrictions:
+* Two IANY instructions, but only one of these may be a shift.  If there is a
+  shift and an IANY instruction, the shift must precede the IANY instruction.
+* One FA.
+* One FM.
+* One branch.
+* One MEM.
+* IANY/IADDLOG/MEM must be insn 1, 2, or 3 in an issue bundle.  Taken branches
+  should not be in slot 4, since that makes the delay insn come from separate
+  bundle.
+* If two IANY/IADDLOG instructions are to be executed in the same cycle and one
+  of these is setting the condition codes, that instruction must be the second
+  one.
+
+To summarize, ignoring branches, these are the bundles that can reach the peak
+execution speed:
+
+insn1	iany	iany	mem	iany	iany	mem	iany	iany	mem
+insn2	iaddlog	mem	iany	mem	iaddlog	iany	mem	iaddlog	iany
+insn3	mem	iaddlog	iaddlog	fa	fa	fa	fm	fm	fm
+insn4	fa/fm	fa/fm	fa/fm	fm	fm	fm	fa	fa	fa
+
+The 64-bit integer multiply instruction mulx takes from 5 cycles to 35 cycles,
+depending on the position of the most significant bit of the first source
+operand.  When used for 32x32->64 multiplication, it needs 20 cycles.
+Furthermore, it stalls the processor while executing.  We stay away from that
+instruction, and instead use floating-point operations.
+
+Floating-point add and multiply units are fully pipelined.  The latency for
+UltraSPARC-1/2 is 3 cycles and for UltraSPARC-3 it is 4 cycles.
+
+Integer conditional move instructions cannot dual-issue with other integer
+instructions.  No conditional move can issue 1-5 cycles after a load.  (This
+might have been fixed for UltraSPARC-3.)
+
+The UltraSPARC-3 pipeline is very simular to the one of UltraSPARC-1/2 , but is
+somewhat slower.  Branches execute slower, and there may be other new stalls.
+But integer multiply doesn't stall the entire CPU and also has a much lower
+latency.  But it's still not pipelined, and thus useless for our needs.
+
+STATUS
+
+* mpn_lshift, mpn_rshift: The current code runs at 2.0 cycles/limb on
+  UltraSPARC-1/2 and 2.65 on UltraSPARC-3.  For UltraSPARC-1/2, the IEU0
+  functional unit is saturated with shifts.
+
+* mpn_add_n, mpn_sub_n: The current code runs at 4 cycles/limb on
+  UltraSPARC-1/2 and 4.5 cycles/limb on UltraSPARC-3.  The 4 instruction
+  recurrency is the speed limiter.
+
+* mpn_addmul_1: The current code runs at 14 cycles/limb asymptotically on
+  UltraSPARC-1/2 and 17.5 cycles/limb on UltraSPARC-3.  On UltraSPARC-1/2, the
+  code sustains 4 instructions/cycle.  It might be possible to invent a better
+  way of summing the intermediate 49-bit operands, but it is unlikely that it
+  will save enough instructions to save an entire cycle.
+
+  The load-use of the u operand is not enough scheduled for good L2 cache
+  performance.  The UltraSPARC-1/2 L1 cache is direct mapped, and since we use
+  temporary stack slots that will conflict with the u and r operands, we miss
+  to L2 very often.  The load-use of the std/ldx pairs via the stack are
+  perhaps over-scheduled.
+
+  It would be possible to save two instructions: (1) The mov could be avoided
+  if the std/ldx were less scheduled.  (2) The ldx of the r operand could be
+  split into two ld instructions, saving the shifts/masks.
+
+  It should be possible to reach 14 cycles/limb for UltraSPARC-3 if the fp
+  operations where rescheduled for this processor's 4-cycle latency.
+
+* mpn_mul_1: The current code is a straightforward edit of the mpn_addmul_1
+  code.  It would be possible to shave one or two cycles from it, with some
+  labour.
+
+* mpn_submul_1: Simpleminded code just calling mpn_mul_1 + mpn_sub_n.  This
+  means that it runs at 18 cycles/limb on UltraSPARC-1/2 and 23 cycles/limb on
+  UltraSPARC-3.  It would be possible to either match the mpn_addmul_1
+  performance, or in the worst case use one more instruction group.
+
+* US1/US2 cache conflict resolving.  The direct mapped L1 date cache of US1/US2
+  is a problem for mul_1, addmul_1 (and a prospective submul_1).  We should
+  allocate a larger cache area, and put the stack temp area in a place that
+  doesn't cause cache conflicts.

diff --git a/third_party/gmp/mpn/sparc64/copyd.asm b/third_party/gmp/mpn/sparc64/copyd.asm
new file mode 100644
index 0000000..ab105d3
--- /dev/null
+++ b/third_party/gmp/mpn/sparc64/copyd.asm

@@ -0,0 +1,89 @@
+dnl  SPARC v9 mpn_copyd -- Copy a limb vector, decrementing.
+
+dnl  Copyright 1999-2003 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C		   cycles/limb
+C UltraSPARC 1&2:	 2
+C UltraSPARC 3:		 2.5
+C UltraSPARC T1:	17
+C UltraSPARC T3:	 6
+C UltraSPARC T4/T5:	 2
+
+C INPUT PARAMETERS
+C rptr	%o0
+C sptr	%o1
+C n	%o2
+
+ASM_START()
+	REGISTER(%g2,#scratch)
+	REGISTER(%g3,#scratch)
+PROLOGUE(mpn_copyd)
+	sllx	%o2,3,%g1
+	add	%g1,%o0,%o0
+	add	%g1,%o1,%o1
+	addcc	%o2,-8,%o2
+	bl,pt	%xcc,L(end01234567)
+	nop
+L(loop1):
+	ldx	[%o1-8],%g1
+	ldx	[%o1-16],%g2
+	ldx	[%o1-24],%g3
+	ldx	[%o1-32],%g4
+	ldx	[%o1-40],%g5
+	ldx	[%o1-48],%o3
+	ldx	[%o1-56],%o4
+	ldx	[%o1-64],%o5
+	add	%o1,-64,%o1
+	stx	%g1,[%o0-8]
+	stx	%g2,[%o0-16]
+	stx	%g3,[%o0-24]
+	stx	%g4,[%o0-32]
+	stx	%g5,[%o0-40]
+	stx	%o3,[%o0-48]
+	stx	%o4,[%o0-56]
+	stx	%o5,[%o0-64]
+	addcc	%o2,-8,%o2
+	bge,pt	%xcc,L(loop1)
+	add	%o0,-64,%o0
+L(end01234567):
+	addcc	%o2,8,%o2
+	bz,pn	%xcc,L(end)
+	nop
+L(loop2):
+	ldx	[%o1-8],%g1
+	add	%o1,-8,%o1
+	addcc	%o2,-1,%o2
+	stx	%g1,[%o0-8]
+	bg,pt	%xcc,L(loop2)
+	add	%o0,-8,%o0
+L(end):	retl
+	nop
+EPILOGUE(mpn_copyd)

diff --git a/third_party/gmp/mpn/sparc64/copyi.asm b/third_party/gmp/mpn/sparc64/copyi.asm
new file mode 100644
index 0000000..45663dc
--- /dev/null
+++ b/third_party/gmp/mpn/sparc64/copyi.asm

@@ -0,0 +1,86 @@
+dnl  SPARC v9 mpn_copyi -- Copy a limb vector, incrementing.
+
+dnl  Copyright 1999-2003 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C		   cycles/limb
+C UltraSPARC 1&2:	 2
+C UltraSPARC 3:		 2.5
+C UltraSPARC T1:	17
+C UltraSPARC T3:	 6
+C UltraSPARC T4/T5:	 2
+
+C INPUT PARAMETERS
+C rptr	%o0
+C sptr	%o1
+C n	%o2
+
+ASM_START()
+	REGISTER(%g2,#scratch)
+	REGISTER(%g3,#scratch)
+PROLOGUE(mpn_copyi)
+	addcc	%o2,-8,%o2
+	bl,pt	%xcc,L(end01234567)
+	nop
+L(loop1):
+	ldx	[%o1+0],%g1
+	ldx	[%o1+8],%g2
+	ldx	[%o1+16],%g3
+	ldx	[%o1+24],%g4
+	ldx	[%o1+32],%g5
+	ldx	[%o1+40],%o3
+	ldx	[%o1+48],%o4
+	ldx	[%o1+56],%o5
+	add	%o1,64,%o1
+	stx	%g1,[%o0+0]
+	stx	%g2,[%o0+8]
+	stx	%g3,[%o0+16]
+	stx	%g4,[%o0+24]
+	stx	%g5,[%o0+32]
+	stx	%o3,[%o0+40]
+	stx	%o4,[%o0+48]
+	stx	%o5,[%o0+56]
+	addcc	%o2,-8,%o2
+	bge,pt	%xcc,L(loop1)
+	add	%o0,64,%o0
+L(end01234567):
+	addcc	%o2,8,%o2
+	bz,pn	%xcc,L(end)
+	nop
+L(loop2):
+	ldx	[%o1+0],%g1
+	add	%o1,8,%o1
+	addcc	%o2,-1,%o2
+	stx	%g1,[%o0+0]
+	bg,pt	%xcc,L(loop2)
+	add	%o0,8,%o0
+L(end):	retl
+	nop
+EPILOGUE(mpn_copyi)

diff --git a/third_party/gmp/mpn/sparc64/dive_1.c b/third_party/gmp/mpn/sparc64/dive_1.c
new file mode 100644
index 0000000..4264f29
--- /dev/null
+++ b/third_party/gmp/mpn/sparc64/dive_1.c

@@ -0,0 +1,161 @@
+/* UltraSPARC 64 mpn_divexact_1 -- mpn by limb exact division.
+
+   THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY.  THEY'RE ALMOST
+   CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
+   FUTURE GNU MP RELEASES.
+
+Copyright 2000, 2001, 2003, 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+#include "mpn/sparc64/sparc64.h"
+
+
+/*                 64-bit divisor   32-bit divisor
+                    cycles/limb      cycles/limb
+                     (approx)         (approx)
+   Ultrasparc 2i:      110               70
+*/
+
+
+/* There are two key ideas here to reduce mulx's.  Firstly when the divisor
+   is 32-bits the high of q*d can be calculated without the two 32x32->64
+   cross-products involving the high 32-bits of the divisor, that being zero
+   of course.  Secondly umul_ppmm_lowequal and umul_ppmm_half_lowequal save
+   one mulx (each) knowing the low of q*d is equal to the input limb l.
+
+   For size==1, a simple udivx is used.  This is faster than calculating an
+   inverse.
+
+   For a 32-bit divisor and small sizes, an attempt was made at a simple
+   udivx loop (two per 64-bit limb), but it turned out to be slower than
+   mul-by-inverse.  At size==2 the inverse is about 260 cycles total
+   compared to a udivx at 291.  Perhaps the latter would suit when size==2
+   but the high 32-bits of the second limb is zero (saving one udivx), but
+   it doesn't seem worth a special case just for that.  */
+
+void
+mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor)
+{
+  mp_limb_t  inverse, s, s_next, c, l, ls, q;
+  unsigned   rshift, lshift;
+  mp_limb_t  lshift_mask;
+  mp_limb_t  divisor_h;
+
+  ASSERT (size >= 1);
+  ASSERT (divisor != 0);
+  ASSERT (MPN_SAME_OR_SEPARATE_P (dst, src, size));
+  ASSERT_MPN (src, size);
+  ASSERT_LIMB (divisor);
+
+  s = *src++;                 /* src low limb */
+  size--;
+  if (size == 0)
+    {
+      *dst = s / divisor;
+      return;
+    }
+
+  if ((divisor & 1) == 0)
+    {
+      count_trailing_zeros (rshift, divisor);
+      divisor >>= rshift;
+      lshift = 64 - rshift;
+
+      lshift_mask = MP_LIMB_T_MAX;
+    }
+  else
+    {
+      rshift = 0;
+
+      /* rshift==0 means no shift, so must mask out other part in this case */
+      lshift = 0;
+      lshift_mask = 0;
+    }
+
+  binvert_limb (inverse, divisor);
+
+  c = 0;
+  divisor_h = HIGH32 (divisor);
+
+  if (divisor_h == 0)
+    {
+      /* 32-bit divisor */
+      do
+        {
+          s_next = *src++;
+          ls = (s >> rshift) | ((s_next << lshift) & lshift_mask);
+          s = s_next;
+
+          SUBC_LIMB (c, l, ls, c);
+
+          q = l * inverse;
+          *dst++ = q;
+
+          umul_ppmm_half_lowequal (l, q, divisor, l);
+          c += l;
+
+          size--;
+        }
+      while (size != 0);
+
+      ls = s >> rshift;
+      l = ls - c;
+      q = l * inverse;
+      *dst = q;
+    }
+  else
+    {
+      /* 64-bit divisor */
+      mp_limb_t  divisor_l = LOW32 (divisor);
+      do
+        {
+          s_next = *src++;
+          ls = (s >> rshift) | ((s_next << lshift) & lshift_mask);
+          s = s_next;
+
+          SUBC_LIMB (c, l, ls, c);
+
+          q = l * inverse;
+          *dst++ = q;
+
+          umul_ppmm_lowequal (l, q, divisor, divisor_h, divisor_l, l);
+          c += l;
+
+          size--;
+        }
+      while (size != 0);
+
+      ls = s >> rshift;
+      l = ls - c;
+      q = l * inverse;
+      *dst = q;
+    }
+}

diff --git a/third_party/gmp/mpn/sparc64/divrem_1.c b/third_party/gmp/mpn/sparc64/divrem_1.c
new file mode 100644
index 0000000..ac94565
--- /dev/null
+++ b/third_party/gmp/mpn/sparc64/divrem_1.c

@@ -0,0 +1,242 @@
+/* UltraSparc 64 mpn_divrem_1 -- mpn by limb division.
+
+Copyright 1991, 1993, 1994, 1996, 1998-2001, 2003 Free Software Foundation,
+Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+#include "mpn/sparc64/sparc64.h"
+
+
+/*                   64-bit divisor       32-bit divisor
+                       cycles/limb          cycles/limb
+                        (approx)             (approx)
+                   integer  fraction    integer  fraction
+   Ultrasparc 2i:    160      160          122      96
+*/
+
+
+/* 32-bit divisors are treated in special case code.  This requires 4 mulx
+   per limb instead of 8 in the general case.
+
+   For big endian systems we need HALF_ENDIAN_ADJ included in the src[i]
+   addressing, to get the two halves of each limb read in the correct order.
+   This is kept in an adj variable.  Doing that measures about 4 c/l faster
+   than just writing HALF_ENDIAN_ADJ(i) in the integer loop.  The latter
+   shouldn't be 6 cycles worth of work, but perhaps it doesn't schedule well
+   (on gcc 3.2.1 at least).  The fraction loop doesn't seem affected, but we
+   still use a variable since that ought to work out best.  */
+
+mp_limb_t
+mpn_divrem_1 (mp_ptr qp_limbptr, mp_size_t xsize_limbs,
+              mp_srcptr ap_limbptr, mp_size_t size_limbs, mp_limb_t d_limb)
+{
+  mp_size_t  total_size_limbs;
+  mp_size_t  i;
+
+  ASSERT (xsize_limbs >= 0);
+  ASSERT (size_limbs >= 0);
+  ASSERT (d_limb != 0);
+  /* FIXME: What's the correct overlap rule when xsize!=0? */
+  ASSERT (MPN_SAME_OR_SEPARATE_P (qp_limbptr + xsize_limbs,
+                                  ap_limbptr, size_limbs));
+
+  total_size_limbs = size_limbs + xsize_limbs;
+  if (UNLIKELY (total_size_limbs == 0))
+    return 0;
+
+  /* udivx is good for total_size==1, and no need to bother checking
+     limb<divisor, since if that's likely the caller should check */
+  if (UNLIKELY (total_size_limbs == 1))
+    {
+      mp_limb_t  a, q;
+      a = (LIKELY (size_limbs != 0) ? ap_limbptr[0] : 0);
+      q = a / d_limb;
+      qp_limbptr[0] = q;
+      return a - q*d_limb;
+    }
+
+  if (d_limb <= CNST_LIMB(0xFFFFFFFF))
+    {
+      mp_size_t  size, xsize, total_size, adj;
+      unsigned   *qp, n1, n0, q, r, nshift, norm_rmask;
+      mp_limb_t  dinv_limb;
+      const unsigned *ap;
+      int        norm, norm_rshift;
+
+      size = 2 * size_limbs;
+      xsize = 2 * xsize_limbs;
+      total_size = size + xsize;
+
+      ap = (unsigned *) ap_limbptr;
+      qp = (unsigned *) qp_limbptr;
+
+      qp += xsize;
+      r = 0;        /* initial remainder */
+
+      if (LIKELY (size != 0))
+        {
+          n1 = ap[size-1 + HALF_ENDIAN_ADJ(1)];
+
+          /* If the length of the source is uniformly distributed, then
+             there's a 50% chance of the high 32-bits being zero, which we
+             can skip.  */
+          if (n1 == 0)
+            {
+              n1 = ap[size-2 + HALF_ENDIAN_ADJ(0)];
+              total_size--;
+              size--;
+              ASSERT (size > 0);  /* because always even */
+              qp[size + HALF_ENDIAN_ADJ(1)] = 0;
+            }
+
+          /* Skip a division if high < divisor (high quotient 0).  Testing
+             here before before normalizing will still skip as often as
+             possible.  */
+          if (n1 < d_limb)
+            {
+              r = n1;
+              size--;
+              qp[size + HALF_ENDIAN_ADJ(size)] = 0;
+              total_size--;
+              if (total_size == 0)
+                return r;
+            }
+        }
+
+      count_leading_zeros_32 (norm, d_limb);
+      norm -= 32;
+      d_limb <<= norm;
+      r <<= norm;
+
+      norm_rshift = 32 - norm;
+      norm_rmask = (norm == 0 ? 0 : 0xFFFFFFFF);
+
+      invert_half_limb (dinv_limb, d_limb);
+
+      if (LIKELY (size != 0))
+        {
+          i = size - 1;
+          adj = HALF_ENDIAN_ADJ (i);
+          n1 = ap[i + adj];
+          adj = -adj;
+          r |= ((n1 >> norm_rshift) & norm_rmask);
+          for ( ; i > 0; i--)
+            {
+              n0 = ap[i-1 + adj];
+              adj = -adj;
+              nshift = (n1 << norm) | ((n0 >> norm_rshift) & norm_rmask);
+              udiv_qrnnd_half_preinv (q, r, r, nshift, d_limb, dinv_limb);
+              qp[i + adj] = q;
+              n1 = n0;
+            }
+          nshift = n1 << norm;
+          udiv_qrnnd_half_preinv (q, r, r, nshift, d_limb, dinv_limb);
+          qp[0 + HALF_ENDIAN_ADJ(0)] = q;
+        }
+      qp -= xsize;
+      adj = HALF_ENDIAN_ADJ (0);
+      for (i = xsize-1; i >= 0; i--)
+        {
+          udiv_qrnnd_half_preinv (q, r, r, 0, d_limb, dinv_limb);
+          adj = -adj;
+          qp[i + adj] = q;
+        }
+
+      return r >> norm;
+    }
+  else
+    {
+      mp_srcptr  ap;
+      mp_ptr     qp;
+      mp_size_t  size, xsize, total_size;
+      mp_limb_t  d, n1, n0, q, r, dinv, nshift, norm_rmask;
+      int        norm, norm_rshift;
+
+      ap = ap_limbptr;
+      qp = qp_limbptr;
+      size = size_limbs;
+      xsize = xsize_limbs;
+      total_size = total_size_limbs;
+      d = d_limb;
+
+      qp += total_size;   /* above high limb */
+      r = 0;              /* initial remainder */
+
+      if (LIKELY (size != 0))
+        {
+          /* Skip a division if high < divisor (high quotient 0).  Testing
+             here before before normalizing will still skip as often as
+             possible.  */
+          n1 = ap[size-1];
+          if (n1 < d)
+            {
+              r = n1;
+              *--qp = 0;
+              total_size--;
+              if (total_size == 0)
+                return r;
+              size--;
+            }
+        }
+
+      count_leading_zeros (norm, d);
+      d <<= norm;
+      r <<= norm;
+
+      norm_rshift = GMP_LIMB_BITS - norm;
+      norm_rmask = (norm == 0 ? 0 : ~CNST_LIMB(0));
+
+      invert_limb (dinv, d);
+
+      if (LIKELY (size != 0))
+        {
+          n1 = ap[size-1];
+          r |= ((n1 >> norm_rshift) & norm_rmask);
+          for (i = size-2; i >= 0; i--)
+            {
+              n0 = ap[i];
+              nshift = (n1 << norm) | ((n0 >> norm_rshift) & norm_rmask);
+              udiv_qrnnd_preinv (q, r, r, nshift, d, dinv);
+              *--qp = q;
+              n1 = n0;
+            }
+          nshift = n1 << norm;
+          udiv_qrnnd_preinv (q, r, r, nshift, d, dinv);
+          *--qp = q;
+        }
+      for (i = 0; i < xsize; i++)
+        {
+          udiv_qrnnd_preinv (q, r, r, CNST_LIMB(0), d, dinv);
+          *--qp = q;
+        }
+      return r >> norm;
+    }
+}

diff --git a/third_party/gmp/mpn/sparc64/gcd_11.asm b/third_party/gmp/mpn/sparc64/gcd_11.asm
new file mode 100644
index 0000000..5564751
--- /dev/null
+++ b/third_party/gmp/mpn/sparc64/gcd_11.asm

@@ -0,0 +1,88 @@
+dnl  SPARC64 mpn_gcd_11.
+
+dnl  Based on the K7 gcd_1.asm, by Kevin Ryde.  Rehacked for SPARC by Torbjörn
+dnl  Granlund.
+
+dnl  Copyright 2000-2002, 2005, 2009, 2011-2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C		  cycles/bit (approx)
+C UltraSPARC 1&2:	 5.1
+C UltraSPARC 3:		 5.0
+C UltraSPARC T1:	11.4
+C UltraSPARC T3:	10
+C UltraSPARC T4:	 6
+C Numbers measured with: speed -CD -s32-64 -t32 mpn_gcd_1
+
+C ctz_table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0.
+
+deflit(MAXSHIFT, 7)
+deflit(MASK, eval((m4_lshift(1,MAXSHIFT))-1))
+
+	RODATA
+	TYPE(ctz_table,object)
+ctz_table:
+	.byte	MAXSHIFT
+forloop(i,1,MASK,
+`	.byte	m4_count_trailing_zeros(i)
+')
+	SIZE(ctz_table,.-ctz_table)
+
+define(`u0',    `%o0')
+define(`v0',    `%o1')
+
+ASM_START()
+	REGISTER(%g2,#scratch)
+	REGISTER(%g3,#scratch)
+PROLOGUE(mpn_gcd_11)
+	LEA64(ctz_table, o5, g4)
+	b	L(odd)
+	 mov	u0, %o4
+
+	ALIGN(16)
+L(top):	movcc	%xcc, %o4, v0		C v = min(u,v)
+	movcc	%xcc, %o2, %o0		C u = |v - u]
+L(mid):	ldub	[%o5+%g3], %g5		C
+	brz,a,pn %g3, L(shift_alot)	C
+	 srlx	%o0, MAXSHIFT, %o0
+	srlx	%o0, %g5, %o4		C new u, odd
+L(odd):	subcc	v0, %o4, %o2		C v - u, set flags for branch and movcc
+	sub	%o4, v0, %o0		C u - v
+	bnz,pt	%xcc, L(top)		C
+	 and	%o2, MASK, %g3		C extract low MAXSHIFT bits from (v-u)
+
+	retl
+	 mov	v0, %o0
+
+L(shift_alot):
+	b	L(mid)
+	 and	%o0, MASK, %g3		C
+EPILOGUE()

diff --git a/third_party/gmp/mpn/sparc64/gmp-mparam.h b/third_party/gmp/mpn/sparc64/gmp-mparam.h
new file mode 100644
index 0000000..5ac2c46
--- /dev/null
+++ b/third_party/gmp/mpn/sparc64/gmp-mparam.h

@@ -0,0 +1,139 @@
+/* Sparc64 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 1999-2002, 2004, 2006, 2008-2010 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+/* 500 MHz ultrasparc2 running GNU/Linux */
+
+#define DIVREM_1_NORM_THRESHOLD              3
+#define DIVREM_1_UNNORM_THRESHOLD            4
+#define MOD_1_NORM_THRESHOLD                 3
+#define MOD_1_UNNORM_THRESHOLD               3
+#define MOD_1N_TO_MOD_1_1_THRESHOLD      MP_SIZE_T_MAX  /* never */
+#define MOD_1U_TO_MOD_1_1_THRESHOLD         22
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD         0  /* never mpn_mod_1_1p */
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD        27
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD  MP_SIZE_T_MAX  /* never */
+#define USE_PREINV_DIVREM_1                  1
+#define DIVEXACT_1_THRESHOLD                 0  /* always */
+#define BMOD_1_TO_MOD_1_THRESHOLD        MP_SIZE_T_MAX  /* never */
+
+#define MUL_TOOM22_THRESHOLD                30
+#define MUL_TOOM33_THRESHOLD               187
+#define MUL_TOOM44_THRESHOLD               278
+#define MUL_TOOM6H_THRESHOLD               278
+#define MUL_TOOM8H_THRESHOLD               357
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD     201
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     199
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD     154
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD     107
+
+#define SQR_BASECASE_THRESHOLD              13
+#define SQR_TOOM2_THRESHOLD                 69
+#define SQR_TOOM3_THRESHOLD                116
+#define SQR_TOOM4_THRESHOLD                336
+#define SQR_TOOM6_THRESHOLD                336
+#define SQR_TOOM8_THRESHOLD                454
+
+#define MULMOD_BNM1_THRESHOLD               17
+#define SQRMOD_BNM1_THRESHOLD               23
+
+#define MUL_FFT_MODF_THRESHOLD             248  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    248, 5}, {      9, 4}, {     19, 6}, {      5, 5}, \
+    {     15, 6}, {      8, 5}, {     17, 6}, {     21, 7}, \
+    {     19, 8}, {     11, 7}, {     25, 8}, {     15, 7}, \
+    {     31, 8}, {     27, 9}, {     15, 8}, {     33, 9}, \
+    {     19, 8}, {     39, 9}, {     27,10}, {     15, 9}, \
+    {     39,10}, {     23, 9}, {     47,11}, {     15,10}, \
+    {     31, 9}, {     67,10}, {     39, 9}, {     79,10}, \
+    {     47,11}, {     31,10}, {     63, 9}, {    127, 8}, \
+    {    255,10}, {     71, 9}, {    143, 8}, {    287,10}, \
+    {     79,11}, {     47,12}, {   4096,13}, {   8192,14}, \
+    {  16384,15}, {  32768,16}, {  65536,17}, { 131072,18}, \
+    { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \
+    {4194304,23}, {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 50
+#define MUL_FFT_THRESHOLD                 1984
+
+#define SQR_FFT_MODF_THRESHOLD             236  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    236, 5}, {      8, 4}, {     17, 5}, {     19, 6}, \
+    {     10, 5}, {     21, 6}, {     19, 7}, {     10, 6}, \
+    {     21, 7}, {     21, 8}, {     21, 9}, {     11, 8}, \
+    {     23, 9}, {     19, 8}, {     43, 9}, {     23,10}, \
+    {     15, 9}, {     43,10}, {     23,11}, {     15,10}, \
+    {     31, 9}, {     63,10}, {     47, 8}, {    191,11}, \
+    {     31,10}, {     63, 8}, {    255, 7}, {    511, 9}, \
+    {    135, 8}, {    271,10}, {     71, 9}, {    143, 8}, \
+    {    287, 7}, {    575,11}, {     47, 9}, {    191, 8}, \
+    {    383,12}, {   4096,13}, {   8192,14}, {  16384,15}, \
+    {  32768,16}, {  65536,17}, { 131072,18}, { 262144,19}, \
+    { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \
+    {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 49
+#define SQR_FFT_THRESHOLD                 1120
+
+#define MULLO_BASECASE_THRESHOLD            16
+#define MULLO_DC_THRESHOLD                  41
+#define MULLO_MUL_N_THRESHOLD             3791
+
+#define DC_DIV_QR_THRESHOLD                 27
+#define DC_DIVAPPR_Q_THRESHOLD             100
+#define DC_BDIV_QR_THRESHOLD                47
+#define DC_BDIV_Q_THRESHOLD                174
+
+#define INV_MULMOD_BNM1_THRESHOLD           58
+#define INV_NEWTON_THRESHOLD                13
+#define INV_APPR_THRESHOLD                   9
+
+#define BINV_NEWTON_THRESHOLD              187
+#define REDC_1_TO_REDC_2_THRESHOLD          10
+#define REDC_2_TO_REDC_N_THRESHOLD         115
+
+#define MU_DIV_QR_THRESHOLD                680
+#define MU_DIVAPPR_Q_THRESHOLD             618
+#define MUPI_DIV_QR_THRESHOLD                0  /* always */
+#define MU_BDIV_QR_THRESHOLD               748
+#define MU_BDIV_Q_THRESHOLD                889
+
+#define MATRIX22_STRASSEN_THRESHOLD         13
+#define HGCD_THRESHOLD                      53
+#define GCD_DC_THRESHOLD                   283
+#define GCDEXT_DC_THRESHOLD                186
+#define JACOBI_BASE_METHOD                   2
+
+#define GET_STR_DC_THRESHOLD                13
+#define GET_STR_PRECOMPUTE_THRESHOLD        16
+#define SET_STR_DC_THRESHOLD               390
+#define SET_STR_PRECOMPUTE_THRESHOLD      1665

diff --git a/third_party/gmp/mpn/sparc64/lshift.asm b/third_party/gmp/mpn/sparc64/lshift.asm
new file mode 100644
index 0000000..90bbb45
--- /dev/null
+++ b/third_party/gmp/mpn/sparc64/lshift.asm

@@ -0,0 +1,140 @@
+dnl  SPARC v9 mpn_lshift
+
+dnl  Contributed to the GNU project by David Miller.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C		    cycles/limb
+C UltraSPARC 1&2:	 2
+C UltraSPARC 3:		 2.5
+C UltraSPARC T1:	17.5
+C UltraSPARC T3:	 8
+C UltraSPARC T4:	 3
+
+C INPUT PARAMETERS
+define(`rp',     `%i0')
+define(`up',     `%i1')
+define(`n',      `%i2')
+define(`cnt',    `%i3')
+
+define(`tcnt',   `%i4')
+define(`retval', `%i5')
+define(`u0',     `%l0')
+define(`u1',     `%l1')
+define(`r0',     `%l6')
+define(`r1',     `%l7')
+define(`u0_off', `%o0')
+define(`u1_off', `%o1')
+define(`r0_off', `%o2')
+define(`r1_off', `%o3')
+
+ASM_START()
+	REGISTER(%g2,#scratch)
+	REGISTER(%g3,#scratch)
+PROLOGUE(mpn_lshift)
+	save	%sp, -176, %sp
+
+	sllx	n, 3, n
+	sub	%g0, cnt, tcnt
+
+	sub	up, 8, u1_off
+	add	rp, (5 * 8), r1_off
+
+	ldx	[n + u1_off], u1	C WAS: up - 8
+	add	u1_off, (3 * 8), u1_off
+
+	sub	r1_off, 8, r0_off
+	sub	u1_off, 8, u0_off
+
+	subcc	n, (3 * 8), n
+	srlx	u1, tcnt, retval
+
+	bl,pn	%xcc, L(end12)
+	 sllx	u1, cnt, %l3
+
+	ldx	[n + u0_off], u0	C WAS: up - 16
+	subcc	n, (2 * 8), n
+
+	ldx	[n + u1_off], u1	C WAS: up - 24
+
+	bl,pn	%xcc, L(end34)
+	 srlx	u0, tcnt, %l4
+
+	b,a	L(top)
+	ALIGN(16)
+L(top):
+	sllx	u0, cnt, %l2
+	or	%l4, %l3, r0
+
+	ldx	[n + u0_off], u0	C WAS: up - 16
+	srlx	u1, tcnt, %l5
+
+	stx	r0, [n + r0_off]	C WAS: rp - 8
+	subcc	n, (2 * 8), n
+
+	sllx	u1, cnt, %l3
+	or	%l2, %l5, r1
+
+	ldx	[n + u1_off], u1	C WAS: up - 24
+	srlx	u0, tcnt, %l4
+
+	bge,pt	%xcc, L(top)
+	 stx	r1, [n + r1_off]	C WAS: rp - 16
+
+L(end34):
+	sllx	u0, cnt, %l2
+	or	%l4, %l3, r0
+
+	srlx	u1, tcnt, %l5
+	stx	r0, [n + r0_off]	C WAS: rp - 8
+
+	or	%l2, %l5, r1
+	sub	n, (2 * 8), %o5
+
+	sllx	u1, cnt, %l3
+	stx	r1, [%o5 + r1_off]	C WAS: rp - 16
+
+L(end12):
+	andcc	n, 8, %g0
+	bz,pn	%xcc, L(done)
+	 nop
+
+	ldx	[n + u0_off], u1
+	srlx	u1, tcnt, %l4
+	or	%l4, %l3, r0
+	stx	r0, [r0_off - 24]
+	sllx	u1, cnt, %l3
+L(done):
+	stx	%l3, [r0_off - 32]
+
+	ret
+	restore retval, 0, %o0
+EPILOGUE()

diff --git a/third_party/gmp/mpn/sparc64/lshiftc.asm b/third_party/gmp/mpn/sparc64/lshiftc.asm
new file mode 100644
index 0000000..4a0f0a3
--- /dev/null
+++ b/third_party/gmp/mpn/sparc64/lshiftc.asm

@@ -0,0 +1,147 @@
+dnl  SPARC v9 mpn_lshiftc
+
+dnl  Contributed to the GNU project by David Miller.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C		    cycles/limb
+C UltraSPARC 1&2:	 3
+C UltraSPARC 3:		 3
+C UltraSPARC T1:	17
+C UltraSPARC T3:	10
+C UltraSPARC T4:	 3.5
+
+C INPUT PARAMETERS
+define(`rp',     `%i0')
+define(`up',     `%i1')
+define(`n',      `%i2')
+define(`cnt',    `%i3')
+
+define(`tcnt',   `%i4')
+define(`retval', `%i5')
+define(`u0',     `%l0')
+define(`u1',     `%l1')
+define(`r0',     `%l6')
+define(`r1',     `%l7')
+define(`u0_off', `%o0')
+define(`u1_off', `%o1')
+define(`r0_off', `%o2')
+define(`r1_off', `%o3')
+
+ASM_START()
+	REGISTER(%g2,#scratch)
+	REGISTER(%g3,#scratch)
+PROLOGUE(mpn_lshiftc)
+	save	%sp, -176, %sp
+
+	sllx	n, 3, n
+	sub	%g0, cnt, tcnt
+
+	sub	up, 8, u1_off
+	add	rp, (5 * 8), r1_off
+
+	ldx	[n + u1_off], u1	C WAS: up - 8
+	add	u1_off, (3 * 8), u1_off
+
+	sub	r1_off, 8, r0_off
+	sub	u1_off, 8, u0_off
+
+	subcc	n, (3 * 8), n
+	srlx	u1, tcnt, retval
+
+	bl,pn	%xcc, L(end12)
+	 sllx	u1, cnt, %l3
+
+	ldx	[n + u0_off], u0	C WAS: up - 16
+	subcc	n, (2 * 8), n
+
+	ldx	[n + u1_off], u1	C WAS: up - 24
+
+	bl,pn	%xcc, L(end34)
+	 srlx	u0, tcnt, %l4
+
+	b,a	L(top)
+	ALIGN(16)
+L(top):
+	not	%l3, %l3
+	sllx	u0, cnt, %l2
+
+	andn	%l3, %l4, r0
+	ldx	[n + u0_off], u0	C WAS: up - 16
+
+	srlx	u1, tcnt, %l5
+	stx	r0, [n + r0_off]	C WAS: rp - 8
+
+	subcc	n, (2 * 8), n
+	not	%l2, %l2
+
+	sllx	u1, cnt, %l3
+	andn	%l2, %l5, r1
+
+	ldx	[n + u1_off], u1	C WAS: up - 24
+	srlx	u0, tcnt, %l4
+
+	bge,pt	%xcc, L(top)
+	 stx	r1, [n + r1_off]	C WAS: rp - 16
+
+L(end34):
+	not	%l3, %l3
+	sllx	u0, cnt, %l2
+
+	andn	%l3, %l4, r0
+	srlx	u1, tcnt, %l5
+
+	stx	r0, [n + r0_off]	C WAS: rp - 8
+	not	%l2, %l2
+
+	andn	%l2, %l5, r1
+	sub	n, (2 * 8), %o5
+
+	sllx	u1, cnt, %l3
+	stx	r1, [%o5 + r1_off]	C WAS: rp - 16
+
+L(end12):
+	andcc	n, 8, %g0
+	bz	%xcc, L(done)+4
+	 not	%l3, %l3
+
+	ldx	[n + u0_off], u1
+	srlx	u1, tcnt, %l4
+	andn	%l3, %l4, r0
+	stx	r0, [r0_off - 24]
+	sllx	u1, cnt, %l3
+L(done):
+	not	%l3, %l3
+	stx	%l3, [r0_off - 32]
+
+	ret
+	restore retval, 0, %o0
+EPILOGUE()

diff --git a/third_party/gmp/mpn/sparc64/mod_1.c b/third_party/gmp/mpn/sparc64/mod_1.c
new file mode 100644
index 0000000..ab53f9d
--- /dev/null
+++ b/third_party/gmp/mpn/sparc64/mod_1.c

@@ -0,0 +1,238 @@
+/* UltraSPARC 64 mpn_mod_1 -- mpn by limb remainder.
+
+Copyright 1991, 1993, 1994, 1999-2001, 2003, 2010 Free Software Foundation,
+Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+#include "mpn/sparc64/sparc64.h"
+
+
+/*                 64-bit divisor   32-bit divisor
+                    cycles/limb      cycles/limb
+                     (approx)         (approx)
+   Ultrasparc 2i:      160               120
+*/
+
+
+/* 32-bit divisors are treated in special case code.  This requires 4 mulx
+   per limb instead of 8 in the general case.
+
+   For big endian systems we need HALF_ENDIAN_ADJ included in the src[i]
+   addressing, to get the two halves of each limb read in the correct order.
+   This is kept in an adj variable.  Doing that measures about 6 c/l faster
+   than just writing HALF_ENDIAN_ADJ(i) in the loop.  The latter shouldn't
+   be 6 cycles worth of work, but perhaps it doesn't schedule well (on gcc
+   3.2.1 at least).
+
+   A simple udivx/umulx loop for the 32-bit case was attempted for small
+   sizes, but at size==2 it was only about the same speed and at size==3 was
+   slower.  */
+
+static mp_limb_t
+mpn_mod_1_anynorm (mp_srcptr src_limbptr, mp_size_t size_limbs, mp_limb_t d_limb)
+{
+  int        norm, norm_rshift;
+  mp_limb_t  src_high_limb;
+  mp_size_t  i;
+
+  ASSERT (size_limbs >= 0);
+  ASSERT (d_limb != 0);
+
+  if (UNLIKELY (size_limbs == 0))
+    return 0;
+
+  src_high_limb = src_limbptr[size_limbs-1];
+
+  /* udivx is good for size==1, and no need to bother checking limb<divisor,
+     since if that's likely the caller should check */
+  if (UNLIKELY (size_limbs == 1))
+    return src_high_limb % d_limb;
+
+  if (d_limb <= CNST_LIMB(0xFFFFFFFF))
+    {
+      unsigned   *src, n1, n0, r, dummy_q, nshift, norm_rmask;
+      mp_size_t  size, adj;
+      mp_limb_t  dinv_limb;
+
+      size = 2 * size_limbs;    /* halfwords */
+      src = (unsigned *) src_limbptr;
+
+      /* prospective initial remainder, if < d */
+      r = src_high_limb >> 32;
+
+      /* If the length of the source is uniformly distributed, then there's
+         a 50% chance of the high 32-bits being zero, which we can skip.  */
+      if (r == 0)
+        {
+          r = (unsigned) src_high_limb;
+          size--;
+          ASSERT (size > 0);  /* because always even */
+        }
+
+      /* Skip a division if high < divisor.  Having the test here before
+         normalizing will still skip as often as possible.  */
+      if (r < d_limb)
+        {
+          size--;
+          ASSERT (size > 0);  /* because size==1 handled above */
+        }
+      else
+        r = 0;
+
+      count_leading_zeros_32 (norm, d_limb);
+      norm -= 32;
+      d_limb <<= norm;
+
+      norm_rshift = 32 - norm;
+      norm_rmask = (norm == 0 ? 0 : 0xFFFFFFFF);
+      i = size-1;
+      adj = HALF_ENDIAN_ADJ (i);
+      n1 = src [i + adj];
+      r = (r << norm) | ((n1 >> norm_rshift) & norm_rmask);
+
+      invert_half_limb (dinv_limb, d_limb);
+      adj = -adj;
+
+      for (i--; i >= 0; i--)
+        {
+          n0 = src [i + adj];
+          adj = -adj;
+          nshift = (n1 << norm) | ((n0 >> norm_rshift) & norm_rmask);
+          udiv_qrnnd_half_preinv (dummy_q, r, r, nshift, d_limb, dinv_limb);
+          n1 = n0;
+        }
+
+      /* same as loop, but without n0 */
+      nshift = n1 << norm;
+      udiv_qrnnd_half_preinv (dummy_q, r, r, nshift, d_limb, dinv_limb);
+
+      ASSERT ((r & ((1 << norm) - 1)) == 0);
+      return r >> norm;
+    }
+  else
+    {
+      mp_srcptr  src;
+      mp_size_t  size;
+      mp_limb_t  n1, n0, r, dinv, dummy_q, nshift, norm_rmask;
+
+      src = src_limbptr;
+      size = size_limbs;
+      r = src_high_limb;  /* initial remainder */
+
+      /* Skip a division if high < divisor.  Having the test here before
+         normalizing will still skip as often as possible.  */
+      if (r < d_limb)
+        {
+          size--;
+          ASSERT (size > 0);  /* because size==1 handled above */
+        }
+      else
+        r = 0;
+
+      count_leading_zeros (norm, d_limb);
+      d_limb <<= norm;
+
+      norm_rshift = GMP_LIMB_BITS - norm;
+      norm_rmask = (norm == 0 ? 0 : 0xFFFFFFFF);
+
+      src += size;
+      n1 = *--src;
+      r = (r << norm) | ((n1 >> norm_rshift) & norm_rmask);
+
+      invert_limb (dinv, d_limb);
+
+      for (i = size-2; i >= 0; i--)
+        {
+          n0 = *--src;
+          nshift = (n1 << norm) | ((n0 >> norm_rshift) & norm_rmask);
+          udiv_qrnnd_preinv (dummy_q, r, r, nshift, d_limb, dinv);
+          n1 = n0;
+        }
+
+      /* same as loop, but without n0 */
+      nshift = n1 << norm;
+      udiv_qrnnd_preinv (dummy_q, r, r, nshift, d_limb, dinv);
+
+      ASSERT ((r & ((CNST_LIMB(1) << norm) - 1)) == 0);
+      return r >> norm;
+    }
+}
+
+mp_limb_t
+mpn_mod_1 (mp_srcptr ap, mp_size_t n, mp_limb_t b)
+{
+  ASSERT (n >= 0);
+  ASSERT (b != 0);
+
+  /* Should this be handled at all?  Rely on callers?  Note un==0 is currently
+     required by mpz/fdiv_r_ui.c and possibly other places.  */
+  if (n == 0)
+    return 0;
+
+  if (UNLIKELY ((b & GMP_NUMB_HIGHBIT) != 0))
+    {
+      if (BELOW_THRESHOLD (n, MOD_1N_TO_MOD_1_1_THRESHOLD))
+	{
+	  return mpn_mod_1_anynorm (ap, n, b);
+	}
+      else
+	{
+	  mp_limb_t pre[4];
+	  mpn_mod_1_1p_cps (pre, b);
+	  return mpn_mod_1_1p (ap, n, b, pre);
+	}
+    }
+  else
+    {
+      if (BELOW_THRESHOLD (n, MOD_1U_TO_MOD_1_1_THRESHOLD))
+	{
+	  return mpn_mod_1_anynorm (ap, n, b);
+	}
+      else if (BELOW_THRESHOLD (n, MOD_1_1_TO_MOD_1_2_THRESHOLD))
+	{
+	  mp_limb_t pre[4];
+	  mpn_mod_1_1p_cps (pre, b);
+	  return mpn_mod_1_1p (ap, n, b << pre[1], pre);
+	}
+      else if (BELOW_THRESHOLD (n, MOD_1_2_TO_MOD_1_4_THRESHOLD) || UNLIKELY (b > GMP_NUMB_MASK / 4))
+	{
+	  mp_limb_t pre[5];
+	  mpn_mod_1s_2p_cps (pre, b);
+	  return mpn_mod_1s_2p (ap, n, b << pre[1], pre);
+	}
+      else
+	{
+	  mp_limb_t pre[7];
+	  mpn_mod_1s_4p_cps (pre, b);
+	  return mpn_mod_1s_4p (ap, n, b << pre[1], pre);
+	}
+    }
+}

diff --git a/third_party/gmp/mpn/sparc64/mod_1_4.c b/third_party/gmp/mpn/sparc64/mod_1_4.c
new file mode 100644
index 0000000..735a402
--- /dev/null
+++ b/third_party/gmp/mpn/sparc64/mod_1_4.c

@@ -0,0 +1,235 @@
+/* mpn_mod_1s_4p (ap, n, b, cps)
+   Divide (ap,,n) by b.  Return the single-limb remainder.
+   Requires that d < B / 4.
+
+   Contributed to the GNU project by Torbjorn Granlund.
+   Based on a suggestion by Peter L. Montgomery.
+
+   THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
+   SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
+   GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
+
+Copyright 2008-2010 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+#include "mpn/sparc64/sparc64.h"
+
+void
+mpn_mod_1s_4p_cps (mp_limb_t cps[7], mp_limb_t b)
+{
+  mp_limb_t bi;
+  mp_limb_t B1modb, B2modb, B3modb, B4modb, B5modb;
+  int cnt;
+
+  ASSERT (b <= (~(mp_limb_t) 0) / 4);
+
+  count_leading_zeros (cnt, b);
+
+  b <<= cnt;
+  invert_limb (bi, b);
+
+  cps[0] = bi;
+  cps[1] = cnt;
+
+  B1modb = -b * ((bi >> (GMP_LIMB_BITS-cnt)) | (CNST_LIMB(1) << cnt));
+  ASSERT (B1modb <= b);		/* NB: not fully reduced mod b */
+  cps[2] = B1modb >> cnt;
+
+  udiv_rnnd_preinv (B2modb, B1modb, CNST_LIMB(0), b, bi);
+  cps[3] = B2modb >> cnt;
+
+  udiv_rnnd_preinv (B3modb, B2modb, CNST_LIMB(0), b, bi);
+  cps[4] = B3modb >> cnt;
+
+  udiv_rnnd_preinv (B4modb, B3modb, CNST_LIMB(0), b, bi);
+  cps[5] = B4modb >> cnt;
+
+  udiv_rnnd_preinv (B5modb, B4modb, CNST_LIMB(0), b, bi);
+  cps[6] = B5modb >> cnt;
+
+#if WANT_ASSERT
+  {
+    int i;
+    b = cps[2];
+    for (i = 3; i <= 6; i++)
+      {
+	b += cps[i];
+	ASSERT (b >= cps[i]);
+      }
+  }
+#endif
+}
+
+mp_limb_t
+mpn_mod_1s_4p (mp_srcptr ap, mp_size_t n, mp_limb_t b, const mp_limb_t cps[7])
+{
+  mp_limb_t rh, rl, bi, ph, pl, ch, cl, r;
+  mp_limb_t B1modb, B2modb, B3modb, B4modb, B5modb;
+  mp_size_t i;
+  int cnt;
+
+  ASSERT (n >= 1);
+
+  B1modb = cps[2];
+  B2modb = cps[3];
+  B3modb = cps[4];
+  B4modb = cps[5];
+  B5modb = cps[6];
+
+  if ((b >> 32) == 0)
+    {
+      switch (n & 3)
+	{
+	case 0:
+	  umul_ppmm_s (ph, pl, ap[n - 3], B1modb);
+	  add_ssaaaa (ph, pl, ph, pl, CNST_LIMB(0), ap[n - 4]);
+	  umul_ppmm_s (ch, cl, ap[n - 2], B2modb);
+	  add_ssaaaa (ph, pl, ph, pl, ch, cl);
+	  umul_ppmm_s (rh, rl, ap[n - 1], B3modb);
+	  add_ssaaaa (rh, rl, rh, rl, ph, pl);
+	  n -= 4;
+	  break;
+	case 1:
+	  rh = 0;
+	  rl = ap[n - 1];
+	  n -= 1;
+	  break;
+	case 2:
+	  rh = ap[n - 1];
+	  rl = ap[n - 2];
+	  n -= 2;
+	  break;
+	case 3:
+	  umul_ppmm_s (ph, pl, ap[n - 2], B1modb);
+	  add_ssaaaa (ph, pl, ph, pl, CNST_LIMB(0), ap[n - 3]);
+	  umul_ppmm_s (rh, rl, ap[n - 1], B2modb);
+	  add_ssaaaa (rh, rl, rh, rl, ph, pl);
+	  n -= 3;
+	  break;
+	}
+
+      for (i = n - 4; i >= 0; i -= 4)
+	{
+	  /* rr = ap[i]				< B
+		+ ap[i+1] * (B mod b)		<= (B-1)(b-1)
+		+ ap[i+2] * (B^2 mod b)		<= (B-1)(b-1)
+		+ ap[i+3] * (B^3 mod b)		<= (B-1)(b-1)
+		+ LO(rr)  * (B^4 mod b)		<= (B-1)(b-1)
+		+ HI(rr)  * (B^5 mod b)		<= (B-1)(b-1)
+	  */
+	  umul_ppmm_s (ph, pl, ap[i + 1], B1modb);
+	  add_ssaaaa (ph, pl, ph, pl, CNST_LIMB(0), ap[i + 0]);
+
+	  umul_ppmm_s (ch, cl, ap[i + 2], B2modb);
+	  add_ssaaaa (ph, pl, ph, pl, ch, cl);
+
+	  umul_ppmm_s (ch, cl, ap[i + 3], B3modb);
+	  add_ssaaaa (ph, pl, ph, pl, ch, cl);
+
+	  umul_ppmm_s (ch, cl, rl, B4modb);
+	  add_ssaaaa (ph, pl, ph, pl, ch, cl);
+
+	  umul_ppmm_s (rh, rl, rh, B5modb);
+	  add_ssaaaa (rh, rl, rh, rl, ph, pl);
+	}
+
+      umul_ppmm_s (rh, cl, rh, B1modb);
+      add_ssaaaa (rh, rl, rh, rl, CNST_LIMB(0), cl);
+    }
+  else
+    {
+      switch (n & 3)
+	{
+	case 0:
+	  umul_ppmm (ph, pl, ap[n - 3], B1modb);
+	  add_ssaaaa (ph, pl, ph, pl, 0, ap[n - 4]);
+	  umul_ppmm (ch, cl, ap[n - 2], B2modb);
+	  add_ssaaaa (ph, pl, ph, pl, ch, cl);
+	  umul_ppmm (rh, rl, ap[n - 1], B3modb);
+	  add_ssaaaa (rh, rl, rh, rl, ph, pl);
+	  n -= 4;
+	  break;
+	case 1:
+	  rh = 0;
+	  rl = ap[n - 1];
+	  n -= 1;
+	  break;
+	case 2:
+	  rh = ap[n - 1];
+	  rl = ap[n - 2];
+	  n -= 2;
+	  break;
+	case 3:
+	  umul_ppmm (ph, pl, ap[n - 2], B1modb);
+	  add_ssaaaa (ph, pl, ph, pl, 0, ap[n - 3]);
+	  umul_ppmm (rh, rl, ap[n - 1], B2modb);
+	  add_ssaaaa (rh, rl, rh, rl, ph, pl);
+	  n -= 3;
+	  break;
+	}
+
+      for (i = n - 4; i >= 0; i -= 4)
+	{
+	  /* rr = ap[i]				< B
+		+ ap[i+1] * (B mod b)		<= (B-1)(b-1)
+		+ ap[i+2] * (B^2 mod b)		<= (B-1)(b-1)
+		+ ap[i+3] * (B^3 mod b)		<= (B-1)(b-1)
+		+ LO(rr)  * (B^4 mod b)		<= (B-1)(b-1)
+		+ HI(rr)  * (B^5 mod b)		<= (B-1)(b-1)
+	  */
+	  umul_ppmm (ph, pl, ap[i + 1], B1modb);
+	  add_ssaaaa (ph, pl, ph, pl, 0, ap[i + 0]);
+
+	  umul_ppmm (ch, cl, ap[i + 2], B2modb);
+	  add_ssaaaa (ph, pl, ph, pl, ch, cl);
+
+	  umul_ppmm (ch, cl, ap[i + 3], B3modb);
+	  add_ssaaaa (ph, pl, ph, pl, ch, cl);
+
+	  umul_ppmm (ch, cl, rl, B4modb);
+	  add_ssaaaa (ph, pl, ph, pl, ch, cl);
+
+	  umul_ppmm (rh, rl, rh, B5modb);
+	  add_ssaaaa (rh, rl, rh, rl, ph, pl);
+	}
+
+      umul_ppmm (rh, cl, rh, B1modb);
+      add_ssaaaa (rh, rl, rh, rl, 0, cl);
+    }
+
+  bi = cps[0];
+  cnt = cps[1];
+
+  r = (rh << cnt) | (rl >> (GMP_LIMB_BITS - cnt));
+  udiv_rnnd_preinv (r, r, rl << cnt, b, bi);
+
+  return r >> cnt;
+}

diff --git a/third_party/gmp/mpn/sparc64/mode1o.c b/third_party/gmp/mpn/sparc64/mode1o.c
new file mode 100644
index 0000000..771c999
--- /dev/null
+++ b/third_party/gmp/mpn/sparc64/mode1o.c

@@ -0,0 +1,196 @@
+/* UltraSPARC 64 mpn_modexact_1c_odd -- mpn by limb exact style remainder.
+
+   THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY.  THEY'RE ALMOST
+   CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
+   FUTURE GNU MP RELEASES.
+
+Copyright 2000-2003 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+#include "longlong.h"
+
+#include "mpn/sparc64/sparc64.h"
+
+
+/*                 64-bit divisor   32-bit divisor
+                    cycles/limb      cycles/limb
+                     (approx)         (approx)
+   Ultrasparc 2i:       ?                ?
+*/
+
+
+/* This implementation reduces the number of multiplies done, knowing that
+   on ultrasparc 1 and 2 the mulx instruction stalls the whole chip.
+
+   The key idea is to use the fact that the low limb of q*d equals l, this
+   being the whole purpose of the q calculated.  It means there's no need to
+   calculate the lowest 32x32->64 part of the q*d, instead it can be
+   inferred from l and the other three 32x32->64 parts.  See sparc64.h for
+   details.
+
+   When d is 32-bits, the same applies, but in this case there's only one
+   other 32x32->64 part (ie. HIGH(q)*d).
+
+   The net effect is that for 64-bit divisor each limb is 4 mulx, or for
+   32-bit divisor each is 2 mulx.
+
+   Enhancements:
+
+   No doubt this could be done in assembler, if that helped the scheduling,
+   or perhaps guaranteed good code irrespective of the compiler.
+
+   Alternatives:
+
+   It might be possibly to use floating point.  The loop is dominated by
+   multiply latency, so not sure if floats would improve that.  One
+   possibility would be to take two limbs at a time, with a 128 bit inverse,
+   if there's enough registers, which could effectively use float throughput
+   to reduce total latency across two limbs.  */
+
+#define ASSERT_RETVAL(r)                \
+  ASSERT (orig_c < d ? r < d : r <= d)
+
+mp_limb_t
+mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, mp_limb_t d, mp_limb_t orig_c)
+{
+  mp_limb_t  c = orig_c;
+  mp_limb_t  s, l, q, h, inverse;
+
+  ASSERT (size >= 1);
+  ASSERT (d & 1);
+  ASSERT_MPN (src, size);
+  ASSERT_LIMB (d);
+  ASSERT_LIMB (c);
+
+  /* udivx is faster than 10 or 12 mulx's for one limb via an inverse */
+  if (size == 1)
+    {
+      s = src[0];
+      if (s > c)
+	{
+	  l = s-c;
+	  h = l % d;
+	  if (h != 0)
+	    h = d - h;
+	}
+      else
+	{
+	  l = c-s;
+	  h = l % d;
+	}
+      return h;
+    }
+
+  binvert_limb (inverse, d);
+
+  if (d <= 0xFFFFFFFF)
+    {
+      s = *src++;
+      size--;
+      do
+        {
+          SUBC_LIMB (c, l, s, c);
+          s = *src++;
+          q = l * inverse;
+          umul_ppmm_half_lowequal (h, q, d, l);
+          c += h;
+          size--;
+        }
+      while (size != 0);
+
+      if (s <= d)
+        {
+          /* With high s <= d the final step can be a subtract and addback.
+             If c==0 then the addback will restore to l>=0.  If c==d then
+             will get l==d if s==0, but that's ok per the function
+             definition.  */
+
+          l = c - s;
+          l += (l > c ? d : 0);
+
+          ASSERT_RETVAL (l);
+          return l;
+        }
+      else
+        {
+          /* Can't skip a divide, just do the loop code once more. */
+          SUBC_LIMB (c, l, s, c);
+          q = l * inverse;
+          umul_ppmm_half_lowequal (h, q, d, l);
+          c += h;
+
+          ASSERT_RETVAL (c);
+          return c;
+        }
+    }
+  else
+    {
+      mp_limb_t  dl = LOW32 (d);
+      mp_limb_t  dh = HIGH32 (d);
+      long i;
+
+      s = *src++;
+      size--;
+      do
+        {
+          SUBC_LIMB (c, l, s, c);
+          s = *src++;
+          q = l * inverse;
+          umul_ppmm_lowequal (h, q, d, dh, dl, l);
+          c += h;
+          size--;
+        }
+      while (size != 0);
+
+      if (s <= d)
+        {
+          /* With high s <= d the final step can be a subtract and addback.
+             If c==0 then the addback will restore to l>=0.  If c==d then
+             will get l==d if s==0, but that's ok per the function
+             definition.  */
+
+          l = c - s;
+          l += (l > c ? d : 0);
+
+          ASSERT_RETVAL (l);
+          return l;
+        }
+      else
+        {
+          /* Can't skip a divide, just do the loop code once more. */
+          SUBC_LIMB (c, l, s, c);
+          q = l * inverse;
+          umul_ppmm_lowequal (h, q, d, dh, dl, l);
+          c += h;
+
+          ASSERT_RETVAL (c);
+          return c;
+        }
+    }
+}

diff --git a/third_party/gmp/mpn/sparc64/rshift.asm b/third_party/gmp/mpn/sparc64/rshift.asm
new file mode 100644
index 0000000..3f8e11f
--- /dev/null
+++ b/third_party/gmp/mpn/sparc64/rshift.asm

@@ -0,0 +1,142 @@
+dnl  SPARC v9 mpn_rshift
+
+dnl  Contributed to the GNU project by David Miller.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C		    cycles/limb
+C UltraSPARC 1&2:	 2
+C UltraSPARC 3:		 2.5
+C UltraSPARC T1:	17.5
+C UltraSPARC T3:	 8
+C UltraSPARC T4:	 3
+
+C INPUT PARAMETERS
+define(`rp',     `%i0')
+define(`up',     `%i1')
+define(`n',      `%i2')
+define(`cnt',    `%i3')
+
+define(`tcnt',   `%i4')
+define(`retval', `%i5')
+define(`u0',     `%l0')
+define(`u1',     `%l1')
+define(`r0',     `%l6')
+define(`r1',     `%l7')
+define(`u0_off', `%o0')
+define(`u1_off', `%o1')
+define(`r0_off', `%o2')
+define(`r1_off', `%o3')
+
+ASM_START()
+	REGISTER(%g2,#scratch)
+	REGISTER(%g3,#scratch)
+PROLOGUE(mpn_rshift)
+	save	%sp, -176, %sp
+
+	sllx	n, 3, n
+	sub	%g0, cnt, tcnt
+
+	add	up, n, up
+	add	rp, n, rp
+
+	neg	n, n
+	sub	up, (2 * 8), u0_off
+	sub	rp, (5 * 8), r0_off
+
+	ldx	[n + up], u1		C WAS: up + 0
+	sub	u0_off, (1 * 8), u1_off
+	sub	r0_off, (1 * 8), r1_off
+
+	subcc	n, -(3 * 8), n
+	sllx	u1, tcnt, retval
+
+	bg,pn	%xcc, L(end12)
+	 srlx	u1, cnt, %l3
+
+	ldx	[n + u0_off], u0	C WAS: up + 0
+	subcc	n, -(2 * 8), n
+
+	ldx	[n + u1_off], u1	C WAS: up + 8
+
+	bg,pn	%xcc, L(end34)
+	 sllx	u0, tcnt, %l4
+
+	b,a	L(top)
+	ALIGN(16)
+L(top):
+	srlx	u0, cnt, %l2
+	or	%l3, %l4, r0
+
+	ldx	[n + u0_off], u0	C WAS: up + 0
+	sllx	u1, tcnt, %l5
+
+	stx	r0, [n + r0_off]	C WAS: rp + 0
+	subcc	n, -(2 * 8), n
+
+	srlx	u1, cnt, %l3
+	or	%l2, %l5, r1
+
+	ldx	[n + u1_off], u1	C WAS: up + 8
+	sllx	u0, tcnt, %l4
+
+	ble,pt	%xcc, L(top)
+	 stx	r1, [n + r1_off]	C WAS: rp + 8
+
+L(end34):
+	srlx	u0, cnt, %l2
+	or	%l3, %l4, r0
+
+	sllx	u1, tcnt, %l5
+	stx	r0, [n + r0_off]	C WAS: rp + 0
+
+	or	%l2, %l5, r1
+	sub	n, -(2 * 8), %o5
+
+	srlx	u1, cnt, %l3
+	stx	r1, [%o5 + r1_off]	C WAS: rp + 8
+
+L(end12):
+	andcc	n, 8, %g0
+	bz,pn	%xcc, L(done)
+	 nop
+
+	ldx	[n + u0_off], u1
+	sllx	u1, tcnt, %l4
+	or	%l3, %l4, r0
+	stx	r0, [r0_off + 24]
+	srlx	u1, cnt, %l3
+L(done):
+	stx	%l3, [r0_off + 32]
+
+	ret
+	restore retval, 0, %o0
+EPILOGUE()

diff --git a/third_party/gmp/mpn/sparc64/sec_tabselect.asm b/third_party/gmp/mpn/sparc64/sec_tabselect.asm
new file mode 100644
index 0000000..22e0dc5
--- /dev/null
+++ b/third_party/gmp/mpn/sparc64/sec_tabselect.asm

@@ -0,0 +1,162 @@
+dnl  SPARC v9 mpn_sec_tabselect.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund and David Miller.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C		   cycles/limb
+C UltraSPARC 1&2:	 2 hopefully
+C UltraSPARC 3:		 3
+C UltraSPARC T1:	17
+C UltraSPARC T3:	 ?
+C UltraSPARC T4/T5:	 2.25 hopefully
+
+C INPUT PARAMETERS
+define(`rp',     `%i0')
+define(`tp',     `%i1')
+define(`n',      `%i2')
+define(`nents',  `%i3')
+define(`which',  `%i4')
+
+define(`i',      `%g1')
+define(`j',      `%g3')
+define(`stride', `%g4')
+define(`tporig', `%g5')
+define(`mask',   `%o0')
+
+define(`data0',  `%l0')
+define(`data1',  `%l1')
+define(`data2',  `%l2')
+define(`data3',  `%l3')
+define(`t0',     `%l4')
+define(`t1',     `%l5')
+define(`t2',     `%l6')
+define(`t3',     `%l7')
+
+ASM_START()
+	REGISTER(%g2,#scratch)
+	REGISTER(%g3,#scratch)
+PROLOGUE(mpn_sec_tabselect)
+	save	%sp, -176, %sp
+
+	sllx	n, 3, stride
+	sub	n, 4, j
+	brlz	j, L(outer_end)
+	 mov	tp, tporig
+
+L(outer_loop):
+	clr	data0
+	clr	data1
+	clr	data2
+	clr	data3
+	mov	tporig, tp
+	mov	nents, i
+	mov	which, %o1
+
+L(top):	subcc	%o1, 1, %o1		C set carry iff o1 = 0
+	ldx	[tp + 0], t0
+	subc	%g0, %g0, mask
+	ldx	[tp + 8], t1
+	sub	i, 1, i
+	ldx	[tp + 16], t2
+	ldx	[tp + 24], t3
+	add	tp, stride, tp
+	and	t0, mask, t0
+	and	t1, mask, t1
+	or	t0, data0, data0
+	and	t2, mask, t2
+	or	t1, data1, data1
+	and	t3, mask, t3
+	or	t2, data2, data2
+	brnz	i, L(top)
+	 or	t3, data3, data3
+
+	stx	data0, [rp + 0]
+	subcc	j, 4, j
+	stx	data1, [rp + 8]
+	stx	data2, [rp + 16]
+	stx	data3, [rp + 24]
+	add	tporig, (4 * 8), tporig
+
+	brgez	j, L(outer_loop)
+	 add	rp, (4 * 8), rp
+L(outer_end):
+
+
+	andcc	n, 2, %g0
+	be	L(b0x)
+	 nop
+L(b1x):	clr	data0
+	clr	data1
+	mov	tporig, tp
+	mov	nents, i
+	mov	which, %o1
+
+L(tp2):	subcc	%o1, 1, %o1
+	ldx	[tp + 0], t0
+	subc	%g0, %g0, mask
+	ldx	[tp + 8], t1
+	sub	i, 1, i
+	add	tp, stride, tp
+	and	t0, mask, t0
+	and	t1, mask, t1
+	or	t0, data0, data0
+	brnz	i, L(tp2)
+	 or	t1, data1, data1
+
+	stx	data0, [rp + 0]
+	stx	data1, [rp + 8]
+	add	tporig, (2 * 8), tporig
+	add	rp, (2 * 8), rp
+
+
+L(b0x):	andcc	n, 1, %g0
+	be	L(b00)
+	 nop
+L(b01):	clr	data0
+	mov	tporig, tp
+	mov	nents, i
+	mov	which, %o1
+
+L(tp1):	subcc	%o1, 1, %o1
+	ldx	[tp + 0], t0
+	subc	%g0, %g0, mask
+	sub	i, 1, i
+	add	tp, stride, tp
+	and	t0, mask, t0
+	brnz	i, L(tp1)
+	 or	t0, data0, data0
+
+	stx	data0, [rp + 0]
+
+L(b00):	 ret
+	  restore
+EPILOGUE()

diff --git a/third_party/gmp/mpn/sparc64/sparc64.h b/third_party/gmp/mpn/sparc64/sparc64.h
new file mode 100644
index 0000000..8698a82
--- /dev/null
+++ b/third_party/gmp/mpn/sparc64/sparc64.h

@@ -0,0 +1,217 @@
+/* UltraSPARC 64 support macros.
+
+   THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY.  THEY'RE ALMOST
+   CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
+   FUTURE GNU MP RELEASES.
+
+Copyright 2003 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+#define LOW32(x)   ((x) & 0xFFFFFFFF)
+#define HIGH32(x)  ((x) >> 32)
+
+
+/* Halfword number i in src is accessed as src[i+HALF_ENDIAN_ADJ(i)].
+   Plain src[i] would be incorrect in big endian, HALF_ENDIAN_ADJ has the
+   effect of swapping the two halves in this case.  */
+#if HAVE_LIMB_BIG_ENDIAN
+#define HALF_ENDIAN_ADJ(i)  (1 - (((i) & 1) << 1))   /* +1 even, -1 odd */
+#endif
+#if HAVE_LIMB_LITTLE_ENDIAN
+#define HALF_ENDIAN_ADJ(i)  0                        /* no adjust */
+#endif
+#ifndef HALF_ENDIAN_ADJ
+Error, error, unknown limb endianness;
+#endif
+
+
+/* umul_ppmm_lowequal sets h to the high limb of q*d, assuming the low limb
+   of that product is equal to l.  dh and dl are the 32-bit halves of d.
+
+   |-----high----||----low-----|
+   +------+------+
+   |             |                 ph = qh * dh
+   +------+------+
+          +------+------+
+          |             |          pm1 = ql * dh
+          +------+------+
+          +------+------+
+          |             |          pm2 = qh * dl
+          +------+------+
+                 +------+------+
+                 |             |   pl = ql * dl (not calculated)
+                 +------+------+
+
+   Knowing that the low 64 bits is equal to l means that LOW(pm1) + LOW(pm2)
+   + HIGH(pl) == HIGH(l).  The only thing we need from those product parts
+   is whether they produce a carry into the high.
+
+   pm_l = LOW(pm1)+LOW(pm2) is done to contribute its carry, then the only
+   time there's a further carry from LOW(pm_l)+HIGH(pl) is if LOW(pm_l) >
+   HIGH(l).  pl is never actually calculated.  */
+
+#define umul_ppmm_lowequal(h, q, d, dh, dl, l)  \
+  do {                                          \
+    mp_limb_t  ql, qh, ph, pm1, pm2, pm_l;      \
+    ASSERT (dh == HIGH32(d));                   \
+    ASSERT (dl == LOW32(d));                    \
+    ASSERT (q*d == l);                          \
+                                                \
+    ql = LOW32 (q);                             \
+    qh = HIGH32 (q);                            \
+                                                \
+    pm1 = ql * dh;                              \
+    pm2 = qh * dl;                              \
+    ph  = qh * dh;                              \
+                                                \
+    pm_l = LOW32 (pm1) + LOW32 (pm2);           \
+                                                \
+    (h) = ph + HIGH32 (pm1) + HIGH32 (pm2)      \
+      + HIGH32 (pm_l) + ((pm_l << 32) > l);     \
+                                                \
+    ASSERT_HIGH_PRODUCT (h, q, d);              \
+  } while (0)
+
+
+/* Set h to the high of q*d, assuming the low limb of that product is equal
+   to l, and that d fits in 32-bits.
+
+   |-----high----||----low-----|
+          +------+------+
+          |             |          pm = qh * dl
+          +------+------+
+                 +------+------+
+                 |             |   pl = ql * dl (not calculated)
+                 +------+------+
+
+   Knowing that LOW(pm) + HIGH(pl) == HIGH(l) (mod 2^32) means that the only
+   time there's a carry from that sum is when LOW(pm) > HIGH(l).  There's no
+   need to calculate pl to determine this.  */
+
+#define umul_ppmm_half_lowequal(h, q, d, l)     \
+  do {                                          \
+    mp_limb_t pm;                               \
+    ASSERT (q*d == l);                          \
+    ASSERT (HIGH32(d) == 0);                    \
+                                                \
+    pm = HIGH32(q) * d;                         \
+    (h) = HIGH32(pm) + ((pm << 32) > l);        \
+    ASSERT_HIGH_PRODUCT (h, q, d);              \
+  } while (0)
+
+
+/* check that h is the high limb of x*y */
+#if WANT_ASSERT
+#define ASSERT_HIGH_PRODUCT(h, x, y)    \
+  do {                                  \
+    mp_limb_t  want_h, dummy;           \
+    umul_ppmm (want_h, dummy, x, y);    \
+    ASSERT (h == want_h);               \
+  } while (0)
+#else
+#define ASSERT_HIGH_PRODUCT(h, q, d)    \
+  do { } while (0)
+#endif
+
+
+/* Multiply u anv v, where v < 2^32.  */
+#define umul_ppmm_s(w1, w0, u, v)					\
+  do {									\
+    UWtype __x0, __x2;							\
+    UWtype __ul, __vl, __uh;						\
+    UWtype __u = (u), __v = (v);					\
+									\
+    __ul = __ll_lowpart (__u);						\
+    __uh = __ll_highpart (__u);						\
+    __vl = __ll_lowpart (__v);						\
+									\
+    __x0 = (UWtype) __ul * __vl;					\
+    __x2 = (UWtype) __uh * __vl;					\
+									\
+    (w1) = (__x2 + (__x0 >> W_TYPE_SIZE/2)) >> W_TYPE_SIZE/2;		\
+    (w0) = (__x2 << W_TYPE_SIZE/2) + __x0;				\
+  } while (0)
+
+/* Count the leading zeros on a limb, but assuming it fits in 32 bits.
+   The count returned will be in the range 32 to 63.
+   This is the 32-bit generic C count_leading_zeros from longlong.h. */
+#define count_leading_zeros_32(count, x)                                      \
+  do {                                                                        \
+    mp_limb_t  __xr = (x);                                                    \
+    unsigned   __a;                                                           \
+    ASSERT ((x) != 0);                                                        \
+    ASSERT ((x) <= CNST_LIMB(0xFFFFFFFF));                                    \
+    __a = __xr < ((UWtype) 1 << 16) ? (__xr < ((UWtype) 1 << 8) ? 1 : 8 + 1)  \
+      : (__xr < ((UWtype) 1 << 24)  ? 16 + 1 : 24 + 1);                       \
+                                                                              \
+    (count) = W_TYPE_SIZE + 1 - __a - __clz_tab[__xr >> __a];                 \
+  } while (0)
+
+
+/* Set inv to a 32-bit inverse floor((b*(b-d)-1) / d), knowing that d fits
+   32 bits and is normalized (high bit set).  */
+#define invert_half_limb(inv, d)                \
+  do {                                          \
+    mp_limb_t  _n;                              \
+    ASSERT ((d) <= 0xFFFFFFFF);                 \
+    ASSERT ((d) & 0x80000000);                  \
+    _n = (((mp_limb_t) -(d)) << 32) - 1;        \
+    (inv) = (mp_limb_t) (unsigned) (_n / (d));  \
+  } while (0)
+
+
+/* Divide nh:nl by d, setting q to the quotient and r to the remainder.
+   q, r, nh and nl are 32-bits each, d_limb is 32-bits but in an mp_limb_t,
+   dinv_limb is similarly a 32-bit inverse but in an mp_limb_t.  */
+
+#define udiv_qrnnd_half_preinv(q, r, nh, nl, d_limb, dinv_limb)         \
+  do {                                                                  \
+    unsigned   _n2, _n10, _n1, _nadj, _q11n, _xh, _r, _q;               \
+    mp_limb_t  _n, _x;                                                  \
+    ASSERT (d_limb <= 0xFFFFFFFF);                                      \
+    ASSERT (dinv_limb <= 0xFFFFFFFF);                                   \
+    ASSERT (d_limb & 0x80000000);                                       \
+    ASSERT (nh < d_limb);                                               \
+    _n10 = (nl);                                                        \
+    _n2 = (nh);                                                         \
+    _n1 = (int) _n10 >> 31;                                             \
+    _nadj = _n10 + (_n1 & d_limb);                                      \
+    _x = dinv_limb * (_n2 - _n1) + _nadj;                               \
+    _q11n = ~(_n2 + HIGH32 (_x));             /* -q1-1 */               \
+    _n = ((mp_limb_t) _n2 << 32) + _n10;                                \
+    _x = _n + d_limb * _q11n;                 /* n-q1*d-d */            \
+    _xh = HIGH32 (_x) - d_limb;               /* high(n-q1*d-d) */      \
+    ASSERT (_xh == 0 || _xh == ~0);                                     \
+    _r = _x + (d_limb & _xh);                 /* addback */             \
+    _q = _xh - _q11n;                         /* q1+1-addback */        \
+    ASSERT (_r < d_limb);                                               \
+    ASSERT (d_limb * _q + _r == _n);                                    \
+    (r) = _r;                                                           \
+    (q) = _q;                                                           \
+  } while (0)

diff --git a/third_party/gmp/mpn/sparc64/ultrasparc1234/add_n.asm b/third_party/gmp/mpn/sparc64/ultrasparc1234/add_n.asm
new file mode 100644
index 0000000..92374d2
--- /dev/null
+++ b/third_party/gmp/mpn/sparc64/ultrasparc1234/add_n.asm

@@ -0,0 +1,241 @@
+dnl  SPARC v9 mpn_add_n -- Add two limb vectors of the same length > 0 and
+dnl  store sum in a third limb vector.
+
+dnl  Copyright 2001-2003, 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C		   cycles/limb
+C UltraSPARC 1&2:     4
+C UltraSPARC 3:	      4.5
+
+C Compute carry-out from the most significant bits of u,v, and r, where
+C r=u+v+carry_in, using logic operations.
+
+C This code runs at 4 cycles/limb on UltraSPARC 1 and 2.  It has a 4 insn
+C recurrency, and the UltraSPARC 1 and 2 the IE units are 100% saturated.
+C Therefore, it seems futile to try to optimize this any further...
+
+C INPUT PARAMETERS
+define(`rp', `%i0')
+define(`up', `%i1')
+define(`vp', `%i2')
+define(`n',  `%i3')
+
+define(`u0', `%l0')
+define(`u1', `%l2')
+define(`u2', `%l4')
+define(`u3', `%l6')
+define(`v0', `%l1')
+define(`v1', `%l3')
+define(`v2', `%l5')
+define(`v3', `%l7')
+
+define(`cy',`%i4')
+
+define(`fanop',`fitod %f0,%f2')		dnl  A quasi nop running in the FA pipe
+define(`fmnop',`fmuld %f0,%f0,%f4')	dnl  A quasi nop running in the FM pipe
+
+ASM_START()
+	REGISTER(%g2,#scratch)
+	REGISTER(%g3,#scratch)
+PROLOGUE(mpn_add_nc)
+	save	%sp,-160,%sp
+
+	fitod	%f0,%f0		C make sure f0 contains small, quiet number
+	subcc	n,4,%g0
+	bl,pn	%xcc,.Loop0
+	nop
+	b,a	L(com)
+EPILOGUE()
+
+PROLOGUE(mpn_add_n)
+	save	%sp,-160,%sp
+
+	fitod	%f0,%f0		C make sure f0 contains small, quiet number
+	subcc	n,4,%g0
+	bl,pn	%xcc,.Loop0
+	mov	0,cy
+L(com):
+	ldx	[up+0],u0
+	ldx	[vp+0],v0
+	add	up,32,up
+	ldx	[up-24],u1
+	ldx	[vp+8],v1
+	add	vp,32,vp
+	ldx	[up-16],u2
+	ldx	[vp-16],v2
+	ldx	[up-8],u3
+	ldx	[vp-8],v3
+	subcc	n,8,n
+	add	u0,v0,%g1	C main add
+	add	%g1,cy,%g5	C carry add
+	or	u0,v0,%g2
+	bl,pn	%xcc,.Lend4567
+	fanop
+	b,a	.Loop
+
+	.align	16
+C START MAIN LOOP
+.Loop:	andn	%g2,%g5,%g2
+	and	u0,v0,%g3
+	ldx	[up+0],u0
+	fanop
+C --
+	or	%g3,%g2,%g2
+	ldx	[vp+0],v0
+	add	up,32,up
+	fanop
+C --
+	srlx	%g2,63,cy
+	add	u1,v1,%g1
+	stx	%g5,[rp+0]
+	fanop
+C --
+	add	%g1,cy,%g5
+	or	u1,v1,%g2
+	fmnop
+	fanop
+C --
+	andn	%g2,%g5,%g2
+	and	u1,v1,%g3
+	ldx	[up-24],u1
+	fanop
+C --
+	or	%g3,%g2,%g2
+	ldx	[vp+8],v1
+	add	vp,32,vp
+	fanop
+C --
+	srlx	%g2,63,cy
+	add	u2,v2,%g1
+	stx	%g5,[rp+8]
+	fanop
+C --
+	add	%g1,cy,%g5
+	or	u2,v2,%g2
+	fmnop
+	fanop
+C --
+	andn	%g2,%g5,%g2
+	and	u2,v2,%g3
+	ldx	[up-16],u2
+	fanop
+C --
+	or	%g3,%g2,%g2
+	ldx	[vp-16],v2
+	add	rp,32,rp
+	fanop
+C --
+	srlx	%g2,63,cy
+	add	u3,v3,%g1
+	stx	%g5,[rp-16]
+	fanop
+C --
+	add	%g1,cy,%g5
+	or	u3,v3,%g2
+	fmnop
+	fanop
+C --
+	andn	%g2,%g5,%g2
+	and	u3,v3,%g3
+	ldx	[up-8],u3
+	fanop
+C --
+	or	%g3,%g2,%g2
+	subcc	n,4,n
+	ldx	[vp-8],v3
+	fanop
+C --
+	srlx	%g2,63,cy
+	add	u0,v0,%g1
+	stx	%g5,[rp-8]
+	fanop
+C --
+	add	%g1,cy,%g5
+	or	u0,v0,%g2
+	bge,pt	%xcc,.Loop
+	fanop
+C END MAIN LOOP
+.Lend4567:
+	andn	%g2,%g5,%g2
+	and	u0,v0,%g3
+	or	%g3,%g2,%g2
+	srlx	%g2,63,cy
+	add	u1,v1,%g1
+	stx	%g5,[rp+0]
+	add	%g1,cy,%g5
+	or	u1,v1,%g2
+	andn	%g2,%g5,%g2
+	and	u1,v1,%g3
+	or	%g3,%g2,%g2
+	srlx	%g2,63,cy
+	add	u2,v2,%g1
+	stx	%g5,[rp+8]
+	add	%g1,cy,%g5
+	or	u2,v2,%g2
+	andn	%g2,%g5,%g2
+	and	u2,v2,%g3
+	or	%g3,%g2,%g2
+	add	rp,32,rp
+	srlx	%g2,63,cy
+	add	u3,v3,%g1
+	stx	%g5,[rp-16]
+	add	%g1,cy,%g5
+	or	u3,v3,%g2
+	andn	%g2,%g5,%g2
+	and	u3,v3,%g3
+	or	%g3,%g2,%g2
+	srlx	%g2,63,cy
+	stx	%g5,[rp-8]
+
+	addcc	n,4,n
+	bz,pn	%xcc,.Lret
+	fanop
+
+.Loop0:	ldx	[up],u0
+	add	up,8,up
+	ldx	[vp],v0
+	add	vp,8,vp
+	add	rp,8,rp
+	subcc	n,1,n
+	add	u0,v0,%g1
+	or	u0,v0,%g2
+	add	%g1,cy,%g5
+	and	u0,v0,%g3
+	andn	%g2,%g5,%g2
+	stx	%g5,[rp-8]
+	or	%g3,%g2,%g2
+	bnz,pt	%xcc,.Loop0
+	srlx	%g2,63,cy
+
+.Lret:	mov	cy,%i0
+	ret
+	restore
+EPILOGUE()

diff --git a/third_party/gmp/mpn/sparc64/ultrasparc1234/addmul_1.asm b/third_party/gmp/mpn/sparc64/ultrasparc1234/addmul_1.asm
new file mode 100644
index 0000000..48a9414
--- /dev/null
+++ b/third_party/gmp/mpn/sparc64/ultrasparc1234/addmul_1.asm

@@ -0,0 +1,606 @@
+dnl  SPARC v9 64-bit mpn_addmul_1 -- Multiply a limb vector with a limb and add
+dnl  the result to a second limb vector.
+
+dnl  Copyright 1998, 2000-2004 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C		   cycles/limb
+C UltraSPARC 1&2:     14
+C UltraSPARC 3:	      17.5
+
+C Algorithm: We use eight floating-point multiplies per limb product, with the
+C invariant v operand split into four 16-bit pieces, and the up operand split
+C into 32-bit pieces.  We sum pairs of 48-bit partial products using
+C floating-point add, then convert the four 49-bit product-sums and transfer
+C them to the integer unit.
+
+C Possible optimizations:
+C   0. Rewrite to use algorithm of mpn_addmul_2.
+C   1. Align the stack area where we transfer the four 49-bit product-sums
+C      to a 32-byte boundary.  That would minimize the cache collision.
+C      (UltraSPARC-1/2 use a direct-mapped cache.)  (Perhaps even better would
+C      be to align the area to map to the area immediately before up?)
+C   2. Sum the 4 49-bit quantities using 32-bit operations, as in the
+C      develop mpn_addmul_2.  This would save many integer instructions.
+C   3. Unrolling.  Questionable if it is worth the code expansion, given that
+C      it could only save 1 cycle/limb.
+C   4. Specialize for particular v values.  If its upper 32 bits are zero, we
+C      could save many operations, in the FPU (fmuld), but more so in the IEU
+C      since we'll be summing 48-bit quantities, which might be simpler.
+C   5. Ideally, we should schedule the f2/f3 and f4/f5 RAW further apart, and
+C      the i00,i16,i32,i48 RAW less apart.  The latter apart-scheduling should
+C      not be greater than needed for L2 cache latency, and also not so great
+C      that i16 needs to be copied.
+C   6. Avoid performing mem+fa+fm in the same cycle, at least not when we want
+C      to get high IEU bandwidth.  (12 of the 14 cycles will be free for 2 IEU
+C      ops.)
+
+C Instruction classification (as per UltraSPARC-1/2 functional units):
+C    8 FM
+C   10 FA
+C   12 MEM
+C   10 ISHIFT + 14 IADDLOG
+C    1 BRANCH
+C   55 insns totally (plus one mov insn that should be optimized out)
+
+C The loop executes 56 instructions in 14 cycles on UltraSPARC-1/2, i.e we
+C sustain the peak execution rate of 4 instructions/cycle.
+
+C INPUT PARAMETERS
+C rp	i0
+C up	i1
+C n	i2
+C v	i3
+
+ASM_START()
+	REGISTER(%g2,#scratch)
+	REGISTER(%g3,#scratch)
+
+define(`p00', `%f8') define(`p16',`%f10') define(`p32',`%f12') define(`p48',`%f14')
+define(`r32',`%f16') define(`r48',`%f18') define(`r64',`%f20') define(`r80',`%f22')
+define(`v00',`%f24') define(`v16',`%f26') define(`v32',`%f28') define(`v48',`%f30')
+define(`u00',`%f32') define(`u32', `%f34')
+define(`a00',`%f36') define(`a16',`%f38') define(`a32',`%f40') define(`a48',`%f42')
+define(`cy',`%g1')
+define(`rlimb',`%g3')
+define(`i00',`%l0') define(`i16',`%l1') define(`i32',`%l2') define(`i48',`%l3')
+define(`xffffffff',`%l7')
+define(`xffff',`%o0')
+
+PROLOGUE(mpn_addmul_1)
+
+C Initialization.  (1) Split v operand into four 16-bit chunks and store them
+C as IEEE double in fp registers.  (2) Clear upper 32 bits of fp register pairs
+C f2 and f4.  (3) Store masks in registers aliased to `xffff' and `xffffffff'.
+
+	save	%sp, -256, %sp
+	mov	-1, %g4
+	srlx	%g4, 48, xffff		C store mask in register `xffff'
+	and	%i3, xffff, %g2
+	stx	%g2, [%sp+2223+0]
+	srlx	%i3, 16, %g3
+	and	%g3, xffff, %g3
+	stx	%g3, [%sp+2223+8]
+	srlx	%i3, 32, %g2
+	and	%g2, xffff, %g2
+	stx	%g2, [%sp+2223+16]
+	srlx	%i3, 48, %g3
+	stx	%g3, [%sp+2223+24]
+	srlx	%g4, 32, xffffffff	C store mask in register `xffffffff'
+
+	sllx	%i2, 3, %i2
+	mov	0, cy			C clear cy
+	add	%i0, %i2, %i0
+	add	%i1, %i2, %i1
+	neg	%i2
+	add	%i1, 4, %i5
+	add	%i0, -32, %i4
+	add	%i0, -16, %i0
+
+	ldd	[%sp+2223+0], v00
+	ldd	[%sp+2223+8], v16
+	ldd	[%sp+2223+16], v32
+	ldd	[%sp+2223+24], v48
+	ld	[%sp+2223+0],%f2	C zero f2
+	ld	[%sp+2223+0],%f4	C zero f4
+	ld	[%i5+%i2], %f3		C read low 32 bits of up[i]
+	ld	[%i1+%i2], %f5		C read high 32 bits of up[i]
+	fxtod	v00, v00
+	fxtod	v16, v16
+	fxtod	v32, v32
+	fxtod	v48, v48
+
+C Start real work.  (We sneakingly read f3 and f5 above...)
+C The software pipeline is very deep, requiring 4 feed-in stages.
+
+	fxtod	%f2, u00
+	fxtod	%f4, u32
+	fmuld	u00, v00, a00
+	fmuld	u00, v16, a16
+	fmuld	u00, v32, p32
+	fmuld	u32, v00, r32
+	fmuld	u00, v48, p48
+	addcc	%i2, 8, %i2
+	bnz,pt	%xcc, .L_two_or_more
+	fmuld	u32, v16, r48
+
+.L_one:
+	fmuld	u32, v32, r64	C FIXME not urgent
+	faddd	p32, r32, a32
+	fdtox	a00, a00
+	faddd	p48, r48, a48
+	fmuld	u32, v48, r80	C FIXME not urgent
+	fdtox	a16, a16
+	fdtox	a32, a32
+	fdtox	a48, a48
+	std	a00, [%sp+2223+0]
+	std	a16, [%sp+2223+8]
+	std	a32, [%sp+2223+16]
+	std	a48, [%sp+2223+24]
+	add	%i2, 8, %i2
+
+	fdtox	r64, a00
+	ldx	[%i0+%i2], rlimb	C read rp[i]
+	fdtox	r80, a16
+	ldx	[%sp+2223+0], i00
+	ldx	[%sp+2223+8], i16
+	ldx	[%sp+2223+16], i32
+	ldx	[%sp+2223+24], i48
+	std	a00, [%sp+2223+0]
+	std	a16, [%sp+2223+8]
+	add	%i2, 8, %i2
+
+	srlx	rlimb, 32, %g4		C HI(rlimb)
+	and	rlimb, xffffffff, %g5	C LO(rlimb)
+	add	i00, %g5, %g5		C i00+ now in g5
+	ldx	[%sp+2223+0], i00
+	srlx	i16, 48, %l4		C (i16 >> 48)
+	mov	i16, %g2
+	ldx	[%sp+2223+8], i16
+	srlx	i48, 16, %l5		C (i48 >> 16)
+	add	i32, %g4, %g4		C i32+ now in g4
+	sllx	i48, 32, %l6		C (i48 << 32)
+	srlx	%g4, 32, %o3		C (i32 >> 32)
+	add	%l5, %l4, %o1		C hi64- in %o1
+	std	a00, [%sp+2223+0]
+	sllx	%g4, 16, %o2		C (i32 << 16)
+	add	%o3, %o1, %o1		C hi64 in %o1   1st ASSIGNMENT
+	std	a16, [%sp+2223+8]
+	sllx	%o1, 48, %o3		C (hi64 << 48)
+	add	%g2, %o2, %o2		C mi64- in %o2
+	add	%l6, %o2, %o2		C mi64- in %o2
+	sub	%o2, %o3, %o2		C mi64 in %o2   1st ASSIGNMENT
+	add	cy, %g5, %o4		C x = prev(i00) + cy
+	b	.L_out_1
+	add	%i2, 8, %i2
+
+.L_two_or_more:
+	ld	[%i5+%i2], %f3		C read low 32 bits of up[i]
+	fmuld	u32, v32, r64	C FIXME not urgent
+	faddd	p32, r32, a32
+	ld	[%i1+%i2], %f5		C read high 32 bits of up[i]
+	fdtox	a00, a00
+	faddd	p48, r48, a48
+	fmuld	u32, v48, r80	C FIXME not urgent
+	fdtox	a16, a16
+	fdtox	a32, a32
+	fxtod	%f2, u00
+	fxtod	%f4, u32
+	fdtox	a48, a48
+	std	a00, [%sp+2223+0]
+	fmuld	u00, v00, p00
+	std	a16, [%sp+2223+8]
+	fmuld	u00, v16, p16
+	std	a32, [%sp+2223+16]
+	fmuld	u00, v32, p32
+	std	a48, [%sp+2223+24]
+	faddd	p00, r64, a00
+	fmuld	u32, v00, r32
+	faddd	p16, r80, a16
+	fmuld	u00, v48, p48
+	addcc	%i2, 8, %i2
+	bnz,pt	%xcc, .L_three_or_more
+	fmuld	u32, v16, r48
+
+.L_two:
+	fmuld	u32, v32, r64	C FIXME not urgent
+	faddd	p32, r32, a32
+	fdtox	a00, a00
+	ldx	[%i0+%i2], rlimb	C read rp[i]
+	faddd	p48, r48, a48
+	fmuld	u32, v48, r80	C FIXME not urgent
+	fdtox	a16, a16
+	ldx	[%sp+2223+0], i00
+	fdtox	a32, a32
+	ldx	[%sp+2223+8], i16
+	ldx	[%sp+2223+16], i32
+	ldx	[%sp+2223+24], i48
+	fdtox	a48, a48
+	std	a00, [%sp+2223+0]
+	std	a16, [%sp+2223+8]
+	std	a32, [%sp+2223+16]
+	std	a48, [%sp+2223+24]
+	add	%i2, 8, %i2
+
+	fdtox	r64, a00
+	srlx	rlimb, 32, %g4		C HI(rlimb)
+	and	rlimb, xffffffff, %g5	C LO(rlimb)
+	ldx	[%i0+%i2], rlimb	C read rp[i]
+	add	i00, %g5, %g5		C i00+ now in g5
+	fdtox	r80, a16
+	ldx	[%sp+2223+0], i00
+	srlx	i16, 48, %l4		C (i16 >> 48)
+	mov	i16, %g2
+	ldx	[%sp+2223+8], i16
+	srlx	i48, 16, %l5		C (i48 >> 16)
+	add	i32, %g4, %g4		C i32+ now in g4
+	ldx	[%sp+2223+16], i32
+	sllx	i48, 32, %l6		C (i48 << 32)
+	ldx	[%sp+2223+24], i48
+	srlx	%g4, 32, %o3		C (i32 >> 32)
+	add	%l5, %l4, %o1		C hi64- in %o1
+	std	a00, [%sp+2223+0]
+	sllx	%g4, 16, %o2		C (i32 << 16)
+	add	%o3, %o1, %o1		C hi64 in %o1   1st ASSIGNMENT
+	std	a16, [%sp+2223+8]
+	sllx	%o1, 48, %o3		C (hi64 << 48)
+	add	%g2, %o2, %o2		C mi64- in %o2
+	add	%l6, %o2, %o2		C mi64- in %o2
+	sub	%o2, %o3, %o2		C mi64 in %o2   1st ASSIGNMENT
+	add	cy, %g5, %o4		C x = prev(i00) + cy
+	b	.L_out_2
+	add	%i2, 8, %i2
+
+.L_three_or_more:
+	ld	[%i5+%i2], %f3		C read low 32 bits of up[i]
+	fmuld	u32, v32, r64	C FIXME not urgent
+	faddd	p32, r32, a32
+	ld	[%i1+%i2], %f5		C read high 32 bits of up[i]
+	fdtox	a00, a00
+	ldx	[%i0+%i2], rlimb	C read rp[i]
+	faddd	p48, r48, a48
+	fmuld	u32, v48, r80	C FIXME not urgent
+	fdtox	a16, a16
+	ldx	[%sp+2223+0], i00
+	fdtox	a32, a32
+	ldx	[%sp+2223+8], i16
+	fxtod	%f2, u00
+	ldx	[%sp+2223+16], i32
+	fxtod	%f4, u32
+	ldx	[%sp+2223+24], i48
+	fdtox	a48, a48
+	std	a00, [%sp+2223+0]
+	fmuld	u00, v00, p00
+	std	a16, [%sp+2223+8]
+	fmuld	u00, v16, p16
+	std	a32, [%sp+2223+16]
+	fmuld	u00, v32, p32
+	std	a48, [%sp+2223+24]
+	faddd	p00, r64, a00
+	fmuld	u32, v00, r32
+	faddd	p16, r80, a16
+	fmuld	u00, v48, p48
+	addcc	%i2, 8, %i2
+	bnz,pt	%xcc, .L_four_or_more
+	fmuld	u32, v16, r48
+
+.L_three:
+	fmuld	u32, v32, r64	C FIXME not urgent
+	faddd	p32, r32, a32
+	fdtox	a00, a00
+	srlx	rlimb, 32, %g4		C HI(rlimb)
+	and	rlimb, xffffffff, %g5	C LO(rlimb)
+	ldx	[%i0+%i2], rlimb	C read rp[i]
+	faddd	p48, r48, a48
+	add	i00, %g5, %g5		C i00+ now in g5
+	fmuld	u32, v48, r80	C FIXME not urgent
+	fdtox	a16, a16
+	ldx	[%sp+2223+0], i00
+	fdtox	a32, a32
+	srlx	i16, 48, %l4		C (i16 >> 48)
+	mov	i16, %g2
+	ldx	[%sp+2223+8], i16
+	srlx	i48, 16, %l5		C (i48 >> 16)
+	add	i32, %g4, %g4		C i32+ now in g4
+	ldx	[%sp+2223+16], i32
+	sllx	i48, 32, %l6		C (i48 << 32)
+	ldx	[%sp+2223+24], i48
+	fdtox	a48, a48
+	srlx	%g4, 32, %o3		C (i32 >> 32)
+	add	%l5, %l4, %o1		C hi64- in %o1
+	std	a00, [%sp+2223+0]
+	sllx	%g4, 16, %o2		C (i32 << 16)
+	add	%o3, %o1, %o1		C hi64 in %o1   1st ASSIGNMENT
+	std	a16, [%sp+2223+8]
+	sllx	%o1, 48, %o3		C (hi64 << 48)
+	add	%g2, %o2, %o2		C mi64- in %o2
+	std	a32, [%sp+2223+16]
+	add	%l6, %o2, %o2		C mi64- in %o2
+	std	a48, [%sp+2223+24]
+	sub	%o2, %o3, %o2		C mi64 in %o2   1st ASSIGNMENT
+	add	cy, %g5, %o4		C x = prev(i00) + cy
+	b	.L_out_3
+	add	%i2, 8, %i2
+
+.L_four_or_more:
+	ld	[%i5+%i2], %f3		C read low 32 bits of up[i]
+	fmuld	u32, v32, r64	C FIXME not urgent
+	faddd	p32, r32, a32
+	ld	[%i1+%i2], %f5		C read high 32 bits of up[i]
+	fdtox	a00, a00
+	srlx	rlimb, 32, %g4		C HI(rlimb)
+	and	rlimb, xffffffff, %g5	C LO(rlimb)
+	ldx	[%i0+%i2], rlimb	C read rp[i]
+	faddd	p48, r48, a48
+	add	i00, %g5, %g5		C i00+ now in g5
+	fmuld	u32, v48, r80	C FIXME not urgent
+	fdtox	a16, a16
+	ldx	[%sp+2223+0], i00
+	fdtox	a32, a32
+	srlx	i16, 48, %l4		C (i16 >> 48)
+	mov	i16, %g2
+	ldx	[%sp+2223+8], i16
+	fxtod	%f2, u00
+	srlx	i48, 16, %l5		C (i48 >> 16)
+	add	i32, %g4, %g4		C i32+ now in g4
+	ldx	[%sp+2223+16], i32
+	fxtod	%f4, u32
+	sllx	i48, 32, %l6		C (i48 << 32)
+	ldx	[%sp+2223+24], i48
+	fdtox	a48, a48
+	srlx	%g4, 32, %o3		C (i32 >> 32)
+	add	%l5, %l4, %o1		C hi64- in %o1
+	std	a00, [%sp+2223+0]
+	fmuld	u00, v00, p00
+	sllx	%g4, 16, %o2		C (i32 << 16)
+	add	%o3, %o1, %o1		C hi64 in %o1   1st ASSIGNMENT
+	std	a16, [%sp+2223+8]
+	fmuld	u00, v16, p16
+	sllx	%o1, 48, %o3		C (hi64 << 48)
+	add	%g2, %o2, %o2		C mi64- in %o2
+	std	a32, [%sp+2223+16]
+	fmuld	u00, v32, p32
+	add	%l6, %o2, %o2		C mi64- in %o2
+	std	a48, [%sp+2223+24]
+	faddd	p00, r64, a00
+	fmuld	u32, v00, r32
+	sub	%o2, %o3, %o2		C mi64 in %o2   1st ASSIGNMENT
+	faddd	p16, r80, a16
+	fmuld	u00, v48, p48
+	add	cy, %g5, %o4		C x = prev(i00) + cy
+	addcc	%i2, 8, %i2
+	bnz,pt	%xcc, .Loop
+	fmuld	u32, v16, r48
+
+.L_four:
+	b,a	.L_out_4
+
+C BEGIN MAIN LOOP
+	.align	16
+.Loop:
+C 00
+	srlx	%o4, 16, %o5		C (x >> 16)
+	ld	[%i5+%i2], %f3		C read low 32 bits of up[i]
+	fmuld	u32, v32, r64	C FIXME not urgent
+	faddd	p32, r32, a32
+C 01
+	add	%o5, %o2, %o2		C mi64 in %o2   2nd ASSIGNMENT
+	and	%o4, xffff, %o5		C (x & 0xffff)
+	ld	[%i1+%i2], %f5		C read high 32 bits of up[i]
+	fdtox	a00, a00
+C 02
+	srlx	rlimb, 32, %g4		C HI(rlimb)
+	and	rlimb, xffffffff, %g5	C LO(rlimb)
+	ldx	[%i0+%i2], rlimb	C read rp[i]
+	faddd	p48, r48, a48
+C 03
+	srlx	%o2, 48, %o7		C (mi64 >> 48)
+	add	i00, %g5, %g5		C i00+ now in g5
+	fmuld	u32, v48, r80	C FIXME not urgent
+	fdtox	a16, a16
+C 04
+	sllx	%o2, 16, %i3		C (mi64 << 16)
+	add	%o7, %o1, cy		C new cy
+	ldx	[%sp+2223+0], i00
+	fdtox	a32, a32
+C 05
+	srlx	i16, 48, %l4		C (i16 >> 48)
+	mov	i16, %g2
+	ldx	[%sp+2223+8], i16
+	fxtod	%f2, u00
+C 06
+	srlx	i48, 16, %l5		C (i48 >> 16)
+	add	i32, %g4, %g4		C i32+ now in g4
+	ldx	[%sp+2223+16], i32
+	fxtod	%f4, u32
+C 07
+	sllx	i48, 32, %l6		C (i48 << 32)
+	or	%i3, %o5, %o5
+	ldx	[%sp+2223+24], i48
+	fdtox	a48, a48
+C 08
+	srlx	%g4, 32, %o3		C (i32 >> 32)
+	add	%l5, %l4, %o1		C hi64- in %o1
+	std	a00, [%sp+2223+0]
+	fmuld	u00, v00, p00
+C 09
+	sllx	%g4, 16, %o2		C (i32 << 16)
+	add	%o3, %o1, %o1		C hi64 in %o1   1st ASSIGNMENT
+	std	a16, [%sp+2223+8]
+	fmuld	u00, v16, p16
+C 10
+	sllx	%o1, 48, %o3		C (hi64 << 48)
+	add	%g2, %o2, %o2		C mi64- in %o2
+	std	a32, [%sp+2223+16]
+	fmuld	u00, v32, p32
+C 11
+	add	%l6, %o2, %o2		C mi64- in %o2
+	std	a48, [%sp+2223+24]
+	faddd	p00, r64, a00
+	fmuld	u32, v00, r32
+C 12
+	sub	%o2, %o3, %o2		C mi64 in %o2   1st ASSIGNMENT
+	stx	%o5, [%i4+%i2]
+	faddd	p16, r80, a16
+	fmuld	u00, v48, p48
+C 13
+	add	cy, %g5, %o4		C x = prev(i00) + cy
+	addcc	%i2, 8, %i2
+	bnz,pt	%xcc, .Loop
+	fmuld	u32, v16, r48
+C END MAIN LOOP
+
+.L_out_4:
+	srlx	%o4, 16, %o5		C (x >> 16)
+	fmuld	u32, v32, r64	C FIXME not urgent
+	faddd	p32, r32, a32
+	add	%o5, %o2, %o2		C mi64 in %o2   2nd ASSIGNMENT
+	and	%o4, xffff, %o5		C (x & 0xffff)
+	fdtox	a00, a00
+	srlx	rlimb, 32, %g4		C HI(rlimb)
+	and	rlimb, xffffffff, %g5	C LO(rlimb)
+	ldx	[%i0+%i2], rlimb	C read rp[i]
+	faddd	p48, r48, a48
+	srlx	%o2, 48, %o7		C (mi64 >> 48)
+	add	i00, %g5, %g5		C i00+ now in g5
+	fmuld	u32, v48, r80	C FIXME not urgent
+	fdtox	a16, a16
+	sllx	%o2, 16, %i3		C (mi64 << 16)
+	add	%o7, %o1, cy		C new cy
+	ldx	[%sp+2223+0], i00
+	fdtox	a32, a32
+	srlx	i16, 48, %l4		C (i16 >> 48)
+	mov	i16, %g2
+	ldx	[%sp+2223+8], i16
+	srlx	i48, 16, %l5		C (i48 >> 16)
+	add	i32, %g4, %g4		C i32+ now in g4
+	ldx	[%sp+2223+16], i32
+	sllx	i48, 32, %l6		C (i48 << 32)
+	or	%i3, %o5, %o5
+	ldx	[%sp+2223+24], i48
+	fdtox	a48, a48
+	srlx	%g4, 32, %o3		C (i32 >> 32)
+	add	%l5, %l4, %o1		C hi64- in %o1
+	std	a00, [%sp+2223+0]
+	sllx	%g4, 16, %o2		C (i32 << 16)
+	add	%o3, %o1, %o1		C hi64 in %o1   1st ASSIGNMENT
+	std	a16, [%sp+2223+8]
+	sllx	%o1, 48, %o3		C (hi64 << 48)
+	add	%g2, %o2, %o2		C mi64- in %o2
+	std	a32, [%sp+2223+16]
+	add	%l6, %o2, %o2		C mi64- in %o2
+	std	a48, [%sp+2223+24]
+	sub	%o2, %o3, %o2		C mi64 in %o2   1st ASSIGNMENT
+	stx	%o5, [%i4+%i2]
+	add	cy, %g5, %o4		C x = prev(i00) + cy
+	add	%i2, 8, %i2
+.L_out_3:
+	srlx	%o4, 16, %o5		C (x >> 16)
+	add	%o5, %o2, %o2		C mi64 in %o2   2nd ASSIGNMENT
+	and	%o4, xffff, %o5		C (x & 0xffff)
+	fdtox	r64, a00
+	srlx	rlimb, 32, %g4		C HI(rlimb)
+	and	rlimb, xffffffff, %g5	C LO(rlimb)
+	ldx	[%i0+%i2], rlimb	C read rp[i]
+	srlx	%o2, 48, %o7		C (mi64 >> 48)
+	add	i00, %g5, %g5		C i00+ now in g5
+	fdtox	r80, a16
+	sllx	%o2, 16, %i3		C (mi64 << 16)
+	add	%o7, %o1, cy		C new cy
+	ldx	[%sp+2223+0], i00
+	srlx	i16, 48, %l4		C (i16 >> 48)
+	mov	i16, %g2
+	ldx	[%sp+2223+8], i16
+	srlx	i48, 16, %l5		C (i48 >> 16)
+	add	i32, %g4, %g4		C i32+ now in g4
+	ldx	[%sp+2223+16], i32
+	sllx	i48, 32, %l6		C (i48 << 32)
+	or	%i3, %o5, %o5
+	ldx	[%sp+2223+24], i48
+	srlx	%g4, 32, %o3		C (i32 >> 32)
+	add	%l5, %l4, %o1		C hi64- in %o1
+	std	a00, [%sp+2223+0]
+	sllx	%g4, 16, %o2		C (i32 << 16)
+	add	%o3, %o1, %o1		C hi64 in %o1   1st ASSIGNMENT
+	std	a16, [%sp+2223+8]
+	sllx	%o1, 48, %o3		C (hi64 << 48)
+	add	%g2, %o2, %o2		C mi64- in %o2
+	add	%l6, %o2, %o2		C mi64- in %o2
+	sub	%o2, %o3, %o2		C mi64 in %o2   1st ASSIGNMENT
+	stx	%o5, [%i4+%i2]
+	add	cy, %g5, %o4		C x = prev(i00) + cy
+	add	%i2, 8, %i2
+.L_out_2:
+	srlx	%o4, 16, %o5		C (x >> 16)
+	add	%o5, %o2, %o2		C mi64 in %o2   2nd ASSIGNMENT
+	and	%o4, xffff, %o5		C (x & 0xffff)
+	srlx	rlimb, 32, %g4		C HI(rlimb)
+	and	rlimb, xffffffff, %g5	C LO(rlimb)
+	srlx	%o2, 48, %o7		C (mi64 >> 48)
+	add	i00, %g5, %g5		C i00+ now in g5
+	sllx	%o2, 16, %i3		C (mi64 << 16)
+	add	%o7, %o1, cy		C new cy
+	ldx	[%sp+2223+0], i00
+	srlx	i16, 48, %l4		C (i16 >> 48)
+	mov	i16, %g2
+	ldx	[%sp+2223+8], i16
+	srlx	i48, 16, %l5		C (i48 >> 16)
+	add	i32, %g4, %g4		C i32+ now in g4
+	sllx	i48, 32, %l6		C (i48 << 32)
+	or	%i3, %o5, %o5
+	srlx	%g4, 32, %o3		C (i32 >> 32)
+	add	%l5, %l4, %o1		C hi64- in %o1
+	sllx	%g4, 16, %o2		C (i32 << 16)
+	add	%o3, %o1, %o1		C hi64 in %o1   1st ASSIGNMENT
+	sllx	%o1, 48, %o3		C (hi64 << 48)
+	add	%g2, %o2, %o2		C mi64- in %o2
+	add	%l6, %o2, %o2		C mi64- in %o2
+	sub	%o2, %o3, %o2		C mi64 in %o2   1st ASSIGNMENT
+	stx	%o5, [%i4+%i2]
+	add	cy, %g5, %o4		C x = prev(i00) + cy
+	add	%i2, 8, %i2
+.L_out_1:
+	srlx	%o4, 16, %o5		C (x >> 16)
+	add	%o5, %o2, %o2		C mi64 in %o2   2nd ASSIGNMENT
+	and	%o4, xffff, %o5		C (x & 0xffff)
+	srlx	%o2, 48, %o7		C (mi64 >> 48)
+	sllx	%o2, 16, %i3		C (mi64 << 16)
+	add	%o7, %o1, cy		C new cy
+	or	%i3, %o5, %o5
+	stx	%o5, [%i4+%i2]
+
+	sllx	i00, 0, %g2
+	add	%g2, cy, cy
+	sllx	i16, 16, %g3
+	add	%g3, cy, cy
+
+	return	%i7+8
+	mov	cy, %o0
+EPILOGUE(mpn_addmul_1)

diff --git a/third_party/gmp/mpn/sparc64/ultrasparc1234/addmul_2.asm b/third_party/gmp/mpn/sparc64/ultrasparc1234/addmul_2.asm
new file mode 100644
index 0000000..37674d7
--- /dev/null
+++ b/third_party/gmp/mpn/sparc64/ultrasparc1234/addmul_2.asm

@@ -0,0 +1,551 @@
+dnl  SPARC v9 64-bit mpn_addmul_2 -- Multiply an n limb number with 2-limb
+dnl  number and add the result to a n limb vector.
+
+dnl  Copyright 2002, 2003 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C                  cycles/limb
+C UltraSPARC 1&2:      9
+C UltraSPARC 3:       10
+
+C Algorithm: We use 16 floating-point multiplies per limb product, with the
+C 2-limb v operand split into eight 16-bit pieces, and the n-limb u operand
+C split into 32-bit pieces.  We sum four 48-bit partial products using
+C floating-point add, then convert the resulting four 50-bit quantities and
+C transfer them to the integer unit.
+
+C Possible optimizations:
+C   1. Align the stack area where we transfer the four 50-bit product-sums
+C      to a 32-byte boundary.  That would minimize the cache collision.
+C      (UltraSPARC-1/2 use a direct-mapped cache.)  (Perhaps even better would
+C      be to align the area to map to the area immediately before up?)
+C   2. Perform two of the fp->int conversions with integer instructions.  We
+C      can get almost ten free IEU slots, if we clean up bookkeeping and the
+C      silly carry-limb code.
+C   3. For an mpn_addmul_1 based on this, we need to fix the silly carry-limb
+C      code.
+
+C OSP (Overlapping software pipeline) version of mpn_mul_basecase:
+C Operand swap will require 8 LDDA and 8 FXTOD, which will mean 8 cycles.
+C FI	= 20
+C L	=  9 x un * vn
+C WDFI	= 10 x vn / 2
+C WD	= 4
+
+C Instruction classification (as per UltraSPARC functional units).
+C Assuming silly carry code is fixed.  Includes bookkeeping.
+C
+C               mpn_addmul_X     mpn_mul_X
+C                1       2       1       2
+C               ==========      ==========
+C      FM        8      16       8      16
+C      FA       10      18      10      18
+C     MEM       12      12      10      10
+C  ISHIFT        6       6       6       6
+C IADDLOG       11      11      10      10
+C  BRANCH        1       1       1       1
+C
+C TOTAL IEU     17      17      16      16
+C TOTAL         48      64      45      61
+C
+C IEU cycles     8.5     8.5     8       8
+C MEM cycles    12      12      10      10
+C ISSUE cycles  12      16      11.25   15.25
+C FPU cycles    10      18      10      18
+C cycles/loop   12      18      12      18
+C cycles/limb   12       9      12       9
+
+
+C INPUT PARAMETERS
+C rp[n + 1]	i0
+C up[n]		i1
+C n		i2
+C vp[2]		i3
+
+
+ASM_START()
+	REGISTER(%g2,#scratch)
+	REGISTER(%g3,#scratch)
+
+C Combine registers:
+C u00_hi= u32_hi
+C u00_lo= u32_lo
+C a000  = out000
+C a016  = out016
+C Free: f52 f54
+
+
+define(`p000', `%f8')  define(`p016',`%f10')
+define(`p032',`%f12')  define(`p048',`%f14')
+define(`p064',`%f16')  define(`p080',`%f18')
+define(`p096a',`%f20') define(`p112a',`%f22')
+define(`p096b',`%f56') define(`p112b',`%f58')
+
+define(`out000',`%f0') define(`out016',`%f6')
+
+define(`v000',`%f24')  define(`v016',`%f26')
+define(`v032',`%f28')  define(`v048',`%f30')
+define(`v064',`%f44')  define(`v080',`%f46')
+define(`v096',`%f48')  define(`v112',`%f50')
+
+define(`u00',`%f32')   define(`u32', `%f34')
+
+define(`a000',`%f36')  define(`a016',`%f38')
+define(`a032',`%f40')  define(`a048',`%f42')
+define(`a064',`%f60')  define(`a080',`%f62')
+
+define(`u00_hi',`%f2') define(`u32_hi',`%f4')
+define(`u00_lo',`%f3') define(`u32_lo',`%f5')
+
+define(`cy',`%g1')
+define(`rlimb',`%g3')
+define(`i00',`%l0')    define(`i16',`%l1')
+define(`r00',`%l2')    define(`r32',`%l3')
+define(`xffffffff',`%l7')
+define(`xffff',`%o0')
+
+
+PROLOGUE(mpn_addmul_2)
+
+C Initialization.  (1) Split v operand into eight 16-bit chunks and store them
+C as IEEE double in fp registers.  (2) Clear upper 32 bits of fp register pairs
+C f2 and f4.  (3) Store masks in registers aliased to `xffff' and `xffffffff'.
+C This code could be better scheduled.
+
+	save	%sp, -256, %sp
+
+ifdef(`HAVE_VIS',
+`	mov	-1, %g4
+	wr	%g0, 0xD2, %asi
+	srlx	%g4, 32, xffffffff	C store mask in register `xffffffff'
+	ldda	[%i3+6] %asi, v000
+	ldda	[%i3+4] %asi, v016
+	ldda	[%i3+2] %asi, v032
+	ldda	[%i3+0] %asi, v048
+	fxtod	v000, v000
+	ldda	[%i3+14] %asi, v064
+	fxtod	v016, v016
+	ldda	[%i3+12] %asi, v080
+	fxtod	v032, v032
+	ldda	[%i3+10] %asi, v096
+	fxtod	v048, v048
+	ldda	[%i3+8] %asi, v112
+	fxtod	v064, v064
+	fxtod	v080, v080
+	fxtod	v096, v096
+	fxtod	v112, v112
+	fzero	u00_hi
+	fzero	u32_hi
+',
+`	mov	-1, %g4
+	ldx	[%i3+0], %l0		C vp[0]
+	srlx	%g4, 48, xffff		C store mask in register `xffff'
+	ldx	[%i3+8], %l1		C vp[1]
+
+	and	%l0, xffff, %g2
+	stx	%g2, [%sp+2223+0]
+	srlx	%l0, 16, %g3
+	and	%g3, xffff, %g3
+	stx	%g3, [%sp+2223+8]
+	srlx	%l0, 32, %g2
+	and	%g2, xffff, %g2
+	stx	%g2, [%sp+2223+16]
+	srlx	%l0, 48, %g3
+	stx	%g3, [%sp+2223+24]
+	and	%l1, xffff, %g2
+	stx	%g2, [%sp+2223+32]
+	srlx	%l1, 16, %g3
+	and	%g3, xffff, %g3
+	stx	%g3, [%sp+2223+40]
+	srlx	%l1, 32, %g2
+	and	%g2, xffff, %g2
+	stx	%g2, [%sp+2223+48]
+	srlx	%l1, 48, %g3
+	stx	%g3, [%sp+2223+56]
+
+	srlx	%g4, 32, xffffffff	C store mask in register `xffffffff'
+
+	ldd	[%sp+2223+0], v000
+	ldd	[%sp+2223+8], v016
+	ldd	[%sp+2223+16], v032
+	ldd	[%sp+2223+24], v048
+	fxtod	v000, v000
+	ldd	[%sp+2223+32], v064
+	fxtod	v016, v016
+	ldd	[%sp+2223+40], v080
+	fxtod	v032, v032
+	ldd	[%sp+2223+48], v096
+	fxtod	v048, v048
+	ldd	[%sp+2223+56], v112
+	fxtod	v064, v064
+	ld	[%sp+2223+0], u00_hi	C zero u00_hi
+	fxtod	v080, v080
+	ld	[%sp+2223+0], u32_hi	C zero u32_hi
+	fxtod	v096, v096
+	fxtod	v112, v112
+')
+C Initialization done.
+	mov	0, %g2
+	mov	0, rlimb
+	mov	0, %g4
+	add	%i0, -8, %i0		C BOOKKEEPING
+
+C Start software pipeline.
+
+	ld	[%i1+4], u00_lo		C read low 32 bits of up[i]
+	fxtod	u00_hi, u00
+C mid
+	ld	[%i1+0], u32_lo		C read high 32 bits of up[i]
+	fmuld	u00, v000, a000
+	fmuld	u00, v016, a016
+	fmuld	u00, v032, a032
+	fmuld	u00, v048, a048
+	add	%i2, -1, %i2		C BOOKKEEPING
+	fmuld	u00, v064, p064
+	add	%i1, 8, %i1		C BOOKKEEPING
+	fxtod	u32_hi, u32
+	fmuld	u00, v080, p080
+	fmuld	u00, v096, p096a
+	brnz,pt	%i2, .L_2_or_more
+	 fmuld	u00, v112, p112a
+
+.L1:	fdtox	a000, out000
+	fmuld	u32, v000, p000
+	fdtox	a016, out016
+	fmuld	u32, v016, p016
+	fmovd	p064, a064
+	fmuld	u32, v032, p032
+	fmovd	p080, a080
+	fmuld	u32, v048, p048
+	std	out000, [%sp+2223+16]
+	faddd	p000, a032, a000
+	fmuld	u32, v064, p064
+	std	out016, [%sp+2223+24]
+	fxtod	u00_hi, u00
+	faddd	p016, a048, a016
+	fmuld	u32, v080, p080
+	faddd	p032, a064, a032
+	fmuld	u32, v096, p096b
+	faddd	p048, a080, a048
+	fmuld	u32, v112, p112b
+C mid
+	fdtox	a000, out000
+	fdtox	a016, out016
+	faddd	p064, p096a, a064
+	faddd	p080, p112a, a080
+	std	out000, [%sp+2223+0]
+	b	.L_wd2
+	 std	out016, [%sp+2223+8]
+
+.L_2_or_more:
+	ld	[%i1+4], u00_lo		C read low 32 bits of up[i]
+	fdtox	a000, out000
+	fmuld	u32, v000, p000
+	fdtox	a016, out016
+	fmuld	u32, v016, p016
+	fmovd	p064, a064
+	fmuld	u32, v032, p032
+	fmovd	p080, a080
+	fmuld	u32, v048, p048
+	std	out000, [%sp+2223+16]
+	faddd	p000, a032, a000
+	fmuld	u32, v064, p064
+	std	out016, [%sp+2223+24]
+	fxtod	u00_hi, u00
+	faddd	p016, a048, a016
+	fmuld	u32, v080, p080
+	faddd	p032, a064, a032
+	fmuld	u32, v096, p096b
+	faddd	p048, a080, a048
+	fmuld	u32, v112, p112b
+C mid
+	ld	[%i1+0], u32_lo		C read high 32 bits of up[i]
+	fdtox	a000, out000
+	fmuld	u00, v000, p000
+	fdtox	a016, out016
+	fmuld	u00, v016, p016
+	faddd	p064, p096a, a064
+	fmuld	u00, v032, p032
+	faddd	p080, p112a, a080
+	fmuld	u00, v048, p048
+	add	%i2, -1, %i2		C BOOKKEEPING
+	std	out000, [%sp+2223+0]
+	faddd	p000, a032, a000
+	fmuld	u00, v064, p064
+	add	%i1, 8, %i1		C BOOKKEEPING
+	std	out016, [%sp+2223+8]
+	fxtod	u32_hi, u32
+	faddd	p016, a048, a016
+	fmuld	u00, v080, p080
+	faddd	p032, a064, a032
+	fmuld	u00, v096, p096a
+	faddd	p048, a080, a048
+	brnz,pt	%i2, .L_3_or_more
+	 fmuld	u00, v112, p112a
+
+	b	.Lend
+	 nop
+
+C  64      32       0
+C   .       .       .
+C   .       |__rXXX_|	32
+C   .      |___cy___|	34
+C   .  |_______i00__|	50
+C  |_______i16__|   .	50
+
+
+C BEGIN MAIN LOOP
+	.align	16
+.L_3_or_more:
+.Loop:	ld	[%i1+4], u00_lo		C read low 32 bits of up[i]
+	and	%g2, xffffffff, %g2
+	fdtox	a000, out000
+	fmuld	u32, v000, p000
+C
+	lduw	[%i0+4+8], r00		C read low 32 bits of rp[i]
+	add	%g2, rlimb, %l5
+	fdtox	a016, out016
+	fmuld	u32, v016, p016
+C
+	srlx	%l5, 32, cy
+	ldx	[%sp+2223+16], i00
+	faddd	p064, p096b, a064
+	fmuld	u32, v032, p032
+C
+	add	%g4, cy, cy		C new cy
+	ldx	[%sp+2223+24], i16
+	faddd	p080, p112b, a080
+	fmuld	u32, v048, p048
+C
+	nop
+	std	out000, [%sp+2223+16]
+	faddd	p000, a032, a000
+	fmuld	u32, v064, p064
+C
+	add	i00, r00, rlimb
+	add	%i0, 8, %i0		C BOOKKEEPING
+	std	out016, [%sp+2223+24]
+	fxtod	u00_hi, u00
+C
+	sllx	i16, 16, %g2
+	add	cy, rlimb, rlimb
+	faddd	p016, a048, a016
+	fmuld	u32, v080, p080
+C
+	srlx	i16, 16, %g4
+	add	%g2, rlimb, %l5
+	faddd	p032, a064, a032
+	fmuld	u32, v096, p096b
+C
+	stw	%l5, [%i0+4]
+	nop
+	faddd	p048, a080, a048
+	fmuld	u32, v112, p112b
+C midloop
+	ld	[%i1+0], u32_lo		C read high 32 bits of up[i]
+	and	%g2, xffffffff, %g2
+	fdtox	a000, out000
+	fmuld	u00, v000, p000
+C
+	lduw	[%i0+0], r32		C read high 32 bits of rp[i]
+	add	%g2, rlimb, %l5
+	fdtox	a016, out016
+	fmuld	u00, v016, p016
+C
+	srlx	%l5, 32, cy
+	ldx	[%sp+2223+0], i00
+	faddd	p064, p096a, a064
+	fmuld	u00, v032, p032
+C
+	add	%g4, cy, cy		C new cy
+	ldx	[%sp+2223+8], i16
+	faddd	p080, p112a, a080
+	fmuld	u00, v048, p048
+C
+	add	%i2, -1, %i2		C BOOKKEEPING
+	std	out000, [%sp+2223+0]
+	faddd	p000, a032, a000
+	fmuld	u00, v064, p064
+C
+	add	i00, r32, rlimb
+	add	%i1, 8, %i1		C BOOKKEEPING
+	std	out016, [%sp+2223+8]
+	fxtod	u32_hi, u32
+C
+	sllx	i16, 16, %g2
+	add	cy, rlimb, rlimb
+	faddd	p016, a048, a016
+	fmuld	u00, v080, p080
+C
+	srlx	i16, 16, %g4
+	add	%g2, rlimb, %l5
+	faddd	p032, a064, a032
+	fmuld	u00, v096, p096a
+C
+	stw	%l5, [%i0+0]
+	faddd	p048, a080, a048
+	brnz,pt	%i2, .Loop
+	 fmuld	u00, v112, p112a
+C END MAIN LOOP
+
+C WIND-DOWN PHASE 1
+.Lend:	and	%g2, xffffffff, %g2
+	fdtox	a000, out000
+	fmuld	u32, v000, p000
+	lduw	[%i0+4+8], r00		C read low 32 bits of rp[i]
+	add	%g2, rlimb, %l5
+	fdtox	a016, out016
+	fmuld	u32, v016, p016
+	srlx	%l5, 32, cy
+	ldx	[%sp+2223+16], i00
+	faddd	p064, p096b, a064
+	fmuld	u32, v032, p032
+	add	%g4, cy, cy		C new cy
+	ldx	[%sp+2223+24], i16
+	faddd	p080, p112b, a080
+	fmuld	u32, v048, p048
+	std	out000, [%sp+2223+16]
+	faddd	p000, a032, a000
+	fmuld	u32, v064, p064
+	add	i00, r00, rlimb
+	add	%i0, 8, %i0		C BOOKKEEPING
+	std	out016, [%sp+2223+24]
+	sllx	i16, 16, %g2
+	add	cy, rlimb, rlimb
+	faddd	p016, a048, a016
+	fmuld	u32, v080, p080
+	srlx	i16, 16, %g4
+	add	%g2, rlimb, %l5
+	faddd	p032, a064, a032
+	fmuld	u32, v096, p096b
+	stw	%l5, [%i0+4]
+	faddd	p048, a080, a048
+	fmuld	u32, v112, p112b
+C mid
+	and	%g2, xffffffff, %g2
+	fdtox	a000, out000
+	lduw	[%i0+0], r32		C read high 32 bits of rp[i]
+	add	%g2, rlimb, %l5
+	fdtox	a016, out016
+	srlx	%l5, 32, cy
+	ldx	[%sp+2223+0], i00
+	faddd	p064, p096a, a064
+	add	%g4, cy, cy		C new cy
+	ldx	[%sp+2223+8], i16
+	faddd	p080, p112a, a080
+	std	out000, [%sp+2223+0]
+	add	i00, r32, rlimb
+	std	out016, [%sp+2223+8]
+	sllx	i16, 16, %g2
+	add	cy, rlimb, rlimb
+	srlx	i16, 16, %g4
+	add	%g2, rlimb, %l5
+	stw	%l5, [%i0+0]
+
+C WIND-DOWN PHASE 2
+.L_wd2:	and	%g2, xffffffff, %g2
+	fdtox	a032, out000
+	lduw	[%i0+4+8], r00		C read low 32 bits of rp[i]
+	add	%g2, rlimb, %l5
+	fdtox	a048, out016
+	srlx	%l5, 32, cy
+	ldx	[%sp+2223+16], i00
+	add	%g4, cy, cy		C new cy
+	ldx	[%sp+2223+24], i16
+	std	out000, [%sp+2223+16]
+	add	i00, r00, rlimb
+	add	%i0, 8, %i0		C BOOKKEEPING
+	std	out016, [%sp+2223+24]
+	sllx	i16, 16, %g2
+	add	cy, rlimb, rlimb
+	srlx	i16, 16, %g4
+	add	%g2, rlimb, %l5
+	stw	%l5, [%i0+4]
+C mid
+	and	%g2, xffffffff, %g2
+	fdtox	a064, out000
+	lduw	[%i0+0], r32		C read high 32 bits of rp[i]
+	add	%g2, rlimb, %l5
+	fdtox	a080, out016
+	srlx	%l5, 32, cy
+	ldx	[%sp+2223+0], i00
+	add	%g4, cy, cy		C new cy
+	ldx	[%sp+2223+8], i16
+	std	out000, [%sp+2223+0]
+	add	i00, r32, rlimb
+	std	out016, [%sp+2223+8]
+	sllx	i16, 16, %g2
+	add	cy, rlimb, rlimb
+	srlx	i16, 16, %g4
+	add	%g2, rlimb, %l5
+	stw	%l5, [%i0+0]
+
+C WIND-DOWN PHASE 3
+.L_wd3:	and	%g2, xffffffff, %g2
+	fdtox	p096b, out000
+	add	%g2, rlimb, %l5
+	fdtox	p112b, out016
+	srlx	%l5, 32, cy
+	ldx	[%sp+2223+16], rlimb
+	add	%g4, cy, cy		C new cy
+	ldx	[%sp+2223+24], i16
+	std	out000, [%sp+2223+16]
+	add	%i0, 8, %i0		C BOOKKEEPING
+	std	out016, [%sp+2223+24]
+	sllx	i16, 16, %g2
+	add	cy, rlimb, rlimb
+	srlx	i16, 16, %g4
+	add	%g2, rlimb, %l5
+	stw	%l5, [%i0+4]
+C mid
+	and	%g2, xffffffff, %g2
+	add	%g2, rlimb, %l5
+	srlx	%l5, 32, cy
+	ldx	[%sp+2223+0], rlimb
+	add	%g4, cy, cy		C new cy
+	ldx	[%sp+2223+8], i16
+	sllx	i16, 16, %g2
+	add	cy, rlimb, rlimb
+	srlx	i16, 16, %g4
+	add	%g2, rlimb, %l5
+	stw	%l5, [%i0+0]
+
+	and	%g2, xffffffff, %g2
+	add	%g2, rlimb, %l5
+	srlx	%l5, 32, cy
+	ldx	[%sp+2223+16], i00
+	add	%g4, cy, cy		C new cy
+	ldx	[%sp+2223+24], i16
+
+	sllx	i16, 16, %g2
+	add	i00, cy, cy
+	return	%i7+8
+	add	%g2, cy, %o0
+EPILOGUE(mpn_addmul_2)

diff --git a/third_party/gmp/mpn/sparc64/ultrasparc1234/lshiftc.asm b/third_party/gmp/mpn/sparc64/ultrasparc1234/lshiftc.asm
new file mode 100644
index 0000000..47286d5
--- /dev/null
+++ b/third_party/gmp/mpn/sparc64/ultrasparc1234/lshiftc.asm

@@ -0,0 +1,165 @@
+dnl  SPARC v9 mpn_lshiftc
+
+dnl  Copyright 1996, 2000-2003, 2010 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+C		   cycles/limb
+C UltraSPARC 1&2:     3
+C UltraSPARC 3:	      2.67
+
+C INPUT PARAMETERS
+define(`rp', `%i0')
+define(`up', `%i1')
+define(`n',  `%i2')
+define(`cnt',`%i3')
+
+define(`u0', `%l0')
+define(`u1', `%l2')
+define(`u2', `%l4')
+define(`u3', `%l6')
+
+define(`tnc',`%i4')
+
+define(`fanop',`fitod %f0,%f2')		dnl  A quasi nop running in the FA pipe
+
+ASM_START()
+	REGISTER(%g2,#scratch)
+	REGISTER(%g3,#scratch)
+PROLOGUE(mpn_lshiftc)
+	save	%sp,-160,%sp
+
+	sllx	n,3,%g1
+	sub	%g0,cnt,tnc		C negate shift count
+	add	up,%g1,up		C make %o1 point at end of src
+	add	rp,%g1,rp		C make %o0 point at end of res
+	ldx	[up-8],u3		C load first limb
+	subcc	n,5,n
+	srlx	u3,tnc,%i5		C compute function result
+	bl,pn	%xcc,.Lend1234
+	sllx	u3,cnt,%g3
+
+	subcc	n,4,n
+	ldx	[up-16],u0
+	ldx	[up-24],u1
+	add	up,-32,up
+	ldx	[up-0],u2
+	ldx	[up-8],u3
+	srlx	u0,tnc,%g2
+	bl,pn	%xcc,.Lend5678
+	not	%g3, %g3
+
+	b,a	.Loop
+	ALIGN(16)
+.Loop:
+	sllx	u0,cnt,%g1
+	andn	%g3,%g2,%g3
+	ldx	[up-16],u0
+	fanop
+C --
+	srlx	u1,tnc,%g2
+	subcc	n,4,n
+	stx	%g3,[rp-8]
+	not	%g1, %g1
+C --
+	sllx	u1,cnt,%g3
+	andn	%g1,%g2,%g1
+	ldx	[up-24],u1
+	fanop
+C --
+	srlx	u2,tnc,%g2
+	stx	%g1,[rp-16]
+	add	up,-32,up
+	not	%g3, %g3
+C --
+	sllx	u2,cnt,%g1
+	andn	%g3,%g2,%g3
+	ldx	[up-0],u2
+	fanop
+C --
+	srlx	u3,tnc,%g2
+	stx	%g3,[rp-24]
+	add	rp,-32,rp
+	not	%g1, %g1
+C --
+	sllx	u3,cnt,%g3
+	andn	%g1,%g2,%g1
+	ldx	[up-8],u3
+	fanop
+C --
+	srlx	u0,tnc,%g2
+	stx	%g1,[rp-0]
+	bge,pt	%xcc,.Loop
+	not	%g3, %g3
+C --
+.Lend5678:
+	sllx	u0,cnt,%g1
+	andn	%g3,%g2,%g3
+	srlx	u1,tnc,%g2
+	stx	%g3,[rp-8]
+	not	%g1, %g1
+	sllx	u1,cnt,%g3
+	andn	%g1,%g2,%g1
+	srlx	u2,tnc,%g2
+	stx	%g1,[rp-16]
+	not	%g3, %g3
+	sllx	u2,cnt,%g1
+	andn	%g3,%g2,%g3
+	srlx	u3,tnc,%g2
+	stx	%g3,[rp-24]
+	add	rp,-32,rp
+	not	%g1, %g1
+	sllx	u3,cnt,%g3		C carry...
+	andn	%g1,%g2,%g1
+	stx	%g1,[rp-0]
+
+.Lend1234:
+	addcc	n,4,n
+	bz,pn	%xcc,.Lret
+	fanop
+.Loop0:
+	add	rp,-8,rp
+	subcc	n,1,n
+	ldx	[up-16],u3
+	add	up,-8,up
+	srlx	u3,tnc,%g2
+	not	%g3, %g3
+	andn	%g3,%g2,%g3
+	stx	%g3,[rp]
+	sllx	u3,cnt,%g3
+	bnz,pt	%xcc,.Loop0
+	fanop
+.Lret:
+	not	%g3, %g3
+	stx	%g3,[rp-8]
+	mov	%i5,%i0
+	ret
+	restore
+EPILOGUE()

diff --git a/third_party/gmp/mpn/sparc64/ultrasparc1234/mul_1.asm b/third_party/gmp/mpn/sparc64/ultrasparc1234/mul_1.asm
new file mode 100644
index 0000000..871d562
--- /dev/null
+++ b/third_party/gmp/mpn/sparc64/ultrasparc1234/mul_1.asm

@@ -0,0 +1,580 @@
+dnl  SPARC v9 64-bit mpn_mul_1 -- Multiply a limb vector with a limb and store
+dnl  the result in a second limb vector.
+
+dnl  Copyright 1998, 2000-2003 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C		   cycles/limb
+C UltraSPARC 1&2:     14
+C UltraSPARC 3:	      18.5
+
+C Algorithm: We use eight floating-point multiplies per limb product, with the
+C invariant v operand split into four 16-bit pieces, and the s1 operand split
+C into 32-bit pieces.  We sum pairs of 48-bit partial products using
+C floating-point add, then convert the four 49-bit product-sums and transfer
+C them to the integer unit.
+
+C Possible optimizations:
+C   1. Align the stack area where we transfer the four 49-bit product-sums
+C      to a 32-byte boundary.  That would minimize the cache collision.
+C      (UltraSPARC-1/2 use a direct-mapped cache.)  (Perhaps even better would
+C      be to align the area to map to the area immediately before s1?)
+C   2. Sum the 4 49-bit quantities using 32-bit operations, as in the
+C      develop mpn_addmul_2.  This would save many integer instructions.
+C   3. Unrolling.  Questionable if it is worth the code expansion, given that
+C      it could only save 1 cycle/limb.
+C   4. Specialize for particular v values.  If its upper 32 bits are zero, we
+C      could save many operations, in the FPU (fmuld), but more so in the IEU
+C      since we'll be summing 48-bit quantities, which might be simpler.
+C   5. Ideally, we should schedule the f2/f3 and f4/f5 RAW further apart, and
+C      the i00,i16,i32,i48 RAW less apart.  The latter apart-scheduling should
+C      not be greater than needed for L2 cache latency, and also not so great
+C      that i16 needs to be copied.
+C   6. Avoid performing mem+fa+fm in the same cycle, at least not when we want
+C      to get high IEU bandwidth.  (12 of the 14 cycles will be free for 2 IEU
+C      ops.)
+
+C Instruction classification (as per UltraSPARC-1/2 functional units):
+C    8 FM
+C   10 FA
+C   11 MEM
+C   9 ISHIFT + 10? IADDLOG
+C    1 BRANCH
+C   49 insns totally (plus three mov insns that should be optimized out)
+
+C The loop executes 53 instructions in 14 cycles on UltraSPARC-1/2, i.e we
+C sustain 3.79 instructions/cycle.
+
+C INPUT PARAMETERS
+C rp	i0
+C up	i1
+C n	i2
+C v	i3
+
+ASM_START()
+	REGISTER(%g2,#scratch)
+	REGISTER(%g3,#scratch)
+
+define(`p00', `%f8') define(`p16',`%f10') define(`p32',`%f12') define(`p48',`%f14')
+define(`r32',`%f16') define(`r48',`%f18') define(`r64',`%f20') define(`r80',`%f22')
+define(`v00',`%f24') define(`v16',`%f26') define(`v32',`%f28') define(`v48',`%f30')
+define(`u00',`%f32') define(`u32', `%f34')
+define(`a00',`%f36') define(`a16',`%f38') define(`a32',`%f40') define(`a48',`%f42')
+define(`cy',`%g1')
+define(`rlimb',`%g3')
+define(`i00',`%l0') define(`i16',`%l1') define(`i32',`%l2') define(`i48',`%l3')
+define(`xffffffff',`%l7')
+define(`xffff',`%o0')
+
+PROLOGUE(mpn_mul_1)
+
+C Initialization.  (1) Split v operand into four 16-bit chunks and store them
+C as IEEE double in fp registers.  (2) Clear upper 32 bits of fp register pairs
+C f2 and f4.  (3) Store masks in registers aliased to `xffff' and `xffffffff'.
+
+	save	%sp, -256, %sp
+	mov	-1, %g4
+	srlx	%g4, 48, xffff		C store mask in register `xffff'
+	and	%i3, xffff, %g2
+	stx	%g2, [%sp+2223+0]
+	srlx	%i3, 16, %g3
+	and	%g3, xffff, %g3
+	stx	%g3, [%sp+2223+8]
+	srlx	%i3, 32, %g2
+	and	%g2, xffff, %g2
+	stx	%g2, [%sp+2223+16]
+	srlx	%i3, 48, %g3
+	stx	%g3, [%sp+2223+24]
+	srlx	%g4, 32, xffffffff	C store mask in register `xffffffff'
+
+	sllx	%i2, 3, %i2
+	mov	0, cy			C clear cy
+	add	%i0, %i2, %i0
+	add	%i1, %i2, %i1
+	neg	%i2
+	add	%i1, 4, %i5
+	add	%i0, -32, %i4
+	add	%i0, -16, %i0
+
+	ldd	[%sp+2223+0], v00
+	ldd	[%sp+2223+8], v16
+	ldd	[%sp+2223+16], v32
+	ldd	[%sp+2223+24], v48
+	ld	[%sp+2223+0],%f2	C zero f2
+	ld	[%sp+2223+0],%f4	C zero f4
+	ld	[%i5+%i2], %f3		C read low 32 bits of up[i]
+	ld	[%i1+%i2], %f5		C read high 32 bits of up[i]
+	fxtod	v00, v00
+	fxtod	v16, v16
+	fxtod	v32, v32
+	fxtod	v48, v48
+
+C Start real work.  (We sneakingly read f3 and f5 above...)
+C The software pipeline is very deep, requiring 4 feed-in stages.
+
+	fxtod	%f2, u00
+	fxtod	%f4, u32
+	fmuld	u00, v00, a00
+	fmuld	u00, v16, a16
+	fmuld	u00, v32, p32
+	fmuld	u32, v00, r32
+	fmuld	u00, v48, p48
+	addcc	%i2, 8, %i2
+	bnz,pt	%xcc, .L_two_or_more
+	fmuld	u32, v16, r48
+
+.L_one:
+	fmuld	u32, v32, r64	C FIXME not urgent
+	faddd	p32, r32, a32
+	fdtox	a00, a00
+	faddd	p48, r48, a48
+	fmuld	u32, v48, r80	C FIXME not urgent
+	fdtox	a16, a16
+	fdtox	a32, a32
+	fdtox	a48, a48
+	std	a00, [%sp+2223+0]
+	std	a16, [%sp+2223+8]
+	std	a32, [%sp+2223+16]
+	std	a48, [%sp+2223+24]
+	add	%i2, 8, %i2
+
+	fdtox	r64, a00
+	fdtox	r80, a16
+	ldx	[%sp+2223+0], i00
+	ldx	[%sp+2223+8], i16
+	ldx	[%sp+2223+16], i32
+	ldx	[%sp+2223+24], i48
+	std	a00, [%sp+2223+0]
+	std	a16, [%sp+2223+8]
+	add	%i2, 8, %i2
+
+	mov	i00, %g5		C i00+ now in g5
+	ldx	[%sp+2223+0], i00
+	srlx	i16, 48, %l4		C (i16 >> 48)
+	mov	i16, %g2
+	ldx	[%sp+2223+8], i16
+	srlx	i48, 16, %l5		C (i48 >> 16)
+	mov	i32, %g4		C i32+ now in g4
+	sllx	i48, 32, %l6		C (i48 << 32)
+	srlx	%g4, 32, %o3		C (i32 >> 32)
+	add	%l5, %l4, %o1		C hi64- in %o1
+	std	a00, [%sp+2223+0]
+	sllx	%g4, 16, %o2		C (i32 << 16)
+	add	%o3, %o1, %o1		C hi64 in %o1   1st ASSIGNMENT
+	std	a16, [%sp+2223+8]
+	sllx	%o1, 48, %o3		C (hi64 << 48)
+	add	%g2, %o2, %o2		C mi64- in %o2
+	add	%l6, %o2, %o2		C mi64- in %o2
+	sub	%o2, %o3, %o2		C mi64 in %o2   1st ASSIGNMENT
+	add	cy, %g5, %o4		C x = prev(i00) + cy
+	b	.L_out_1
+	add	%i2, 8, %i2
+
+.L_two_or_more:
+	ld	[%i5+%i2], %f3		C read low 32 bits of up[i]
+	fmuld	u32, v32, r64	C FIXME not urgent
+	faddd	p32, r32, a32
+	ld	[%i1+%i2], %f5		C read high 32 bits of up[i]
+	fdtox	a00, a00
+	faddd	p48, r48, a48
+	fmuld	u32, v48, r80	C FIXME not urgent
+	fdtox	a16, a16
+	fdtox	a32, a32
+	fxtod	%f2, u00
+	fxtod	%f4, u32
+	fdtox	a48, a48
+	std	a00, [%sp+2223+0]
+	fmuld	u00, v00, p00
+	std	a16, [%sp+2223+8]
+	fmuld	u00, v16, p16
+	std	a32, [%sp+2223+16]
+	fmuld	u00, v32, p32
+	std	a48, [%sp+2223+24]
+	faddd	p00, r64, a00
+	fmuld	u32, v00, r32
+	faddd	p16, r80, a16
+	fmuld	u00, v48, p48
+	addcc	%i2, 8, %i2
+	bnz,pt	%xcc, .L_three_or_more
+	fmuld	u32, v16, r48
+
+.L_two:
+	fmuld	u32, v32, r64	C FIXME not urgent
+	faddd	p32, r32, a32
+	fdtox	a00, a00
+	faddd	p48, r48, a48
+	fmuld	u32, v48, r80	C FIXME not urgent
+	fdtox	a16, a16
+	ldx	[%sp+2223+0], i00
+	fdtox	a32, a32
+	ldx	[%sp+2223+8], i16
+	ldx	[%sp+2223+16], i32
+	ldx	[%sp+2223+24], i48
+	fdtox	a48, a48
+	std	a00, [%sp+2223+0]
+	std	a16, [%sp+2223+8]
+	std	a32, [%sp+2223+16]
+	std	a48, [%sp+2223+24]
+	add	%i2, 8, %i2
+
+	fdtox	r64, a00
+	mov	i00, %g5		C i00+ now in g5
+	fdtox	r80, a16
+	ldx	[%sp+2223+0], i00
+	srlx	i16, 48, %l4		C (i16 >> 48)
+	mov	i16, %g2
+	ldx	[%sp+2223+8], i16
+	srlx	i48, 16, %l5		C (i48 >> 16)
+	mov	i32, %g4		C i32+ now in g4
+	ldx	[%sp+2223+16], i32
+	sllx	i48, 32, %l6		C (i48 << 32)
+	ldx	[%sp+2223+24], i48
+	srlx	%g4, 32, %o3		C (i32 >> 32)
+	add	%l5, %l4, %o1		C hi64- in %o1
+	std	a00, [%sp+2223+0]
+	sllx	%g4, 16, %o2		C (i32 << 16)
+	add	%o3, %o1, %o1		C hi64 in %o1   1st ASSIGNMENT
+	std	a16, [%sp+2223+8]
+	sllx	%o1, 48, %o3		C (hi64 << 48)
+	add	%g2, %o2, %o2		C mi64- in %o2
+	add	%l6, %o2, %o2		C mi64- in %o2
+	sub	%o2, %o3, %o2		C mi64 in %o2   1st ASSIGNMENT
+	add	cy, %g5, %o4		C x = prev(i00) + cy
+	b	.L_out_2
+	add	%i2, 8, %i2
+
+.L_three_or_more:
+	ld	[%i5+%i2], %f3		C read low 32 bits of up[i]
+	fmuld	u32, v32, r64	C FIXME not urgent
+	faddd	p32, r32, a32
+	ld	[%i1+%i2], %f5		C read high 32 bits of up[i]
+	fdtox	a00, a00
+	faddd	p48, r48, a48
+	fmuld	u32, v48, r80	C FIXME not urgent
+	fdtox	a16, a16
+	ldx	[%sp+2223+0], i00
+	fdtox	a32, a32
+	ldx	[%sp+2223+8], i16
+	fxtod	%f2, u00
+	ldx	[%sp+2223+16], i32
+	fxtod	%f4, u32
+	ldx	[%sp+2223+24], i48
+	fdtox	a48, a48
+	std	a00, [%sp+2223+0]
+	fmuld	u00, v00, p00
+	std	a16, [%sp+2223+8]
+	fmuld	u00, v16, p16
+	std	a32, [%sp+2223+16]
+	fmuld	u00, v32, p32
+	std	a48, [%sp+2223+24]
+	faddd	p00, r64, a00
+	fmuld	u32, v00, r32
+	faddd	p16, r80, a16
+	fmuld	u00, v48, p48
+	addcc	%i2, 8, %i2
+	bnz,pt	%xcc, .L_four_or_more
+	fmuld	u32, v16, r48
+
+.L_three:
+	fmuld	u32, v32, r64	C FIXME not urgent
+	faddd	p32, r32, a32
+	fdtox	a00, a00
+	faddd	p48, r48, a48
+	mov	i00, %g5		C i00+ now in g5
+	fmuld	u32, v48, r80	C FIXME not urgent
+	fdtox	a16, a16
+	ldx	[%sp+2223+0], i00
+	fdtox	a32, a32
+	srlx	i16, 48, %l4		C (i16 >> 48)
+	mov	i16, %g2
+	ldx	[%sp+2223+8], i16
+	srlx	i48, 16, %l5		C (i48 >> 16)
+	mov	i32, %g4		C i32+ now in g4
+	ldx	[%sp+2223+16], i32
+	sllx	i48, 32, %l6		C (i48 << 32)
+	ldx	[%sp+2223+24], i48
+	fdtox	a48, a48
+	srlx	%g4, 32, %o3		C (i32 >> 32)
+	add	%l5, %l4, %o1		C hi64- in %o1
+	std	a00, [%sp+2223+0]
+	sllx	%g4, 16, %o2		C (i32 << 16)
+	add	%o3, %o1, %o1		C hi64 in %o1   1st ASSIGNMENT
+	std	a16, [%sp+2223+8]
+	sllx	%o1, 48, %o3		C (hi64 << 48)
+	add	%g2, %o2, %o2		C mi64- in %o2
+	std	a32, [%sp+2223+16]
+	add	%l6, %o2, %o2		C mi64- in %o2
+	std	a48, [%sp+2223+24]
+	sub	%o2, %o3, %o2		C mi64 in %o2   1st ASSIGNMENT
+	add	cy, %g5, %o4		C x = prev(i00) + cy
+	b	.L_out_3
+	add	%i2, 8, %i2
+
+.L_four_or_more:
+	ld	[%i5+%i2], %f3		C read low 32 bits of up[i]
+	fmuld	u32, v32, r64	C FIXME not urgent
+	faddd	p32, r32, a32
+	ld	[%i1+%i2], %f5		C read high 32 bits of up[i]
+	fdtox	a00, a00
+	faddd	p48, r48, a48
+	mov	i00, %g5		C i00+ now in g5
+	fmuld	u32, v48, r80	C FIXME not urgent
+	fdtox	a16, a16
+	ldx	[%sp+2223+0], i00
+	fdtox	a32, a32
+	srlx	i16, 48, %l4		C (i16 >> 48)
+	mov	i16, %g2
+	ldx	[%sp+2223+8], i16
+	fxtod	%f2, u00
+	srlx	i48, 16, %l5		C (i48 >> 16)
+	mov	i32, %g4		C i32+ now in g4
+	ldx	[%sp+2223+16], i32
+	fxtod	%f4, u32
+	sllx	i48, 32, %l6		C (i48 << 32)
+	ldx	[%sp+2223+24], i48
+	fdtox	a48, a48
+	srlx	%g4, 32, %o3		C (i32 >> 32)
+	add	%l5, %l4, %o1		C hi64- in %o1
+	std	a00, [%sp+2223+0]
+	fmuld	u00, v00, p00
+	sllx	%g4, 16, %o2		C (i32 << 16)
+	add	%o3, %o1, %o1		C hi64 in %o1   1st ASSIGNMENT
+	std	a16, [%sp+2223+8]
+	fmuld	u00, v16, p16
+	sllx	%o1, 48, %o3		C (hi64 << 48)
+	add	%g2, %o2, %o2		C mi64- in %o2
+	std	a32, [%sp+2223+16]
+	fmuld	u00, v32, p32
+	add	%l6, %o2, %o2		C mi64- in %o2
+	std	a48, [%sp+2223+24]
+	faddd	p00, r64, a00
+	fmuld	u32, v00, r32
+	sub	%o2, %o3, %o2		C mi64 in %o2   1st ASSIGNMENT
+	faddd	p16, r80, a16
+	fmuld	u00, v48, p48
+	add	cy, %g5, %o4		C x = prev(i00) + cy
+	addcc	%i2, 8, %i2
+	bnz,pt	%xcc, .Loop
+	fmuld	u32, v16, r48
+
+.L_four:
+	b,a	.L_out_4
+
+C BEGIN MAIN LOOP
+	.align	16
+.Loop:
+C 00
+	srlx	%o4, 16, %o5		C (x >> 16)
+	ld	[%i5+%i2], %f3		C read low 32 bits of up[i]
+	fmuld	u32, v32, r64	C FIXME not urgent
+	faddd	p32, r32, a32
+C 01
+	add	%o5, %o2, %o2		C mi64 in %o2   2nd ASSIGNMENT
+	and	%o4, xffff, %o5		C (x & 0xffff)
+	ld	[%i1+%i2], %f5		C read high 32 bits of up[i]
+	fdtox	a00, a00
+C 02
+	faddd	p48, r48, a48
+C 03
+	srlx	%o2, 48, %o7		C (mi64 >> 48)
+	mov	i00, %g5		C i00+ now in g5
+	fmuld	u32, v48, r80	C FIXME not urgent
+	fdtox	a16, a16
+C 04
+	sllx	%o2, 16, %i3		C (mi64 << 16)
+	add	%o7, %o1, cy		C new cy
+	ldx	[%sp+2223+0], i00
+	fdtox	a32, a32
+C 05
+	srlx	i16, 48, %l4		C (i16 >> 48)
+	mov	i16, %g2
+	ldx	[%sp+2223+8], i16
+	fxtod	%f2, u00
+C 06
+	srlx	i48, 16, %l5		C (i48 >> 16)
+	mov	i32, %g4		C i32+ now in g4
+	ldx	[%sp+2223+16], i32
+	fxtod	%f4, u32
+C 07
+	sllx	i48, 32, %l6		C (i48 << 32)
+	or	%i3, %o5, %o5
+	ldx	[%sp+2223+24], i48
+	fdtox	a48, a48
+C 08
+	srlx	%g4, 32, %o3		C (i32 >> 32)
+	add	%l5, %l4, %o1		C hi64- in %o1
+	std	a00, [%sp+2223+0]
+	fmuld	u00, v00, p00
+C 09
+	sllx	%g4, 16, %o2		C (i32 << 16)
+	add	%o3, %o1, %o1		C hi64 in %o1   1st ASSIGNMENT
+	std	a16, [%sp+2223+8]
+	fmuld	u00, v16, p16
+C 10
+	sllx	%o1, 48, %o3		C (hi64 << 48)
+	add	%g2, %o2, %o2		C mi64- in %o2
+	std	a32, [%sp+2223+16]
+	fmuld	u00, v32, p32
+C 11
+	add	%l6, %o2, %o2		C mi64- in %o2
+	std	a48, [%sp+2223+24]
+	faddd	p00, r64, a00
+	fmuld	u32, v00, r32
+C 12
+	sub	%o2, %o3, %o2		C mi64 in %o2   1st ASSIGNMENT
+	stx	%o5, [%i4+%i2]
+	faddd	p16, r80, a16
+	fmuld	u00, v48, p48
+C 13
+	add	cy, %g5, %o4		C x = prev(i00) + cy
+	addcc	%i2, 8, %i2
+	bnz,pt	%xcc, .Loop
+	fmuld	u32, v16, r48
+C END MAIN LOOP
+
+.L_out_4:
+	srlx	%o4, 16, %o5		C (x >> 16)
+	fmuld	u32, v32, r64	C FIXME not urgent
+	faddd	p32, r32, a32
+	add	%o5, %o2, %o2		C mi64 in %o2   2nd ASSIGNMENT
+	and	%o4, xffff, %o5		C (x & 0xffff)
+	fdtox	a00, a00
+	faddd	p48, r48, a48
+	srlx	%o2, 48, %o7		C (mi64 >> 48)
+	mov	i00, %g5		C i00+ now in g5
+	fmuld	u32, v48, r80	C FIXME not urgent
+	fdtox	a16, a16
+	sllx	%o2, 16, %i3		C (mi64 << 16)
+	add	%o7, %o1, cy		C new cy
+	ldx	[%sp+2223+0], i00
+	fdtox	a32, a32
+	srlx	i16, 48, %l4		C (i16 >> 48)
+	mov	i16, %g2
+	ldx	[%sp+2223+8], i16
+	srlx	i48, 16, %l5		C (i48 >> 16)
+	mov	i32, %g4		C i32+ now in g4
+	ldx	[%sp+2223+16], i32
+	sllx	i48, 32, %l6		C (i48 << 32)
+	or	%i3, %o5, %o5
+	ldx	[%sp+2223+24], i48
+	fdtox	a48, a48
+	srlx	%g4, 32, %o3		C (i32 >> 32)
+	add	%l5, %l4, %o1		C hi64- in %o1
+	std	a00, [%sp+2223+0]
+	sllx	%g4, 16, %o2		C (i32 << 16)
+	add	%o3, %o1, %o1		C hi64 in %o1   1st ASSIGNMENT
+	std	a16, [%sp+2223+8]
+	sllx	%o1, 48, %o3		C (hi64 << 48)
+	add	%g2, %o2, %o2		C mi64- in %o2
+	std	a32, [%sp+2223+16]
+	add	%l6, %o2, %o2		C mi64- in %o2
+	std	a48, [%sp+2223+24]
+	sub	%o2, %o3, %o2		C mi64 in %o2   1st ASSIGNMENT
+	stx	%o5, [%i4+%i2]
+	add	cy, %g5, %o4		C x = prev(i00) + cy
+	add	%i2, 8, %i2
+.L_out_3:
+	srlx	%o4, 16, %o5		C (x >> 16)
+	add	%o5, %o2, %o2		C mi64 in %o2   2nd ASSIGNMENT
+	and	%o4, xffff, %o5		C (x & 0xffff)
+	fdtox	r64, a00
+	srlx	%o2, 48, %o7		C (mi64 >> 48)
+	mov	i00, %g5		C i00+ now in g5
+	fdtox	r80, a16
+	sllx	%o2, 16, %i3		C (mi64 << 16)
+	add	%o7, %o1, cy		C new cy
+	ldx	[%sp+2223+0], i00
+	srlx	i16, 48, %l4		C (i16 >> 48)
+	mov	i16, %g2
+	ldx	[%sp+2223+8], i16
+	srlx	i48, 16, %l5		C (i48 >> 16)
+	mov	i32, %g4		C i32+ now in g4
+	ldx	[%sp+2223+16], i32
+	sllx	i48, 32, %l6		C (i48 << 32)
+	or	%i3, %o5, %o5
+	ldx	[%sp+2223+24], i48
+	srlx	%g4, 32, %o3		C (i32 >> 32)
+	add	%l5, %l4, %o1		C hi64- in %o1
+	std	a00, [%sp+2223+0]
+	sllx	%g4, 16, %o2		C (i32 << 16)
+	add	%o3, %o1, %o1		C hi64 in %o1   1st ASSIGNMENT
+	std	a16, [%sp+2223+8]
+	sllx	%o1, 48, %o3		C (hi64 << 48)
+	add	%g2, %o2, %o2		C mi64- in %o2
+	add	%l6, %o2, %o2		C mi64- in %o2
+	sub	%o2, %o3, %o2		C mi64 in %o2   1st ASSIGNMENT
+	stx	%o5, [%i4+%i2]
+	add	cy, %g5, %o4		C x = prev(i00) + cy
+	add	%i2, 8, %i2
+.L_out_2:
+	srlx	%o4, 16, %o5		C (x >> 16)
+	add	%o5, %o2, %o2		C mi64 in %o2   2nd ASSIGNMENT
+	and	%o4, xffff, %o5		C (x & 0xffff)
+	srlx	%o2, 48, %o7		C (mi64 >> 48)
+	mov	i00, %g5		C i00+ now in g5
+	sllx	%o2, 16, %i3		C (mi64 << 16)
+	add	%o7, %o1, cy		C new cy
+	ldx	[%sp+2223+0], i00
+	srlx	i16, 48, %l4		C (i16 >> 48)
+	mov	i16, %g2
+	ldx	[%sp+2223+8], i16
+	srlx	i48, 16, %l5		C (i48 >> 16)
+	mov	i32, %g4		C i32+ now in g4
+	sllx	i48, 32, %l6		C (i48 << 32)
+	or	%i3, %o5, %o5
+	srlx	%g4, 32, %o3		C (i32 >> 32)
+	add	%l5, %l4, %o1		C hi64- in %o1
+	sllx	%g4, 16, %o2		C (i32 << 16)
+	add	%o3, %o1, %o1		C hi64 in %o1   1st ASSIGNMENT
+	sllx	%o1, 48, %o3		C (hi64 << 48)
+	add	%g2, %o2, %o2		C mi64- in %o2
+	add	%l6, %o2, %o2		C mi64- in %o2
+	sub	%o2, %o3, %o2		C mi64 in %o2   1st ASSIGNMENT
+	stx	%o5, [%i4+%i2]
+	add	cy, %g5, %o4		C x = prev(i00) + cy
+	add	%i2, 8, %i2
+.L_out_1:
+	srlx	%o4, 16, %o5		C (x >> 16)
+	add	%o5, %o2, %o2		C mi64 in %o2   2nd ASSIGNMENT
+	and	%o4, xffff, %o5		C (x & 0xffff)
+	srlx	%o2, 48, %o7		C (mi64 >> 48)
+	sllx	%o2, 16, %i3		C (mi64 << 16)
+	add	%o7, %o1, cy		C new cy
+	or	%i3, %o5, %o5
+	stx	%o5, [%i4+%i2]
+
+	sllx	i00, 0, %g2
+	add	%g2, cy, cy
+	sllx	i16, 16, %g3
+	add	%g3, cy, cy
+
+	return	%i7+8
+	mov	cy, %o0
+EPILOGUE(mpn_mul_1)

diff --git a/third_party/gmp/mpn/sparc64/ultrasparc1234/sqr_diagonal.asm b/third_party/gmp/mpn/sparc64/ultrasparc1234/sqr_diagonal.asm
new file mode 100644
index 0000000..43c69d3
--- /dev/null
+++ b/third_party/gmp/mpn/sparc64/ultrasparc1234/sqr_diagonal.asm

@@ -0,0 +1,342 @@
+dnl  SPARC v9 64-bit mpn_sqr_diagonal.
+
+dnl  Copyright 2001, 2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C		   cycles/limb
+C UltraSPARC 1&2:     22
+C UltraSPARC 3:	      36
+
+C This was generated by the Sun C compiler.  It runs at 22 cycles/limb on the
+C UltraSPARC-1/2, three cycles slower than theoretically possible for optimal
+C code using the same algorithm.  For 1-3 limbs, a special loop was generated,
+C which causes performance problems in particular for 2 and 3 limbs.
+C Ultimately, this should be replaced by hand-written code in the same software
+C pipeline style as e.g., addmul_1.asm.
+
+ASM_START()
+	REGISTER(%g2,#scratch)
+	REGISTER(%g3,#scratch)
+PROLOGUE(mpn_sqr_diagonal)
+	save	%sp, -240, %sp
+
+	sethi	%hi(0x1ffc00), %o0
+	sethi	%hi(0x3ffc00), %o1
+	add	%o0, 1023, %o7
+	cmp	%i2, 4
+	add	%o1, 1023, %o4
+	or	%g0, %i1, %g1
+	or	%g0, %i0, %o0
+	bl,pn	%xcc, .Lsmall
+	or	%g0, 0, %g2
+
+	ldx	[%i1], %o1
+	add	%i1, 24, %g1
+	or	%g0, 3, %g2
+	srlx	%o1, 42, %g3
+	stx	%g3, [%sp+2279]
+	and	%o1, %o7, %o2
+	stx	%o2, [%sp+2263]
+	srlx	%o1, 21, %o1
+	ldd	[%sp+2279], %f0
+	and	%o1, %o7, %o1
+	stx	%o1, [%sp+2271]
+	ldx	[%i1+8], %o2
+	fxtod	%f0, %f12
+	srlx	%o2, 21, %o1
+	and	%o2, %o7, %g3
+	ldd	[%sp+2263], %f2
+	fmuld	%f12, %f12, %f10
+	srlx	%o2, 42, %o2
+	ldd	[%sp+2271], %f0
+	and	%o1, %o7, %o1
+	fxtod	%f2, %f8
+	stx	%o2, [%sp+2279]
+	stx	%o1, [%sp+2271]
+	fxtod	%f0, %f0
+	stx	%g3, [%sp+2263]
+	fdtox	%f10, %f14
+	fmuld	%f12, %f8, %f6
+	ldx	[%i1+16], %o2
+	std	%f14, [%sp+2255]
+	fmuld	%f0, %f0, %f2
+	fmuld	%f8, %f8, %f10
+	srlx	%o2, 42, %o1
+	faddd	%f6, %f6, %f6
+	fmuld	%f12, %f0, %f12
+	fmuld	%f0, %f8, %f8
+	ldd	[%sp+2279], %f0
+	ldd	[%sp+2263], %f4
+	fdtox	%f10, %f10
+	std	%f10, [%sp+2239]
+	faddd	%f2, %f6, %f6
+	ldd	[%sp+2271], %f2
+	fdtox	%f12, %f12
+	std	%f12, [%sp+2247]
+	fdtox	%f8, %f8
+	std	%f8, [%sp+2231]
+	fdtox	%f6, %f6
+	std	%f6, [%sp+2223]
+
+.Loop:	srlx	%o2, 21, %g3
+	stx	%o1, [%sp+2279]
+	add	%g2, 1, %g2
+	and	%g3, %o7, %o1
+	ldx	[%sp+2255], %g4
+	cmp	%g2, %i2
+	stx	%o1, [%sp+2271]
+	add	%g1, 8, %g1
+	add	%o0, 16, %o0
+	ldx	[%sp+2239], %o1
+	fxtod	%f0, %f10
+	fxtod	%f4, %f14
+	ldx	[%sp+2231], %i0
+	ldx	[%sp+2223], %g5
+	ldx	[%sp+2247], %g3
+	and	%o2, %o7, %o2
+	fxtod	%f2, %f8
+	fmuld	%f10, %f10, %f0
+	stx	%o2, [%sp+2263]
+	fmuld	%f10, %f14, %f6
+	ldx	[%g1-8], %o2
+	fmuld	%f10, %f8, %f12
+	fdtox	%f0, %f2
+	ldd	[%sp+2279], %f0
+	fmuld	%f8, %f8, %f4
+	faddd	%f6, %f6, %f6
+	fmuld	%f14, %f14, %f10
+	std	%f2, [%sp+2255]
+	sllx	%g4, 20, %g4
+	ldd	[%sp+2271], %f2
+	fmuld	%f8, %f14, %f8
+	sllx	%i0, 22, %i1
+	fdtox	%f12, %f12
+	std	%f12, [%sp+2247]
+	sllx	%g5, 42, %i0
+	add	%o1, %i1, %o1
+	faddd	%f4, %f6, %f6
+	ldd	[%sp+2263], %f4
+	add	%o1, %i0, %o1
+	add	%g3, %g4, %g3
+	fdtox	%f10, %f10
+	std	%f10, [%sp+2239]
+	srlx	%o1, 42, %g4
+	and	%g5, %o4, %i0
+	fdtox	%f8, %f8
+	std	%f8, [%sp+2231]
+	srlx	%g5, 22, %g5
+	sub	%g4, %i0, %g4
+	fdtox	%f6, %f6
+	std	%f6, [%sp+2223]
+	srlx	%g4, 63, %g4
+	add	%g3, %g5, %g3
+	add	%g3, %g4, %g3
+	stx	%o1, [%o0-16]
+	srlx	%o2, 42, %o1
+	bl,pt	%xcc, .Loop
+	stx	%g3, [%o0-8]
+
+	stx	%o1, [%sp+2279]
+	srlx	%o2, 21, %o1
+	fxtod	%f0, %f16
+	ldx	[%sp+2223], %g3
+	fxtod	%f4, %f6
+	and	%o2, %o7, %o3
+	stx	%o3, [%sp+2263]
+	fxtod	%f2, %f4
+	and	%o1, %o7, %o1
+	ldx	[%sp+2231], %o2
+	sllx	%g3, 42, %g4
+	fmuld	%f16, %f16, %f14
+	stx	%o1, [%sp+2271]
+	fmuld	%f16, %f6, %f8
+	add	%o0, 48, %o0
+	ldx	[%sp+2239], %o1
+	sllx	%o2, 22, %o2
+	fmuld	%f4, %f4, %f10
+	ldx	[%sp+2255], %o3
+	fdtox	%f14, %f14
+	fmuld	%f4, %f6, %f2
+	std	%f14, [%sp+2255]
+	faddd	%f8, %f8, %f12
+	add	%o1, %o2, %o2
+	fmuld	%f16, %f4, %f4
+	ldd	[%sp+2279], %f0
+	sllx	%o3, 20, %g5
+	add	%o2, %g4, %o2
+	fmuld	%f6, %f6, %f6
+	srlx	%o2, 42, %o3
+	and	%g3, %o4, %g4
+	srlx	%g3, 22, %g3
+	faddd	%f10, %f12, %f16
+	ldd	[%sp+2271], %f12
+	ldd	[%sp+2263], %f8
+	fxtod	%f0, %f0
+	sub	%o3, %g4, %o3
+	ldx	[%sp+2247], %o1
+	srlx	%o3, 63, %o3
+	fdtox	%f2, %f10
+	fxtod	%f8, %f8
+	std	%f10, [%sp+2231]
+	fdtox	%f6, %f6
+	std	%f6, [%sp+2239]
+	add	%o1, %g5, %o1
+	fmuld	%f0, %f0, %f2
+	fdtox	%f16, %f16
+	std	%f16, [%sp+2223]
+	add	%o1, %g3, %o1
+	fdtox	%f4, %f4
+	std	%f4, [%sp+2247]
+	fmuld	%f0, %f8, %f10
+	fxtod	%f12, %f12
+	add	%o1, %o3, %o1
+	stx	%o2, [%o0-48]
+	fmuld	%f8, %f8, %f6
+	stx	%o1, [%o0-40]
+	fdtox	%f2, %f2
+	ldx	[%sp+2231], %o2
+	faddd	%f10, %f10, %f10
+	ldx	[%sp+2223], %g3
+	fmuld	%f12, %f12, %f4
+	fdtox	%f6, %f6
+	ldx	[%sp+2239], %o1
+	sllx	%o2, 22, %o2
+	fmuld	%f12, %f8, %f8
+	sllx	%g3, 42, %g5
+	ldx	[%sp+2255], %o3
+	fmuld	%f0, %f12, %f0
+	add	%o1, %o2, %o2
+	faddd	%f4, %f10, %f4
+	ldx	[%sp+2247], %o1
+	add	%o2, %g5, %o2
+	and	%g3, %o4, %g4
+	fdtox	%f8, %f8
+	sllx	%o3, 20, %g5
+	std	%f8, [%sp+2231]
+	fdtox	%f0, %f0
+	srlx	%o2, 42, %o3
+	add	%o1, %g5, %o1
+	fdtox	%f4, %f4
+	srlx	%g3, 22, %g3
+	sub	%o3, %g4, %o3
+	std	%f6, [%sp+2239]
+	std	%f4, [%sp+2223]
+	srlx	%o3, 63, %o3
+	add	%o1, %g3, %o1
+	std	%f2, [%sp+2255]
+	add	%o1, %o3, %o1
+	std	%f0, [%sp+2247]
+	stx	%o2, [%o0-32]
+	stx	%o1, [%o0-24]
+	ldx	[%sp+2231], %o2
+	ldx	[%sp+2223], %o3
+	ldx	[%sp+2239], %o1
+	sllx	%o2, 22, %o2
+	sllx	%o3, 42, %g5
+	ldx	[%sp+2255], %g4
+	and	%o3, %o4, %g3
+	add	%o1, %o2, %o2
+	ldx	[%sp+2247], %o1
+	add	%o2, %g5, %o2
+	stx	%o2, [%o0-16]
+	sllx	%g4, 20, %g4
+	srlx	%o2, 42, %o2
+	add	%o1, %g4, %o1
+	srlx	%o3, 22, %o3
+	sub	%o2, %g3, %o2
+	srlx	%o2, 63, %o2
+	add	%o1, %o3, %o1
+	add	%o1, %o2, %o1
+	stx	%o1, [%o0-8]
+	ret
+	restore	%g0, %g0, %g0
+.Lsmall:
+	ldx	[%g1], %o2
+.Loop0:
+	and	%o2, %o7, %o1
+	stx	%o1, [%sp+2263]
+	add	%g2, 1, %g2
+	srlx	%o2, 21, %o1
+	add	%g1, 8, %g1
+	srlx	%o2, 42, %o2
+	stx	%o2, [%sp+2279]
+	and	%o1, %o7, %o1
+	ldd	[%sp+2263], %f0
+	cmp	%g2, %i2
+	stx	%o1, [%sp+2271]
+	fxtod	%f0, %f6
+	ldd	[%sp+2279], %f0
+	ldd	[%sp+2271], %f4
+	fxtod	%f0, %f2
+	fmuld	%f6, %f6, %f0
+	fxtod	%f4, %f10
+	fmuld	%f2, %f6, %f4
+	fdtox	%f0, %f0
+	std	%f0, [%sp+2239]
+	fmuld	%f10, %f6, %f8
+	fmuld	%f10, %f10, %f0
+	faddd	%f4, %f4, %f6
+	fmuld	%f2, %f2, %f4
+	fdtox	%f8, %f8
+	std	%f8, [%sp+2231]
+	fmuld	%f2, %f10, %f2
+	faddd	%f0, %f6, %f0
+	fdtox	%f4, %f4
+	std	%f4, [%sp+2255]
+	fdtox	%f2, %f2
+	std	%f2, [%sp+2247]
+	fdtox	%f0, %f0
+	std	%f0, [%sp+2223]
+	ldx	[%sp+2239], %o1
+	ldx	[%sp+2255], %g4
+	ldx	[%sp+2231], %o2
+	sllx	%g4, 20, %g4
+	ldx	[%sp+2223], %o3
+	sllx	%o2, 22, %o2
+	sllx	%o3, 42, %g5
+	add	%o1, %o2, %o2
+	ldx	[%sp+2247], %o1
+	add	%o2, %g5, %o2
+	stx	%o2, [%o0]
+	and	%o3, %o4, %g3
+	srlx	%o2, 42, %o2
+	add	%o1, %g4, %o1
+	srlx	%o3, 22, %o3
+	sub	%o2, %g3, %o2
+	srlx	%o2, 63, %o2
+	add	%o1, %o3, %o1
+	add	%o1, %o2, %o1
+	stx	%o1, [%o0+8]
+	add	%o0, 16, %o0
+	bl,a,pt	%xcc, .Loop0
+	ldx	[%g1], %o2
+	ret
+	restore	%g0, %g0, %g0
+EPILOGUE(mpn_sqr_diagonal)

diff --git a/third_party/gmp/mpn/sparc64/ultrasparc1234/sub_n.asm b/third_party/gmp/mpn/sparc64/ultrasparc1234/sub_n.asm
new file mode 100644
index 0000000..9fb7f70
--- /dev/null
+++ b/third_party/gmp/mpn/sparc64/ultrasparc1234/sub_n.asm

@@ -0,0 +1,241 @@
+dnl  SPARC v9 mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+dnl  store difference in a third limb vector.
+
+dnl  Copyright 2001-2003, 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C		   cycles/limb
+C UltraSPARC 1&2:     4
+C UltraSPARC 3:	      4.5
+
+C Compute carry-out from the most significant bits of u,v, and r, where
+C r=u-v-carry_in, using logic operations.
+
+C This code runs at 4 cycles/limb on UltraSPARC 1 and 2.  It has a 4 insn
+C recurrency, and the UltraSPARC 1 and 2 the IE units are 100% saturated.
+C Therefore, it seems futile to try to optimize this any further...
+
+C INPUT PARAMETERS
+define(`rp',`%i0')
+define(`up',`%i1')
+define(`vp',`%i2')
+define(`n',`%i3')
+
+define(`u0',`%l0')
+define(`u1',`%l2')
+define(`u2',`%l4')
+define(`u3',`%l6')
+define(`v0',`%l1')
+define(`v1',`%l3')
+define(`v2',`%l5')
+define(`v3',`%l7')
+
+define(`cy',`%i4')
+
+define(`fanop',`fitod %f0,%f2')		dnl  A quasi nop running in the FA pipe
+define(`fmnop',`fmuld %f0,%f0,%f4')	dnl  A quasi nop running in the FM pipe
+
+ASM_START()
+	REGISTER(%g2,#scratch)
+	REGISTER(%g3,#scratch)
+PROLOGUE(mpn_sub_nc)
+	save	%sp,-160,%sp
+
+	fitod	%f0,%f0		C make sure f0 contains small, quiet number
+	subcc	n,4,%g0
+	bl,pn	%xcc,.Loop0
+	nop
+	b,a	L(com)
+EPILOGUE()
+
+PROLOGUE(mpn_sub_n)
+	save	%sp,-160,%sp
+
+	fitod	%f0,%f0		C make sure f0 contains small, quiet number
+	subcc	n,4,%g0
+	bl,pn	%xcc,.Loop0
+	mov	0,cy
+L(com):
+	ldx	[up+0],u0
+	ldx	[vp+0],v0
+	add	up,32,up
+	ldx	[up-24],u1
+	ldx	[vp+8],v1
+	add	vp,32,vp
+	ldx	[up-16],u2
+	ldx	[vp-16],v2
+	ldx	[up-8],u3
+	ldx	[vp-8],v3
+	subcc	n,8,n
+	sub	u0,v0,%g1	C main sub
+	sub	%g1,cy,%g5	C carry sub
+	orn	u0,v0,%g2
+	bl,pn	%xcc,.Lend4567
+	fanop
+	b,a	.Loop
+
+	.align	16
+C START MAIN LOOP
+.Loop:	orn	%g5,%g2,%g2
+	andn	u0,v0,%g3
+	ldx	[up+0],u0
+	fanop
+C --
+	andn	%g2,%g3,%g2
+	ldx	[vp+0],v0
+	add	up,32,up
+	fanop
+C --
+	srlx	%g2,63,cy
+	sub	u1,v1,%g1
+	stx	%g5,[rp+0]
+	fanop
+C --
+	sub	%g1,cy,%g5
+	orn	u1,v1,%g2
+	fmnop
+	fanop
+C --
+	orn	%g5,%g2,%g2
+	andn	u1,v1,%g3
+	ldx	[up-24],u1
+	fanop
+C --
+	andn	%g2,%g3,%g2
+	ldx	[vp+8],v1
+	add	vp,32,vp
+	fanop
+C --
+	srlx	%g2,63,cy
+	sub	u2,v2,%g1
+	stx	%g5,[rp+8]
+	fanop
+C --
+	sub	%g1,cy,%g5
+	orn	u2,v2,%g2
+	fmnop
+	fanop
+C --
+	orn	%g5,%g2,%g2
+	andn	u2,v2,%g3
+	ldx	[up-16],u2
+	fanop
+C --
+	andn	%g2,%g3,%g2
+	ldx	[vp-16],v2
+	add	rp,32,rp
+	fanop
+C --
+	srlx	%g2,63,cy
+	sub	u3,v3,%g1
+	stx	%g5,[rp-16]
+	fanop
+C --
+	sub	%g1,cy,%g5
+	orn	u3,v3,%g2
+	fmnop
+	fanop
+C --
+	orn	%g5,%g2,%g2
+	andn	u3,v3,%g3
+	ldx	[up-8],u3
+	fanop
+C --
+	andn	%g2,%g3,%g2
+	subcc	n,4,n
+	ldx	[vp-8],v3
+	fanop
+C --
+	srlx	%g2,63,cy
+	sub	u0,v0,%g1
+	stx	%g5,[rp-8]
+	fanop
+C --
+	sub	%g1,cy,%g5
+	orn	u0,v0,%g2
+	bge,pt	%xcc,.Loop
+	fanop
+C END MAIN LOOP
+.Lend4567:
+	orn	%g5,%g2,%g2
+	andn	u0,v0,%g3
+	andn	%g2,%g3,%g2
+	srlx	%g2,63,cy
+	sub	u1,v1,%g1
+	stx	%g5,[rp+0]
+	sub	%g1,cy,%g5
+	orn	u1,v1,%g2
+	orn	%g5,%g2,%g2
+	andn	u1,v1,%g3
+	andn	%g2,%g3,%g2
+	srlx	%g2,63,cy
+	sub	u2,v2,%g1
+	stx	%g5,[rp+8]
+	sub	%g1,cy,%g5
+	orn	u2,v2,%g2
+	orn	%g5,%g2,%g2
+	andn	u2,v2,%g3
+	andn	%g2,%g3,%g2
+	add	rp,32,rp
+	srlx	%g2,63,cy
+	sub	u3,v3,%g1
+	stx	%g5,[rp-16]
+	sub	%g1,cy,%g5
+	orn	u3,v3,%g2
+	orn	%g5,%g2,%g2
+	andn	u3,v3,%g3
+	andn	%g2,%g3,%g2
+	srlx	%g2,63,cy
+	stx	%g5,[rp-8]
+
+	addcc	n,4,n
+	bz,pn	%xcc,.Lret
+	fanop
+
+.Loop0:	ldx	[up],u0
+	add	up,8,up
+	ldx	[vp],v0
+	add	vp,8,vp
+	add	rp,8,rp
+	subcc	n,1,n
+	sub	u0,v0,%g1
+	orn	u0,v0,%g2
+	sub	%g1,cy,%g5
+	andn	u0,v0,%g3
+	orn	%g5,%g2,%g2
+	stx	%g5,[rp-8]
+	andn	%g2,%g3,%g2
+	bnz,pt	%xcc,.Loop0
+	srlx	%g2,63,cy
+
+.Lret:	mov	cy,%i0
+	ret
+	restore
+EPILOGUE(mpn_sub_n)

diff --git a/third_party/gmp/mpn/sparc64/ultrasparc1234/submul_1.asm b/third_party/gmp/mpn/sparc64/ultrasparc1234/submul_1.asm
new file mode 100644
index 0000000..0bdb566
--- /dev/null
+++ b/third_party/gmp/mpn/sparc64/ultrasparc1234/submul_1.asm

@@ -0,0 +1,68 @@
+dnl  SPARC v9 64-bit mpn_submul_1 -- Multiply a limb vector with a limb and
+dnl  subtract the result from a second limb vector.
+
+dnl  Copyright 2001-2003 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C		   cycles/limb
+C UltraSPARC 1&2:     18
+C UltraSPARC 3:	      23
+
+C INPUT PARAMETERS
+C rp	i0
+C up	i1
+C n	i2
+C v	i3
+
+ASM_START()
+	REGISTER(%g2,#scratch)
+
+PROLOGUE(mpn_submul_1)
+	save	%sp,-176,%sp
+
+	sllx	%i2, 3, %g2
+	or	%g0, %i1, %o1
+	add	%g2, 15, %o0
+	or	%g0, %i2, %o2
+	and	%o0, -16, %o0
+	sub	%sp, %o0, %sp
+	add	%sp, 2223, %o0
+	or	%g0, %o0, %l0
+	call	mpn_mul_1
+	or	%g0, %i3, %o3
+	or	%g0, %o0, %l1		C preserve carry value from mpn_mul_1
+	or	%g0, %i0, %o0
+	or	%g0, %i0, %o1
+	or	%g0, %l0, %o2
+	call	mpn_sub_n
+	or	%g0, %i2, %o3
+	ret
+	restore	%l1, %o0, %o0		C sum carry values
+EPILOGUE(mpn_submul_1)

diff --git a/third_party/gmp/mpn/sparc64/ultrasparc34/gmp-mparam.h b/third_party/gmp/mpn/sparc64/ultrasparc34/gmp-mparam.h
new file mode 100644
index 0000000..c88e680
--- /dev/null
+++ b/third_party/gmp/mpn/sparc64/ultrasparc34/gmp-mparam.h

@@ -0,0 +1,222 @@
+/* ultrasparc3/4 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 1999-2002, 2004, 2006, 2008-2010, 2014, 2015 Free
+Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+/* 1593 MHz ultrasparc3 running Solaris 10 (swift.nada.kth.se) */
+/* FFT tuning limit = 100 M */
+/* Generated by tuneup.c, 2015-10-09, gcc 3.4 */
+
+#define DIVREM_1_NORM_THRESHOLD              0  /* always */
+#define DIVREM_1_UNNORM_THRESHOLD            0  /* always */
+#define MOD_1_1P_METHOD                      2
+#define MOD_1_NORM_THRESHOLD                 0  /* always */
+#define MOD_1_UNNORM_THRESHOLD               0  /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          7
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          5
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD        22
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD         0  /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     29
+#define USE_PREINV_DIVREM_1                  1
+#define DIV_QR_1N_PI1_METHOD                 1
+#define DIV_QR_1_NORM_THRESHOLD              2
+#define DIV_QR_1_UNNORM_THRESHOLD            1
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD                 0  /* always */
+#define BMOD_1_TO_MOD_1_THRESHOLD        MP_SIZE_T_MAX  /* never */
+
+#define MUL_TOOM22_THRESHOLD                28
+#define MUL_TOOM33_THRESHOLD                93
+#define MUL_TOOM44_THRESHOLD               142
+#define MUL_TOOM6H_THRESHOLD               165
+#define MUL_TOOM8H_THRESHOLD               278
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      93
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     114
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD      88
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD      50
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD      67
+
+#define SQR_BASECASE_THRESHOLD               7
+#define SQR_TOOM2_THRESHOLD                 70
+#define SQR_TOOM3_THRESHOLD                101
+#define SQR_TOOM4_THRESHOLD                184
+#define SQR_TOOM6_THRESHOLD                  0  /* always */
+#define SQR_TOOM8_THRESHOLD                339
+
+#define MULMID_TOOM42_THRESHOLD             40
+
+#define MULMOD_BNM1_THRESHOLD               14
+#define SQRMOD_BNM1_THRESHOLD                9
+
+#define MUL_FFT_MODF_THRESHOLD             212  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    212, 5}, {     13, 6}, {     17, 7}, {      9, 6}, \
+    {     19, 7}, {     17, 8}, {      9, 7}, {     20, 8}, \
+    {     11, 7}, {     23, 8}, {     13, 9}, {      7, 8}, \
+    {     19, 9}, {     11, 8}, {     25,10}, {      7, 9}, \
+    {     15, 8}, {     33, 9}, {     19, 8}, {     39, 9}, \
+    {     23, 8}, {     47, 9}, {     27,10}, {     15, 9}, \
+    {     39,10}, {     23, 9}, {     47,11}, {     15,10}, \
+    {     31, 9}, {     63, 8}, {    127, 7}, {    255, 9}, \
+    {     67,10}, {     39, 9}, {     79, 8}, {    159, 7}, \
+    {    319, 9}, {     83,10}, {     47, 9}, {     95, 8}, \
+    {    191, 7}, {    383,10}, {     55,11}, {     31,10}, \
+    {     63, 9}, {    127, 8}, {    255, 7}, {    511,10}, \
+    {     71, 9}, {    143, 8}, {    287,10}, {     79, 9}, \
+    {    159, 8}, {    319, 9}, {    175, 8}, {    351,11}, \
+    {     47,10}, {     95, 9}, {    191, 8}, {    383, 7}, \
+    {    767,10}, {    103,12}, {     31,11}, {     63,10}, \
+    {    127, 9}, {    287,11}, {     79,10}, {    159, 9}, \
+    {    319, 8}, {    639,10}, {    175, 9}, {    351, 8}, \
+    {    703,11}, {     95,10}, {    207, 9}, {    415,11}, \
+    {    111,10}, {    223, 9}, {    479,12}, {     63,11}, \
+    {    127,10}, {    255,11}, {    143,10}, {    287, 9}, \
+    {    575,10}, {    319, 9}, {    639,11}, {    175,10}, \
+    {    351,11}, {    191,10}, {    383,11}, {    207,10}, \
+    {    415,11}, {    223,10}, {    447,13}, {     63,12}, \
+    {    127,11}, {    287,10}, {    575,11}, {    319,10}, \
+    {    703,12}, {    191,11}, {    383,12}, {    223,11}, \
+    {    447,13}, {    127,12}, {    287,11}, {    575,12}, \
+    {    351,13}, {    191,12}, {    479,14}, {    127,13}, \
+    {    255,12}, {    575,13}, {    319,12}, {    703,13}, \
+    {    383,12}, {    767,13}, {    447,12}, {    895,14}, \
+    {    255,13}, {    511,12}, {   1023,13}, {    575,12}, \
+    {   1151,13}, {    703,14}, {    383,13}, {    831,12}, \
+    {   1663,13}, {    895,15}, {    255,14}, {    511,13}, \
+    {   1151,14}, {    639,13}, {   1407,12}, {   2815,14}, \
+    {    767,13}, {   1663,14}, {    895,13}, {   1791,15}, \
+    {    511,14}, {   1023,13}, {   2047,14}, {   1151,13}, \
+    {   2303,14}, {   1407,13}, {   2815,15}, {    767,14}, \
+    {   1791,16}, {    511,15}, {   1023,14}, {   2303,15}, \
+    {   1279,14}, {   2815,15}, {   1535,14}, {   3199,15}, \
+    {   1791,16}, {   1023,15}, {   2047,14}, {   4223,15}, \
+    {   2303,14}, {   4863,15}, {   2815,16}, {  65536,17}, \
+    { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
+    {2097152,22}, {4194304,23}, {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 171
+#define MUL_FFT_THRESHOLD                 2240
+
+#define SQR_FFT_MODF_THRESHOLD             244  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    244, 5}, {      8, 4}, {     17, 5}, {     15, 6}, \
+    {      8, 5}, {     17, 6}, {     17, 7}, {      9, 6}, \
+    {     19, 7}, {     17, 8}, {      9, 7}, {     20, 8}, \
+    {     11, 7}, {     23, 8}, {     13, 9}, {      7, 8}, \
+    {     19, 9}, {     11, 8}, {     25,10}, {      7, 9}, \
+    {     15, 8}, {     31, 9}, {     19, 8}, {     39, 9}, \
+    {     27,10}, {     15, 9}, {     39,10}, {     23, 9}, \
+    {     47,11}, {     15,10}, {     31, 9}, {     67,10}, \
+    {     39, 9}, {     79, 8}, {    159,10}, {     47, 9}, \
+    {     95,10}, {     55,11}, {     31,10}, {     63, 9}, \
+    {    127, 8}, {    255,10}, {     71, 9}, {    143, 8}, \
+    {    287, 7}, {    575,10}, {     79, 9}, {    159,11}, \
+    {     47, 9}, {    191, 8}, {    383, 7}, {    767, 9}, \
+    {    207,12}, {     31,11}, {     63,10}, {    127, 9}, \
+    {    255, 8}, {    511,10}, {    135, 9}, {    271,10}, \
+    {    143, 9}, {    287,11}, {     79,10}, {    159, 9}, \
+    {    319, 8}, {    639,10}, {    175, 9}, {    351, 8}, \
+    {    703, 7}, {   1407,11}, {     95,10}, {    191, 9}, \
+    {    383, 8}, {    767,10}, {    207, 9}, {    415,10}, \
+    {    223, 9}, {    447,12}, {     63,11}, {    127,10}, \
+    {    271, 9}, {    543,10}, {    287, 9}, {    575, 8}, \
+    {   1151,11}, {    159,10}, {    319, 9}, {    639,10}, \
+    {    351, 9}, {    703, 8}, {   1407, 7}, {   2815,11}, \
+    {    207,10}, {    415, 9}, {    831,11}, {    223,10}, \
+    {    447, 9}, {    895,13}, {     63,11}, {    271,10}, \
+    {    543,11}, {    287,12}, {    159,11}, {    351,10}, \
+    {    703,12}, {    191,11}, {    415,10}, {    831,12}, \
+    {    223,13}, {    127,12}, {    255,11}, {    511,10}, \
+    {   1023,11}, {    543,12}, {    287,11}, {    607,12}, \
+    {    319,11}, {    639,12}, {    415,11}, {    895,12}, \
+    {    479,14}, {    127,13}, {    255,12}, {    543,11}, \
+    {   1087,12}, {    575,11}, {   1151,13}, {    319,12}, \
+    {    639,11}, {   1279,12}, {    703,10}, {   2815,12}, \
+    {    831,11}, {   1663,13}, {    447,12}, {    959,14}, \
+    {    255,13}, {    511,12}, {   1087,13}, {    703,12}, \
+    {   1407,14}, {    383,13}, {    767,12}, {   1535,13}, \
+    {    895,15}, {    255,14}, {    511,13}, {   1215,14}, \
+    {    639,13}, {   1279,14}, {    767,13}, {   1663,14}, \
+    {    895,13}, {   1919,15}, {    511,14}, {   1023,13}, \
+    {   2175,14}, {   1151,13}, {   2431,14}, {   1407,15}, \
+    {    767,14}, {   1791,16}, {    511,15}, {   1023,14}, \
+    {   2303,15}, {   1279,14}, {   2815,15}, {   1535,14}, \
+    {   3199,15}, {   1791,16}, {   1023,15}, {   2047,14}, \
+    {   4351,15}, {   2303,14}, {   4863,15}, {   2815,16}, \
+    {  65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \
+    {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 184
+#define SQR_FFT_THRESHOLD                 1728
+
+#define MULLO_BASECASE_THRESHOLD             0  /* always */
+#define MULLO_DC_THRESHOLD                  29
+#define MULLO_MUL_N_THRESHOLD             4392
+#define SQRLO_BASECASE_THRESHOLD             2
+#define SQRLO_DC_THRESHOLD                  63
+#define SQRLO_SQR_THRESHOLD               3176
+
+#define DC_DIV_QR_THRESHOLD                 16
+#define DC_DIVAPPR_Q_THRESHOLD              64
+#define DC_BDIV_QR_THRESHOLD                30
+#define DC_BDIV_Q_THRESHOLD                 86
+
+#define INV_MULMOD_BNM1_THRESHOLD           58
+#define INV_NEWTON_THRESHOLD                17
+#define INV_APPR_THRESHOLD                  15
+
+#define BINV_NEWTON_THRESHOLD              109
+#define REDC_1_TO_REDC_2_THRESHOLD           0  /* always */
+#define REDC_2_TO_REDC_N_THRESHOLD         117
+
+#define MU_DIV_QR_THRESHOLD                618
+#define MU_DIVAPPR_Q_THRESHOLD             618
+#define MUPI_DIV_QR_THRESHOLD                0  /* always */
+#define MU_BDIV_QR_THRESHOLD               680
+#define MU_BDIV_Q_THRESHOLD                807
+
+#define POWM_SEC_TABLE  3,22,102,579,1555
+
+#define GET_STR_DC_THRESHOLD                20
+#define GET_STR_PRECOMPUTE_THRESHOLD        28
+#define SET_STR_DC_THRESHOLD               381
+#define SET_STR_PRECOMPUTE_THRESHOLD      1042
+
+#define FAC_DSC_THRESHOLD                  462
+#define FAC_ODD_THRESHOLD                    0  /* always */
+
+#define MATRIX22_STRASSEN_THRESHOLD         12
+#define HGCD_THRESHOLD                      45
+#define HGCD_APPR_THRESHOLD                 50
+#define HGCD_REDUCE_THRESHOLD             1094
+#define GCD_DC_THRESHOLD                   126
+#define GCDEXT_DC_THRESHOLD                132
+#define JACOBI_BASE_METHOD                   4

diff --git a/third_party/gmp/mpn/sparc64/ultrasparct1/add_n.asm b/third_party/gmp/mpn/sparc64/ultrasparct1/add_n.asm
new file mode 100644
index 0000000..954c7f6
--- /dev/null
+++ b/third_party/gmp/mpn/sparc64/ultrasparct1/add_n.asm

@@ -0,0 +1,68 @@
+dnl  SPARC v9 mpn_add_n for T1/T2.
+
+dnl  Copyright 2010 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C		   cycles/limb
+C UltraSPARC T1:	 ?
+C UltraSPARC T2:	 ?
+
+C INPUT PARAMETERS
+define(`rp', `%o0')
+define(`up', `%o1')
+define(`vp', `%o2')
+define(`n',  `%o3')
+define(`cy', `%o4')
+
+ASM_START()
+	REGISTER(%g2,#scratch)
+	REGISTER(%g3,#scratch)
+PROLOGUE(mpn_add_nc)
+	b,a	L(ent)
+EPILOGUE()
+PROLOGUE(mpn_add_n)
+	mov	0, cy
+L(ent):	cmp	%g0, cy
+L(top):	ldx	[up+0], %o4
+	add	up, 8, up
+	ldx	[vp+0], %o5
+	add	vp, 8, vp
+	add	rp, 8, rp
+	add	n, -1, n
+	srlx	%o4, 32, %g1
+	srlx	%o5, 32, %g2
+	addccc	%o4, %o5, %g3
+	addccc	%g1, %g2, %g0
+	brgz	n, L(top)
+	 stx	%g3, [rp-8]
+
+	retl
+	addc	%g0, %g0, %o0
+EPILOGUE()

diff --git a/third_party/gmp/mpn/sparc64/ultrasparct1/addlsh1_n.asm b/third_party/gmp/mpn/sparc64/ultrasparct1/addlsh1_n.asm
new file mode 100644
index 0000000..3134797
--- /dev/null
+++ b/third_party/gmp/mpn/sparc64/ultrasparct1/addlsh1_n.asm

@@ -0,0 +1,41 @@
+dnl  SPARC v9 mpn_addlsh1_n for T1/T2.
+
+dnl  Copyright 2010 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+define(LSH,             1)
+define(RSH,             63)
+
+define(func, mpn_addlsh1_n)
+
+MULFUNC_PROLOGUE(mpn_addlsh1_n)
+
+include_mpn(`sparc64/ultrasparct1/addlshC_n.asm')

diff --git a/third_party/gmp/mpn/sparc64/ultrasparct1/addlsh2_n.asm b/third_party/gmp/mpn/sparc64/ultrasparct1/addlsh2_n.asm
new file mode 100644
index 0000000..ee1afd0
--- /dev/null
+++ b/third_party/gmp/mpn/sparc64/ultrasparct1/addlsh2_n.asm

@@ -0,0 +1,41 @@
+dnl  SPARC v9 mpn_addlsh2_n for T1/T2.
+
+dnl  Copyright 2010 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+define(LSH,             2)
+define(RSH,             62)
+
+define(func, mpn_addlsh2_n)
+
+MULFUNC_PROLOGUE(mpn_addlsh2_n)
+
+include_mpn(`sparc64/ultrasparct1/addlshC_n.asm')

diff --git a/third_party/gmp/mpn/sparc64/ultrasparct1/addlshC_n.asm b/third_party/gmp/mpn/sparc64/ultrasparct1/addlshC_n.asm
new file mode 100644
index 0000000..5be9a0d
--- /dev/null
+++ b/third_party/gmp/mpn/sparc64/ultrasparct1/addlshC_n.asm

@@ -0,0 +1,69 @@
+dnl  SPARC v9 mpn_addlshC_n for T1/T2.
+
+dnl  Copyright 2010 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+C		   cycles/limb
+C UltraSPARC T1:	21
+C UltraSPARC T2:	 ?
+
+C INPUT PARAMETERS
+define(`rp', `%o0')
+define(`up', `%o1')
+define(`vp', `%o2')
+define(`n',  `%o3')
+define(`cy', `%o4')
+
+ASM_START()
+	REGISTER(%g2,#scratch)
+	REGISTER(%g3,#scratch)
+PROLOGUE(func)
+	mov	0, cy
+	mov	0, %g5
+	cmp	%g0, cy
+L(top):	ldx	[up+0], %o4
+	add	up, 8, up
+	ldx	[vp+0], %o5
+	add	vp, 8, vp
+	add	rp, 8, rp
+
+	sllx	%o5, LSH, %g4
+	add	n, -1, n
+	or	%g5, %g4, %g4
+	srlx	%o5, RSH, %g5
+
+	srlx	%o4, 32, %g1
+	srlx	%g4, 32, %g2
+	addccc	%o4, %g4, %g3
+	addccc	%g1, %g2, %g0
+	brgz	n, L(top)
+	 stx	%g3, [rp-8]
+
+	retl
+	addc	%g5, %g0, %o0
+EPILOGUE()

diff --git a/third_party/gmp/mpn/sparc64/ultrasparct1/addmul_1.asm b/third_party/gmp/mpn/sparc64/ultrasparct1/addmul_1.asm
new file mode 100644
index 0000000..29dba96
--- /dev/null
+++ b/third_party/gmp/mpn/sparc64/ultrasparct1/addmul_1.asm

@@ -0,0 +1,86 @@
+dnl  SPARC v9 mpn_addmul_1 for T1/T2.
+
+dnl  Copyright 2010 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C		   cycles/limb
+C UltraSPARC T1:	74
+C UltraSPARC T2:	 ?
+
+C INPUT PARAMETERS
+define(`rp', `%i0')
+define(`up', `%i1')
+define(`n',  `%i2')
+define(`v0', `%i3')
+
+ASM_START()
+	REGISTER(%g2,#scratch)
+	REGISTER(%g3,#scratch)
+PROLOGUE(mpn_addmul_1)
+	save	%sp, -176, %sp
+	mov	1, %o2
+	mov	%i0, %g2
+	srlx	%i3, 32, %o4
+	sllx	%o2, 32, %o2
+	srl	%i3, 0, %i3
+	mov	0, %g3
+	mov	0, %i0
+
+L(top):	ldx	[%i1+%g3], %g1
+	srl	%g1, 0, %g4
+	mulx	%g4, %i3, %o5
+	srlx	%g1, 32, %g1
+	mulx	%g1, %i3, %g5
+	mulx	%g4, %o4, %g4
+	mulx	%g1, %o4, %g1
+	srlx	%o5, 32, %o1
+	add	%g5, %o1, %o1
+	addcc	%o1, %g4, %g4
+	srl	%o5, 0, %o0
+	ldx	[%g2+%g3], %o5
+	sllx	%g4, 32, %o1
+	add	%g1, %o2, %l1
+	movlu	%xcc, %l1, %g1
+	add	%o1, %o0, %l0
+	addcc	%l0, %i0, %g5
+	srlx	%g4, 32, %i0
+	add	%i0, 1, %g4
+	movlu	%xcc, %g4, %i0
+	addcc	%o5, %g5, %g5
+	stx	%g5, [%g2+%g3]
+	add	%i0, 1, %g4
+	movlu	%xcc, %g4, %i0
+	add	%i2, -1, %i2
+	add	%i0, %g1, %i0
+	brnz,pt	%i2, L(top)
+	 add	%g3, 8, %g3
+	return	%i7+8
+	 nop
+EPILOGUE()

diff --git a/third_party/gmp/mpn/sparc64/ultrasparct1/gmp-mparam.h b/third_party/gmp/mpn/sparc64/ultrasparct1/gmp-mparam.h
new file mode 100644
index 0000000..99db78a
--- /dev/null
+++ b/third_party/gmp/mpn/sparc64/ultrasparct1/gmp-mparam.h

@@ -0,0 +1,154 @@
+/* Sparc64 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 1999-2002, 2004, 2006, 2008-2010 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+/* 1000 MHz ultrasparc t1 running GNU/Linux */
+
+#define DIVREM_1_NORM_THRESHOLD              0  /* always */
+#define DIVREM_1_UNNORM_THRESHOLD            0  /* always */
+#define MOD_1_1P_METHOD                      2
+#define MOD_1_NORM_THRESHOLD                 0  /* always */
+#define MOD_1_UNNORM_THRESHOLD               0  /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD         13
+#define MOD_1U_TO_MOD_1_1_THRESHOLD      MP_SIZE_T_MAX
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD         0  /* never mpn_mod_1_1p */
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD         0  /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     34
+#define USE_PREINV_DIVREM_1                  1
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD                 0  /* always */
+#define BMOD_1_TO_MOD_1_THRESHOLD        MP_SIZE_T_MAX  /* never */
+
+#define MUL_TOOM22_THRESHOLD                 8
+#define MUL_TOOM33_THRESHOLD                50
+#define MUL_TOOM44_THRESHOLD                99
+#define MUL_TOOM6H_THRESHOLD               125
+#define MUL_TOOM8H_THRESHOLD               187
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      65
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD      77
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD      65
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD      50
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD      34
+
+#define SQR_BASECASE_THRESHOLD               0  /* always */
+#define SQR_TOOM2_THRESHOLD                 14
+#define SQR_TOOM3_THRESHOLD                 57
+#define SQR_TOOM4_THRESHOLD                133
+#define SQR_TOOM6_THRESHOLD                156
+#define SQR_TOOM8_THRESHOLD                260
+
+#define MULMID_TOOM42_THRESHOLD             12
+
+#define MULMOD_BNM1_THRESHOLD                7
+#define SQRMOD_BNM1_THRESHOLD                7
+
+#define MUL_FFT_MODF_THRESHOLD             176  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    176, 5}, {      7, 6}, {      4, 5}, {      9, 6}, \
+    {      5, 5}, {     11, 6}, {     11, 7}, {      6, 6}, \
+    {     13, 7}, {      7, 6}, {     15, 7}, {      9, 8}, \
+    {      5, 7}, {     13, 8}, {      7, 7}, {     15, 6}, \
+    {     32, 7}, {     24, 8}, {     21, 9}, {     11, 8}, \
+    {     23,10}, {      7, 9}, {     15, 8}, {     33, 9}, \
+    {     19, 8}, {     39, 9}, {     23,10}, {     15, 9}, \
+    {     43,10}, {     23,11}, {     15,10}, {     31, 9}, \
+    {     63, 8}, {    127, 9}, {     67,10}, {     39, 9}, \
+    {     79, 8}, {    159,10}, {     47, 9}, {     95,11}, \
+    {   2048,12}, {   4096,13}, {   8192,14}, {  16384,15}, \
+    {  32768,16}, {  65536,17}, { 131072,18}, { 262144,19}, \
+    { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \
+    {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 53
+#define MUL_FFT_THRESHOLD                 1728
+
+
+#define SQR_FFT_MODF_THRESHOLD             148  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    148, 5}, {      7, 6}, {      4, 5}, {      9, 6}, \
+    {      5, 5}, {     11, 6}, {     11, 7}, {      6, 6}, \
+    {     13, 7}, {      7, 6}, {     15, 7}, {     13, 8}, \
+    {      7, 7}, {     16, 8}, {      9, 6}, {     38, 7}, \
+    {     20, 8}, {     11, 7}, {     24, 8}, {     13, 9}, \
+    {      7, 7}, {     30, 8}, {     19, 9}, {     11, 8}, \
+    {     25,10}, {      7, 9}, {     15, 8}, {     31, 9}, \
+    {     19, 8}, {     39, 9}, {     27,10}, {     15, 9}, \
+    {     39,10}, {     23, 9}, {     47, 8}, {     95, 9}, \
+    {     51,11}, {     15,10}, {     31, 8}, {    127,10}, \
+    {     39, 9}, {     79, 8}, {    159,10}, {     47, 9}, \
+    {     95,11}, {   2048,12}, {   4096,13}, {   8192,14}, \
+    {  16384,15}, {  32768,16}, {  65536,17}, { 131072,18}, \
+    { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \
+    {4194304,23}, {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 58
+#define SQR_FFT_THRESHOLD                 1344
+
+#define MULLO_BASECASE_THRESHOLD             0  /* always */
+#define MULLO_DC_THRESHOLD                  28
+#define MULLO_MUL_N_THRESHOLD             3176
+
+#define DC_DIV_QR_THRESHOLD                 27
+#define DC_DIVAPPR_Q_THRESHOLD             106
+#define DC_BDIV_QR_THRESHOLD                27
+#define DC_BDIV_Q_THRESHOLD                 62
+
+#define INV_MULMOD_BNM1_THRESHOLD           14
+#define INV_NEWTON_THRESHOLD               163
+#define INV_APPR_THRESHOLD                 117
+
+#define BINV_NEWTON_THRESHOLD              166
+#define REDC_1_TO_REDC_N_THRESHOLD          31
+
+#define MU_DIV_QR_THRESHOLD                734
+#define MU_DIVAPPR_Q_THRESHOLD             748
+#define MUPI_DIV_QR_THRESHOLD               67
+#define MU_BDIV_QR_THRESHOLD               562
+#define MU_BDIV_Q_THRESHOLD                734
+
+#define POWM_SEC_TABLE  4,29,188,643,2741
+
+#define MATRIX22_STRASSEN_THRESHOLD         11
+#define HGCD_THRESHOLD                      58
+#define HGCD_APPR_THRESHOLD                 55
+#define HGCD_REDUCE_THRESHOLD              637
+#define GCD_DC_THRESHOLD                   186
+#define GCDEXT_DC_THRESHOLD                140
+#define JACOBI_BASE_METHOD                   3
+
+#define GET_STR_DC_THRESHOLD                20
+#define GET_STR_PRECOMPUTE_THRESHOLD        33
+#define SET_STR_DC_THRESHOLD               268
+#define SET_STR_PRECOMPUTE_THRESHOLD       960
+
+#define FAC_DSC_THRESHOLD                  268
+#define FAC_ODD_THRESHOLD                    0  /* always */

diff --git a/third_party/gmp/mpn/sparc64/ultrasparct1/mul_1.asm b/third_party/gmp/mpn/sparc64/ultrasparct1/mul_1.asm
new file mode 100644
index 0000000..1fea2a1
--- /dev/null
+++ b/third_party/gmp/mpn/sparc64/ultrasparct1/mul_1.asm

@@ -0,0 +1,82 @@
+dnl  SPARC v9 mpn_mul_1 for T1/T2.
+
+dnl  Copyright 2010 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C		   cycles/limb
+C UltraSPARC T1:	68
+C UltraSPARC T2:	 ?
+
+C INPUT PARAMETERS
+define(`rp', `%i0')
+define(`up', `%i1')
+define(`n',  `%i2')
+define(`v0', `%i3')
+
+ASM_START()
+	REGISTER(%g2,#scratch)
+	REGISTER(%g3,#scratch)
+PROLOGUE(mpn_mul_1)
+	save	%sp, -176, %sp
+	mov	1, %o2
+	mov	%i0, %g2
+	srlx	%i3, 32, %o4
+	sllx	%o2, 32, %o2
+	srl	%i3, 0, %i3
+	mov	0, %g3
+	mov	0, %i0
+
+L(top):	ldx	[%i1+%g3], %g1
+	srl	%g1, 0, %g4
+	mulx	%g4, %i3, %o5
+	srlx	%g1, 32, %g1
+	mulx	%g1, %i3, %g5
+	mulx	%g4, %o4, %g4
+	mulx	%g1, %o4, %g1
+	srlx	%o5, 32, %o1
+	add	%g5, %o1, %o1
+	addcc	%o1, %g4, %g4
+	srl	%o5, 0, %o0
+	sllx	%g4, 32, %o1
+	add	%g1, %o2, %l1
+	movlu	%xcc, %l1, %g1
+	add	%o1, %o0, %l0
+	addcc	%l0, %i0, %g5
+	srlx	%g4, 32, %i0
+	add	%i0, 1, %g4
+	movlu	%xcc, %g4, %i0
+	stx	%g5, [%g2+%g3]
+	add	%i2, -1, %i2
+	add	%i0, %g1, %i0
+	brnz,pt	%i2, L(top)
+	 add	%g3, 8, %g3
+	return	%i7+8
+	 nop
+EPILOGUE()

diff --git a/third_party/gmp/mpn/sparc64/ultrasparct1/rsblsh1_n.asm b/third_party/gmp/mpn/sparc64/ultrasparct1/rsblsh1_n.asm
new file mode 100644
index 0000000..51bd4ab
--- /dev/null
+++ b/third_party/gmp/mpn/sparc64/ultrasparct1/rsblsh1_n.asm

@@ -0,0 +1,41 @@
+dnl  SPARC v9 mpn_rsblsh1_n for T1/T2.
+
+dnl  Copyright 2010 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+define(LSH,             1)
+define(RSH,             63)
+
+define(func, mpn_rsblsh1_n)
+
+MULFUNC_PROLOGUE(mpn_rsblsh1_n)
+
+include_mpn(`sparc64/ultrasparct1/rsblshC_n.asm')

diff --git a/third_party/gmp/mpn/sparc64/ultrasparct1/rsblsh2_n.asm b/third_party/gmp/mpn/sparc64/ultrasparct1/rsblsh2_n.asm
new file mode 100644
index 0000000..f0d208e
--- /dev/null
+++ b/third_party/gmp/mpn/sparc64/ultrasparct1/rsblsh2_n.asm

@@ -0,0 +1,41 @@
+dnl  SPARC v9 mpn_rsblsh2_n for T1/T2.
+
+dnl  Copyright 2010 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+define(LSH,             2)
+define(RSH,             62)
+
+define(func, mpn_rsblsh2_n)
+
+MULFUNC_PROLOGUE(mpn_rsblsh2_n)
+
+include_mpn(`sparc64/ultrasparct1/rsblshC_n.asm')

diff --git a/third_party/gmp/mpn/sparc64/ultrasparct1/rsblshC_n.asm b/third_party/gmp/mpn/sparc64/ultrasparct1/rsblshC_n.asm
new file mode 100644
index 0000000..7c03e9f
--- /dev/null
+++ b/third_party/gmp/mpn/sparc64/ultrasparct1/rsblshC_n.asm

@@ -0,0 +1,69 @@
+dnl  SPARC v9 mpn_rsblshC_n for T1/T2.
+
+dnl  Copyright 2010 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+C		   cycles/limb
+C UltraSPARC T1:	21
+C UltraSPARC T2:	 ?
+
+C INPUT PARAMETERS
+define(`rp', `%o0')
+define(`up', `%o1')
+define(`vp', `%o2')
+define(`n',  `%o3')
+define(`cy', `%o4')
+
+ASM_START()
+	REGISTER(%g2,#scratch)
+	REGISTER(%g3,#scratch)
+PROLOGUE(func)
+	mov	0, cy
+	mov	0, %g5
+	cmp	%g0, cy
+L(top):	ldx	[up+0], %o4
+	add	up, 8, up
+	ldx	[vp+0], %o5
+	add	vp, 8, vp
+	add	rp, 8, rp
+
+	sllx	%o5, LSH, %g4
+	add	n, -1, n
+	or	%g5, %g4, %g4
+	srlx	%o5, RSH, %g5
+
+	srlx	%o4, 32, %g1
+	srlx	%g4, 32, %g2
+	subccc	%g4, %o4, %g3
+	subccc	%g2, %g1, %g0
+	brgz	n, L(top)
+	 stx	%g3, [rp-8]
+
+	retl
+	subc	%g5, %g0, %o0
+EPILOGUE()

diff --git a/third_party/gmp/mpn/sparc64/ultrasparct1/sub_n.asm b/third_party/gmp/mpn/sparc64/ultrasparct1/sub_n.asm
new file mode 100644
index 0000000..c2af89f
--- /dev/null
+++ b/third_party/gmp/mpn/sparc64/ultrasparct1/sub_n.asm

@@ -0,0 +1,68 @@
+dnl  SPARC v9 mpn_sub_n for T1/T2.
+
+dnl  Copyright 2010 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C		   cycles/limb
+C UltraSPARC T1:	 ?
+C UltraSPARC T2:	 ?
+
+C INPUT PARAMETERS
+define(`rp', `%o0')
+define(`up', `%o1')
+define(`vp', `%o2')
+define(`n',  `%o3')
+define(`cy', `%o4')
+
+ASM_START()
+	REGISTER(%g2,#scratch)
+	REGISTER(%g3,#scratch)
+PROLOGUE(mpn_sub_nc)
+	b,a	L(ent)
+EPILOGUE()
+PROLOGUE(mpn_sub_n)
+	mov	0, cy
+L(ent):	cmp	%g0, cy
+L(top):	ldx	[up+0], %o4
+	add	up, 8, up
+	ldx	[vp+0], %o5
+	add	vp, 8, vp
+	add	rp, 8, rp
+	add	n, -1, n
+	srlx	%o4, 32, %g1
+	srlx	%o5, 32, %g2
+	subccc	%o4, %o5, %g3
+	subccc	%g1, %g2, %g0
+	brgz	n, L(top)
+	 stx	%g3, [rp-8]
+
+	retl
+	addc	%g0, %g0, %o0
+EPILOGUE()

diff --git a/third_party/gmp/mpn/sparc64/ultrasparct1/sublsh1_n.asm b/third_party/gmp/mpn/sparc64/ultrasparct1/sublsh1_n.asm
new file mode 100644
index 0000000..8c8fa80
--- /dev/null
+++ b/third_party/gmp/mpn/sparc64/ultrasparct1/sublsh1_n.asm

@@ -0,0 +1,41 @@
+dnl  SPARC v9 mpn_sublsh1_n for T1/T2.
+
+dnl  Copyright 2010 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+define(LSH,             1)
+define(RSH,             63)
+
+define(func, mpn_sublsh1_n)
+
+MULFUNC_PROLOGUE(mpn_sublsh1_n)
+
+include_mpn(`sparc64/ultrasparct1/sublshC_n.asm')

diff --git a/third_party/gmp/mpn/sparc64/ultrasparct1/sublsh2_n.asm b/third_party/gmp/mpn/sparc64/ultrasparct1/sublsh2_n.asm
new file mode 100644
index 0000000..2fd5eee
--- /dev/null
+++ b/third_party/gmp/mpn/sparc64/ultrasparct1/sublsh2_n.asm

@@ -0,0 +1,41 @@
+dnl  SPARC v9 mpn_sublsh2_n for T1/T2.
+
+dnl  Copyright 2010 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+define(LSH,             2)
+define(RSH,             62)
+
+define(func, mpn_sublsh2_n)
+
+MULFUNC_PROLOGUE(mpn_sublsh2_n)
+
+include_mpn(`sparc64/ultrasparct1/sublshC_n.asm')

diff --git a/third_party/gmp/mpn/sparc64/ultrasparct1/sublshC_n.asm b/third_party/gmp/mpn/sparc64/ultrasparct1/sublshC_n.asm
new file mode 100644
index 0000000..01eafef
--- /dev/null
+++ b/third_party/gmp/mpn/sparc64/ultrasparct1/sublshC_n.asm

@@ -0,0 +1,69 @@
+dnl  SPARC v9 mpn_sublshC_n for T1/T2.
+
+dnl  Copyright 2010 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+C		   cycles/limb
+C UltraSPARC T1:	21
+C UltraSPARC T2:	 ?
+
+C INPUT PARAMETERS
+define(`rp', `%o0')
+define(`up', `%o1')
+define(`vp', `%o2')
+define(`n',  `%o3')
+define(`cy', `%o4')
+
+ASM_START()
+	REGISTER(%g2,#scratch)
+	REGISTER(%g3,#scratch)
+PROLOGUE(func)
+	mov	0, cy
+	mov	0, %g5
+	cmp	%g0, cy
+L(top):	ldx	[up+0], %o4
+	add	up, 8, up
+	ldx	[vp+0], %o5
+	add	vp, 8, vp
+	add	rp, 8, rp
+
+	sllx	%o5, LSH, %g4
+	add	n, -1, n
+	or	%g5, %g4, %g4
+	srlx	%o5, RSH, %g5
+
+	srlx	%o4, 32, %g1
+	srlx	%g4, 32, %g2
+	subccc	%o4, %g4, %g3
+	subccc	%g1, %g2, %g0
+	brgz	n, L(top)
+	 stx	%g3, [rp-8]
+
+	retl
+	addc	%g5, %g0, %o0
+EPILOGUE()

diff --git a/third_party/gmp/mpn/sparc64/ultrasparct1/submul_1.asm b/third_party/gmp/mpn/sparc64/ultrasparct1/submul_1.asm
new file mode 100644
index 0000000..4f553a8
--- /dev/null
+++ b/third_party/gmp/mpn/sparc64/ultrasparct1/submul_1.asm

@@ -0,0 +1,86 @@
+dnl  SPARC v9 mpn_submul_1 for T1/T2.
+
+dnl  Copyright 2010 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C		   cycles/limb
+C UltraSPARC T1:	74
+C UltraSPARC T2:	 ?
+
+C INPUT PARAMETERS
+define(`rp', `%i0')
+define(`up', `%i1')
+define(`n',  `%i2')
+define(`v0', `%i3')
+
+ASM_START()
+	REGISTER(%g2,#scratch)
+	REGISTER(%g3,#scratch)
+PROLOGUE(mpn_submul_1)
+	save	%sp, -176, %sp
+	mov	1, %o2
+	mov	%i0, %g2
+	srlx	%i3, 32, %o4
+	sllx	%o2, 32, %o2
+	srl	%i3, 0, %i3
+	mov	0, %g3
+	mov	0, %i0
+
+L(top):	ldx	[%i1+%g3], %g1
+	srl	%g1, 0, %g4
+	mulx	%g4, %i3, %o5
+	srlx	%g1, 32, %g1
+	mulx	%g1, %i3, %g5
+	mulx	%g4, %o4, %g4
+	mulx	%g1, %o4, %g1
+	srlx	%o5, 32, %o1
+	add	%g5, %o1, %o1
+	addcc	%o1, %g4, %g4
+	srl	%o5, 0, %o0
+	ldx	[%g2+%g3], %o5
+	sllx	%g4, 32, %o1
+	add	%g1, %o2, %l1
+	movlu	%xcc, %l1, %g1
+	add	%o1, %o0, %l0
+	addcc	%l0, %i0, %g5
+	srlx	%g4, 32, %i0
+	add	%i0, 1, %g4
+	movlu	%xcc, %g4, %i0
+	subcc	%o5, %g5, %g5
+	stx	%g5, [%g2+%g3]
+	add	%i0, 1, %g4
+	movlu	%xcc, %g4, %i0
+	add	%i2, -1, %i2
+	add	%i0, %g1, %i0
+	brnz,pt	%i2, L(top)
+	 add	%g3, 8, %g3
+	return	%i7+8
+	 nop
+EPILOGUE()

diff --git a/third_party/gmp/mpn/sparc64/ultrasparct3/add_n.asm b/third_party/gmp/mpn/sparc64/ultrasparct3/add_n.asm
new file mode 100644
index 0000000..0170746
--- /dev/null
+++ b/third_party/gmp/mpn/sparc64/ultrasparct3/add_n.asm

@@ -0,0 +1,126 @@
+dnl  SPARC v9 mpn_add_n for T3/T4.
+
+dnl  Contributed to the GNU project by David Miller.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C		   cycles/limb
+C UltraSPARC T3:	 8
+C UltraSPARC T4:	 3
+
+C INPUT PARAMETERS
+define(`rp', `%i0')
+define(`up', `%i1')
+define(`vp', `%i2')
+define(`n',  `%i3')
+define(`cy', `%i4')
+
+define(`u0_off', `%l2')
+define(`u1_off', `%l3')
+define(`loop_n', `%l6')
+define(`tmp', `%l7')
+
+ASM_START()
+	REGISTER(%g2,#scratch)
+	REGISTER(%g3,#scratch)
+PROLOGUE(mpn_add_nc)
+	save	%sp, -176, %sp
+	b,a	L(ent)
+EPILOGUE()
+PROLOGUE(mpn_add_n)
+	save	%sp, -176, %sp
+
+	mov	0, cy
+L(ent):
+	subcc	n, 1, n
+	be	L(final_one)
+	 cmp	%g0, cy
+
+	ldx	[up + 0], %o4
+	sllx	n, 3, tmp
+
+	ldx	[vp + 0], %o5
+	add	up, tmp, u0_off
+
+	ldx	[up + 8], %g5
+	neg	tmp, loop_n
+
+	ldx	[vp + 8], %g1
+	add	u0_off, 8, u1_off
+
+	sub	loop_n, -(2 * 8), loop_n
+
+	brgez,pn loop_n, L(loop_tail)
+	 add	vp, (2 * 8), vp
+
+	b,a	L(top)
+	ALIGN(16)
+L(top):
+	addxccc(%o4, %o5, tmp)
+	ldx	[vp + 0], %o5
+
+	add	rp, (2 * 8), rp
+	ldx	[loop_n + u0_off], %o4
+
+	add	vp, (2 * 8), vp
+	stx	tmp, [rp - 16]
+
+	addxccc(%g1, %g5, tmp)
+	ldx	[vp - 8], %g1
+
+	ldx	[loop_n + u1_off], %g5
+	sub	loop_n, -(2 * 8), loop_n
+
+	brlz	loop_n, L(top)
+	 stx	tmp, [rp - 8]
+
+L(loop_tail):
+	addxccc(%o4, %o5, %g3)
+	add	loop_n, u0_off, up
+
+	addxccc(%g1, %g5, %g5)
+	stx	%g3, [rp + 0]
+
+	brgz,pt	loop_n, L(done)
+	 stx	%g5, [rp + 8]
+
+	add	rp, (2 * 8), rp
+L(final_one):
+	ldx	[up+0], %o4
+	ldx	[vp+0], %o5
+	addxccc(%o4, %o5, %g3)
+	stx	%g3, [rp+0]
+
+L(done):
+	addxc(%g0, %g0, %i0)
+	ret
+	 restore
+EPILOGUE()

diff --git a/third_party/gmp/mpn/sparc64/ultrasparct3/addmul_1.asm b/third_party/gmp/mpn/sparc64/ultrasparct3/addmul_1.asm
new file mode 100644
index 0000000..939811e
--- /dev/null
+++ b/third_party/gmp/mpn/sparc64/ultrasparct3/addmul_1.asm

@@ -0,0 +1,182 @@
+dnl  SPARC v9 mpn_addmul_1 for T3/T4/T5.
+
+dnl  Contributed to the GNU project by David Miller and Torbjörn Granlund.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C		   cycles/limb
+C UltraSPARC T3:	26
+C UltraSPARC T4:	4.5
+
+C INPUT PARAMETERS
+define(`rp', `%i0')
+define(`up', `%i1')
+define(`n',  `%i2')
+define(`v0', `%i3')
+
+define(`u0',  `%l0')
+define(`u1',  `%l1')
+define(`u2',  `%l2')
+define(`u3',  `%l3')
+define(`r0',  `%l4')
+define(`r1',  `%l5')
+define(`r2',  `%l6')
+define(`r3',  `%l7')
+
+ASM_START()
+	REGISTER(%g2,#scratch)
+	REGISTER(%g3,#scratch)
+PROLOGUE(mpn_addmul_1)
+	save	%sp, -176, %sp
+	ldx	[up+0], %g1
+
+	and	n, 3, %g3
+	brz	%g3, L(b0)
+	 addcc	%g0, %g0, %g5			C clear carry limb, flag
+	cmp	%g3, 2
+	bcs	%xcc, L(b01)
+	 nop
+	be	%xcc, L(b10)
+	 ldx	[up+8], %g5
+
+L(b11):	ldx	[up+16], u3
+	mulx	%g1, v0, %o2
+	umulxhi(%g1, v0, %o3)
+	ldx	[rp+0], r1
+	mulx	%g5, v0, %o4
+	ldx	[rp+8], r2
+	umulxhi(%g5, v0, %o5)
+	ldx	[rp+16], r3
+	mulx	u3, v0, %g4
+	umulxhi(u3, v0, %g5)
+	addcc	%o3, %o4, %o4
+	addxccc(%o5, %g4, %g4)
+	addxc(	%g0, %g5, %g5)
+	addcc	r1, %o2, r1
+	stx	r1, [rp+0]
+	addxccc(r2, %o4, r2)
+	stx	r2, [rp+8]
+	addxccc(r3, %g4, r3)
+	stx	r3, [rp+16]
+	add	n, -3, n
+	add	up, 24, up
+	brz	n, L(xit)
+	 add	rp, 24, rp
+	b	L(com)
+	 nop
+
+L(b10):	mulx	%g1, v0, %o4
+	ldx	[rp+0], r2
+	umulxhi(%g1, v0, %o5)
+	ldx	[rp+8], r3
+	mulx	%g5, v0, %g4
+	umulxhi(%g5, v0, %g5)
+	addcc	%o5, %g4, %g4
+	addxc(	%g0, %g5, %g5)
+	addcc	r2, %o4, r2
+	stx	r2, [rp+0]
+	addxccc(r3, %g4, r3)
+	stx	r3, [rp+8]
+	add	n, -2, n
+	add	up, 16, up
+	brz	n, L(xit)
+	 add	rp, 16, rp
+	b	L(com)
+	 nop
+
+L(b01):	ldx	[rp+0], r3
+	mulx	%g1, v0, %g4
+	umulxhi(%g1, v0, %g5)
+	addcc	r3, %g4, r3
+	stx	r3, [rp+0]
+	add	n, -1, n
+	add	up, 8, up
+	brz	n, L(xit)
+	 add	rp, 8, rp
+
+L(com):	ldx	[up+0], %g1
+L(b0):	ldx	[up+8], u1
+	ldx	[up+16], u2
+	ldx	[up+24], u3
+	mulx	%g1, v0, %o0
+	umulxhi(%g1, v0, %o1)
+	b	L(lo0)
+	 nop
+
+	ALIGN(16)
+L(top):	ldx	[up+0], u0
+	addxc(	%g0, %g5, %g5)		C propagate carry into carry limb
+	ldx	[up+8], u1
+	addcc	r0, %o0, r0
+	ldx	[up+16], u2
+	addxccc(r1, %o2, r1)
+	ldx	[up+24], u3
+	addxccc(r2, %o4, r2)
+	stx	r0, [rp-32]
+	addxccc(r3, %g4, r3)
+	stx	r1, [rp-24]
+	mulx	u0, v0, %o0
+	stx	r2, [rp-16]
+	umulxhi(u0, v0, %o1)
+	stx	r3, [rp-8]
+L(lo0):	mulx	u1, v0, %o2
+	ldx	[rp+0], r0
+	umulxhi(u1, v0, %o3)
+	ldx	[rp+8], r1
+	mulx	u2, v0, %o4
+	ldx	[rp+16], r2
+	umulxhi(u2, v0, %o5)
+	ldx	[rp+24], r3
+	mulx	u3, v0, %g4
+	addxccc(%g5, %o0, %o0)
+	umulxhi(u3, v0, %g5)
+	add	up, 32, up
+	addxccc(%o1, %o2, %o2)
+	add	rp, 32, rp
+	addxccc(%o3, %o4, %o4)
+	add	n, -4, n
+	addxccc(%o5, %g4, %g4)
+	brgz	n, L(top)
+	 nop
+
+	addxc(	%g0, %g5, %g5)
+	addcc	r0, %o0, r0
+	stx	r0, [rp-32]
+	addxccc(r1, %o2, r1)
+	stx	r1, [rp-24]
+	addxccc(r2, %o4, r2)
+	stx	r2, [rp-16]
+	addxccc(r3, %g4, r3)
+	stx	r3, [rp-8]
+L(xit):	addxc(	%g0, %g5, %i0)
+	ret
+	 restore
+EPILOGUE()

diff --git a/third_party/gmp/mpn/sparc64/ultrasparct3/aormul_2.asm b/third_party/gmp/mpn/sparc64/ultrasparct3/aormul_2.asm
new file mode 100644
index 0000000..ccc6a44
--- /dev/null
+++ b/third_party/gmp/mpn/sparc64/ultrasparct3/aormul_2.asm

@@ -0,0 +1,228 @@
+dnl  SPARC v9 mpn_mul_2 and mpn_addmul_2 for T3/T4/T5.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C		    cycles/limb      cycles/limb
+C		       mul_2           addmul_2
+C UltraSPARC T3:	22.5		 23.5
+C UltraSPARC T4:	 3.25		 3.75
+
+
+C The code is reasonably scheduled but also relies on OoO.  There was hope that
+C this could run at around 3.0 and 3.5 c/l respectively, on T4.  Two cycles per
+C iteration needs to be removed.
+C
+C We could almost use 2-way unrolling, but currently the wN registers live too
+C long.  By changing add x,w1,w1 to add x,w1,w0, i.e. migrate the values down-
+C wards, 2-way unrolling should become possible.  With n-indexed addressing it
+C should run no slower.
+C
+C The rp loads to g1/g3 are very much over-scheduled.  Presumably, they could
+C be postponed a full way, and then just one register could be used.
+
+C INPUT PARAMETERS
+define(`rp', `%i0')
+define(`up', `%i1')
+define(`n',  `%i2')
+define(`vp', `%i3')
+
+define(`v0', `%o0')
+define(`v1', `%o1')
+
+define(`w0', `%o2')
+define(`w1', `%o3')
+define(`w2', `%o4')
+define(`w3', `%o5')
+
+ifdef(`OPERATION_mul_2',`
+      define(`AM2',      `')
+      define(`ADDX',	 `addcc`'$1')
+      define(`func',     `mpn_mul_2')
+')
+ifdef(`OPERATION_addmul_2',`
+      define(`AM2',      `$1')
+      define(`ADDX',	 `addxccc($1,$2,$3)')
+      define(`func',     `mpn_addmul_2')
+')
+
+
+MULFUNC_PROLOGUE(mpn_mul_2 mpn_addmul_2)
+
+ASM_START()
+	REGISTER(%g2,#scratch)
+	REGISTER(%g3,#scratch)
+PROLOGUE(func)
+	save	%sp, -176, %sp
+
+	ldx	[vp+0], v0		C load v0
+	and	n, 3, %g5
+	ldx	[vp+8], v1		C load v1
+	add	n, -6, n
+	ldx	[up+0], %g4
+	brz	%g5, L(b0)
+	 cmp	%g5, 2
+	bcs	L(b1)
+	 nop
+	be	L(b2)
+	 nop
+
+L(b3):
+AM2(`	ldx	[rp+0], %g1')
+	mulx	%g4, v0, w2
+	umulxhi(%g4, v0, w3)
+	ldx	[up+8], %i5
+	mulx	%g4, v1, %l3
+	umulxhi(%g4, v1, %l7)
+AM2(`	ldx	[rp+8], %g3')
+	add	up, -8, up
+	add	rp, -8, rp
+	b	L(lo3)
+	 mov	0, w0
+
+L(b2):
+AM2(`	ldx	[rp+0], %g3')
+	mulx	%g4, v0, w3
+	umulxhi(%g4, v0, w0)
+	ldx	[up+8], %i4
+	mulx	%g4, v1, %l1
+	umulxhi(%g4, v1, %l5)
+AM2(`	ldx	[rp+8], %g1')
+	add	rp, 16, rp
+	brlz	n, L(end)
+	 mov	0, w1
+	ba	L(top)
+	 add	up, 16, up
+
+L(b1):
+AM2(`	ldx	[rp+0], %g1')
+	mulx	%g4, v0, w0
+	umulxhi(%g4, v0, w1)
+	ldx	[up+8], %i5
+	mulx	%g4, v1, %l3
+	umulxhi(%g4, v1, %l7)
+AM2(`	ldx	[rp+8], %g3')
+	add	up, 8, up
+	add	rp, 8, rp
+	b	L(lo1)
+	 mov	0, w2
+
+L(b0):
+AM2(`	ldx	[rp+0], %g3')
+	mulx	%g4, v0, w1
+	umulxhi(%g4, v0, w2)
+	ldx	[up+8], %i4
+	mulx	%g4, v1, %l1
+	umulxhi(%g4, v1, %l5)
+AM2(`	ldx	[rp+8], %g1')
+	b	L(lo0)
+	 mov	0, w3
+
+	ALIGN(16)			C cycle
+L(top):	mulx	%i4, v0, %l2		C 0->5
+	umulxhi(%i4, v0, %l6)		C 0->5
+	ldx	[up+0], %i5		C 1->6
+AM2(`	addcc	w3, %g3, w3')		C 1
+	stx	w3, [rp-16]		C 2
+	ADDX(`	%l1, w0, w0')		C 2
+	addxccc(%l5, w1, w1)		C 3
+	mulx	%i4, v1, %l3		C 3->9
+	umulxhi(%i4, v1, %l7)		C 4->9
+AM2(`	ldx	[rp+0], %g3')		C 4
+	addcc	%l2, w0, w0		C 5
+	addxccc(%l6, w1, w1)		C 5
+	addxc(	%g0, %g0, w2)		C 6
+L(lo1):	mulx	%i5, v0, %l0		C 6
+	umulxhi(%i5, v0, %l4)		C 7
+	ldx	[up+8], %i4		C 7
+AM2(`	addcc	w0, %g1, w0')		C 8
+	stx	w0, [rp-8]		C 8
+	ADDX(`	%l3, w1, w1')		C 9
+	addxccc(%l7, w2, w2)		C 9
+	mulx	%i5, v1, %l1		C 10
+	umulxhi(%i5, v1, %l5)		C 10
+AM2(`	ldx	[rp+8], %g1')		C 11
+	addcc	%l0, w1, w1		C 11
+	addxccc(%l4, w2, w2)		C 12
+	addxc(	%g0, %g0, w3)		C 12
+L(lo0):	mulx	%i4, v0, %l2		C 13
+	umulxhi(%i4, v0, %l6)		C 13
+	ldx	[up+16], %i5		C 14
+AM2(`	addcc	w1, %g3, w1')		C 14
+	stx	w1, [rp+0]		C 15
+	ADDX(`	%l1, w2, w2')		C 15
+	addxccc(%l5, w3, w3)		C 16
+	mulx	%i4, v1, %l3		C 16
+	umulxhi(%i4, v1, %l7)		C 17
+AM2(`	ldx	[rp+16], %g3')		C 17
+	addcc	%l2, w2, w2		C 18
+	addxccc(%l6, w3, w3)		C 18
+	addxc(	%g0, %g0, w0)		C 19
+L(lo3):	mulx	%i5, v0, %l0		C 19
+	umulxhi(%i5, v0, %l4)		C 20
+	ldx	[up+24], %i4		C 20
+AM2(`	addcc	w2, %g1, w2')		C 21
+	stx	w2, [rp+8]		C 21
+	ADDX(`	%l3, w3, w3')		C 22
+	addxccc(%l7, w0, w0)		C 22
+	mulx	%i5, v1, %l1		C 23
+	umulxhi(%i5, v1, %l5)		C 23
+AM2(`	ldx	[rp+24], %g1')		C 24
+	addcc	%l0, w3, w3		C 24
+	addxccc(%l4, w0, w0)		C 25
+	addxc(	%g0, %g0, w1)		C 25
+	add	up, 32, up
+	add	rp, 32, rp
+	brgz	n, L(top)
+	 add	n, -4, n
+
+L(end):	mulx	%i4, v0, %l2
+	umulxhi(%i4, v0, %l6)
+AM2(`	addcc	w3, %g3, w3')
+	stx	w3, [rp-16]
+	ADDX(`	%l1, w0, w0')
+	addxccc(%l5, w1, w1)
+	mulx	%i4, v1, %l3
+	umulxhi(%i4, v1, %l7)
+	addcc	%l2, w0, w0
+	addxccc(%l6, w1, w1)
+	addxc(	%g0, %g0, w2)
+AM2(`	addcc	w0, %g1, w0')
+	stx	w0, [rp-8]
+	ADDX(`	%l3, w1, w1')
+	stx	w1, [rp+0]
+	addxc(%l7, w2, %i0)
+
+	ret
+	 restore
+EPILOGUE()

diff --git a/third_party/gmp/mpn/sparc64/ultrasparct3/aormul_4.asm b/third_party/gmp/mpn/sparc64/ultrasparct3/aormul_4.asm
new file mode 100644
index 0000000..845f6d6
--- /dev/null
+++ b/third_party/gmp/mpn/sparc64/ultrasparct3/aormul_4.asm

@@ -0,0 +1,219 @@
+dnl  SPARC v9 mpn_mul_4 and mpn_addmul_4 for T3/T4/T5.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C		    cycles/limb      cycles/limb
+C		       mul_4           addmul_4
+C UltraSPARC T3:	21.5		22.0
+C UltraSPARC T4:	 2.625		 2.75
+
+
+C The code is well-scheduled and relies on OoO very little.  There is hope that
+C this will run at around 2.5 and 2.75 c/l respectively, on T4.
+
+define(`rp', `%i0')
+define(`up', `%i1')
+define(`n',  `%i2')
+define(`vp', `%i3')
+
+define(`v0', `%g1')
+define(`v1', `%o7')
+define(`v2', `%g2')
+define(`v3', `%i3')
+
+define(`w0', `%o0')
+define(`w1', `%o1')
+define(`w2', `%o2')
+define(`w3', `%o3')
+define(`w4', `%o4')
+
+define(`r0', `%o5')
+
+define(`u0', `%i4')
+define(`u1', `%i5')
+
+define(`rp0', `rp')
+define(`rp1', `%g3')
+define(`rp2', `%g4')
+define(`up0', `up')
+define(`up1', `%g5')
+
+ifdef(`OPERATION_mul_4',`
+      define(`AM4',      `')
+      define(`ADDX',	 `addcc`'$1')
+      define(`func',     `mpn_mul_4')
+')
+ifdef(`OPERATION_addmul_4',`
+      define(`AM4',      `$1')
+      define(`ADDX',	 `addxccc($1,$2,$3)')
+      define(`func',     `mpn_addmul_4')
+')
+
+
+MULFUNC_PROLOGUE(mpn_mul_4 mpn_addmul_4)
+
+ASM_START()
+	REGISTER(%g2,#scratch)
+	REGISTER(%g3,#scratch)
+PROLOGUE(func)
+	save	%sp, -176, %sp
+
+	ldx	[up + 0], u1		C load up[0] early
+	andcc	n, 1, %g0		C is n odd?
+	ldx	[vp + 0], v0
+	sllx	n, 3, n
+	ldx	[vp + 8], v1
+	add	n, -28, n
+	ldx	[vp + 16], v2
+	add	rp, -16, rp
+	ldx	[vp + 24], v3
+	add	up, n, up0
+	add	rp, n, rp0
+	add	up0, 8, up1
+	add	rp0, 8, rp1
+	add	rp0, 16, rp2
+	mulx	u1, v0, %l0
+	mov	0, w0
+	mulx	u1, v1, %l1
+	mov	0, w1
+	mulx	u1, v2, %l2
+	mov	0, w2
+	mulx	u1, v3, %l3
+	mov	0, w3
+
+	be	L(evn)
+	 neg	n, n
+
+L(odd):	mov	u1, u0
+	ldx	[up1 + n], u1
+AM4(`	ldx	[rp2 + n], r0')
+	umulxhi(u0, v0, %l4)
+	umulxhi(u0, v1, %l5)
+	umulxhi(u0, v2, %l6)
+	umulxhi(u0, v3, %l7)
+	b	L(mid)
+	 add	n, 8, n
+
+L(evn):	ldx	[up1 + n], u0
+AM4(`	ldx	[rp2 + n], r0')
+	umulxhi(u1, v0, %l4)
+	umulxhi(u1, v1, %l5)
+	umulxhi(u1, v2, %l6)
+	umulxhi(u1, v3, %l7)
+	add	n, 16, n
+
+	ALIGN(16)
+L(top):	addcc	%l0, w0, w0
+	mulx	u0, v0, %l0	C w 0
+	addxccc(%l1, w1, w1)
+	mulx	u0, v1, %l1	C w 1
+	addxccc(%l2, w2, w2)
+	mulx	u0, v2, %l2	C w 2
+	addxccc(%l3, w3, w3)
+	mulx	u0, v3, %l3	C w 3
+	ldx	[up0 + n], u1
+	addxc(	%g0, %g0, w4)
+AM4(`	addcc	r0, w0, w0')
+	stx	w0, [rp0 + n]
+	ADDX(`	%l4, w1, w0')
+	umulxhi(u0, v0, %l4)	C w 1
+AM4(`	ldx	[rp1 + n], r0')
+	addxccc(%l5, w2, w1)
+	umulxhi(u0, v1, %l5)	C w 2
+	addxccc(%l6, w3, w2)
+	umulxhi(u0, v2, %l6)	C w 3
+	addxc(	%l7, w4, w3)
+	umulxhi(u0, v3, %l7)	C w 4
+L(mid):	addcc	%l0, w0, w0
+	mulx	u1, v0, %l0	C w 1
+	addxccc(%l1, w1, w1)
+	mulx	u1, v1, %l1	C w 2
+	addxccc(%l2, w2, w2)
+	mulx	u1, v2, %l2	C w 3
+	addxccc(%l3, w3, w3)
+	mulx	u1, v3, %l3	C w 4
+	ldx	[up1 + n], u0
+	addxc(	%g0, %g0, w4)
+AM4(`	addcc	r0, w0, w0')
+	stx	w0, [rp1 + n]
+	ADDX(`	%l4, w1, w0')
+	umulxhi(u1, v0, %l4)	C w 2
+AM4(`	ldx	[rp2 + n], r0')
+	addxccc(%l5, w2, w1)
+	umulxhi(u1, v1, %l5)	C w 3
+	addxccc(%l6, w3, w2)
+	umulxhi(u1, v2, %l6)	C w 4
+	addxc(	%l7, w4, w3)
+	umulxhi(u1, v3, %l7)	C w 5
+	brlz	n, L(top)
+	 add	n, 16, n
+
+L(end):	addcc	%l0, w0, w0
+	mulx	u0, v0, %l0
+	addxccc(%l1, w1, w1)
+	mulx	u0, v1, %l1
+	addxccc(%l2, w2, w2)
+	mulx	u0, v2, %l2
+	addxccc(%l3, w3, w3)
+	mulx	u0, v3, %l3
+	addxc(	%g0, %g0, w4)
+AM4(`	addcc	r0, w0, w0')
+	stx	w0, [rp0 + n]
+	ADDX(`	%l4, w1, w0')
+	umulxhi(u0, v0, %l4)
+AM4(`	ldx	[rp1 + n], r0')
+	addxccc(%l5, w2, w1)
+	umulxhi(u0, v1, %l5)
+	addxccc(%l6, w3, w2)
+	umulxhi(u0, v2, %l6)
+	addxc(	%l7, w4, w3)
+	umulxhi(u0, v3, %l7)
+	addcc	%l0, w0, w0
+	addxccc(%l1, w1, w1)
+	addxccc(%l2, w2, w2)
+	addxccc(%l3, w3, w3)
+	addxc(	%g0, %g0, w4)
+AM4(`	addcc	r0, w0, w0')
+	stx	w0, [rp1 + n]
+	ADDX(`	%l4, w1, w0')
+	addxccc(%l5, w2, w1)
+	addxccc(%l6, w3, w2)
+	stx	w0, [rp2 + n]
+	add	n, 16, n
+	stx	w1, [rp1 + n]
+	stx	w2, [rp2 + n]
+	addxc(	%l7, w4, %i0)
+	ret
+	 restore
+EPILOGUE()

diff --git a/third_party/gmp/mpn/sparc64/ultrasparct3/aorslsh_n.asm b/third_party/gmp/mpn/sparc64/ultrasparct3/aorslsh_n.asm
new file mode 100644
index 0000000..1014b1b
--- /dev/null
+++ b/third_party/gmp/mpn/sparc64/ultrasparct3/aorslsh_n.asm

@@ -0,0 +1,147 @@
+dnl  SPARC v9 mpn_addlsh_n and mpn_sublsh_n for T3/T4/T5.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C		   cycles/limb
+C UltraSPARC T3:	11
+C UltraSPARC T4:	 4
+
+C For sublsh_n we combine the two shifted limbs using xnor, using the identity
+C (a xor not b) = (not (a xor b)) which equals (not (a or b)) when (a and b) =
+C 0 as it is in our usage.  This gives us the ones complement for free.
+C Unfortunately, the same trick will not work for rsblsh_n, which will instead
+C require a separate negation.
+C
+C FIXME: Add rsblsh_n to this file.
+
+define(`rp', `%i0')
+define(`up', `%i1')
+define(`vp', `%i2')
+define(`n',  `%i3')
+define(`cnt',`%i4')
+
+define(`tnc',`%o5')
+
+ifdef(`OPERATION_addlsh_n',`
+  define(`INITCY', `subcc	%g0, 0, %g0')
+  define(`MERGE',  `or')
+  define(`func',   `mpn_addlsh_n')
+')
+ifdef(`OPERATION_sublsh_n',`
+  define(`INITCY', `subcc	%g0, 1, %g0')
+  define(`MERGE',  `xnor')
+  define(`func',   `mpn_sublsh_n')
+')
+
+define(`rp0',  `rp')
+define(`rp1',  `%o2')
+define(`up0',  `up')
+define(`up1',  `%o3')
+define(`vp0',  `vp')
+define(`vp1',  `%o4')
+
+MULFUNC_PROLOGUE(mpn_addlsh_n mpn_sublsh_n)
+ASM_START()
+	REGISTER(%g2,#scratch)
+	REGISTER(%g3,#scratch)
+PROLOGUE(func)
+	save	%sp, -176, %sp
+	mov	64, tnc
+	sub	tnc, cnt, tnc
+
+	andcc	n, 1, %g0
+	sllx	n, 3, n
+	add	n, -16, n
+	add	up, n, up0
+	add	vp, n, vp0
+	add	rp, n, rp0
+	add	up0, 8, up1
+	add	vp0, 8, vp1
+	add	rp0, -8, rp1
+	add	rp0, -16, rp0
+	neg	n, n
+	be	L(evn)
+	 INITCY
+
+L(odd):	ldx	[vp0 + n], %l1
+	mov	0, %l2
+	ldx	[up0 + n], %l5
+	sllx	%l1, cnt, %g3
+	brgez	n, L(wd1)
+	 add	n, 8, n
+	ldx	[vp0 + n], %l0
+	b	L(lo1)
+	 sllx	%l1, cnt, %g3
+
+L(evn):	ldx	[vp0 + n], %l0
+	mov	0, %l3
+	ldx	[up0 + n], %l4
+	ldx	[vp1 + n], %l1
+	b	L(lo0)
+	 sllx	%l0, cnt, %g1
+
+L(top):	addxccc(%l6, %l4, %o0)
+	ldx	[vp0 + n], %l0
+	sllx	%l1, cnt, %g3
+	stx	%o0, [rp0 + n]
+L(lo1):	srlx	%l1, tnc, %l3
+	MERGE	%l2, %g3, %l7
+	ldx	[up0 + n], %l4
+	addxccc(%l7, %l5, %o1)
+	ldx	[vp1 + n], %l1
+	sllx	%l0, cnt, %g1
+	stx	%o1, [rp1 + n]
+L(lo0):	srlx	%l0, tnc, %l2
+	MERGE	%l3, %g1, %l6
+	ldx	[up1 + n], %l5
+	brlz,pt	n, L(top)
+	 add	n, 16, n
+
+	addxccc(%l6, %l4, %o0)
+	sllx	%l1, cnt, %g3
+	stx	%o0, [rp0 + n]
+L(wd1):	srlx	%l1, tnc, %l3
+	MERGE	%l2, %g3, %l7
+	addxccc(%l7, %l5, %o1)
+	stx	%o1, [rp1 + n]
+
+ifdef(`OPERATION_addlsh_n',
+`	addxc(	%l3, %g0, %i0)')
+ifdef(`OPERATION_sublsh_n',
+`	addxc(	%g0, %g0, %g1)
+	add	%g1, -1, %g1
+	sub	%l3, %g1, %i0')
+
+	ret
+	 restore
+EPILOGUE()

diff --git a/third_party/gmp/mpn/sparc64/ultrasparct3/bdiv_dbm1c.asm b/third_party/gmp/mpn/sparc64/ultrasparct3/bdiv_dbm1c.asm
new file mode 100644
index 0000000..550860d
--- /dev/null
+++ b/third_party/gmp/mpn/sparc64/ultrasparct3/bdiv_dbm1c.asm

@@ -0,0 +1,147 @@
+dnl  SPARC T3/T4/T5 mpn_bdiv_dbm1c.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C                  cycles/limb
+C UltraSPARC T3:	25
+C UltraSPARC T4/T5:	 4
+
+C INPUT PARAMETERS
+define(`qp',  `%i0')
+define(`ap',  `%i1')
+define(`n',   `%i2')
+define(`bd',  `%i3')
+define(`h',   `%i4')
+
+define(`plo0',`%g4')  define(`plo1',`%g5')
+define(`phi0',`%l0')  define(`phi1',`%l1')
+define(`a0',  `%g1')  define(`a1',  `%g3')
+
+ASM_START()
+	REGISTER(%g2,#scratch)
+	REGISTER(%g3,#scratch)
+PROLOGUE(mpn_bdiv_dbm1c)
+	save	%sp, -176, %sp
+
+	and	n, 3, %g5
+	ldx	[ap + 0], %g2
+	add	n, -5, n
+	brz	%g5, L(b0)
+	 cmp	%g5, 2
+	bcs	%xcc, L(b1)
+	 nop
+	be	%xcc, L(b2)
+	 nop
+
+L(b3):	ldx	[ap + 8], a0
+	mulx	bd, %g2, plo1
+	umulxhi(bd, %g2, phi1)
+	ldx	[ap + 16], a1
+	add	qp, -24, qp
+	b	L(lo3)
+	 add	ap, -8, ap
+
+L(b2):	ldx	[ap + 8], a1
+	mulx	bd, %g2, plo0
+	umulxhi(bd, %g2, phi0)
+	brlz,pt n, L(wd2)
+	 nop
+L(gt2):	ldx	[ap + 16], a0
+	add	ap, 16, ap
+	b	L(lo2)
+	 add	n, -1, n
+
+L(b1):	mulx	bd, %g2, plo1
+	 umulxhi(bd, %g2, phi1)
+	brlz,pn	n, L(wd1)
+	 add	qp, -8, qp
+L(gt1):	ldx	[ap + 8], a0
+	ldx	[ap + 16], a1
+	b	L(lo1)
+	 add	ap, 8, ap
+
+L(b0):	ldx	[ap + 8], a1
+	mulx	bd, %g2, plo0
+	umulxhi(bd, %g2, phi0)
+	ldx	[ap + 16], a0
+	b	L(lo0)
+	 add	qp, -16, qp
+
+L(top):	ldx	[ap + 0], a0
+	sub	h, phi1, h
+L(lo2):	mulx	bd, a1, plo1
+	umulxhi(bd, a1, phi1)
+	subcc	h, plo0, h
+	addxc(	phi0, %g0, phi0)
+	stx	h, [qp + 0]
+	ldx	[ap + 8], a1
+	sub	h, phi0, h
+L(lo1):	mulx	bd, a0, plo0
+	umulxhi(bd, a0, phi0)
+	subcc	h, plo1, h
+	addxc(	phi1, %g0, phi1)
+	stx	h, [qp + 8]
+	ldx	[ap + 16], a0
+	sub	h, phi1, h
+L(lo0):	mulx	bd, a1, plo1
+	umulxhi(bd, a1, phi1)
+	subcc	h, plo0, h
+	addxc(	phi0, %g0, phi0)
+	stx	h, [qp + 16]
+	ldx	[ap + 24], a1
+	sub	h, phi0, h
+L(lo3):	mulx	bd, a0, plo0
+	umulxhi(bd, a0, phi0)
+	subcc	h, plo1, h
+	addxc(	phi1, %g0, phi1)
+	stx	h, [qp + 24]
+	add	ap, 32, ap
+	add	qp, 32, qp
+	brgz,pt	n, L(top)
+	 add	n, -4, n
+
+L(end):	sub	h, phi1, h
+L(wd2):	mulx	bd, a1, plo1
+	umulxhi(bd, a1, phi1)
+	subcc	h, plo0, h
+	addxc(	phi0, %g0, phi0)
+	stx	h, [qp + 0]
+	sub	h, phi0, h
+L(wd1):	subcc	h, plo1, h
+	addxc(	phi1, %g0, phi1)
+	stx	h, [qp + 8]
+	sub	h, phi1, %i0
+
+	ret
+	 restore
+EPILOGUE()

diff --git a/third_party/gmp/mpn/sparc64/ultrasparct3/bdiv_q_1.asm b/third_party/gmp/mpn/sparc64/ultrasparct3/bdiv_q_1.asm
new file mode 100644
index 0000000..9847047
--- /dev/null
+++ b/third_party/gmp/mpn/sparc64/ultrasparct3/bdiv_q_1.asm

@@ -0,0 +1,137 @@
+dnl  SPARC T3/T4/T5 mpn_bdiv_q_1.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2013, 2017 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C                  cycles/limb
+C UltraSPARC T3:	31
+C UltraSPARC T4/T5:	20-26  hits 20 early, then sharply drops
+
+C INPUT PARAMETERS
+define(`qp',  `%i0')
+define(`ap',  `%i1')
+define(`n',   `%i2')
+define(`d',   `%i3')
+define(`dinv',`%i4')
+define(`cnt', `%i5')
+
+define(`tnc', `%o2')
+
+ASM_START()
+	REGISTER(%g2,#scratch)
+	REGISTER(%g3,#scratch)
+PROLOGUE(mpn_bdiv_q_1)
+	save	%sp, -176, %sp
+	ldx	[ap], %o5
+	add	d, -1, %g1
+	andn	%g1, d, %g1
+	popc	%g1, cnt
+
+	srlx	d, cnt, d
+	srlx	d, 1, %g1
+	and	%g1, 127, %g1
+	LEA64(binvert_limb_table, g2, g4)
+	ldub	[%g2+%g1], %g1
+	add	%g1, %g1, %g2
+	mulx	%g1, %g1, %g1
+	mulx	%g1, d, %g1
+	sub	%g2, %g1, %g2
+	add	%g2, %g2, %g1
+	mulx	%g2, %g2, %g2
+	mulx	%g2, d, %g2
+	sub	%g1, %g2, %g1
+	add	%g1, %g1, %o7
+	mulx	%g1, %g1, %g1
+	mulx	%g1, d, %g1
+	add	n, -2, n
+	brz,pt	cnt, L(norm)
+	 sub	%o7, %g1, dinv
+
+	brlz,pt	n, L(edu)
+	 srlx	%o5, cnt, %o5
+	b	L(eee)
+	 mov	0, %g4
+EPILOGUE()
+
+PROLOGUE(mpn_pi1_bdiv_q_1)
+	save	%sp, -176, %sp
+	ldx	[ap], %o5
+
+	brz,pt	cnt, L(norm)
+	 add	n, -2, n
+
+L(unorm):
+	brlz,pt	n, L(edu)
+	 srlx	%o5, cnt, %o5
+	mov	0, %g4
+L(eee):	sub	%g0, cnt, tnc
+
+L(tpu):	ldx	[ap+8], %g3
+	add	ap, 8, ap
+	sllx	%g3, tnc, %g5
+	or	%g5, %o5, %g5
+	srlx	%g3, cnt, %o5
+	subcc	%g5, %g4, %g4
+	mulx	%g4, dinv, %g1
+	stx	%g1, [qp]
+	add	qp, 8, qp
+	umulxhi(d, %g1, %g1)
+	addxc(	%g1, %g0, %g4)
+	brgz,pt	n, L(tpu)
+	 add	n, -1, n
+
+	sub	%o5, %g4, %o5
+L(edu):	mulx	%o5, dinv, %g1
+	return	%i7+8
+	 stx	%g1, [%o0]
+
+L(norm):
+	mulx	dinv, %o5, %g1
+	brlz,pt	n, L(edn)
+	 stx	%g1, [qp]
+	add	qp, 8, qp
+	addcc	%g0, 0, %g4
+
+L(tpn):	umulxhi(d, %g1, %g1)
+	ldx	[ap+8], %g5
+	add	ap, 8, ap
+	addxc(	%g1, %g0, %g1)
+	subcc	%g5, %g1, %g1
+	mulx	%g1, dinv, %g1
+	stx	%g1, [qp]
+	add	qp, 8, qp
+	brgz,pt	n, L(tpn)
+	 add	n, -1, n
+
+L(edn):	return	%i7+8
+	 nop
+EPILOGUE()

diff --git a/third_party/gmp/mpn/sparc64/ultrasparct3/cnd_aors_n.asm b/third_party/gmp/mpn/sparc64/ultrasparct3/cnd_aors_n.asm
new file mode 100644
index 0000000..49ccaec
--- /dev/null
+++ b/third_party/gmp/mpn/sparc64/ultrasparct3/cnd_aors_n.asm

@@ -0,0 +1,145 @@
+dnl  SPARC v9 mpn_cnd_add_n and mpn_cnd_sub_n for T3/T4/T5.
+
+dnl  Contributed to the GNU project by David Miller and Torbjörn Granlund.
+
+dnl  Copyright 2013, 2017 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C		   cycles/limb
+C UltraSPARC T3:	 8.5
+C UltraSPARC T4:	 3
+
+C We use a double-pointer trick to allow indexed addressing.  Its setup
+C cost might be a problem in these functions, since we don't expect huge n
+C arguments.
+C
+C For sub we need ~(a & mask) = (~a | ~mask) but by complementing mask we can
+C instead do ~(a & ~mask) = (~a | mask), allowing us to use the orn insn.
+
+C INPUT PARAMETERS
+define(`cnd', `%i0')
+define(`rp',  `%i1')
+define(`up',  `%i2')
+define(`vp',  `%i3')
+define(`n',   `%i4')
+
+define(`mask',   `cnd')
+define(`up0', `%l0')  define(`up1', `%l1')
+define(`vp0', `%l2')  define(`vp1', `%l3')
+define(`rp0', `%g4')  define(`rp1', `%g5')
+define(`u0',  `%l4')  define(`u1',  `%l5')
+define(`v0',  `%l6')  define(`v1',  `%l7')
+define(`x0',  `%g1')  define(`x1',  `%g3')
+define(`w0',  `%g1')  define(`w1',  `%g3')
+
+ifdef(`OPERATION_cnd_add_n',`
+  define(`LOGOP',   `and	$1, $2, $3')
+  define(`MAKEMASK',`cmp	%g0, $1
+		     addxc(	%g0, %g0, $2)
+		     neg	$2, $2')
+  define(`INITCY',  `addcc	%g0, 0, %g0')
+  define(`RETVAL',  `addxc(	%g0, %g0, %i0)')
+  define(`func',    `mpn_cnd_add_n')
+')
+ifdef(`OPERATION_cnd_sub_n',`
+  define(`LOGOP',   `orn	$2, $1, $3')
+  define(`MAKEMASK',`cmp	$1, 1
+		     addxc(	%g0, %g0, $2)
+		     neg	$2, $2')
+  define(`INITCY',  `subcc	%g0, 1, %g0')
+  define(`RETVAL',  `addxc(	%g0, %g0, %i0)
+		     xor	%i0, 1, %i0')
+  define(`func',    `mpn_cnd_sub_n')
+')
+
+MULFUNC_PROLOGUE(mpn_cnd_add_n mpn_cnd_sub_n)
+
+ASM_START()
+	REGISTER(%g2,#scratch)
+	REGISTER(%g3,#scratch)
+PROLOGUE(func)
+	save	%sp, -176, %sp
+
+	MAKEMASK(cnd,mask)
+
+	andcc	n, 1, %g0
+	sllx	n, 3, n
+	add	n, -16, n
+	add	vp, n, vp0
+	add	up, n, up0
+	add	rp, n, rp0
+	neg	n, n
+	be	L(evn)
+	 INITCY
+
+L(odd):	ldx	[vp0 + n], v1
+	ldx	[up0 + n], u1
+	LOGOP(	v1, mask, x1)
+	addxccc(u1, x1, w1)
+	stx	w1, [rp0 + n]
+	add	n, 8, n
+	brgz	n, L(rtn)
+	 nop
+
+L(evn):	add	vp0, 8, vp1
+	add	up0, 8, up1
+	add	rp0, -24, rp1
+	ldx	[vp0 + n], v0
+	ldx	[vp1 + n], v1
+	ldx	[up0 + n], u0
+	ldx	[up1 + n], u1
+	add	n, 16, n
+	brgz	n, L(end)
+	 add	rp0, -16, rp0
+
+L(top):	LOGOP(	v0, mask, x0)
+	ldx	[vp0 + n], v0
+	LOGOP(	v1, mask, x1)
+	ldx	[vp1 + n], v1
+	addxccc(u0, x0, w0)
+	ldx	[up0 + n], u0
+	addxccc(u1, x1, w1)
+	ldx	[up1 + n], u1
+	stx	w0, [rp0 + n]
+	add	n, 16, n
+	brlez	n, L(top)
+	 stx	w1, [rp1 + n]
+
+L(end):	LOGOP(	v0, mask, x0)
+	LOGOP(	v1, mask, x1)
+	addxccc(u0, x0, w0)
+	addxccc(u1, x1, w1)
+	stx	w0, [rp0 + n]
+	stx	w1, [rp1 + 32]
+
+L(rtn):	RETVAL
+	ret
+	 restore
+EPILOGUE()

diff --git a/third_party/gmp/mpn/sparc64/ultrasparct3/dive_1.asm b/third_party/gmp/mpn/sparc64/ultrasparct3/dive_1.asm
new file mode 100644
index 0000000..d7dbdf9
--- /dev/null
+++ b/third_party/gmp/mpn/sparc64/ultrasparct3/dive_1.asm

@@ -0,0 +1,129 @@
+dnl  SPARC T3/T4/T5 mpn_divexact_1.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C                  cycles/limb
+C UltraSPARC T3:	31
+C UltraSPARC T4/T5:	20-26  hits 20 early, then sharply drops
+
+C INPUT PARAMETERS
+define(`qp',  `%i0')
+define(`ap',  `%i1')
+define(`n',   `%i2')
+define(`d',   `%i3')
+
+define(`dinv',`%o4')
+
+ASM_START()
+	REGISTER(%g2,#scratch)
+	REGISTER(%g3,#scratch)
+PROLOGUE(mpn_divexact_1)
+	save	%sp, -176, %sp
+	cmp	n, 1
+	bne,pt	%xcc, L(gt1)
+	 ldx	[ap], %o5
+	udivx	%o5, d, %g1
+	stx	%g1, [qp]
+	return	%i7+8
+	 nop
+
+L(gt1):	add	d, -1, %g1
+	andn	%g1, d, %g1
+	popc	%g1, %i4		C i4 = count_trailing_zeros(d)
+
+	srlx	d, %i4, d
+	srlx	d, 1, %g1
+	and	%g1, 127, %g1
+
+	LEA64(binvert_limb_table, g2, g4)
+	ldub	[%g2+%g1], %g1
+	add	%g1, %g1, %g2
+	mulx	%g1, %g1, %g1
+	mulx	%g1, d, %g1
+	sub	%g2, %g1, %g2
+	add	%g2, %g2, %g1
+	mulx	%g2, %g2, %g2
+	mulx	%g2, d, %g2
+	sub	%g1, %g2, %g1
+	add	%g1, %g1, %o7
+	mulx	%g1, %g1, %g1
+	mulx	%g1, d, %g1
+	add	n, -2, n
+	brz,pt	%i4, L(norm)
+	 sub	%o7, %g1, dinv
+
+L(unnorm):
+	mov	0, %g4
+	sub	%g0, %i4, %o2
+	srlx	%o5, %i4, %o5
+L(top_unnorm):
+	ldx	[ap+8], %g3
+	add	ap, 8, ap
+	sllx	%g3, %o2, %g5
+	or	%g5, %o5, %g5
+	srlx	%g3, %i4, %o5
+	subcc	%g5, %g4, %g4
+	mulx	%g4, dinv, %g1
+	stx	%g1, [qp]
+	add	qp, 8, qp
+	umulxhi(d, %g1, %g1)
+	addxc(	%g1, %g0, %g4)
+	brgz,pt	n, L(top_unnorm)
+	 add	n, -1, n
+
+	sub	%o5, %g4, %g4
+	mulx	%g4, dinv, %g1
+	stx	%g1, [qp]
+	return	%i7+8
+	 nop
+
+L(norm):
+	mulx	dinv, %o5, %g1
+	stx	%g1, [qp]
+	add	qp, 8, qp
+	addcc	%g0, 0, %g4
+L(top_norm):
+	umulxhi(d, %g1, %g1)
+	ldx	[ap+8], %g5
+	add	ap, 8, ap
+	addxc(	%g1, %g0, %g1)
+	subcc	%g5, %g1, %g1
+	mulx	%g1, dinv, %g1
+	stx	%g1, [qp]
+	add	qp, 8, qp
+	brgz,pt	n, L(top_norm)
+	 add	n, -1, n
+
+	return	%i7+8
+	 nop
+EPILOGUE()

diff --git a/third_party/gmp/mpn/sparc64/ultrasparct3/hamdist.asm b/third_party/gmp/mpn/sparc64/ultrasparct3/hamdist.asm
new file mode 100644
index 0000000..20ed8bf
--- /dev/null
+++ b/third_party/gmp/mpn/sparc64/ultrasparct3/hamdist.asm

@@ -0,0 +1,78 @@
+dnl  SPARC v9 mpn_hamdist for T3/T4.
+
+dnl  Contributed to the GNU project by David Miller.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C		   cycles/limb
+C UltraSPARC T3:	18
+C UltraSPARC T4:	 3.5
+
+C INPUT PARAMETERS
+define(`up',   `%o0')
+define(`vp',   `%o1')
+define(`n',    `%o2')
+define(`pcnt', `%o5')
+
+ASM_START()
+	REGISTER(%g2,#scratch)
+	REGISTER(%g3,#scratch)
+PROLOGUE(mpn_hamdist)
+	subcc	n, 1, n
+	be	L(final_one)
+	 clr	pcnt
+L(top):
+	ldx	[up + 0], %g1
+	ldx	[vp + 0], %g2
+	ldx	[up + 8], %o4
+	ldx	[vp + 8], %g3
+	sub	n, 2, n
+	xor	%g1, %g2, %g1
+	add	up, 16, up
+	popc	%g1, %g2
+	add	vp, 16, vp
+	xor	%o4, %g3, %o4
+	add	pcnt, %g2, pcnt
+	popc	%o4, %g3
+	brgz	n, L(top)
+	 add	pcnt, %g3, pcnt
+	brlz,pt	n, L(done)
+	 nop
+L(final_one):
+	ldx	[up + 0], %g1
+	ldx	[vp + 0], %g2
+	xor	%g1,%g2, %g1
+	popc	%g1, %g2
+	add	pcnt, %g2, pcnt
+L(done):
+	retl
+	 mov	pcnt, %o0
+EPILOGUE()

diff --git a/third_party/gmp/mpn/sparc64/ultrasparct3/invert_limb.asm b/third_party/gmp/mpn/sparc64/ultrasparct3/invert_limb.asm
new file mode 100644
index 0000000..4da49cf
--- /dev/null
+++ b/third_party/gmp/mpn/sparc64/ultrasparct3/invert_limb.asm

@@ -0,0 +1,92 @@
+dnl  SPARC T3/T4/T5 mpn_invert_limb.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C                  cycles/limb
+C UltraSPARC T3:	 ?
+C UltraSPARC T4/T5:	 ?
+
+C INPUT PARAMETERS
+define(`d',  `%o0')
+
+ASM_START()
+	REGISTER(%g2,#scratch)
+	REGISTER(%g3,#scratch)
+PROLOGUE(mpn_invert_limb)
+	srlx	d, 54, %g1
+	LEA64(approx_tab, g2, g3)
+	and	%g1, 0x1fe, %g1
+	srlx	d, 24, %g4
+	lduh	[%g2+%g1], %g3
+	add	%g4, 1, %g4
+	sllx	%g3, 11, %g2
+	add	%g2, -1, %g2
+	mulx	%g3, %g3, %g3
+	mulx	%g3, %g4, %g3
+	srlx	%g3, 40, %g3
+	sub	%g2, %g3, %g2
+	sllx	%g2, 60, %g1
+	mulx	%g2, %g2, %g3
+	mulx	%g3, %g4, %g4
+	sub	%g1, %g4, %g1
+	srlx	%g1, 47, %g1
+	sllx	%g2, 13, %g2
+	add	%g1, %g2, %g1
+	and	d, 1, %g2
+	srlx	%g1, 1, %g4
+	sub	%g0, %g2, %g3
+	and	%g4, %g3, %g3
+	srlx	d, 1, %g4
+	add	%g4, %g2, %g2
+	mulx	%g1, %g2, %g2
+	sub	%g3, %g2, %g2
+	umulxhi(%g1, %g2, %g2)
+	srlx	%g2, 1, %g2
+	sllx	%g1, 31, %g1
+	add	%g2, %g1, %g1
+	mulx	%g1, d, %g3
+	umulxhi(d, %g1, %g4)
+	addcc	%g3, d, %g0
+	addxc(	%g4, d, %o0)
+	jmp	%o7+8
+	 sub	%g1, %o0, %o0
+EPILOGUE()
+
+	RODATA
+	ALIGN(2)
+	TYPE(	approx_tab, object)
+	SIZE(	approx_tab, 512)
+approx_tab:
+forloop(i,256,512-1,dnl
+`	.half	eval(0x7fd00/i)
+')dnl

diff --git a/third_party/gmp/mpn/sparc64/ultrasparct3/missing.asm b/third_party/gmp/mpn/sparc64/ultrasparct3/missing.asm
new file mode 100644
index 0000000..c79032d
--- /dev/null
+++ b/third_party/gmp/mpn/sparc64/ultrasparct3/missing.asm

@@ -0,0 +1,77 @@
+dnl  SPARC v9-2011 simulation support.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(__gmpn_umulh)
+	save	%sp, -176, %sp
+	ldx	[%sp+2047+176+256], %o0
+	ldx	[%sp+2047+176+256+8], %o1
+	rd	%ccr, %o4
+	srl	%o0, 0, %l4
+	srl	%o1, 0, %l1
+	srlx	%o1, 32, %o1
+	mulx	%o1, %l4, %l2
+	srlx	%o0, 32, %o0
+	mulx	%o0, %l1, %l3
+	mulx	%l1, %l4, %l1
+	srlx	%l1, 32, %l1
+	add	%l2, %l1, %l2
+	addcc	%l2, %l3, %l2
+	mulx	%o1, %o0, %o1
+	mov	0, %l1
+	movcs	%xcc, 1, %l1
+	sllx	%l1, 32, %l1
+	add	%o1, %l1, %o1
+	srlx	%l2, 32, %o0
+	add	%o1, %o0, %o0
+	stx	%o0, [%sp+2047+176+256]
+	wr	%o4, 0, %ccr
+	ret
+	 restore
+EPILOGUE()
+
+PROLOGUE(__gmpn_lzcnt)
+	save	%sp, -176, %sp
+	ldx	[%sp+2047+176+256], %o0
+	brz,a	%o0, 2f
+	 mov	64, %o1
+	brlz	%o0, 2f
+	 mov	0, %o1
+1:	sllx	%o0, 1, %o0
+	brgz	%o0, 1b
+	 add	%o1, 1, %o1
+	stx	%o1, [%sp+2047+176+256]
+2:	ret
+	 restore
+EPILOGUE()

diff --git a/third_party/gmp/mpn/sparc64/ultrasparct3/missing.m4 b/third_party/gmp/mpn/sparc64/ultrasparct3/missing.m4
new file mode 100644
index 0000000..e5d6d8e
--- /dev/null
+++ b/third_party/gmp/mpn/sparc64/ultrasparct3/missing.m4

@@ -0,0 +1,88 @@
+dnl  SPARC v9-2011 simulation support.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+dnl Usage addxccc(r1,r2,r3, t1)
+dnl  64-bit add with carry-in and carry-out
+dnl  FIXME: Register g2 must not be destination
+
+define(`addxccc',`dnl
+	add	%sp, -512, %sp
+	stx	%g2, [%sp+2047+256+16]
+	mov	0, %g2
+	movcs	%xcc, -1, %g2
+	addcc	%g2, 1, %g0
+	addccc	$1, $2, $3
+	ldx	[%sp+2047+256+16], %g2
+	sub	%sp, -512, %sp
+')
+
+
+dnl Usage addxc(r1,r2,r3, t1,t2)
+dnl  64-bit add with carry-in
+
+define(`addxc',`dnl
+	bcc	%xcc, 1f
+	 add	$1, $2, $3
+	add	$3, 1, $3
+1:
+')
+
+
+dnl Usage umulxhi(r1,r2,r3)
+dnl  64-bit multiply returning upper 64 bits
+dnl  Calls __gmpn_umulh using a non-standard calling convention
+
+define(`umulxhi',`dnl
+	add	%sp, -512, %sp
+	stx	$1, [%sp+2047+256]
+	stx	$2, [%sp+2047+256+8]
+	stx	%o7, [%sp+2047+256+16]
+	call	__gmpn_umulh
+	 nop
+	ldx	[%sp+2047+256+16], %o7
+	ldx	[%sp+2047+256], $3
+	sub	%sp, -512, %sp
+')
+dnl Usage lzcnt(r1,r2)
+dnl  Plain count leading zeros
+dnl  Calls __gmpn_lzcnt using a non-standard calling convention
+
+define(`lzcnt',`dnl
+	add	%sp, -512, %sp
+	stx	%o7, [%sp+2047+256+16]
+	call	__gmpn_lzcnt
+	 stx	$1, [%sp+2047+256]
+	ldx	[%sp+2047+256+16], %o7
+	ldx	[%sp+2047+256], $2
+	sub	%sp, -512, %sp
+')

diff --git a/third_party/gmp/mpn/sparc64/ultrasparct3/mod_1_4.asm b/third_party/gmp/mpn/sparc64/ultrasparct3/mod_1_4.asm
new file mode 100644
index 0000000..08facbd
--- /dev/null
+++ b/third_party/gmp/mpn/sparc64/ultrasparct3/mod_1_4.asm

@@ -0,0 +1,233 @@
+dnl  SPARC T3/T4/T5 mpn_mod_1s_4p.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C                  cycles/limb
+C UltraSPARC T3:	30
+C UltraSPARC T4/T5:	 4
+
+C INPUT PARAMETERS
+define(`ap',  `%o0')
+define(`n',   `%o1')
+define(`d',   `%o2')
+define(`cps', `%o3')
+
+
+ASM_START()
+	REGISTER(%g2,#scratch)
+	REGISTER(%g3,#scratch)
+PROLOGUE(mpn_mod_1s_4p)
+	save	%sp, -176, %sp
+	ldx	[%i3+16], %o4
+	ldx	[%i3+24], %o3
+	ldx	[%i3+32], %o2
+	ldx	[%i3+40], %o1
+	ldx	[%i3+48], %o0
+
+	and	%i1, 3, %g3
+	sllx	%i1, 3, %g1
+	add	%i0, %g1, %i0
+	brz	%g3, L(b00)
+	 cmp	%g3, 2
+	bcs	%xcc, L(b01)
+	 nop
+	be	%xcc, L(b10)
+	 nop
+
+L(b11):	ldx	[%i0-16], %g2
+	mulx	%g2, %o4, %g5
+	umulxhi(%g2, %o4, %g3)
+	ldx	[%i0-24], %g4
+	addcc	%g5, %g4, %g5
+	addxc(	%g3, %g0, %g4)
+	ldx	[%i0-8], %g2
+	mulx	%g2, %o3, %g1
+	umulxhi(%g2, %o3, %g3)
+	addcc	%g1, %g5, %g1
+	addxc(	%g3, %g4, %g2)
+	ba,pt	%xcc, .L8
+	 add	%i0, -32, %i0
+
+L(b00):	ldx	[%i0-24], %g3
+	mulx	%g3, %o4, %g2
+	umulxhi(%g3, %o4, %g5)
+	ldx	[%i0-32], %g4
+	addcc	%g2, %g4, %g2
+	addxc(	%g5, %g0, %g3)
+	ldx	[%i0-16], %g4
+	mulx	%g4, %o3, %g5
+	umulxhi(%g4, %o3, %i5)
+	addcc	%g2, %g5, %g5
+	addxc(	%g3, %i5, %g4)
+	ldx	[%i0-8], %g2
+	mulx	%g2, %o2, %g1
+	umulxhi(%g2, %o2, %g3)
+	addcc	%g1, %g5, %g1
+	addxc(	%g3, %g4, %g2)
+	ba,pt	%xcc, .L8
+	 add	%i0, -40, %i0
+
+L(b01):	ldx	[%i0-8], %g1
+	mov	0, %g2
+	ba,pt	%xcc, .L8
+	 add	%i0, -16, %i0
+
+L(b10):	ldx	[%i0-8], %g2
+	ldx	[%i0-16], %g1
+	add	%i0, -24, %i0
+
+.L8:	add	%i1, -5, %g3
+	brlz,pn	%g3, L(end)
+	 nop
+
+L(top):	ldx	[%i0-16], %i4
+	mulx	%i4, %o4, %o5
+	umulxhi(%i4, %o4, %i1)
+	ldx	[%i0-24], %i5
+	addcc	%o5, %i5, %o5
+	addxc(	%i1, %g0, %i4)
+	ldx	[%i0-8], %i5
+	mulx	%i5, %o3, %o7
+	umulxhi(%i5, %o3, %i1)
+	addcc	%o5, %o7, %o7
+	addxc(	%i4, %i1, %i5)
+	ldx	[%i0+0], %g4
+	mulx	%g4, %o2, %i1
+	umulxhi(%g4, %o2, %i4)
+	addcc	%o7, %i1, %i1
+	addxc(	%i5, %i4, %g4)
+	mulx	%g1, %o1, %i5
+	umulxhi(%g1, %o1, %i4)
+	addcc	%i1, %i5, %i5
+	addxc(	%g4, %i4, %g5)
+	mulx	%g2, %o0, %g1
+	umulxhi(%g2, %o0, %g4)
+	addcc	%g1, %i5, %g1
+	addxc(	%g4, %g5, %g2)
+	add	%g3, -4, %g3
+	brgez,pt %g3, L(top)
+	 add	%i0, -32, %i0
+
+L(end):	mulx	%g2, %o4, %g5
+	umulxhi(%g2, %o4, %g3)
+	addcc	%g1, %g5, %g5
+	addxc(	%g3, %g0, %g2)
+	ldx	[%i3+8], %i0
+	ldx	[%i3], %g4
+	sub	%g0, %i0, %i5
+	srlx	%g5, %i5, %i5
+	sllx	%g2, %i0, %g2
+	or	%i5, %g2, %g1
+	mulx	%g1, %g4, %l7
+	umulxhi(%g1, %g4, %g3)
+	sllx	%g5, %i0, %g2
+	add	%g1, 1, %g1
+	addcc	%l7, %g2, %g5
+	addxc(	%g3, %g1, %g1)
+	mulx	%g1, %i2, %g1
+	sub	%g2, %g1, %g2
+	cmp	%g2, %g5
+	add	%i2, %g2, %g1
+	movlu	%xcc, %g2, %g1
+	subcc	%g1, %i2, %g2
+	movgeu	%xcc, %g2, %g1
+	return	%i7+8
+	 srlx	%g1, %o0, %o0
+EPILOGUE()
+
+PROLOGUE(mpn_mod_1s_4p_cps)
+	save	%sp, -176, %sp
+	lzcnt(	%i1, %i5)
+	sllx	%i1, %i5, %i1
+	call	mpn_invert_limb, 0
+	 mov	%i1, %o0
+	stx	%o0, [%i0]
+	sra	%i5, 0, %g1
+	stx	%g1, [%i0+8]
+	sub	%g0, %i5, %g2
+	srlx	%o0, %g2, %g2
+	mov	1, %g1
+	sllx	%g1, %i5, %g1
+	or	%g2, %g1, %g2
+	sub	%g0, %i1, %g1
+	mulx	%g2, %g1, %g2
+	srlx	%g2, %i5, %g1
+	stx	%g1, [%i0+16]
+
+	umulxhi(%o0, %g2, %g3)
+	add	%g2, %g3, %g3
+	xnor	%g0, %g3, %g3
+	mulx	%g3, %i1, %g3
+	mulx	%g2, %o0, %g2
+	cmp	%g2, %g3
+	add	%i1, %g3, %g1
+	movgeu	%xcc, %g3, %g1
+	srlx	%g1, %i5, %g2
+	stx	%g2, [%i0+24]
+
+	umulxhi(%o0, %g1, %g3)
+	add	%g1, %g3, %g3
+	xnor	%g0, %g3, %g3
+	mulx	%g3, %i1, %g3
+	mulx	%g1, %o0, %g1
+	cmp	%g1, %g3
+	add	%i1, %g3, %g2
+	movgeu	%xcc, %g3, %g2
+	srlx	%g2, %i5, %g1
+	stx	%g1, [%i0+32]
+
+	umulxhi(%o0, %g2, %g3)
+	add	%g2, %g3, %g3
+	xnor	%g0, %g3, %g3
+	mulx	%g3, %i1, %g3
+	mulx	%g2, %o0, %g2
+	cmp	%g2, %g3
+	add	%i1, %g3, %g1
+	movgeu	%xcc, %g3, %g1
+	srlx	%g1, %i5, %g2
+	stx	%g2, [%i0+40]
+
+	umulxhi(%o0, %g1, %g2)
+	add	%g1, %g2, %g2
+	xnor	%g0, %g2, %g2
+	mulx	%g2, %i1, %g2
+	mulx	%g1, %o0, %o0
+	cmp	%o0, %g2
+	add	%i1, %g2, %g3
+	movgeu	%xcc, %g2, %g3
+	srlx	%g3, %i5, %i5
+	stx	%i5, [%i0+48]
+
+	return	%i7+8
+	 nop
+EPILOGUE()

diff --git a/third_party/gmp/mpn/sparc64/ultrasparct3/mod_34lsub1.asm b/third_party/gmp/mpn/sparc64/ultrasparct3/mod_34lsub1.asm
new file mode 100644
index 0000000..8744280
--- /dev/null
+++ b/third_party/gmp/mpn/sparc64/ultrasparct3/mod_34lsub1.asm

@@ -0,0 +1,117 @@
+dnl  SPARC v9 mpn_mod_34lsub1 for T3/T4/T5.
+
+dnl  Copyright 2005, 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C		    cycles/limb
+C UltraSPARC T1:	 -
+C UltraSPARC T3:	 5
+C UltraSPARC T4:	 1.57
+
+C This is based on the powerpc64/mode64 code.
+
+C INPUT PARAMETERS
+define(`up', `%i0')
+define(`n',  `%i1')
+
+ASM_START()
+	REGISTER(%g2,#scratch)
+	REGISTER(%g3,#scratch)
+PROLOGUE(mpn_mod_34lsub1)
+	save	%sp, -176, %sp
+
+	mov	0, %g1
+	mov	0, %g3
+	mov	0, %g4
+	addcc	%g0, 0, %g5
+
+	add	n, -3, n
+	brlz	n, L(lt3)
+	 nop
+
+	add	n, -3, n
+	ldx	[up+0], %l5
+	ldx	[up+8], %l6
+	ldx	[up+16], %l7
+	brlz	n, L(end)
+	 add	up, 24, up
+
+	ALIGN(16)
+L(top):	addxccc(%g1, %l5, %g1)
+	ldx	[up+0], %l5
+	addxccc(%g3, %l6, %g3)
+	ldx	[up+8], %l6
+	addxccc(%g4, %l7, %g4)
+	ldx	[up+16], %l7
+	add	n, -3, n
+	brgez	n, L(top)
+	 add	up, 24, up
+
+L(end):	addxccc(	%g1, %l5, %g1)
+	addxccc(%g3, %l6, %g3)
+	addxccc(%g4, %l7, %g4)
+	addxc(	%g5, %g0, %g5)
+
+L(lt3):	cmp	n, -2
+	blt	L(2)
+	 nop
+
+	ldx	[up+0], %l5
+	mov	0, %l6
+	beq	L(1)
+	 addcc	%g1, %l5, %g1
+
+	ldx	[up+8], %l6
+L(1):	addxccc(%g3, %l6, %g3)
+	addxccc(%g4, %g0, %g4)
+	addxc(	%g5, %g0, %g5)
+
+L(2):	sllx	%g1, 16, %l0
+	srlx	%l0, 16, %l0		C %l0 = %g1 mod 2^48
+	srlx	%g1, 48, %l3		C %l3 = %g1 div 2^48
+	srl	%g3, 0, %g1
+	sllx	%g1, 16, %l4		C %l4 = (%g3 mod 2^32) << 16
+	srlx	%g3, 32, %l5		C %l5 = %g3 div 2^32
+	sethi	%hi(0xffff0000), %g1
+	andn	%g4, %g1, %g1
+	sllx	%g1, 32, %l6		C %l6 = (%g4 mod 2^16) << 32
+	srlx	%g4, 16, %l7		C %l7 = %g4 div 2^16
+
+	add	%l0, %l3, %l0
+	add	%l4, %l5, %l4
+	add	%l6, %l7, %l6
+
+	add	%l0, %l4, %l0
+	add	%l6, %g5, %l6
+
+	add	%l0, %l6, %i0
+	ret
+	 restore
+EPILOGUE()

diff --git a/third_party/gmp/mpn/sparc64/ultrasparct3/mode1o.asm b/third_party/gmp/mpn/sparc64/ultrasparct3/mode1o.asm
new file mode 100644
index 0000000..494e1d3
--- /dev/null
+++ b/third_party/gmp/mpn/sparc64/ultrasparct3/mode1o.asm

@@ -0,0 +1,82 @@
+dnl  SPARC T3/T4/T5 mpn_modexact_1c_odd.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C                  cycles/limb
+C UltraSPARC T3:	30
+C UltraSPARC T4/T5:	26
+
+C INPUT PARAMETERS
+define(`ap',  `%o0')
+define(`n',   `%o1')
+define(`d',   `%o2')
+define(`cy',  `%o3')
+
+define(`dinv',`%o5')
+define(`a0',  `%g1')
+
+ASM_START()
+	REGISTER(%g2,#scratch)
+	REGISTER(%g3,#scratch)
+PROLOGUE(mpn_modexact_1c_odd)
+	srlx	d, 1, %g1
+	and	%g1, 127, %g1
+
+	LEA64(binvert_limb_table, g2, g4)
+	ldub	[%g2+%g1], %g1
+	add	%g1, %g1, %g2
+	mulx	%g1, %g1, %g1
+	mulx	%g1, d, %g1
+	sub	%g2, %g1, %g2
+	add	%g2, %g2, %g1
+	mulx	%g2, %g2, %g2
+	mulx	%g2, d, %g2
+	sub	%g1, %g2, %g1
+	add	%g1, %g1, %o5
+	mulx	%g1, %g1, %g1
+	mulx	%g1, d, %g1
+	sub	%o5, %g1, dinv
+	add	n, -1, n
+
+L(top):	ldx	[ap], a0
+	add	ap, 8, ap
+	subcc	a0, cy, %g3
+	mulx	%g3, dinv, %g5
+	umulxhi(d, %g5, %g5)
+	addxc(	%g5, %g0, cy)
+	brnz,pt	n, L(top)
+	 add	n, -1, n
+
+	retl
+	 mov	cy, %o0
+EPILOGUE()

diff --git a/third_party/gmp/mpn/sparc64/ultrasparct3/mul_1.asm b/third_party/gmp/mpn/sparc64/ultrasparct3/mul_1.asm
new file mode 100644
index 0000000..af05d62
--- /dev/null
+++ b/third_party/gmp/mpn/sparc64/ultrasparct3/mul_1.asm

@@ -0,0 +1,174 @@
+dnl  SPARC v9 mpn_mul_1 for T3/T4/T5.
+
+dnl  Contributed to the GNU project by David Miller and Torbjörn Granlund.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C		   cycles/limb
+C UltraSPARC T3:	23
+C UltraSPARC T4:	 3
+
+C INPUT PARAMETERS
+define(`rp', `%i0')
+define(`up', `%i1')
+define(`n',  `%i2')
+define(`v0', `%i3')
+
+ASM_START()
+	REGISTER(%g2,#scratch)
+	REGISTER(%g3,#scratch)
+PROLOGUE(mpn_mul_1)
+	save	%sp, -176, %sp
+
+	and	n, 3, %g5
+	add	n, -4, n
+	brz	%g5, L(b0)
+	 cmp	%g5, 2
+	bcs	%xcc, L(b1)
+	 nop
+	be	%xcc, L(b2)
+	 nop
+
+L(b3):	addcc	%g0, %g0, %i5
+	ldx	[up+0], %l0
+	ldx	[up+8], %l1
+	ldx	[up+16], %l2
+	mulx	%l0, v0, %o0
+	umulxhi(%l0, v0, %o1)
+	brgz	n, L(gt3)
+	 add	rp, -8, rp
+	mulx	%l1, v0, %o2
+	umulxhi(%l1, v0, %o3)
+	b	L(wd3)
+	 nop
+L(gt3):	ldx	[up+24], %l3
+	mulx	%l1, v0, %o2
+	umulxhi(%l1, v0, %o3)
+	add	up, 24, up
+	b	L(lo3)
+	 add	n, -3, n
+
+L(b2):	addcc	%g0, %g0, %o1
+	ldx	[up+0], %l1
+	ldx	[up+8], %l2
+	brgz	n, L(gt2)
+	 add	rp, -16, rp
+	mulx	%l1, v0, %o2
+	umulxhi(%l1, v0, %o3)
+	mulx	%l2, v0, %o4
+	umulxhi(%l2, v0, %o5)
+	b	L(wd2)
+	 nop
+L(gt2):	ldx	[up+16], %l3
+	mulx	%l1, v0, %o2
+	umulxhi(%l1, v0, %o3)
+	ldx	[up+24], %l0
+	mulx	%l2, v0, %o4
+	umulxhi(%l2, v0, %o5)
+	add	up, 16, up
+	b	L(lo2)
+	 add	n, -2, n
+
+L(b1):	addcc	%g0, %g0, %o3
+	ldx	[up+0], %l2
+	brgz	n, L(gt1)
+	nop
+	mulx	%l2, v0, %o4
+	stx	%o4, [rp+0]
+	umulxhi(%l2, v0, %i0)
+	ret
+	 restore
+L(gt1):	ldx	[up+8], %l3
+	ldx	[up+16], %l0
+	mulx	%l2, v0, %o4
+	umulxhi(%l2, v0, %o5)
+	ldx	[up+24], %l1
+	mulx	%l3, v0, %i4
+	umulxhi(%l3, v0, %i5)
+	add	rp, -24, rp
+	add	up, 8, up
+	b	L(lo1)
+	 add	n, -1, n
+
+L(b0):	addcc	%g0, %g0, %o5
+	ldx	[up+0], %l3
+	ldx	[up+8], %l0
+	ldx	[up+16], %l1
+	mulx	%l3, v0, %i4
+	umulxhi(%l3, v0, %i5)
+	ldx	[up+24], %l2
+	mulx	%l0, v0, %o0
+	umulxhi(%l0, v0, %o1)
+	b	L(lo0)
+	 nop
+
+	ALIGN(16)
+L(top):	ldx	[up+0], %l3	C 0
+	addxccc(%i4, %o5, %i4)	C 0
+	mulx	%l1, v0, %o2	C 1
+	stx	%i4, [rp+0]	C 1
+	umulxhi(%l1, v0, %o3)	C 2
+L(lo3):	ldx	[up+8], %l0	C 2
+	addxccc(%o0, %i5, %o0)	C 3
+	mulx	%l2, v0, %o4	C 3
+	stx	%o0, [rp+8]	C 4
+	umulxhi(%l2, v0, %o5)	C 4
+L(lo2):	ldx	[up+16], %l1	C 5
+	addxccc(%o2, %o1, %o2)	C 5
+	mulx	%l3, v0, %i4	C 6
+	stx	%o2, [rp+16]	C 6
+	umulxhi(%l3, v0, %i5)	C 7
+L(lo1):	ldx	[up+24], %l2	C 7
+	addxccc(%o4, %o3, %o4)	C 8
+	mulx	%l0, v0, %o0	C 8
+	stx	%o4, [rp+24]	C 9
+	umulxhi(%l0, v0, %o1)	C 9
+	add	rp, 32, rp	C 10
+L(lo0):	add	up, 32, up	C 10
+	brgz	n, L(top)	C 11
+	 add	n, -4, n	C 11
+
+L(end):	addxccc(%i4, %o5, %i4)
+	mulx	%l1, v0, %o2
+	stx	%i4, [rp+0]
+	umulxhi(%l1, v0, %o3)
+	addxccc(%o0, %i5, %o0)
+L(wd3):	mulx	%l2, v0, %o4
+	stx	%o0, [rp+8]
+	umulxhi(%l2, v0, %o5)
+	addxccc(%o2, %o1, %o2)
+L(wd2):	stx	%o2, [rp+16]
+	addxccc(%o4, %o3, %o4)
+	stx	%o4, [rp+24]
+	addxc(	%g0, %o5, %i0)
+	ret
+	 restore
+EPILOGUE()

diff --git a/third_party/gmp/mpn/sparc64/ultrasparct3/popcount.asm b/third_party/gmp/mpn/sparc64/ultrasparct3/popcount.asm
new file mode 100644
index 0000000..de80f3c
--- /dev/null
+++ b/third_party/gmp/mpn/sparc64/ultrasparct3/popcount.asm

@@ -0,0 +1,70 @@
+dnl  SPARC v9 mpn_popcount for T3/T4.
+
+dnl  Contributed to the GNU project by David Miller.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C		   cycles/limb
+C UltraSPARC T3:	15
+C UltraSPARC T4:	 2.5
+
+C INPUT PARAMETERS
+define(`up',   `%o0')
+define(`n',    `%o1')
+define(`pcnt', `%o5')
+
+ASM_START()
+	REGISTER(%g2,#scratch)
+	REGISTER(%g3,#scratch)
+PROLOGUE(mpn_popcount)
+	subcc	n, 1, n
+	be	L(final_one)
+	 clr	pcnt
+L(top):
+	ldx	[up + 0], %g1
+	sub	n, 2, n
+	ldx	[up + 8], %o4
+	add	up, 16, up
+	popc	%g1, %g2
+	popc	%o4, %g3
+	add	pcnt, %g2, pcnt
+	brgz	n, L(top)
+	 add	pcnt, %g3, pcnt
+	brlz,pt	n, L(done)
+	 nop
+L(final_one):
+	ldx	[up + 0], %g1
+	popc	%g1, %g2
+	add	pcnt, %g2, pcnt
+L(done):
+	retl
+	 mov	pcnt, %o0
+EPILOGUE()

diff --git a/third_party/gmp/mpn/sparc64/ultrasparct3/sqr_diag_addlsh1.asm b/third_party/gmp/mpn/sparc64/ultrasparct3/sqr_diag_addlsh1.asm
new file mode 100644
index 0000000..d46499f
--- /dev/null
+++ b/third_party/gmp/mpn/sparc64/ultrasparct3/sqr_diag_addlsh1.asm

@@ -0,0 +1,93 @@
+dnl  SPARC v9 mpn_sqr_diag_addlsh1 for T3/T4/T5.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C		   cycles/limb
+C UltraSPARC T3:	?
+C UltraSPARC T4:	>= 4.5
+
+
+define(`rp', `%i0')
+define(`tp', `%i1')
+define(`up', `%i2')
+define(`n',  `%i3')
+
+ASM_START()
+	REGISTER(%g2,#scratch)
+	REGISTER(%g3,#scratch)
+PROLOGUE(mpn_sqr_diag_addlsh1)
+	save	%sp, -176, %sp
+
+	ldx	[up+0], %g1
+	mulx	%g1, %g1, %o0
+	umulxhi(%g1, %g1, %g2)
+	stx	%o0, [rp+0]
+
+	ldx	[up+8], %g1
+	ldx	[tp+0], %g4
+	ldx	[tp+8], %g5
+	mulx	%g1, %g1, %o0
+	orcc	%g0, %g0, %o5
+	b	L(dm)
+	 add	n, -2, n
+
+	ALIGN(16)
+L(top):	ldx	[up+8], %g1
+	addcc	%g4, %o2, %o2
+	addxccc(%g5, %o0, %g3)
+	ldx	[tp+16], %g4
+	ldx	[tp+24], %g5
+	mulx	%g1, %g1, %o0
+	stx	%o2, [rp+8]
+	stx	%g3, [rp+16]
+	add	rp, 16, rp
+	add	tp, 16, tp
+L(dm):	add	%g2, %o5, %o2
+	umulxhi(%g1, %g1, %g2)
+	addxccc(%g4, %g4, %g4)
+	addxccc(%g5, %g5, %g5)
+	add	up, 8, up
+	addxc(	%g0, %g0, %o5)
+	brnz	n, L(top)
+	 add	n, -1, n
+
+	addcc	%o2, %g4, %g4
+	addxccc(%o0, %g5, %g5)
+	stx	%g4, [rp+8]
+	stx	%g5, [rp+16]
+	addxc(	%o5, %g2, %g2)
+	stx	%g2, [rp+24]
+
+	ret
+	 restore
+EPILOGUE()

diff --git a/third_party/gmp/mpn/sparc64/ultrasparct3/sub_n.asm b/third_party/gmp/mpn/sparc64/ultrasparct3/sub_n.asm
new file mode 100644
index 0000000..0e4bc93
--- /dev/null
+++ b/third_party/gmp/mpn/sparc64/ultrasparct3/sub_n.asm

@@ -0,0 +1,144 @@
+dnl  SPARC v9 mpn_sub_n for T3/T4.
+
+dnl  Contributed to the GNU project by David Miller.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C		   cycles/limb
+C UltraSPARC T3:	 8
+C UltraSPARC T4:	 3
+
+C INPUT PARAMETERS
+define(`rp', `%i0')
+define(`up', `%i1')
+define(`vp', `%i2')
+define(`n',  `%i3')
+define(`cy', `%i4')
+
+define(`u0_off', `%l0')
+define(`u1_off', `%l1')
+define(`v0_off', `%l2')
+define(`v1_off', `%l3')
+define(`r0_off', `%l4')
+define(`r1_off', `%l5')
+define(`loop_n', `%l6')
+define(`tmp', `%l7')
+
+ASM_START()
+	REGISTER(%g2,#scratch)
+	REGISTER(%g3,#scratch)
+PROLOGUE(mpn_sub_nc)
+	save	%sp, -176, %sp
+	ba,pt	%xcc, L(ent)
+	 xor	cy, 1, cy
+EPILOGUE()
+PROLOGUE(mpn_sub_n)
+	save	%sp, -176, %sp
+	mov	1, cy
+L(ent):
+	subcc	n, 1, n
+	be	L(final_one)
+	 cmp	%g0, cy
+
+	ldx	[up + 0], %o4
+	sllx	n, 3, tmp
+
+	ldx	[vp + 0], %o5
+	add	up, tmp, u0_off
+
+	ldx	[up + 8], %g5
+	add	vp, tmp, v0_off
+
+	ldx	[vp + 8], %g1
+	add	rp, tmp, r0_off
+
+	neg	tmp, loop_n
+	add	u0_off, 8, u1_off
+
+	add	v0_off, 8, v1_off
+	sub	loop_n, -(2 * 8), loop_n
+
+	sub	r0_off, 16, r0_off
+	brgez,pn loop_n, L(loop_tail)
+	 sub	r0_off, 8, r1_off
+
+	b,a	L(top)
+	ALIGN(16)
+L(top):
+	xnor	%o5, 0, tmp
+	ldx	[loop_n + v0_off], %o5
+
+	addxccc(%o4, tmp, %g3)
+	ldx	[loop_n + u0_off], %o4
+
+	xnor	%g1, 0, %g1
+	stx	%g3, [loop_n + r0_off]
+
+	addxccc(%g5, %g1, tmp)
+	ldx	[loop_n + v1_off], %g1
+
+	ldx	[loop_n + u1_off], %g5
+	sub	loop_n, -(2 * 8), loop_n
+
+	brlz	loop_n, L(top)
+	 stx	tmp, [loop_n + r1_off]
+
+L(loop_tail):
+	xnor	%o5, 0, tmp
+	xnor	%g1, 0, %g1
+
+	addxccc(%o4, tmp, %g3)
+	add	loop_n, u0_off, up
+
+	addxccc(%g5, %g1, %g5)
+	add	loop_n, r0_off, rp
+
+	stx	%g3, [rp + 0]
+	add	loop_n, v0_off, vp
+
+	brgz,pt	loop_n, L(done)
+	 stx	%g5, [rp + 8]
+
+	add	rp, (2 * 8), rp
+
+L(final_one):
+	ldx	[up+0], %o4
+	ldx	[vp+0], %o5
+	xnor	%o5, %g0, %o5
+	addxccc(%o4, %o5, %g3)
+	stx	%g3, [rp+0]
+
+L(done):
+	clr	%i0
+	movcc	%xcc, 1, %i0
+	ret
+	 restore
+EPILOGUE()

diff --git a/third_party/gmp/mpn/sparc64/ultrasparct3/submul_1.asm b/third_party/gmp/mpn/sparc64/ultrasparct3/submul_1.asm
new file mode 100644
index 0000000..5635d1b
--- /dev/null
+++ b/third_party/gmp/mpn/sparc64/ultrasparct3/submul_1.asm

@@ -0,0 +1,170 @@
+dnl  SPARC v9 mpn_submul_1 for T3/T4/T5.
+
+dnl  Contributed to the GNU project by David Miller and Torbjörn Granlund.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C		   cycles/limb
+C UltraSPARC T3:	26
+C UltraSPARC T4:	 4.5
+
+C INPUT PARAMETERS
+define(`rp', `%i0')
+define(`up', `%i1')
+define(`n',  `%i2')
+define(`v0', `%i3')
+
+ASM_START()
+	REGISTER(%g2,#scratch)
+	REGISTER(%g3,#scratch)
+PROLOGUE(mpn_submul_1)
+	save	%sp, -176, %sp
+	ldx	[up+0], %g1
+
+	and	n, 3, %g5
+	add	n, -4, n
+	brz	%g5, L(b00)
+	 cmp	%g5, 2
+	bcs	%xcc, L(b01)
+	 nop
+	bne	%xcc, L(b11)
+	 ldx	[up+8], %g4
+
+L(b10):	add	up, 16, up
+	addcc	%g0, 0, %g3
+	mulx	%g1, v0, %l4
+	umulxhi(%g1, v0, %l5)
+	ldx	[rp+0], %o2
+	mulx	%g4, v0, %l6
+	umulxhi(%g4, v0, %l7)
+	brlz	n, L(wd2)
+	 nop
+L(gt2):	ldx	[up+0], %o0
+	b	L(lo2)
+	 nop
+
+L(b00):	add	rp, -16, rp
+	addcc	%g0, 0, %g3
+	ldx	[up+8], %o1
+	mulx	%g1, v0, %l0
+	umulxhi(%g1, v0, %l1)
+	ldx	[up+16], %o0
+	ldx	[rp+16], %o2
+	mulx	%o1, v0, %l2
+	umulxhi(%o1, v0, %l3)
+	b	     L(lo0)
+	 nop
+
+L(b01):	add	up, 8, up
+	add	rp, -8, rp
+	addcc	%g0, 0, %g3
+	ldx	[rp+8], %o3
+	mulx	%g1, v0, %l6
+	umulxhi(%g1, v0, %l7)
+	brlz	n, L(wd1)
+	 nop
+	ldx	[up+0], %o0
+	ldx	[up+8], %o1
+	mulx	%o0, v0, %l0
+	umulxhi(%o0, v0, %l1)
+	b	L(lo1)
+	 nop
+
+L(b11):	add	up, 24, up
+	add	rp, 8, rp
+	addcc	%g0, 0, %g3
+	mulx	%g1, v0, %l2
+	umulxhi(%g1, v0, %l3)
+	ldx	[up-8], %o1
+	ldx	[rp-8], %o3
+	mulx	%g4, v0, %l4
+	umulxhi(%g4, v0, %l5)
+	brlz	n, L(end)
+	 nop
+
+	ALIGN(16)
+L(top):	ldx	[up+0], %o0
+	addxccc(%g3, %l2, %g1)
+	ldx	[rp+0], %o2
+	addxc(	%g0, %l3, %g3)
+	mulx	%o1, v0, %l6
+	subcc	%o3, %g1, %g4
+	umulxhi(%o1, v0, %l7)
+	stx	%g4, [rp-8]
+L(lo2):	ldx	[up+8], %o1
+	addxccc(%g3, %l4, %g1)
+	ldx	[rp+8], %o3
+	addxc(	%g0, %l5, %g3)
+	mulx	%o0, v0, %l0
+	subcc	%o2, %g1, %g4
+	umulxhi(%o0, v0, %l1)
+	stx	%g4, [rp+0]
+L(lo1):	ldx	[up+16], %o0
+	addxccc(%g3, %l6, %g1)
+	ldx	[rp+16], %o2
+	addxc(	%g0, %l7, %g3)
+	mulx	%o1, v0, %l2
+	subcc	%o3, %g1, %g4
+	umulxhi(%o1, v0, %l3)
+	stx	%g4, [rp+8]
+L(lo0):	ldx	[up+24], %o1
+	addxccc(%g3, %l0, %g1)
+	ldx	[rp+24], %o3
+	addxc(	%g0, %l1, %g3)
+	mulx	%o0, v0, %l4
+	subcc	%o2, %g1, %g4
+	umulxhi(%o0, v0, %l5)
+	stx	%g4, [rp+16]
+	add	n, -4, n
+	add	up, 32, up
+	brgez	n, L(top)
+	 add	rp, 32, rp
+
+L(end):	addxccc(%g3, %l2, %g1)
+	ldx	[rp+0], %o2
+	addxc(	%g0, %l3, %g3)
+	mulx	%o1, v0, %l6
+	subcc	%o3, %g1, %g4
+	umulxhi(%o1, v0, %l7)
+	stx	%g4, [rp-8]
+L(wd2):	addxccc(%g3, %l4, %g1)
+	ldx	[rp+8], %o3
+	addxc(	%g0, %l5, %g3)
+	subcc	%o2, %g1, %g4
+	stx	%g4, [rp+0]
+L(wd1):	addxccc(%g3, %l6, %g1)
+	addxc(	%g0, %l7, %g3)
+	subcc	%o3, %g1, %g4
+	stx	%g4, [rp+8]
+	addxc(	%g0, %g3, %i0)
+	ret
+	 restore
+EPILOGUE()

diff --git a/third_party/gmp/mpn/sparc64/ultrasparct45/gmp-mparam.h b/third_party/gmp/mpn/sparc64/ultrasparct45/gmp-mparam.h
new file mode 100644
index 0000000..2fecdba
--- /dev/null
+++ b/third_party/gmp/mpn/sparc64/ultrasparct45/gmp-mparam.h

@@ -0,0 +1,173 @@
+/* Sparc64 T4-T5 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+/* 3600 MHz ultrasparct5 running GNU/Linux */
+/* FFT tuning limit = 0.5 M */
+/* Generated by tuneup.c, 2019-10-01, gcc 7.4 */
+
+#define DIVREM_1_NORM_THRESHOLD              3
+#define DIVREM_1_UNNORM_THRESHOLD            3
+#define MOD_1_1P_METHOD                      2  /* 0.34% faster than 1 */
+#define MOD_1_NORM_THRESHOLD                 0  /* always */
+#define MOD_1_UNNORM_THRESHOLD               3
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          6
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          6
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD         8
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD         0  /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     13
+#define USE_PREINV_DIVREM_1                  1
+#define DIV_QR_1N_PI1_METHOD                 2  /* 27.84% faster than 1 */
+#define DIV_QR_1_NORM_THRESHOLD              3
+#define DIV_QR_1_UNNORM_THRESHOLD            2
+#define DIV_QR_2_PI2_THRESHOLD               5
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           19
+
+#define DIV_1_VS_MUL_1_PERCENT             654
+
+#define MUL_TOOM22_THRESHOLD                40
+#define MUL_TOOM33_THRESHOLD               129
+#define MUL_TOOM44_THRESHOLD               372
+#define MUL_TOOM6H_THRESHOLD               494
+#define MUL_TOOM8H_THRESHOLD               656
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD     126
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     247
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD     225
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD     219
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD     188
+
+#define SQR_BASECASE_THRESHOLD              20
+#define SQR_TOOM2_THRESHOLD                 59
+#define SQR_TOOM3_THRESHOLD                107
+#define SQR_TOOM4_THRESHOLD                298
+#define SQR_TOOM6_THRESHOLD                399
+#define SQR_TOOM8_THRESHOLD                562
+
+#define MULMID_TOOM42_THRESHOLD             48
+
+#define MULMOD_BNM1_THRESHOLD               25
+#define SQRMOD_BNM1_THRESHOLD               23
+
+#define MUL_FFT_MODF_THRESHOLD             555  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    555, 5}, {     29, 6}, {     31, 7}, {     31, 8}, \
+    {     17, 7}, {     36, 8}, {     19, 7}, {     39, 8}, \
+    {     21, 7}, {     43, 8}, {     29, 9}, {     15, 8}, \
+    {     31, 7}, {     63, 8}, {     35, 9}, {     19, 8}, \
+    {     43, 9}, {     23, 8}, {     51, 9}, {     27, 8}, \
+    {     57,10}, {     15, 8}, {     61, 9}, {     31, 8}, \
+    {     67, 9}, {     35, 8}, {     71, 9}, {     39, 8}, \
+    {     81, 9}, {     43,10}, {     23, 9}, {     59,11}, \
+    {     15,10}, {     31, 9}, {     71,10}, {     39, 9}, \
+    {     87,10}, {     47, 9}, {     99,10}, {     55, 9}, \
+    {    115,11}, {     31,10}, {     63, 9}, {    131,10}, \
+    {     87,11}, {     47,10}, {    111, 9}, {    223,12}, \
+    {     31,11}, {     63,10}, {    135,11}, {     79,10}, \
+    {    159,11}, {     95,10}, {    191,11}, {    111,12}, \
+    {     63,11}, {    143,10}, {    287,11}, {    159,12}, \
+    {     95,11}, {    191,10}, {    383, 9}, {    767,13}, \
+    {   8192,14}, {  16384,15}, {  32768,16}, {  65536,17}, \
+    { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
+    {2097152,22}, {4194304,23}, {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 75
+#define MUL_FFT_THRESHOLD                 5760
+
+#define SQR_FFT_MODF_THRESHOLD             372  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    372, 5}, {     23, 6}, {     12, 5}, {     25, 6}, \
+    {     25, 7}, {     13, 6}, {     27, 7}, {     25, 8}, \
+    {     13, 7}, {     28, 8}, {     15, 7}, {     31, 8}, \
+    {     27, 9}, {     15, 8}, {     35, 9}, {     19, 8}, \
+    {     41, 9}, {     23, 8}, {     47, 9}, {     27,10}, \
+    {     15, 9}, {     39,10}, {     23, 9}, {     51,11}, \
+    {     15,10}, {     31, 9}, {     67,10}, {     39, 9}, \
+    {     79,10}, {     47, 9}, {     95,10}, {     55,11}, \
+    {     31,10}, {     79,11}, {     47,10}, {     95,12}, \
+    {     31,11}, {     63,10}, {    135,11}, {     79,10}, \
+    {    159, 9}, {    319,11}, {     95,10}, {    191, 9}, \
+    {    383,11}, {    111,12}, {     63,11}, {    127,10}, \
+    {    255, 9}, {    511,10}, {    271,11}, {    143,10}, \
+    {    287, 9}, {    575,10}, {    303, 9}, {    607,11}, \
+    {    159,10}, {    319, 9}, {    639,12}, {     95,11}, \
+    {    191,10}, {    383, 9}, {    767,11}, {    207,13}, \
+    {   8192,14}, {  16384,15}, {  32768,16}, {  65536,17}, \
+    { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
+    {2097152,22}, {4194304,23}, {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 75
+#define SQR_FFT_THRESHOLD                 3776
+
+#define MULLO_BASECASE_THRESHOLD             0  /* always */
+#define MULLO_DC_THRESHOLD                  35
+#define MULLO_MUL_N_THRESHOLD            11278
+#define SQRLO_BASECASE_THRESHOLD             0  /* always */
+#define SQRLO_DC_THRESHOLD                 168
+#define SQRLO_SQR_THRESHOLD               7511
+
+#define DC_DIV_QR_THRESHOLD                 36
+#define DC_DIVAPPR_Q_THRESHOLD             103
+#define DC_BDIV_QR_THRESHOLD                28
+#define DC_BDIV_Q_THRESHOLD                 88
+
+#define INV_MULMOD_BNM1_THRESHOLD           78
+#define INV_NEWTON_THRESHOLD               181
+#define INV_APPR_THRESHOLD                 118
+
+#define BINV_NEWTON_THRESHOLD              296
+#define REDC_1_TO_REDC_2_THRESHOLD           4
+#define REDC_2_TO_REDC_N_THRESHOLD          79
+
+#define MU_DIV_QR_THRESHOLD               1970
+#define MU_DIVAPPR_Q_THRESHOLD            1970
+#define MUPI_DIV_QR_THRESHOLD               82
+#define MU_BDIV_QR_THRESHOLD              1528
+#define MU_BDIV_Q_THRESHOLD               1970
+
+#define POWM_SEC_TABLE  1,58,102,1509
+
+#define GET_STR_DC_THRESHOLD                15
+#define GET_STR_PRECOMPUTE_THRESHOLD        29
+#define SET_STR_DC_THRESHOLD               686
+#define SET_STR_PRECOMPUTE_THRESHOLD      2717
+
+#define FAC_DSC_THRESHOLD                  336
+#define FAC_ODD_THRESHOLD                   24
+
+#define MATRIX22_STRASSEN_THRESHOLD         32
+#define HGCD2_DIV1_METHOD                    1  /* 0.66% faster than 3 */
+#define HGCD_THRESHOLD                      57
+#define HGCD_APPR_THRESHOLD                 50
+#define HGCD_REDUCE_THRESHOLD             3389
+#define GCD_DC_THRESHOLD                   386
+#define GCDEXT_DC_THRESHOLD                288
+#define JACOBI_BASE_METHOD                   4  /* 2.50% faster than 3 */

diff --git a/third_party/gmp/mpn/thumb/add_n.asm b/third_party/gmp/mpn/thumb/add_n.asm
new file mode 100644
index 0000000..08ed60b
--- /dev/null
+++ b/third_party/gmp/mpn/thumb/add_n.asm

@@ -0,0 +1,63 @@
+dnl  ARM/Thumb mpn_add_n.
+
+dnl  Copyright 1997, 2000, 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+define(`rp',	r0)
+define(`up',	r1)
+define(`vp',	r2)
+define(`n',	r3)
+
+ASM_START()
+	.thumb
+PROLOGUE(mpn_add_nc)
+	push	{r4, r5, r6}
+	ldr	r6, [sp, #12]		C init carry save register
+	sub	r6, #1
+	b	L(top)
+EPILOGUE()
+PROLOGUE(mpn_add_n)
+	push	{r4, r5, r6}
+	neg	r6, n			C init carry save register
+
+L(top):	ldmia	up!, {r4}		C load next limb from S1
+	cmp	n, r6			C tricky carry restore
+	ldmia	vp!, {r5}		C load next limb from S2
+	adc	r4, r5
+	stmia	rp!, {r4}		C store result limb to RES
+	sbc	r6, r6			C save negated carry
+	sub	n, #1
+	bne	L(top)
+
+	add	r0, r6, #1
+	pop	{r4, r5, r6}
+	bx	lr
+EPILOGUE()

diff --git a/third_party/gmp/mpn/thumb/sub_n.asm b/third_party/gmp/mpn/thumb/sub_n.asm
new file mode 100644
index 0000000..a385720
--- /dev/null
+++ b/third_party/gmp/mpn/thumb/sub_n.asm

@@ -0,0 +1,63 @@
+dnl  ARM/Thumb mpn_sub_n.
+
+dnl  Copyright 1997, 2000, 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+define(`rp',	r0)
+define(`up',	r1)
+define(`vp',	r2)
+define(`n',	r3)
+
+ASM_START()
+	.thumb
+PROLOGUE(mpn_sub_nc)
+	push	{r4, r5, r6}
+	ldr	r6, [sp, #12]		C init carry save register
+	neg	r6, r6
+	b	L(top)
+EPILOGUE()
+PROLOGUE(mpn_sub_n)
+	push	{r4, r5, r6}
+	mov	r6, n			C init carry save register
+
+L(top):	ldmia	up!, {r4}		C load next limb from S1
+	cmp	n, r6			C tricky carry restore
+	ldmia	vp!, {r5}		C load next limb from S2
+	sbc	r4, r5
+	stmia	rp!, {r4}		C store result limb to RES
+	sbc	r6, r6			C save negated carry
+	sub	n, #1
+	bne	L(top)
+
+	neg	r0, r6
+	pop	{r4, r5, r6}
+	bx	lr
+EPILOGUE()

diff --git a/third_party/gmp/mpn/vax/add_n.asm b/third_party/gmp/mpn/vax/add_n.asm
new file mode 100644
index 0000000..0a0bf78
--- /dev/null
+++ b/third_party/gmp/mpn/vax/add_n.asm

@@ -0,0 +1,64 @@
+dnl  VAX mpn_add_n -- Add two limb vectors of the same length > 0 and store sum
+dnl  in a third limb vector.
+
+dnl  Copyright 1999, 2000, 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_add_n)
+	.word	0x0
+	movl	16(ap), r0
+	movl	12(ap), r1
+	movl	8(ap), r2
+	movl	4(ap), r3
+	mnegl	r0, r5
+	addl2	$3, r0
+	ashl	$-2, r0, r0	C unroll loop count
+	bicl2	$-4, r5		C mask out low 2 bits
+	movaq	(r5)[r5], r5	C 9x
+	jmp	L(top)[r5]
+
+L(top):	movl	(r2)+, r4
+	adwc	(r1)+, r4
+	movl	r4, (r3)+
+	movl	(r2)+, r4
+	adwc	(r1)+, r4
+	movl	r4, (r3)+
+	movl	(r2)+, r4
+	adwc	(r1)+, r4
+	movl	r4, (r3)+
+	movl	(r2)+, r4
+	adwc	(r1)+, r4
+	movl	r4, (r3)+
+	sobgtr	r0, L(top)
+
+	adwc	r0, r0
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/vax/addmul_1.asm b/third_party/gmp/mpn/vax/addmul_1.asm
new file mode 100644
index 0000000..8a6f636
--- /dev/null
+++ b/third_party/gmp/mpn/vax/addmul_1.asm

@@ -0,0 +1,124 @@
+dnl  VAX mpn_addmul_1 -- Multiply a limb vector with a limb and add the result
+dnl  to a second limb vector.
+
+dnl  Copyright 1992, 1994, 1996, 2000, 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_addmul_1)
+	.word	0xfc0
+	movl	12(ap), r4
+	movl	8(ap), r8
+	movl	4(ap), r9
+	clrl	r3
+	incl	r4
+	ashl	$-1, r4, r7
+	clrl	r11
+	movl	16(ap), r6
+	jlss	L(v0_big)
+	jlbc	r4, L(1)
+
+C Loop for v0 < 0x80000000
+L(tp1):	movl	(r8)+, r1
+	jlss	L(1n0)
+	emul	r1, r6, $0, r2
+	addl2	r11, r2
+	adwc	$0, r3
+	addl2	r2, (r9)+
+	adwc	$0, r3
+L(1):	movl	(r8)+, r1
+	jlss	L(1n1)
+L(1p1):	emul	r1, r6, $0, r10
+	addl2	r3, r10
+	adwc	$0, r11
+	addl2	r10, (r9)+
+	adwc	$0, r11
+
+	sobgtr	r7, L(tp1)
+	movl	r11, r0
+	ret
+
+L(1n0):	emul	r1, r6, $0, r2
+	addl2	r11, r2
+	adwc	r6, r3
+	addl2	r2, (r9)+
+	adwc	$0, r3
+	movl	(r8)+, r1
+	jgeq	L(1p1)
+L(1n1):	emul	r1, r6, $0, r10
+	addl2	r3, r10
+	adwc	r6, r11
+	addl2	r10, (r9)+
+	adwc	$0, r11
+
+	sobgtr	r7, L(tp1)
+	movl	r11, r0
+	ret
+
+L(v0_big):
+	jlbc	r4, L(2)
+
+C Loop for v0 >= 0x80000000
+L(tp2):	movl	(r8)+, r1
+	jlss	L(2n0)
+	emul	r1, r6, $0, r2
+	addl2	r11, r2
+	adwc	r1, r3
+	addl2	r2, (r9)+
+	adwc	$0, r3
+L(2):	movl	(r8)+, r1
+	jlss	L(2n1)
+L(2p1):	emul	r1, r6, $0, r10
+	addl2	r3, r10
+	adwc	r1, r11
+	addl2	r10, (r9)+
+	adwc	$0, r11
+
+	sobgtr	r7, L(tp2)
+	movl	r11, r0
+	ret
+
+L(2n0):	emul	r1, r6, $0, r2
+	addl2	r11, r2
+	adwc	r6, r3
+	addl2	r2, (r9)+
+	adwc	r1, r3
+	movl	(r8)+, r1
+	jgeq	L(2p1)
+L(2n1):	emul	r1, r6, $0, r10
+	addl2	r3, r10
+	adwc	r6, r11
+	addl2	r10, (r9)+
+	adwc	r1, r11
+
+	sobgtr	r7, L(tp2)
+	movl	r11, r0
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/vax/elf.m4 b/third_party/gmp/mpn/vax/elf.m4
new file mode 100644
index 0000000..e04f0ba
--- /dev/null
+++ b/third_party/gmp/mpn/vax/elf.m4

@@ -0,0 +1,54 @@
+divert(-1)
+
+dnl  m4 macros for VAX assembler.
+
+dnl  Copyright 2001, 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+defreg(r0,`%r``''0')
+defreg(r1,`%r``''1')
+defreg(r2,`%r``''2')
+defreg(r3,`%r``''3')
+defreg(r4,`%r``''4')
+defreg(r5,`%r``''5')
+defreg(r6,`%r``''6')
+defreg(r7,`%r``''7')
+defreg(r8,`%r``''8')
+defreg(r9,`%r``''9')
+defreg(r10,`%r``''10')
+defreg(r11,`%r``''11')
+defreg(r12,`%r``''12')
+defreg(r13,`%r``''13')
+defreg(r14,`%r``''14')
+defreg(r15,`%r``''15')
+defreg(ap,`%a``''p')
+
+define(`foo', blablabla)
+
+divert

diff --git a/third_party/gmp/mpn/vax/gmp-mparam.h b/third_party/gmp/mpn/vax/gmp-mparam.h
new file mode 100644
index 0000000..9f20b9b
--- /dev/null
+++ b/third_party/gmp/mpn/vax/gmp-mparam.h

@@ -0,0 +1,60 @@
+/* VAX gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2000-2002 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+/* These numbers were measured manually using the tune/speed program.
+   The standard tune/tuneup takes too long.  (VAX 8800) */
+
+#define MUL_TOOM22_THRESHOLD             14
+#define MUL_TOOM33_THRESHOLD            110
+
+#define SQR_BASECASE_THRESHOLD            6
+#define SQR_TOOM2_THRESHOLD              42
+#define SQR_TOOM3_THRESHOLD             250
+
+/* #define DIV_SB_PREINV_THRESHOLD         */
+/* #define DIV_DC_THRESHOLD                */
+/* #define POWM_THRESHOLD                  */
+
+/* #define GCD_ACCEL_THRESHOLD             */
+/* #define JACOBI_BASE_METHOD              */
+
+/* #define DIVREM_1_NORM_THRESHOLD         */
+/* #define DIVREM_1_UNNORM_THRESHOLD       */
+/* #define MOD_1_NORM_THRESHOLD            */
+/* #define MOD_1_UNNORM_THRESHOLD          */
+/* #define USE_PREINV_DIVREM_1             */
+/* #define USE_PREINV_MOD_1                */
+/* #define DIVREM_2_THRESHOLD              */
+/* #define DIVEXACT_1_THRESHOLD            */
+/* #define MODEXACT_1_ODD_THRESHOLD        */
+
+/* #define GET_STR_DC_THRESHOLD            */
+/* #define GET_STR_PRECOMPUTE_THRESHOLD    */
+#define SET_STR_THRESHOLD              3400

diff --git a/third_party/gmp/mpn/vax/lshift.asm b/third_party/gmp/mpn/vax/lshift.asm
new file mode 100644
index 0000000..941e999
--- /dev/null
+++ b/third_party/gmp/mpn/vax/lshift.asm

@@ -0,0 +1,59 @@
+dnl  VAX mpn_lshift -- left shift.
+
+dnl  Copyright 1999-2001, 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_lshift)
+	.word	0x1c0
+	movl	4(ap), r7
+	movl	8(ap), r6
+	movl	12(ap), r1
+	movl	16(ap), r8
+
+	moval	(r6)[r1], r6
+	moval	(r7)[r1], r7
+	clrl	r3
+	movl	-(r6), r2
+	ashq	r8, r2, r4
+	movl	r5, r0
+	movl	r2, r3
+	decl	r1
+	jeql	L(end)
+
+L(top):	movl	-(r6), r2
+	ashq	r8, r2, r4
+	movl	r5, -(r7)
+	movl	r2, r3
+	sobgtr	r1, L(top)
+
+L(end):	movl	r4, -4(r7)
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/vax/mul_1.asm b/third_party/gmp/mpn/vax/mul_1.asm
new file mode 100644
index 0000000..8e4dcd2
--- /dev/null
+++ b/third_party/gmp/mpn/vax/mul_1.asm

@@ -0,0 +1,118 @@
+dnl  VAX mpn_mul_1 -- Multiply a limb vector with a limb and store the result
+dnl  in a second limb vector.
+
+dnl  Copyright 1992, 1994, 1996, 2000, 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_mul_1)
+	.word	0xfc0
+	movl	12(ap), r4
+	movl	8(ap), r8
+	movl	4(ap), r9
+	clrl	r3
+	incl	r4
+	ashl	$-1, r4, r7
+	clrl	r11
+	movl	16(ap), r6
+	jlss	L(v0_big)
+	jlbc	r4, L(1)
+
+C Loop for v0 < 0x80000000
+L(tp1):	movl	(r8)+, r1
+	jlss	L(1n0)
+	emul	r1, r6, $0, r2
+	addl2	r11, r2
+	adwc	$0, r3
+	movl	r2, (r9)+
+L(1):	movl	(r8)+, r1
+	jlss	L(1n1)
+L(1p1):	emul	r1, r6, $0, r10
+	addl2	r3, r10
+	adwc	$0, r11
+	movl	r10, (r9)+
+
+	sobgtr	r7, L(tp1)
+	movl	r11, r0
+	ret
+
+L(1n0):	emul	r1, r6, $0, r2
+	addl2	r11, r2
+	adwc	r6, r3
+	movl	r2, (r9)+
+	movl	(r8)+, r1
+	jgeq	L(1p1)
+L(1n1):	emul	r1, r6, $0, r10
+	addl2	r3, r10
+	adwc	r6, r11
+	movl	r10, (r9)+
+
+	sobgtr	r7, L(tp1)
+	movl	r11, r0
+	ret
+
+L(v0_big):
+	jlbc	r4, L(2)
+
+C Loop for v0 >= 0x80000000
+L(tp2):	movl	(r8)+, r1
+	jlss	L(2n0)
+	emul	r1, r6, $0, r2
+	addl2	r11, r2
+	adwc	r1, r3
+	movl	r2, (r9)+
+L(2):	movl	(r8)+, r1
+	jlss	L(2n1)
+L(2p1):	emul	r1, r6, $0, r10
+	addl2	r3, r10
+	adwc	r1, r11
+	movl	r10, (r9)+
+
+	sobgtr	r7, L(tp2)
+	movl	r11, r0
+	ret
+
+L(2n0):	emul	r1, r6, $0, r2
+	addl2	r1, r3
+	addl2	r11, r2
+	adwc	r6, r3
+	movl	r2, (r9)+
+	movl	(r8)+, r1
+	jgeq	L(2p1)
+L(2n1):	emul	r1, r6, $0, r10
+	addl2	r1, r11
+	addl2	r3, r10
+	adwc	r6, r11
+	movl	r10, (r9)+
+
+	sobgtr	r7, L(tp2)
+	movl	r11, r0
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/vax/rshift.asm b/third_party/gmp/mpn/vax/rshift.asm
new file mode 100644
index 0000000..00b2daa
--- /dev/null
+++ b/third_party/gmp/mpn/vax/rshift.asm

@@ -0,0 +1,57 @@
+dnl  VAX mpn_rshift -- right shift.
+
+dnl  Copyright 1999-2001, 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_rshift)
+	.word	0x1c0
+	movl	4(ap), r7
+	movl	8(ap), r6
+	movl	12(ap), r1
+	movl	16(ap), r8
+
+	movl	(r6)+, r2
+	subl3	r8, $32, r8
+	ashl	r8, r2, r0
+	decl	r1
+	jeql	L(end)
+
+L(top):	movl	(r6)+, r3
+	ashq	r8, r2, r4
+	movl	r5, (r7)+
+	movl	r3, r2
+	sobgtr	r1, L(top)
+
+L(end):	clrl	r3
+	ashq	r8, r2, r4
+	movl	r5, (r7)
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/vax/sub_n.asm b/third_party/gmp/mpn/vax/sub_n.asm
new file mode 100644
index 0000000..2844ef2
--- /dev/null
+++ b/third_party/gmp/mpn/vax/sub_n.asm

@@ -0,0 +1,64 @@
+dnl  VAX mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+dnl  store difference in a third limb vector.
+
+dnl  Copyright 1999, 2000, 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_sub_n)
+	.word	0x0
+	movl	16(ap), r0
+	movl	12(ap), r1
+	movl	8(ap), r2
+	movl	4(ap), r3
+	mnegl	r0, r5
+	addl2	$3, r0
+	ashl	$-2, r0, r0	C unroll loop count
+	bicl2	$-4, r5		C mask out low 2 bits
+	movaq	(r5)[r5], r5	C 9x
+	jmp	L(top)[r5]
+
+L(top):	movl	(r2)+, r4
+	sbwc	(r1)+, r4
+	movl	r4, (r3)+
+	movl	(r2)+, r4
+	sbwc	(r1)+, r4
+	movl	r4, (r3)+
+	movl	(r2)+, r4
+	sbwc	(r1)+, r4
+	movl	r4, (r3)+
+	movl	(r2)+, r4
+	sbwc	(r1)+, r4
+	movl	r4, (r3)+
+	sobgtr	r0, L(top)
+
+	adwc	r0, r0
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/vax/submul_1.asm b/third_party/gmp/mpn/vax/submul_1.asm
new file mode 100644
index 0000000..60d47fc
--- /dev/null
+++ b/third_party/gmp/mpn/vax/submul_1.asm

@@ -0,0 +1,124 @@
+dnl  VAX mpn_submul_1 -- Multiply a limb vector with a limb and subtract the
+dnl  result from a second limb vector.
+
+dnl  Copyright 1992, 1994, 1996, 2000, 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ASM_START()
+PROLOGUE(mpn_submul_1)
+	.word	0xfc0
+	movl	12(ap), r4
+	movl	8(ap), r8
+	movl	4(ap), r9
+	clrl	r3
+	incl	r4
+	ashl	$-1, r4, r7
+	clrl	r11
+	movl	16(ap), r6
+	jlss	L(v0_big)
+	jlbc	r4, L(1)
+
+C Loop for v0 < 0x80000000
+L(tp1):	movl	(r8)+, r1
+	jlss	L(1n0)
+	emul	r1, r6, $0, r2
+	addl2	r11, r2
+	adwc	$0, r3
+	subl2	r2, (r9)+
+	adwc	$0, r3
+L(1):	movl	(r8)+, r1
+	jlss	L(1n1)
+L(1p1):	emul	r1, r6, $0, r10
+	addl2	r3, r10
+	adwc	$0, r11
+	subl2	r10, (r9)+
+	adwc	$0, r11
+
+	sobgtr	r7, L(tp1)
+	movl	r11, r0
+	ret
+
+L(1n0):	emul	r1, r6, $0, r2
+	addl2	r11, r2
+	adwc	r6, r3
+	subl2	r2, (r9)+
+	adwc	$0, r3
+	movl	(r8)+, r1
+	jgeq	L(1p1)
+L(1n1):	emul	r1, r6, $0, r10
+	addl2	r3, r10
+	adwc	r6, r11
+	subl2	r10, (r9)+
+	adwc	$0, r11
+
+	sobgtr	r7, L(tp1)
+	movl	r11, r0
+	ret
+
+L(v0_big):
+	jlbc	r4, L(2)
+
+C Loop for v0 >= 0x80000000
+L(tp2):	movl	(r8)+, r1
+	jlss	L(2n0)
+	emul	r1, r6, $0, r2
+	addl2	r11, r2
+	adwc	r1, r3
+	subl2	r2, (r9)+
+	adwc	$0, r3
+L(2):	movl	(r8)+, r1
+	jlss	L(2n1)
+L(2p1):	emul	r1, r6, $0, r10
+	addl2	r3, r10
+	adwc	r1, r11
+	subl2	r10, (r9)+
+	adwc	$0, r11
+
+	sobgtr	r7, L(tp2)
+	movl	r11, r0
+	ret
+
+L(2n0):	emul	r1, r6, $0, r2
+	addl2	r11, r2
+	adwc	r6, r3
+	subl2	r2, (r9)+
+	adwc	r1, r3
+	movl	(r8)+, r1
+	jgeq	L(2p1)
+L(2n1):	emul	r1, r6, $0, r10
+	addl2	r3, r10
+	adwc	r6, r11
+	subl2	r10, (r9)+
+	adwc	r1, r11
+
+	sobgtr	r7, L(tp2)
+	movl	r11, r0
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/README b/third_party/gmp/mpn/x86/README
new file mode 100644
index 0000000..8d7ac90
--- /dev/null
+++ b/third_party/gmp/mpn/x86/README

@@ -0,0 +1,525 @@
+Copyright 1999-2002 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.
+
+
+
+
+
+                      X86 MPN SUBROUTINES
+
+
+This directory contains mpn functions for various 80x86 chips.
+
+
+CODE ORGANIZATION
+
+	x86               i386, generic
+	x86/i486          i486
+	x86/pentium       Intel Pentium (P5, P54)
+	x86/pentium/mmx   Intel Pentium with MMX (P55)
+	x86/p6            Intel Pentium Pro
+	x86/p6/mmx        Intel Pentium II, III
+	x86/p6/p3mmx      Intel Pentium III
+	x86/k6            \ AMD K6
+	x86/k6/mmx        /
+	x86/k6/k62mmx     AMD K6-2
+	x86/k7            \ AMD Athlon
+	x86/k7/mmx        /
+	x86/pentium4      \
+	x86/pentium4/mmx  | Intel Pentium 4
+	x86/pentium4/sse2 /
+
+
+The top-level x86 directory contains blended style code, meant to be
+reasonable on all x86s.
+
+
+
+STATUS
+
+The code is well-optimized for AMD and Intel chips, but there's nothing
+specific for Cyrix chips, nor for actual 80386 and 80486 chips.
+
+
+
+ASM FILES
+
+The x86 .asm files are BSD style assembler code, first put through m4 for
+macro processing.  The generic mpn/asm-defs.m4 is used, together with
+mpn/x86/x86-defs.m4.  See comments in those files.
+
+The code is meant for use with GNU "gas" or a system "as".  There's no
+support for assemblers that demand Intel style code.
+
+
+
+STACK FRAME
+
+m4 macros are used to define the parameters passed on the stack, and these
+act like comments on what the stack frame looks like too.  For example,
+mpn_mul_1() has the following.
+
+        defframe(PARAM_MULTIPLIER, 16)
+        defframe(PARAM_SIZE,       12)
+        defframe(PARAM_SRC,         8)
+        defframe(PARAM_DST,         4)
+
+PARAM_MULTIPLIER becomes `FRAME+16(%esp)', and the others similarly.  The
+return address is at offset 0, but there's not normally any need to access
+that.
+
+FRAME is redefined as necessary through the code so it's the number of bytes
+pushed on the stack, and hence the offsets in the parameter macros stay
+correct.  At the start of a routine FRAME should be zero.
+
+        deflit(`FRAME',0)
+	...
+	deflit(`FRAME',4)
+	...
+	deflit(`FRAME',8)
+	...
+
+Helper macros FRAME_pushl(), FRAME_popl(), FRAME_addl_esp() and
+FRAME_subl_esp() exist to adjust FRAME for the effect of those instructions,
+and can be used instead of explicit definitions if preferred.
+defframe_pushl() is a combination FRAME_pushl() and defframe().
+
+There's generally some slackness in redefining FRAME.  If new values aren't
+going to get used then the redefinitions are omitted to keep from cluttering
+up the code.  This happens for instance at the end of a routine, where there
+might be just four pops and then a ret, so FRAME isn't getting used.
+
+Local variables and saved registers can be similarly defined, with negative
+offsets representing stack space below the initial stack pointer.  For
+example,
+
+	defframe(SAVE_ESI,   -4)
+	defframe(SAVE_EDI,   -8)
+	defframe(VAR_COUNTER,-12)
+
+	deflit(STACK_SPACE, 12)
+
+Here STACK_SPACE gets used in a "subl $STACK_SPACE, %esp" to allocate the
+space, and that instruction must be followed by a redefinition of FRAME
+(setting it equal to STACK_SPACE) to reflect the change in %esp.
+
+Definitions for pushed registers are only put in when they're going to be
+used.  If registers are just saved and restored with pushes and pops then
+definitions aren't made.
+
+
+
+ASSEMBLER EXPRESSIONS
+
+Only addition and subtraction seem to be universally available, certainly
+that's all the Solaris 8 "as" seems to accept.  If expressions are wanted
+then m4 eval() should be used.
+
+In particular note that a "/" anywhere in a line starts a comment in Solaris
+"as", and in some configurations of gas too.
+
+	addl	$32/2, %eax           <-- wrong
+
+	addl	$eval(32/2), %eax     <-- right
+
+Binutils gas/config/tc-i386.c has a choice between "/" being a comment
+anywhere in a line, or only at the start.  FreeBSD patches 2.9.1 to select
+the latter, and from 2.9.5 it's the default for GNU/Linux too.
+
+
+
+ASSEMBLER COMMENTS
+
+Solaris "as" doesn't support "#" commenting, using /* */ instead.  For that
+reason "C" commenting is used (see asm-defs.m4) and the intermediate ".s"
+files have no comments.
+
+Any comments before include(`../config.m4') must use m4 "dnl", since it's
+only after the include that "C" is available.  By convention "dnl" is also
+used for comments about m4 macros.
+
+
+
+TEMPORARY LABELS
+
+Temporary numbered labels like "1:" used as "1f" or "1b" are available in
+"gas" and Solaris "as", but not in SCO "as".  Normal L() labels should be
+used instead, possibly with a counter to make them unique, see jadcl0() in
+x86-defs.m4 for instance.  A separate counter for each macro makes it
+possible to nest them, for instance movl_text_address() can be used within
+an ASSERT().
+
+"1:" etc must be avoided in gcc __asm__ blocks too.  "%=" for generating a
+unique number looks like a good alternative, but is that actually a
+documented feature?  In any case this problem doesn't currently arise.
+
+
+
+ZERO DISPLACEMENTS
+
+In a couple of places addressing modes like 0(%ebx) with a byte-sized zero
+displacement are wanted, rather than (%ebx) with no displacement.  These are
+either for computed jumps or to get desirable code alignment.  Explicit
+.byte sequences are used to ensure the assembler doesn't turn 0(%ebx) into
+(%ebx).  The Zdisp() macro in x86-defs.m4 is used for this.
+
+Current gas 2.9.5 or recent 2.9.1 leave 0(%ebx) as written, but old gas
+1.92.3 changes it.  In general changing would be the sort of "optimization"
+an assembler might perform, hence explicit ".byte"s are used where
+necessary.
+
+
+
+SHLD/SHRD INSTRUCTIONS
+
+The %cl count forms of double shift instructions like "shldl %cl,%eax,%ebx"
+must be written "shldl %eax,%ebx" for some assemblers.  gas takes either,
+Solaris "as" doesn't allow %cl, gcc generates %cl for gas and NeXT (which is
+gas), and omits %cl elsewhere.
+
+For GMP an autoconf test GMP_ASM_X86_SHLDL_CL is used to determine whether
+%cl should be used, and the macros shldl, shrdl, shldw and shrdw in
+mpn/x86/x86-defs.m4 pass through or omit %cl as necessary.  See the comments
+with those macros for usage.
+
+
+
+IMUL INSTRUCTION
+
+GCC config/i386/i386.md (cvs rev 1.187, 21 Oct 00) under *mulsi3_1 notes
+that the following two forms produce identical object code
+
+	imul	$12, %eax
+	imul	$12, %eax, %eax
+
+but that the former isn't accepted by some assemblers, in particular the SCO
+OSR5 COFF assembler.  GMP follows GCC and uses only the latter form.
+
+(This applies only to immediate operands, the three operand form is only
+valid with an immediate.)
+
+
+
+DIRECTION FLAG
+
+The x86 calling conventions say that the direction flag should be clear at
+function entry and exit.  (See iBCS2 and SVR4 ABI books, references below.)
+Although this has been so since the year dot, it's not absolutely clear
+whether it's universally respected.  Since it's better to be safe than
+sorry, GMP follows glibc and does a "cld" if it depends on the direction
+flag being clear.  This happens only in a few places.
+
+
+
+POSITION INDEPENDENT CODE
+
+  Coding Style
+
+    Defining the symbol PIC in m4 processing selects SVR4 / ELF style
+    position independent code.  This is necessary for shared libraries
+    because they can be mapped into different processes at different virtual
+    addresses.  Actually, relocations are allowed but text pages with
+    relocations aren't shared, defeating the purpose of a shared library.
+
+    The GOT is used to access global data, and the PLT is used for
+    functions.  The use of the PLT adds a fixed cost to every function call,
+    and the GOT adds a cost to any function accessing global variables.
+    These are small but might be noticeable when working with small
+    operands.
+
+  Scope
+
+    It's intended, as a matter of policy, that references within libgmp are
+    resolved within libgmp.  Certainly there's no need for an application to
+    replace any internals, and we take the view that there's no value in an
+    application subverting anything documented either.
+
+    Resolving references within libgmp in theory means calls can be made with a
+    plain PC-relative call instruction, which is faster and smaller than going
+    through the PLT, and data references can be similarly PC-relative, saving a
+    GOT entry and fetch from there.  Unfortunately the normal linker behaviour
+    doesn't allow us to do this.
+
+    By default an R_386_PC32 PC-relative reference, either for a call or for
+    data, is left in libgmp.so by the linker so that it can be resolved at
+    runtime to a location in the application or another shared library.  This
+    means a text segment relocation which we don't want.
+
+  -Bsymbolic
+
+    Under the "-Bsymbolic" option, the linker resolves references to symbols
+    within libgmp.so.  This gives us the desired effect for R_386_PC32,
+    ie. it's resolved at link time.  It also resolves R_386_PLT32 calls
+    directly to their target without creating a PLT entry (though if this is
+    done to normal compiler-generated code it still leaves a setup of %ebx
+    to _GLOBAL_OFFSET_TABLE_ which may then be unnecessary).
+
+    Unfortunately -Bsymbolic does bad things to global variables defined in
+    a shared library but accessed by non-PIC code from the mainline (or a
+    static library).
+
+    The problem is that the mainline needs a fixed data address to avoid
+    text segment relocations, so space is allocated in its data segment and
+    the value from the variable is copied from the shared library's data
+    segment when the library is loaded.  Under -Bsymbolic, however,
+    references in the shared library are then resolved still to the shared
+    library data area.  Not surprisingly it bombs badly to have mainline
+    code and library code accessing different locations for what should be
+    one variable.
+
+    Note that this -Bsymbolic effect for the shared library is not just for
+    R_386_PC32 offsets which might have been cooked up in assembler, but is
+    done also for the contents of GOT entries.  -Bsymbolic simply applies a
+    general rule that symbols are resolved first from the local module.
+
+  Visibility Attributes
+
+    GCC __attribute__ ((visibility ("protected"))), which is available in
+    recent versions, eg. 3.3, is probably what we'd like to use.  It makes
+    gcc generate plain PC-relative calls to indicated functions, and directs
+    the linker to resolve references to the given function within the link
+    module.
+
+    Unfortunately, as of debian binutils 2.13.90.0.16 at least, the
+    resulting libgmp.so comes out with text segment relocations, references
+    are not resolved at link time.  If the gcc description is to be believed
+    this is this not how it should work.  If a symbol cannot be overridden
+    by another module then surely references within that module can be
+    resolved immediately (ie. at link time).
+
+  Present
+
+    In any case, all this means that we have no optimizations we can
+    usefully make to function or variable usages, neither for assembler nor
+    C code.  Perhaps in the future the visibility attribute will work as
+    we'd like.
+
+
+
+
+GLOBAL OFFSET TABLE
+
+The magic _GLOBAL_OFFSET_TABLE_ used by code establishing the address of the
+GOT sometimes requires an extra underscore prefix.  SVR4 systems and NetBSD
+don't need a prefix, OpenBSD does need one.  Note that NetBSD and OpenBSD
+are both a.out underscore systems, so the prefix for _GLOBAL_OFFSET_TABLE_
+is not simply the same as the prefix for ordinary globals.
+
+In any case in the asm code we write _GLOBAL_OFFSET_TABLE_ and let a macro
+in x86-defs.m4 add an extra underscore if required (according to a configure
+test).
+
+Old gas 1.92.3 which comes with FreeBSD 2.2.8 gets a segmentation fault when
+asked to assemble the following,
+
+        L1:
+            addl  $_GLOBAL_OFFSET_TABLE_+[.-L1], %ebx
+
+It seems that using the label in the same instruction it refers to is the
+problem, since a nop in between works.  But the simplest workaround is to
+follow gcc and omit the +[.-L1] since it does nothing,
+
+            addl  $_GLOBAL_OFFSET_TABLE_, %ebx
+
+Current gas 2.10 generates incorrect object code when %eax is used in such a
+construction (with or without +[.-L1]),
+
+            addl  $_GLOBAL_OFFSET_TABLE_, %eax
+
+The R_386_GOTPC gets a displacement of 2 rather than the 1 appropriate for
+the 1 byte opcode of "addl $n,%eax".  The best workaround is just to use any
+other register, since then it's a two byte opcode+mod/rm.  GCC for example
+always uses %ebx (which is needed for calls through the PLT).
+
+A similar problem occurs in an leal (again with or without a +[.-L1]),
+
+            leal  _GLOBAL_OFFSET_TABLE_(%edi), %ebx
+
+This time the R_386_GOTPC gets a displacement of 0 rather than the 2
+appropriate for the opcode and mod/rm, making this form unusable.
+
+
+
+
+SIMPLE LOOPS
+
+The overheads in setting up for an unrolled loop can mean that at small
+sizes a simple loop is faster.  Making small sizes go fast is important,
+even if it adds a cycle or two to bigger sizes.  To this end various
+routines choose between a simple loop and an unrolled loop according to
+operand size.  The path to the simple loop, or to special case code for
+small sizes, is always as fast as possible.
+
+Adding a simple loop requires a conditional jump to choose between the
+simple and unrolled code.  The size of a branch misprediction penalty
+affects whether a simple loop is worthwhile.
+
+The convention is for an m4 definition UNROLL_THRESHOLD to set the crossover
+point, with sizes < UNROLL_THRESHOLD using the simple loop, sizes >=
+UNROLL_THRESHOLD using the unrolled loop.  If position independent code adds
+a couple of cycles to an unrolled loop setup, the threshold will vary with
+PIC or non-PIC.  Something like the following is typical.
+
+	deflit(UNROLL_THRESHOLD, ifdef(`PIC',10,8))
+
+There's no automated way to determine the threshold.  Setting it to a small
+value and then to a big value makes it possible to measure the simple and
+unrolled loops each over a range of sizes, from which the crossover point
+can be determined.  Alternately, just adjust the threshold up or down until
+there's no more speedups.
+
+
+
+UNROLLED LOOP CODING
+
+The x86 addressing modes allow a byte displacement of -128 to +127, making
+it possible to access 256 bytes, which is 64 limbs, without adjusting
+pointer registers within the loop.  Dword sized displacements can be used
+too, but they increase code size, and unrolling to 64 ought to be enough.
+
+When unrolling to the full 64 limbs/loop, the limb at the top of the loop
+will have a displacement of -128, so pointers have to have a corresponding
++128 added before entering the loop.  When unrolling to 32 limbs/loop
+displacements 0 to 127 can be used with 0 at the top of the loop and no
+adjustment needed to the pointers.
+
+Where 64 limbs/loop is supported, the +128 adjustment is done only when 64
+limbs/loop is selected.  Usually the gain in speed using 64 instead of 32 or
+16 is small, so support for 64 limbs/loop is generally only for comparison.
+
+
+
+COMPUTED JUMPS
+
+When working from least significant limb to most significant limb (most
+routines) the computed jump and pointer calculations in preparation for an
+unrolled loop are as follows.
+
+	S = operand size in limbs
+	N = number of limbs per loop (UNROLL_COUNT)
+	L = log2 of unrolling (UNROLL_LOG2)
+	M = mask for unrolling (UNROLL_MASK)
+	C = code bytes per limb in the loop
+	B = bytes per limb (4 for x86)
+
+	computed jump            (-S & M) * C + entrypoint
+	subtract from pointers   (-S & M) * B
+	initial loop counter     (S-1) >> L
+	displacements            0 to B*(N-1)
+
+The loop counter is decremented at the end of each loop, and the looping
+stops when the decrement takes the counter to -1.  The displacements are for
+the addressing accessing each limb, eg. a load with "movl disp(%ebx), %eax".
+
+Usually the multiply by "C" can be handled without an imul, using instead an
+leal, or a shift and subtract.
+
+When working from most significant to least significant limb (eg. mpn_lshift
+and mpn_copyd), the calculations change as follows.
+
+	add to pointers          (-S & M) * B
+	displacements            0 to -B*(N-1)
+
+
+
+OLD GAS 1.92.3
+
+This version comes with FreeBSD 2.2.8 and has a couple of gremlins that
+affect GMP code.
+
+Firstly, an expression involving two forward references to labels comes out
+as zero.  For example,
+
+		addl	$bar-foo, %eax
+	foo:
+		nop
+	bar:
+
+This should lead to "addl $1, %eax", but it comes out as "addl $0, %eax".
+When only one forward reference is involved, it works correctly, as for
+example,
+
+	foo:
+		addl	$bar-foo, %eax
+		nop
+	bar:
+
+Secondly, an expression involving two labels can't be used as the
+displacement for an leal.  For example,
+
+	foo:
+		nop
+	bar:
+		leal	bar-foo(%eax,%ebx,8), %ecx
+
+A slightly cryptic error is given, "Unimplemented segment type 0 in
+parse_operand".  When only one label is used it's ok, and the label can be a
+forward reference too, as for example,
+
+		leal	foo(%eax,%ebx,8), %ecx
+		nop
+	foo:
+
+These problems only affect PIC computed jump calculations.  The workarounds
+are just to do an leal without a displacement and then an addl, and to make
+sure the code is placed so that there's at most one forward reference in the
+addl.
+
+
+
+REFERENCES
+
+"Intel Architecture Software Developer's Manual", volumes 1, 2a, 2b, 3a, 3b,
+2006, order numbers 253665 through 253669.  Available on-line,
+
+	ftp://download.intel.com/design/Pentium4/manuals/25366518.pdf
+	ftp://download.intel.com/design/Pentium4/manuals/25366618.pdf
+	ftp://download.intel.com/design/Pentium4/manuals/25366718.pdf
+	ftp://download.intel.com/design/Pentium4/manuals/25366818.pdf
+	ftp://download.intel.com/design/Pentium4/manuals/25366918.pdf
+
+
+"System V Application Binary Interface", Unix System Laboratories Inc, 1992,
+published by Prentice Hall, ISBN 0-13-880410-9.  And the "Intel386 Processor
+Supplement", AT&T, 1991, ISBN 0-13-877689-X.  These have details of calling
+conventions and ELF shared library PIC coding.  Versions of both available
+on-line,
+
+	http://www.sco.com/developer/devspecs
+
+"Intel386 Family Binary Compatibility Specification 2", Intel Corporation,
+published by McGraw-Hill, 1991, ISBN 0-07-031219-2.  (Same as the above 386
+ABI supplement.)
+
+
+
+----------------
+Local variables:
+mode: text
+fill-column: 76
+End:

diff --git a/third_party/gmp/mpn/x86/aors_n.asm b/third_party/gmp/mpn/x86/aors_n.asm
new file mode 100644
index 0000000..5d359f5
--- /dev/null
+++ b/third_party/gmp/mpn/x86/aors_n.asm

@@ -0,0 +1,202 @@
+dnl  x86 mpn_add_n/mpn_sub_n -- mpn addition and subtraction.
+
+dnl  Copyright 1992, 1994-1996, 1999-2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C     cycles/limb
+C P5	3.375
+C P6	3.125
+C K6	3.5
+C K7	2.25
+C P4	8.75
+
+
+ifdef(`OPERATION_add_n',`
+	define(M4_inst,        adcl)
+	define(M4_function_n,  mpn_add_n)
+	define(M4_function_nc, mpn_add_nc)
+
+',`ifdef(`OPERATION_sub_n',`
+	define(M4_inst,        sbbl)
+	define(M4_function_n,  mpn_sub_n)
+	define(M4_function_nc, mpn_sub_nc)
+
+',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n
+')')')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+
+
+C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C                          mp_size_t size);
+C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C	                    mp_size_t size, mp_limb_t carry);
+
+defframe(PARAM_CARRY,20)
+defframe(PARAM_SIZE, 16)
+defframe(PARAM_SRC2, 12)
+defframe(PARAM_SRC1, 8)
+defframe(PARAM_DST,  4)
+
+	TEXT
+	ALIGN(8)
+
+PROLOGUE(M4_function_nc)
+deflit(`FRAME',0)
+
+	pushl	%edi		FRAME_pushl()
+	pushl	%esi		FRAME_pushl()
+
+	movl	PARAM_DST,%edi
+	movl	PARAM_SRC1,%esi
+	movl	PARAM_SRC2,%edx
+	movl	PARAM_SIZE,%ecx
+
+	movl	%ecx,%eax
+	shrl	$3,%ecx			C compute count for unrolled loop
+	negl	%eax
+	andl	$7,%eax			C get index where to start loop
+	jz	L(oopgo)		C necessary special case for 0
+	incl	%ecx			C adjust loop count
+	shll	$2,%eax			C adjustment for pointers...
+	subl	%eax,%edi		C ... since they are offset ...
+	subl	%eax,%esi		C ... by a constant when we ...
+	subl	%eax,%edx		C ... enter the loop
+	shrl	$2,%eax			C restore previous value
+
+ifdef(`PIC',`
+	C Calculate start address in loop for PIC.  Due to limitations in
+	C old gas, LF(M4_function_n,oop)-L(0a)-3 cannot be put into the leal
+	call	L(0a)
+L(0a):	leal	(%eax,%eax,8),%eax
+	addl	(%esp),%eax
+	addl	$L(oop)-L(0a)-3,%eax
+	addl	$4,%esp
+',`
+	C Calculate start address in loop for non-PIC.
+	leal	L(oop)-3(%eax,%eax,8),%eax
+')
+
+	C These lines initialize carry from the 5th parameter.  Should be
+	C possible to simplify.
+	pushl	%ebp		FRAME_pushl()
+	movl	PARAM_CARRY,%ebp
+	shrl	%ebp			C shift bit 0 into carry
+	popl	%ebp		FRAME_popl()
+
+	jmp	*%eax			C jump into loop
+
+EPILOGUE()
+
+
+	ALIGN(16)
+PROLOGUE(M4_function_n)
+deflit(`FRAME',0)
+
+	pushl	%edi		FRAME_pushl()
+	pushl	%esi		FRAME_pushl()
+
+	movl	PARAM_DST,%edi
+	movl	PARAM_SRC1,%esi
+	movl	PARAM_SRC2,%edx
+	movl	PARAM_SIZE,%ecx
+
+	movl	%ecx,%eax
+	shrl	$3,%ecx			C compute count for unrolled loop
+	negl	%eax
+	andl	$7,%eax			C get index where to start loop
+	jz	L(oop)			C necessary special case for 0
+	incl	%ecx			C adjust loop count
+	shll	$2,%eax			C adjustment for pointers...
+	subl	%eax,%edi		C ... since they are offset ...
+	subl	%eax,%esi		C ... by a constant when we ...
+	subl	%eax,%edx		C ... enter the loop
+	shrl	$2,%eax			C restore previous value
+
+ifdef(`PIC',`
+	C Calculate start address in loop for PIC.  Due to limitations in
+	C some assemblers, L(oop)-L(0b)-3 cannot be put into the leal
+	call	L(0b)
+L(0b):	leal	(%eax,%eax,8),%eax
+	addl	(%esp),%eax
+	addl	$L(oop)-L(0b)-3,%eax
+	addl	$4,%esp
+',`
+	C Calculate start address in loop for non-PIC.
+	leal	L(oop)-3(%eax,%eax,8),%eax
+')
+	jmp	*%eax			C jump into loop
+
+L(oopgo):
+	pushl	%ebp		FRAME_pushl()
+	movl	PARAM_CARRY,%ebp
+	shrl	%ebp			C shift bit 0 into carry
+	popl	%ebp		FRAME_popl()
+
+	ALIGN(16)
+L(oop):	movl	(%esi),%eax
+	M4_inst	(%edx),%eax
+	movl	%eax,(%edi)
+	movl	4(%esi),%eax
+	M4_inst	4(%edx),%eax
+	movl	%eax,4(%edi)
+	movl	8(%esi),%eax
+	M4_inst	8(%edx),%eax
+	movl	%eax,8(%edi)
+	movl	12(%esi),%eax
+	M4_inst	12(%edx),%eax
+	movl	%eax,12(%edi)
+	movl	16(%esi),%eax
+	M4_inst	16(%edx),%eax
+	movl	%eax,16(%edi)
+	movl	20(%esi),%eax
+	M4_inst	20(%edx),%eax
+	movl	%eax,20(%edi)
+	movl	24(%esi),%eax
+	M4_inst	24(%edx),%eax
+	movl	%eax,24(%edi)
+	movl	28(%esi),%eax
+	M4_inst	28(%edx),%eax
+	movl	%eax,28(%edi)
+	leal	32(%edi),%edi
+	leal	32(%esi),%esi
+	leal	32(%edx),%edx
+	decl	%ecx
+	jnz	L(oop)
+
+	sbbl	%eax,%eax
+	negl	%eax
+
+	popl	%esi
+	popl	%edi
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/aorsmul_1.asm b/third_party/gmp/mpn/x86/aorsmul_1.asm
new file mode 100644
index 0000000..54a8905
--- /dev/null
+++ b/third_party/gmp/mpn/x86/aorsmul_1.asm

@@ -0,0 +1,156 @@
+dnl  x86 __gmpn_addmul_1 (for 386 and 486) -- Multiply a limb vector with a
+dnl  limb and add the result to a second limb vector.
+
+dnl  Copyright 1992, 1994, 1997, 1999-2002, 2005 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C			    cycles/limb
+C P5				14.75
+C P6 model 0-8,10-12		 7.5
+C P6 model 9  (Banias)		 6.7
+C P6 model 13 (Dothan)		 6.75
+C P4 model 0  (Willamette)	24.0
+C P4 model 1  (?)		24.0
+C P4 model 2  (Northwood)	24.0
+C P4 model 3  (Prescott)
+C P4 model 4  (Nocona)
+C Intel Atom
+C AMD K6			12.5
+C AMD K7			 5.25
+C AMD K8
+C AMD K10
+
+
+ifdef(`OPERATION_addmul_1',`
+      define(M4_inst,        addl)
+      define(M4_function_1,  mpn_addmul_1)
+
+',`ifdef(`OPERATION_submul_1',`
+      define(M4_inst,        subl)
+      define(M4_function_1,  mpn_submul_1)
+
+',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1
+')')')
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
+
+
+C mp_limb_t M4_function_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                          mp_limb_t mult);
+
+define(PARAM_MULTIPLIER, `FRAME+16(%esp)')
+define(PARAM_SIZE,       `FRAME+12(%esp)')
+define(PARAM_SRC,        `FRAME+8(%esp)')
+define(PARAM_DST,        `FRAME+4(%esp)')
+
+	TEXT
+	ALIGN(8)
+
+PROLOGUE(M4_function_1)
+deflit(`FRAME',0)
+
+	pushl	%edi
+	pushl	%esi
+	pushl	%ebx
+	pushl	%ebp
+deflit(`FRAME',16)
+
+	movl	PARAM_DST,%edi
+	movl	PARAM_SRC,%esi
+	movl	PARAM_SIZE,%ecx
+
+	xorl	%ebx,%ebx
+	andl	$3,%ecx
+	jz	L(end0)
+
+L(oop0):
+	movl	(%esi),%eax
+	mull	PARAM_MULTIPLIER
+	leal	4(%esi),%esi
+	addl	%ebx,%eax
+	movl	$0,%ebx
+	adcl	%ebx,%edx
+	M4_inst	%eax,(%edi)
+	adcl	%edx,%ebx	C propagate carry into cylimb
+
+	leal	4(%edi),%edi
+	decl	%ecx
+	jnz	L(oop0)
+
+L(end0):
+	movl	PARAM_SIZE,%ecx
+	shrl	$2,%ecx
+	jz	L(end)
+
+	ALIGN(8)
+L(oop):	movl	(%esi),%eax
+	mull	PARAM_MULTIPLIER
+	addl	%eax,%ebx
+	movl	$0,%ebp
+	adcl	%edx,%ebp
+
+	movl	4(%esi),%eax
+	mull	PARAM_MULTIPLIER
+	M4_inst	%ebx,(%edi)
+	adcl	%eax,%ebp	C new lo + cylimb
+	movl	$0,%ebx
+	adcl	%edx,%ebx
+
+	movl	8(%esi),%eax
+	mull	PARAM_MULTIPLIER
+	M4_inst	%ebp,4(%edi)
+	adcl	%eax,%ebx	C new lo + cylimb
+	movl	$0,%ebp
+	adcl	%edx,%ebp
+
+	movl	12(%esi),%eax
+	mull	PARAM_MULTIPLIER
+	M4_inst	%ebx,8(%edi)
+	adcl	%eax,%ebp	C new lo + cylimb
+	movl	$0,%ebx
+	adcl	%edx,%ebx
+
+	M4_inst	%ebp,12(%edi)
+	adcl	$0,%ebx		C propagate carry into cylimb
+
+	leal	16(%esi),%esi
+	leal	16(%edi),%edi
+	decl	%ecx
+	jnz	L(oop)
+
+L(end):	movl	%ebx,%eax
+
+	popl	%ebp
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/atom/aorrlsh1_n.asm b/third_party/gmp/mpn/x86/atom/aorrlsh1_n.asm
new file mode 100644
index 0000000..cd1a650
--- /dev/null
+++ b/third_party/gmp/mpn/x86/atom/aorrlsh1_n.asm

@@ -0,0 +1,53 @@
+dnl  Intel Atom mpn_rsblsh1_n -- rp[] = (vp[] << 1) - up[]
+
+dnl  Contributed to the GNU project by Marco Bodrato.
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+define(LSH, 1)
+define(RSH, 31)
+
+ifdef(`OPERATION_addlsh1_n', `
+	define(M4_inst,        adc)
+	define(M4_opp,         sub)
+	define(M4_function,    mpn_addlsh1_n)
+	define(M4_function_c,  mpn_addlsh1_nc)
+',`ifdef(`OPERATION_rsblsh1_n', `
+	define(M4_inst,        sbb)
+	define(M4_opp,         add)
+	define(M4_function,    mpn_rsblsh1_n)
+	define(M4_function_c,  mpn_rsblsh1_nc)
+',`m4_error(`Need OPERATION_addlsh1_n or OPERATION_rsblsh1_n
+')')')
+
+MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_addlsh1_nc mpn_rsblsh1_n mpn_rsblsh1_nc)
+
+include_mpn(`x86/atom/aorrlshC_n.asm')

diff --git a/third_party/gmp/mpn/x86/atom/aorrlsh2_n.asm b/third_party/gmp/mpn/x86/atom/aorrlsh2_n.asm
new file mode 100644
index 0000000..10f4419
--- /dev/null
+++ b/third_party/gmp/mpn/x86/atom/aorrlsh2_n.asm

@@ -0,0 +1,53 @@
+dnl  Intel Atom mpn_addlsh2_n/mpn_rsblsh2_n -- rp[] = (vp[] << 2) +- up[]
+
+dnl  Contributed to the GNU project by Marco Bodrato.
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+define(LSH, 2)
+define(RSH, 30)
+
+ifdef(`OPERATION_addlsh2_n', `
+	define(M4_inst,        adcl)
+	define(M4_opp,         subl)
+	define(M4_function,    mpn_addlsh2_n)
+	define(M4_function_c,  mpn_addlsh2_nc)
+',`ifdef(`OPERATION_rsblsh2_n', `
+	define(M4_inst,        sbbl)
+	define(M4_opp,         addl)
+	define(M4_function,    mpn_rsblsh2_n)
+	define(M4_function_c,  mpn_rsblsh2_nc)
+',`m4_error(`Need OPERATION_addlsh2_n or OPERATION_rsblsh2_n
+')')')
+
+MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_addlsh2_nc mpn_rsblsh2_n mpn_rsblsh2_nc)
+
+include_mpn(`x86/atom/aorrlshC_n.asm')

diff --git a/third_party/gmp/mpn/x86/atom/aorrlshC_n.asm b/third_party/gmp/mpn/x86/atom/aorrlshC_n.asm
new file mode 100644
index 0000000..71cfe49
--- /dev/null
+++ b/third_party/gmp/mpn/x86/atom/aorrlshC_n.asm

@@ -0,0 +1,156 @@
+dnl  Intel Atom mpn_addlshC_n/mpn_rsblshC_n -- rp[] = (vp[] << C) +- up[]
+
+dnl  Contributed to the GNU project by Marco Bodrato.
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C mp_limb_t mpn_addlshC_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C                          mp_size_t size);
+C mp_limb_t mpn_addlshC_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C                           mp_size_t size, mp_limb_t carry);
+C mp_limb_t mpn_rsblshC_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C                          mp_size_t size);
+C mp_limb_t mpn_rsblshC_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C                           mp_size_t size, mp_signed_limb_t carry);
+
+C				cycles/limb
+C P5
+C P6 model 0-8,10-12
+C P6 model 9  (Banias)
+C P6 model 13 (Dothan)
+C P4 model 0  (Willamette)
+C P4 model 1  (?)
+C P4 model 2  (Northwood)
+C P4 model 3  (Prescott)
+C P4 model 4  (Nocona)
+C Intel Atom			 6
+C AMD K6
+C AMD K7
+C AMD K8
+C AMD K10
+
+defframe(PARAM_CORB,	20)
+defframe(PARAM_SIZE,	16)
+defframe(PARAM_DBLD,	12)
+defframe(PARAM_SRC,	 8)
+defframe(PARAM_DST,	 4)
+
+dnl  re-use parameter space
+define(VAR_COUNT,`PARAM_SIZE')
+define(SAVE_EBP,`PARAM_DBLD')
+define(SAVE_VP,`PARAM_SRC')
+define(SAVE_UP,`PARAM_DST')
+
+define(M, eval(m4_lshift(1,LSH)))
+define(`rp',  `%edi')
+define(`up',  `%esi')
+define(`vp',  `%ebx')
+
+ASM_START()
+	TEXT
+	ALIGN(8)
+
+PROLOGUE(M4_function_c)
+deflit(`FRAME',0)
+	movl	PARAM_CORB, %eax
+	movl	%eax, %edx
+	shr	$LSH, %edx
+	andl	$1, %edx
+	M4_opp	%edx, %eax
+	jmp	L(start_nc)
+EPILOGUE()
+
+PROLOGUE(M4_function)
+deflit(`FRAME',0)
+
+	xor	%eax, %eax
+	xor	%edx, %edx
+L(start_nc):
+	push	rp			FRAME_pushl()
+
+	mov	PARAM_SIZE, %ecx	C size
+	mov	PARAM_DST, rp
+	mov	up, SAVE_UP
+	incl	%ecx			C size + 1
+	mov	PARAM_SRC, up
+	mov	vp, SAVE_VP
+	shr	%ecx			C (size+1)\2
+	mov	PARAM_DBLD, vp
+	mov	%ebp, SAVE_EBP
+	mov	%ecx, VAR_COUNT
+	jnc	L(entry)		C size odd
+
+	shr	%edx			C size even
+	mov	(vp), %ecx
+	lea	4(vp), vp
+	lea	(%eax,%ecx,M), %edx
+	mov	%ecx, %eax
+	lea	-4(up), up
+	lea	-4(rp), rp
+	jmp	L(enteven)
+
+	ALIGN(16)
+L(oop):
+	lea	(%eax,%ecx,M), %ebp
+	shr	$RSH, %ecx
+	mov	4(vp), %eax
+	shr	%edx
+	lea	8(vp), vp
+	M4_inst	(up), %ebp
+	lea	(%ecx,%eax,M), %edx
+	mov	%ebp, (rp)
+L(enteven):
+	M4_inst	4(up), %edx
+	lea	8(up), up
+	mov	%edx, 4(rp)
+	adc	%edx, %edx
+	shr	$RSH, %eax
+	lea	8(rp), rp
+L(entry):
+	mov	(vp), %ecx
+	decl	VAR_COUNT
+	jnz	L(oop)
+
+	lea	(%eax,%ecx,M), %ebp
+	shr	$RSH, %ecx
+	shr	%edx
+	mov	SAVE_VP, vp
+	M4_inst	(up), %ebp
+	mov	%ecx, %eax
+	mov	SAVE_UP, up
+	M4_inst	$0, %eax
+	mov	%ebp, (rp)
+	mov	SAVE_EBP, %ebp
+	pop	rp			FRAME_popl()
+	ret
+EPILOGUE()
+
+ASM_END()

diff --git a/third_party/gmp/mpn/x86/atom/aors_n.asm b/third_party/gmp/mpn/x86/atom/aors_n.asm
new file mode 100644
index 0000000..45ec287
--- /dev/null
+++ b/third_party/gmp/mpn/x86/atom/aors_n.asm

@@ -0,0 +1,159 @@
+dnl  Intel Atom mpn_add_n/mpn_sub_n -- rp[] = up[] +- vp[].
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  Contributed to the GNU project by Marco Bodrato.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C			    cycles/limb
+C P5
+C P6 model 0-8,10-12
+C P6 model 9  (Banias)
+C P6 model 13 (Dothan)
+C P4 model 0  (Willamette)
+C P4 model 1  (?)
+C P4 model 2  (Northwood)
+C P4 model 3  (Prescott)
+C P4 model 4  (Nocona)
+C Intel Atom			 3
+C AMD K6
+C AMD K7
+C AMD K8
+C AMD K10
+
+ifdef(`OPERATION_add_n', `
+	define(M4_inst,        adcl)
+	define(M4_function_n,  mpn_add_n)
+	define(M4_function_nc, mpn_add_nc)
+	define(M4_description, add)
+',`ifdef(`OPERATION_sub_n', `
+	define(M4_inst,        sbbl)
+	define(M4_function_n,  mpn_sub_n)
+	define(M4_function_nc, mpn_sub_nc)
+	define(M4_description, subtract)
+',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n
+')')')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+
+C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C                         mp_size_t size);
+C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C	                   mp_size_t size, mp_limb_t carry);
+C
+C Calculate src1,size M4_description src2,size, and store the result in
+C dst,size.  The return value is the carry bit from the top of the result (1
+C or 0).
+C
+C The _nc version accepts 1 or 0 for an initial carry into the low limb of
+C the calculation.  Note values other than 1 or 0 here will lead to garbage
+C results.
+
+defframe(PARAM_CARRY,20)
+defframe(PARAM_SIZE, 16)
+defframe(PARAM_SRC2, 12)
+defframe(PARAM_SRC1, 8)
+defframe(PARAM_DST,  4)
+
+dnl  re-use parameter space
+define(SAVE_RP,`PARAM_SIZE')
+define(SAVE_VP,`PARAM_SRC1')
+define(SAVE_UP,`PARAM_DST')
+
+define(`rp',  `%edi')
+define(`up',  `%esi')
+define(`vp',  `%ebx')
+define(`cy',  `%ecx')
+define(`r1',  `%ecx')
+define(`r2',  `%edx')
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+deflit(`FRAME',0)
+
+PROLOGUE(M4_function_n)
+	xor	cy, cy			C carry
+L(start):
+	mov	PARAM_SIZE, %eax	C size
+	mov	rp, SAVE_RP
+	mov	PARAM_DST, rp
+	mov	up, SAVE_UP
+	mov	PARAM_SRC1, up
+	shr	%eax			C size >> 1
+	mov	vp, SAVE_VP
+	mov	PARAM_SRC2, vp
+	jz	L(one)			C size == 1
+	jc	L(three)		C size % 2 == 1
+
+	shr	cy
+	mov	(up), r2
+	lea	4(up), up
+	lea	4(vp), vp
+	lea	-4(rp), rp
+	jmp	L(entry)
+L(one):
+	shr	cy
+	mov	(up), r1
+	jmp	L(end)
+L(three):
+	shr	cy
+	mov	(up), r1
+
+	ALIGN(16)
+L(oop):
+	M4_inst	(vp), r1
+	lea	8(up), up
+	mov	-4(up), r2
+	lea	8(vp), vp
+	mov	r1, (rp)
+L(entry):
+	M4_inst	-4(vp), r2
+	lea	8(rp), rp
+	dec	%eax
+	mov	(up), r1
+	mov	r2, -4(rp)
+	jnz	L(oop)
+
+L(end):					C %eax is zero here
+	mov	SAVE_UP, up
+	M4_inst	(vp), r1
+	mov	SAVE_VP, vp
+	mov	r1, (rp)
+	adc	%eax, %eax
+	mov	SAVE_RP, rp
+	ret
+EPILOGUE()
+
+PROLOGUE(M4_function_nc)
+	mov	PARAM_CARRY, cy		C carry
+	jmp	L(start)
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/x86/atom/aorslshC_n.asm b/third_party/gmp/mpn/x86/atom/aorslshC_n.asm
new file mode 100644
index 0000000..75ace65
--- /dev/null
+++ b/third_party/gmp/mpn/x86/atom/aorslshC_n.asm

@@ -0,0 +1,247 @@
+dnl  Intel Atom mpn_addlshC_n/mpn_sublshC_n -- rp[] = up[] +- (vp[] << C)
+
+dnl  Contributed to the GNU project by Marco Bodrato.
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C mp_limb_t mpn_addlshC_n_ip1 (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C mp_limb_t mpn_addlshC_nc_ip1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C				mp_limb_t carry);
+C mp_limb_t mpn_sublshC_n_ip1 (mp_ptr dst, mp_srcptr src, mp_size_t size,);
+C mp_limb_t mpn_sublshC_nc_ip1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C				mp_signed_limb_t borrow);
+
+defframe(PARAM_CORB,	16)
+defframe(PARAM_SIZE,	12)
+defframe(PARAM_SRC,	 8)
+defframe(PARAM_DST,	 4)
+
+C mp_limb_t mpn_addlshC_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C                          mp_size_t size,);
+C mp_limb_t mpn_addlshC_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C                           mp_size_t size, mp_limb_t carry);
+C mp_limb_t mpn_sublshC_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C                          mp_size_t size,);
+C mp_limb_t mpn_sublshC_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C                           mp_size_t size, mp_limb_t borrow);
+
+C if src1 == dst, _ip1 is used
+
+C					cycles/limb
+C				dst!=src1,src2	dst==src1
+C P5
+C P6 model 0-8,10-12
+C P6 model 9  (Banias)
+C P6 model 13 (Dothan)
+C P4 model 0  (Willamette)
+C P4 model 1  (?)
+C P4 model 2  (Northwood)
+C P4 model 3  (Prescott)
+C P4 model 4  (Nocona)
+C Intel Atom			 7		 6
+C AMD K6
+C AMD K7
+C AMD K8
+C AMD K10
+
+defframe(GPARAM_CORB,	20)
+defframe(GPARAM_SIZE,	16)
+defframe(GPARAM_SRC2,	12)
+
+dnl  re-use parameter space
+define(SAVE_EBP,`PARAM_SIZE')
+define(SAVE_EBX,`PARAM_SRC')
+define(SAVE_UP,`PARAM_DST')
+
+define(M, eval(m4_lshift(1,LSH)))
+define(`rp',  `%edi')
+define(`up',  `%esi')
+
+ASM_START()
+	TEXT
+	ALIGN(8)
+
+PROLOGUE(M4_ip_function_c)
+deflit(`FRAME',0)
+	movl	PARAM_CORB, %ecx
+	movl	%ecx, %edx
+	shr	$LSH, %edx
+	andl	$1, %edx
+	M4_opp	%edx, %ecx
+	jmp	L(start_nc)
+EPILOGUE()
+
+PROLOGUE(M4_ip_function)
+deflit(`FRAME',0)
+
+	xor	%ecx, %ecx
+	xor	%edx, %edx
+L(start_nc):
+	push	rp			FRAME_pushl()
+	mov	PARAM_DST, rp
+	mov	up, SAVE_UP
+	mov	PARAM_SRC, up
+	mov	%ebx, SAVE_EBX
+	mov	PARAM_SIZE, %ebx	C size
+L(inplace):
+	incl	%ebx			C size + 1
+	shr	%ebx			C (size+1)\2
+	mov	%ebp, SAVE_EBP
+	jnc	L(entry)		C size odd
+
+	add	%edx, %edx		C size even
+	mov	%ecx, %ebp
+	mov	(up), %ecx
+	lea	-4(rp), rp
+	lea	(%ebp,%ecx,M), %eax
+	lea	4(up), up
+	jmp	L(enteven)
+
+	ALIGN(16)
+L(oop):
+	lea	(%ecx,%eax,M), %ebp
+	shr	$RSH, %eax
+	mov	4(up), %ecx
+	add	%edx, %edx
+	lea	8(up), up
+	M4_inst	%ebp, (rp)
+	lea	(%eax,%ecx,M), %eax
+
+L(enteven):
+	M4_inst	%eax, 4(rp)
+	lea	8(rp), rp
+
+	sbb	%edx, %edx
+	shr	$RSH, %ecx
+
+L(entry):
+	mov	(up), %eax
+	decl	%ebx
+	jnz	L(oop)
+
+	lea	(%ecx,%eax,M), %ebp
+	shr	$RSH, %eax
+	shr	%edx
+	M4_inst	%ebp, (rp)
+	mov	SAVE_UP, up
+	adc	$0, %eax
+	mov	SAVE_EBP, %ebp
+	mov	SAVE_EBX, %ebx
+	pop	rp			FRAME_popl()
+	ret
+EPILOGUE()
+
+PROLOGUE(M4_function_c)
+deflit(`FRAME',0)
+	movl	GPARAM_CORB, %ecx
+	movl	%ecx, %edx
+	shr	$LSH, %edx
+	andl	$1, %edx
+	M4_opp	%edx, %ecx
+	jmp	L(generic_nc)
+EPILOGUE()
+
+PROLOGUE(M4_function)
+deflit(`FRAME',0)
+
+	xor	%ecx, %ecx
+	xor	%edx, %edx
+L(generic_nc):
+	push	rp			FRAME_pushl()
+	mov	PARAM_DST, rp
+	mov	up, SAVE_UP
+	mov	PARAM_SRC, up
+	cmp	rp, up
+	mov	%ebx, SAVE_EBX
+	jne	L(general)
+	mov	GPARAM_SIZE, %ebx	C size
+	mov	GPARAM_SRC2, up
+	jmp	L(inplace)
+
+L(general):
+	mov	GPARAM_SIZE, %eax	C size
+	mov	%ebx, SAVE_EBX
+	incl	%eax			C size + 1
+	mov	up, %ebx		C vp
+	mov	GPARAM_SRC2, up		C up
+	shr	%eax			C (size+1)\2
+	mov	%ebp, SAVE_EBP
+	mov	%eax, GPARAM_SIZE
+	jnc	L(entry2)		C size odd
+
+	add	%edx, %edx		C size even
+	mov	%ecx, %ebp
+	mov	(up), %ecx
+	lea	-4(rp), rp
+	lea	-4(%ebx), %ebx
+	lea	(%ebp,%ecx,M), %eax
+	lea	4(up), up
+	jmp	L(enteven2)
+
+	ALIGN(16)
+L(oop2):
+	lea	(%ecx,%eax,M), %ebp
+	shr	$RSH, %eax
+	mov	4(up), %ecx
+	add	%edx, %edx
+	lea	8(up), up
+	mov	(%ebx), %edx
+	M4_inst	%ebp, %edx
+	lea	(%eax,%ecx,M), %eax
+	mov	%edx, (rp)
+L(enteven2):
+	mov	4(%ebx), %edx
+	lea	8(%ebx), %ebx
+	M4_inst	%eax, %edx
+	mov	%edx, 4(rp)
+	sbb	%edx, %edx
+	shr	$RSH, %ecx
+	lea	8(rp), rp
+L(entry2):
+	mov	(up), %eax
+	decl	GPARAM_SIZE
+	jnz	L(oop2)
+
+	lea	(%ecx,%eax,M), %ebp
+	shr	$RSH, %eax
+	shr	%edx
+	mov	(%ebx), %edx
+	M4_inst	%ebp, %edx
+	mov	%edx, (rp)
+	mov	SAVE_UP, up
+	adc	$0, %eax
+	mov	SAVE_EBP, %ebp
+	mov	SAVE_EBX, %ebx
+	pop	rp			FRAME_popl()
+	ret
+EPILOGUE()
+
+ASM_END()

diff --git a/third_party/gmp/mpn/x86/atom/bdiv_q_1.asm b/third_party/gmp/mpn/x86/atom/bdiv_q_1.asm
new file mode 100644
index 0000000..31e908e
--- /dev/null
+++ b/third_party/gmp/mpn/x86/atom/bdiv_q_1.asm

@@ -0,0 +1,35 @@
+dnl  Intel Atom mpn_bdiv_q_1, mpn_pi1_bdiv_q_1 -- schoolbook Hensel
+dnl  division by 1-limb divisor, returning quotient only.
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_bdiv_q_1 mpn_pi1_bdiv_q_1)
+include_mpn(`x86/pentium/bdiv_q_1.asm')

diff --git a/third_party/gmp/mpn/x86/atom/cnd_add_n.asm b/third_party/gmp/mpn/x86/atom/cnd_add_n.asm
new file mode 100644
index 0000000..50bf2ad
--- /dev/null
+++ b/third_party/gmp/mpn/x86/atom/cnd_add_n.asm

@@ -0,0 +1,113 @@
+dnl  X86 mpn_cnd_add_n optimised for Intel Atom.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C			    cycles/limb
+C P5				 ?
+C P6 model 0-8,10-12		 ?
+C P6 model 9   (Banias)		 ?
+C P6 model 13  (Dothan)		 ?
+C P4 model 0-1 (Willamette)	 ?
+C P4 model 2   (Northwood)	 ?
+C P4 model 3-4 (Prescott)	 ?
+C Intel atom			 4.67
+C AMD K6			 ?
+C AMD K7			 ?
+C AMD K8			 ?
+
+
+define(`rp',  `%edi')
+define(`up',  `%esi')
+define(`vp',  `%ebp')
+define(`n',   `%ecx')
+define(`cnd', `20(%esp)')
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_cnd_add_n)
+	push	%edi
+	push	%esi
+	push	%ebx
+	push	%ebp
+
+	mov	cnd, %eax		C make cnd into a mask (1)
+	mov	24(%esp), rp
+	neg	%eax			C make cnd into a mask (1)
+	mov	28(%esp), up
+	sbb	%eax, %eax		C make cnd into a mask (1)
+	mov	32(%esp), vp
+	mov	%eax, cnd		C make cnd into a mask (1)
+	mov	36(%esp), n
+
+	xor	%edx, %edx
+
+	shr	$1, n
+	jnc	L(top)
+
+	mov	0(vp), %eax
+	and	cnd, %eax
+	lea	4(vp), vp
+	add	0(up), %eax
+	lea	4(rp), rp
+	lea	4(up), up
+	sbb	%edx, %edx
+	mov	%eax, -4(rp)
+	inc	n
+	dec	n
+	je	L(end)
+
+L(top):	sbb	%edx, %edx
+	mov	0(vp), %eax
+	and	cnd, %eax
+	lea	8(vp), vp
+	lea	8(rp), rp
+	mov	-4(vp), %ebx
+	and	cnd, %ebx
+	add	%edx, %edx
+	adc	0(up), %eax
+	lea	8(up), up
+	mov	%eax, -8(rp)
+	adc	-4(up), %ebx
+	dec	n
+	mov	%ebx, -4(rp)
+	jne	L(top)
+
+L(end):	mov	$0, %eax
+	adc	%eax, %eax
+
+	pop	%ebp
+	pop	%ebx
+	pop	%esi
+	pop	%edi
+	ret
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/x86/atom/cnd_sub_n.asm b/third_party/gmp/mpn/x86/atom/cnd_sub_n.asm
new file mode 100644
index 0000000..221bedc
--- /dev/null
+++ b/third_party/gmp/mpn/x86/atom/cnd_sub_n.asm

@@ -0,0 +1,124 @@
+dnl  X86 mpn_cnd_sub_n optimised for Intel Atom.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C			    cycles/limb
+C P5				 ?
+C P6 model 0-8,10-12		 ?
+C P6 model 9   (Banias)		 ?
+C P6 model 13  (Dothan)		 ?
+C P4 model 0-1 (Willamette)	 ?
+C P4 model 2   (Northwood)	 ?
+C P4 model 3-4 (Prescott)	 ?
+C Intel atom			 5.67
+C AMD K6			 ?
+C AMD K7			 ?
+C AMD K8			 ?
+
+
+define(`rp',  `%edi')
+define(`up',  `%esi')
+define(`vp',  `%ebp')
+define(`n',   `%ecx')
+define(`cnd', `20(%esp)')
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_cnd_sub_n)
+	push	%edi
+	push	%esi
+	push	%ebx
+	push	%ebp
+
+	mov	cnd, %eax		C make cnd into a mask (1)
+	mov	24(%esp), rp
+	neg	%eax			C make cnd into a mask (1)
+	mov	28(%esp), up
+	sbb	%eax, %eax		C make cnd into a mask (1)
+	mov	32(%esp), vp
+	mov	%eax, cnd		C make cnd into a mask (1)
+	mov	36(%esp), n
+
+	xor	%edx, %edx
+
+	inc	n
+	shr	n
+	jnc	L(ent)
+
+	mov	0(vp), %eax
+	and	cnd, %eax
+	lea	4(vp), vp
+	mov	0(up), %edx
+	sub	%eax, %edx
+	lea	4(rp), rp
+	lea	4(up), up
+	mov	%edx, -4(rp)
+	sbb	%edx, %edx		C save cy
+
+L(ent):	mov	0(vp), %ebx
+	and	cnd, %ebx
+	add	%edx, %edx		C restore cy
+	mov	0(up), %edx
+	dec	n
+	je	L(end)
+
+L(top):	sbb	%ebx, %edx
+	mov	4(vp), %eax
+	mov	%edx, 0(rp)
+	sbb	%edx, %edx		C save cy
+	mov	8(vp), %ebx
+	lea	8(up), up
+	and	cnd, %ebx
+	and	cnd, %eax
+	add	%edx, %edx		C restore cy
+	mov	-4(up), %edx
+	lea	8(rp), rp
+	sbb	%eax, %edx
+	mov	%edx, -4(rp)
+	dec	n
+	mov	0(up), %edx
+	lea	8(vp), vp
+	jne	L(top)
+
+L(end):	sbb	%ebx, %edx
+	mov	%edx, 0(rp)
+
+	mov	$0, %eax
+	adc	%eax, %eax
+
+	pop	%ebp
+	pop	%ebx
+	pop	%esi
+	pop	%edi
+	ret
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/x86/atom/dive_1.asm b/third_party/gmp/mpn/x86/atom/dive_1.asm
new file mode 100644
index 0000000..71036a1
--- /dev/null
+++ b/third_party/gmp/mpn/x86/atom/dive_1.asm

@@ -0,0 +1,34 @@
+dnl  Intel Atom mpn_divexact_1 -- mpn by limb exact division.
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_divexact_1)
+include_mpn(`x86/pentium/dive_1.asm')

diff --git a/third_party/gmp/mpn/x86/atom/gmp-mparam.h b/third_party/gmp/mpn/x86/atom/gmp-mparam.h
new file mode 100644
index 0000000..e025bb7
--- /dev/null
+++ b/third_party/gmp/mpn/x86/atom/gmp-mparam.h

@@ -0,0 +1,214 @@
+/* Intel Atom/32 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* 1600 MHz Diamondville (Atom 330) */
+/* FFT tuning limit = 67,000,000 */
+/* Generated by tuneup.c, 2019-10-18, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD                 3
+#define MOD_1_UNNORM_THRESHOLD               5
+#define MOD_1N_TO_MOD_1_1_THRESHOLD         11
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          5
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD        10
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD         0  /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     17
+#define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_1N_PI1_METHOD                 1  /* 72.60% faster than 2 */
+#define DIV_QR_1_NORM_THRESHOLD          MP_SIZE_T_MAX  /* never */
+#define DIV_QR_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           35
+
+#define DIV_1_VS_MUL_1_PERCENT             236
+
+#define MUL_TOOM22_THRESHOLD                22
+#define MUL_TOOM33_THRESHOLD                81
+#define MUL_TOOM44_THRESHOLD               178
+#define MUL_TOOM6H_THRESHOLD               270
+#define MUL_TOOM8H_THRESHOLD               399
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      97
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     126
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD     115
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD     129
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD     115
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 32
+#define SQR_TOOM3_THRESHOLD                117
+#define SQR_TOOM4_THRESHOLD                178
+#define SQR_TOOM6_THRESHOLD                366
+#define SQR_TOOM8_THRESHOLD                527
+
+#define MULMID_TOOM42_THRESHOLD             50
+
+#define MULMOD_BNM1_THRESHOLD               13
+#define SQRMOD_BNM1_THRESHOLD               17
+
+#define MUL_FFT_MODF_THRESHOLD             404  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    404, 5}, {     21, 6}, {     11, 5}, {     23, 6}, \
+    {     12, 5}, {     25, 6}, {     13, 5}, {     27, 6}, \
+    {     21, 7}, {     11, 6}, {     25, 7}, {     13, 6}, \
+    {     27, 7}, {     15, 6}, {     31, 7}, {     21, 8}, \
+    {     11, 7}, {     27, 8}, {     15, 7}, {     35, 8}, \
+    {     19, 7}, {     39, 8}, {     23, 7}, {     47, 8}, \
+    {     27, 9}, {     15, 8}, {     39, 9}, {     23, 8}, \
+    {     51,10}, {     15, 9}, {     31, 8}, {     67, 9}, \
+    {     39, 8}, {     79, 9}, {     47, 8}, {     95,10}, \
+    {     31, 9}, {     79,10}, {     47, 9}, {     95,11}, \
+    {     31,10}, {     63, 9}, {    135,10}, {     79, 9}, \
+    {    159,10}, {     95, 9}, {    191,10}, {    111,11}, \
+    {     63,10}, {    127, 9}, {    255, 8}, {    511,10}, \
+    {    143, 9}, {    287, 8}, {    575, 9}, {    303,10}, \
+    {    159,11}, {     95,10}, {    191, 9}, {    383,12}, \
+    {     63,11}, {    127,10}, {    255, 9}, {    511,10}, \
+    {    271, 9}, {    543,10}, {    287, 9}, {    575,10}, \
+    {    303,11}, {    159,10}, {    351, 9}, {    703,10}, \
+    {    367, 9}, {    735,11}, {    191,10}, {    383, 9}, \
+    {    767,10}, {    415,11}, {    223,10}, {    447,12}, \
+    {    127,11}, {    255,10}, {    543,11}, {    287,10}, \
+    {    607,11}, {    319,10}, {    671,11}, {    351,10}, \
+    {    735,12}, {    191,11}, {    383,10}, {    767,11}, \
+    {    415,10}, {    831,11}, {    447,13}, {    127,12}, \
+    {    255,11}, {    543,10}, {   1087,11}, {    607,12}, \
+    {    319,11}, {    735,12}, {    383,11}, {    831,12}, \
+    {    447,11}, {    959,13}, {    255,12}, {    511,11}, \
+    {   1087,12}, {    575,11}, {   1215,12}, {    639,11}, \
+    {   1279,12}, {    703,11}, {   1407,13}, {    383,12}, \
+    {    831,11}, {   1663,12}, {    959,14}, {    255,13}, \
+    {    511,12}, {   1215,13}, {    639,12}, {   1471,13}, \
+    {    767,12}, {   1599,13}, {    895,12}, {   1791,14}, \
+    {    511,13}, {   1023,12}, {   2111,13}, {   1151,12}, \
+    {   2431,13}, {   1407,14}, {    767,13}, {   1663,12}, \
+    {   3455,13}, {   1791,15}, {    511,14}, {   1023,13}, \
+    {   2431,14}, {   1279,13}, {   2943,12}, {   5887,14}, \
+    {   1535,13}, {   3455,14}, {   1791,13}, {   3839,15}, \
+    {   1023,14}, {   2047,13}, {   4223,14}, {   2303,13}, \
+    {   4991,12}, {   9983,14}, {   2815,13}, {   5887,15}, \
+    {   1535,14}, {   3839,16} }
+#define MUL_FFT_TABLE3_SIZE 158
+#define MUL_FFT_THRESHOLD                 4544
+
+#define SQR_FFT_MODF_THRESHOLD             368  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    368, 5}, {     23, 6}, {     12, 5}, {     25, 6}, \
+    {     13, 5}, {     27, 6}, {     25, 7}, {     13, 6}, \
+    {     28, 7}, {     15, 6}, {     31, 7}, {     17, 6}, \
+    {     35, 7}, {     21, 8}, {     11, 7}, {     27, 8}, \
+    {     15, 7}, {     35, 8}, {     19, 7}, {     41, 8}, \
+    {     23, 7}, {     47, 8}, {     27, 9}, {     15, 8}, \
+    {     39, 9}, {     23, 8}, {     51,10}, {     15, 9}, \
+    {     31, 8}, {     63, 9}, {     39, 8}, {     79, 9}, \
+    {     47,10}, {     31, 9}, {     79,10}, {     47, 9}, \
+    {     95,11}, {     31,10}, {     63, 9}, {    127, 8}, \
+    {    255, 9}, {    135,10}, {     79, 9}, {    159, 8}, \
+    {    319,10}, {     95, 9}, {    191,11}, {     63,10}, \
+    {    127, 9}, {    255, 8}, {    511, 9}, {    271,10}, \
+    {    143, 9}, {    287, 8}, {    575, 9}, {    303,10}, \
+    {    159, 9}, {    319,11}, {     95,10}, {    191, 9}, \
+    {    383,12}, {     63,11}, {    127,10}, {    255, 9}, \
+    {    511,10}, {    271, 9}, {    543,10}, {    287, 9}, \
+    {    575,10}, {    303, 9}, {    607,11}, {    159,10}, \
+    {    319, 9}, {    639,10}, {    335, 9}, {    671,10}, \
+    {    351, 9}, {    703,11}, {    191,10}, {    383, 9}, \
+    {    767,10}, {    415,11}, {    223,10}, {    447,12}, \
+    {    127,11}, {    255,10}, {    543,11}, {    287,10}, \
+    {    607,11}, {    319,10}, {    671,11}, {    351,10}, \
+    {    703,12}, {    191,11}, {    383,10}, {    767,11}, \
+    {    415,10}, {    831,11}, {    447,13}, {    127,12}, \
+    {    255,11}, {    543,10}, {   1087,11}, {    607,12}, \
+    {    319,11}, {    671,10}, {   1343,11}, {    735,12}, \
+    {    383,11}, {    831,12}, {    447,11}, {    959,13}, \
+    {    255,12}, {    511,11}, {   1087,12}, {    575,11}, \
+    {   1215,12}, {    639,11}, {   1343,12}, {    703,13}, \
+    {    383,12}, {    959,14}, {    255,13}, {    511,12}, \
+    {   1215,13}, {    639,12}, {   1471,13}, {    767,12}, \
+    {   1599,13}, {    895,14}, {    511,13}, {   1023,12}, \
+    {   2111,13}, {   1151,12}, {   2431,13}, {   1407,14}, \
+    {    767,13}, {   1663,12}, {   3455,15}, {    511,14}, \
+    {   1023,13}, {   2175,12}, {   4351,13}, {   2431,14}, \
+    {   1279,13}, {   2943,12}, {   5887,14}, {   1535,13}, \
+    {   3455,14}, {   1791,13}, {   3839,15}, {   1023,14}, \
+    {   2047,13}, {   4351,14}, {   2303,13}, {   4991,12}, \
+    {   9983,14}, {   2815,13}, {   5887,15}, {   1535,14}, \
+    {   3839,16} }
+#define SQR_FFT_TABLE3_SIZE 161
+#define SQR_FFT_THRESHOLD                 3712
+
+#define MULLO_BASECASE_THRESHOLD             0  /* always */
+#define MULLO_DC_THRESHOLD                  56
+#define MULLO_MUL_N_THRESHOLD             8907
+#define SQRLO_BASECASE_THRESHOLD             6
+#define SQRLO_DC_THRESHOLD                 111
+#define SQRLO_SQR_THRESHOLD               6654
+
+#define DC_DIV_QR_THRESHOLD                 67
+#define DC_DIVAPPR_Q_THRESHOLD             252
+#define DC_BDIV_QR_THRESHOLD                63
+#define DC_BDIV_Q_THRESHOLD                172
+
+#define INV_MULMOD_BNM1_THRESHOLD           42
+#define INV_NEWTON_THRESHOLD               250
+#define INV_APPR_THRESHOLD                 250
+
+#define BINV_NEWTON_THRESHOLD              276
+#define REDC_1_TO_REDC_N_THRESHOLD          68
+
+#define MU_DIV_QR_THRESHOLD               1334
+#define MU_DIVAPPR_Q_THRESHOLD            1442
+#define MUPI_DIV_QR_THRESHOLD              116
+#define MU_BDIV_QR_THRESHOLD              1142
+#define MU_BDIV_Q_THRESHOLD               1341
+
+#define POWM_SEC_TABLE  1,16,98,376,1259
+
+#define GET_STR_DC_THRESHOLD                12
+#define GET_STR_PRECOMPUTE_THRESHOLD        23
+#define SET_STR_DC_THRESHOLD               298
+#define SET_STR_PRECOMPUTE_THRESHOLD      1037
+
+#define FAC_DSC_THRESHOLD                  171
+#define FAC_ODD_THRESHOLD                   34
+
+#define MATRIX22_STRASSEN_THRESHOLD         17
+#define HGCD2_DIV1_METHOD                    3  /* 3.71% faster than 1 */
+#define HGCD_THRESHOLD                     128
+#define HGCD_APPR_THRESHOLD                186
+#define HGCD_REDUCE_THRESHOLD             2479
+#define GCD_DC_THRESHOLD                   465
+#define GCDEXT_DC_THRESHOLD                339
+#define JACOBI_BASE_METHOD                   3  /* 2.58% faster than 2 */
+
+/* Tuneup completed successfully, took 214190 seconds */

diff --git a/third_party/gmp/mpn/x86/atom/logops_n.asm b/third_party/gmp/mpn/x86/atom/logops_n.asm
new file mode 100644
index 0000000..3cb6d73
--- /dev/null
+++ b/third_party/gmp/mpn/x86/atom/logops_n.asm

@@ -0,0 +1,151 @@
+dnl  Intel Atom mpn_and_n,...,mpn_xnor_n -- bitwise logical operations.
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  Contributed to the GNU project by Marco Bodrato.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C				   cycles/limb
+C				op	nop	opn
+C P5
+C P6 model 0-8,10-12
+C P6 model 9  (Banias)
+C P6 model 13 (Dothan)
+C P4 model 0  (Willamette)
+C P4 model 1  (?)
+C P4 model 2  (Northwood)
+C P4 model 3  (Prescott)
+C P4 model 4  (Nocona)
+C Intel Atom			 3	 3.5	 3.5
+C AMD K6
+C AMD K7
+C AMD K8
+C AMD K10
+
+define(M4_choose_op,
+`ifdef(`OPERATION_$1',`
+define(`M4_function', `mpn_$1')
+define(`M4_want_pre', `$4')
+define(`M4_inst',     `$3')
+define(`M4_want_post',`$2')
+')')
+define(M4pre, `ifelse(M4_want_pre, yes,`$1')')
+define(M4post,`ifelse(M4_want_post,yes,`$1')')
+
+M4_choose_op( and_n,     , andl,    )
+M4_choose_op( andn_n,    , andl, yes)
+M4_choose_op( nand_n, yes, andl,    )
+M4_choose_op( ior_n,     ,  orl,    )
+M4_choose_op( iorn_n,    ,  orl, yes)
+M4_choose_op( nior_n, yes,  orl,    )
+M4_choose_op( xor_n,     , xorl,    )
+M4_choose_op( xnor_n, yes, xorl,    )
+
+ifdef(`M4_function',,
+`m4_error(`Unrecognised or undefined OPERATION symbol
+')')
+
+MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n)
+
+C void M4_function (mp_ptr dst, mp_srcptr src2, mp_srcptr src1, mp_size_t size);
+C
+
+defframe(PARAM_SIZE, 16)
+defframe(PARAM_SRC1, 12)
+defframe(PARAM_SRC2, 8)
+defframe(PARAM_DST,  4)
+
+dnl  re-use parameter space
+define(SAVE_RP,`PARAM_SIZE')
+define(SAVE_VP,`PARAM_SRC1')
+define(SAVE_UP,`PARAM_DST')
+
+define(`rp',  `%edi')
+define(`up',  `%esi')
+define(`vp',  `%ebx')
+define(`cnt', `%eax')
+define(`r1',  `%ecx')
+define(`r2',  `%edx')
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+deflit(`FRAME',0)
+
+PROLOGUE(M4_function)
+	mov	PARAM_SIZE, cnt		C size
+	mov	rp, SAVE_RP
+	mov	PARAM_DST, rp
+	mov	up, SAVE_UP
+	mov	PARAM_SRC1, up
+	shr	cnt			C size >> 1
+	mov	vp, SAVE_VP
+	mov	PARAM_SRC2, vp
+	mov	(up), r1
+	jz	L(end)			C size == 1
+	jnc	L(even)			C size % 2 == 0
+
+	ALIGN(16)
+L(oop):
+M4pre(`	notl_or_xorl_GMP_NUMB_MASK(r1)')
+	M4_inst	(vp), r1
+	lea	8(up), up
+	mov	-4(up), r2
+M4post(`	notl_or_xorl_GMP_NUMB_MASK(r1)')
+	lea	8(vp), vp
+	mov	r1, (rp)
+L(entry):
+M4pre(`	notl_or_xorl_GMP_NUMB_MASK(r2)')
+	M4_inst	-4(vp), r2
+	lea	8(rp), rp
+M4post(`	notl_or_xorl_GMP_NUMB_MASK(r2)')
+	dec	cnt
+	mov	(up), r1
+	mov	r2, -4(rp)
+	jnz	L(oop)
+
+L(end):
+M4pre(`	notl_or_xorl_GMP_NUMB_MASK(r1)')
+	mov	SAVE_UP, up
+	M4_inst	(vp), r1
+M4post(`notl_or_xorl_GMP_NUMB_MASK(r1)')
+	mov	SAVE_VP, vp
+	mov	r1, (rp)
+	mov	SAVE_RP, rp
+	ret
+
+L(even):
+	mov	r1, r2
+	lea	4(up), up
+	lea	4(vp), vp
+	lea	-4(rp), rp
+	jmp	L(entry)
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/x86/atom/lshift.asm b/third_party/gmp/mpn/x86/atom/lshift.asm
new file mode 100644
index 0000000..f2c70dd
--- /dev/null
+++ b/third_party/gmp/mpn/x86/atom/lshift.asm

@@ -0,0 +1,218 @@
+dnl  Intel Atom mpn_lshift -- mpn left shift.
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C			unsigned cnt);
+
+C				  cycles/limb
+C				cnt!=1	cnt==1
+C P5
+C P6 model 0-8,10-12
+C P6 model 9  (Banias)
+C P6 model 13 (Dothan)
+C P4 model 0  (Willamette)
+C P4 model 1  (?)
+C P4 model 2  (Northwood)
+C P4 model 3  (Prescott)
+C P4 model 4  (Nocona)
+C Intel Atom			 5	 2.5
+C AMD K6
+C AMD K7
+C AMD K8
+C AMD K10
+
+defframe(PARAM_CNT, 16)
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC,  8)
+defframe(PARAM_DST,  4)
+
+dnl  re-use parameter space
+define(SAVE_UP,`PARAM_CNT')
+define(VAR_COUNT,`PARAM_SIZE')
+define(SAVE_EBX,`PARAM_SRC')
+define(SAVE_EBP,`PARAM_DST')
+
+define(`rp',  `%edi')
+define(`up',  `%esi')
+define(`cnt',  `%ecx')
+
+ASM_START()
+	TEXT
+	ALIGN(8)
+deflit(`FRAME',0)
+PROLOGUE(mpn_lshift)
+	mov	PARAM_CNT, cnt
+	mov	PARAM_SIZE, %edx
+	mov	up, SAVE_UP
+	mov	PARAM_SRC, up
+	push	rp			FRAME_pushl()
+	mov	PARAM_DST, rp
+
+C We can use faster code for shift-by-1 under certain conditions.
+	cmp	$1,cnt
+	jne	L(normal)
+	cmpl	rp, up
+	jnc	L(special)		C jump if s_ptr + 1 >= res_ptr
+	leal	(up,%edx,4),%eax
+	cmpl	%eax,rp
+	jnc	L(special)		C jump if res_ptr >= s_ptr + size
+
+L(normal):
+	lea	-4(up,%edx,4), up
+	mov	%ebx, SAVE_EBX
+	lea	-4(rp,%edx,4), rp
+
+	shr	%edx
+	mov	(up), %eax
+	mov	%edx, VAR_COUNT
+	jnc	L(evn)
+
+	mov	%eax, %ebx
+	shl	%cl, %ebx
+	neg	cnt
+	shr	%cl, %eax
+	test	%edx, %edx
+	jnz	L(gt1)
+	mov	%ebx, (rp)
+	jmp	L(quit)
+
+L(gt1):	mov	%ebp, SAVE_EBP
+	push	%eax
+	mov	-4(up), %eax
+	mov	%eax, %ebp
+	shr	%cl, %eax
+	jmp	L(lo1)
+
+L(evn):	mov	%ebp, SAVE_EBP
+	neg	cnt
+	mov	%eax, %ebp
+	mov	-4(up), %edx
+	shr	%cl, %eax
+	mov	%edx, %ebx
+	shr	%cl, %edx
+	neg	cnt
+	decl	VAR_COUNT
+	lea	4(rp), rp
+	lea	-4(up), up
+	jz	L(end)
+	push	%eax			FRAME_pushl()
+
+	ALIGN(8)
+L(top):	shl	%cl, %ebp
+	or	%ebp, %edx
+	shl	%cl, %ebx
+	neg	cnt
+	mov	-4(up), %eax
+	mov	%eax, %ebp
+	mov	%edx, -4(rp)
+	shr	%cl, %eax
+	lea	-8(rp), rp
+L(lo1):	mov	-8(up), %edx
+	or	%ebx, %eax
+	mov	%edx, %ebx
+	shr	%cl, %edx
+	lea	-8(up), up
+	neg	cnt
+	mov	%eax, (rp)
+	decl	VAR_COUNT
+	jg	L(top)
+
+	pop	%eax			FRAME_popl()
+L(end):
+	shl	%cl, %ebp
+	shl	%cl, %ebx
+	or	%ebp, %edx
+	mov	SAVE_EBP, %ebp
+	mov	%edx, -4(rp)
+	mov	%ebx, -8(rp)
+
+L(quit):
+	mov	SAVE_UP, up
+	mov	SAVE_EBX, %ebx
+	pop	rp			FRAME_popl()
+	ret
+
+L(special):
+deflit(`FRAME',4)
+	lea	3(%edx), %eax		C size + 3
+	dec	%edx			C size - 1
+	mov	(up), %ecx
+	shr	$2, %eax		C (size + 3) / 4
+	and	$3, %edx		C (size - 1) % 4
+	jz	L(goloop)		C jmp if  size == 1 (mod 4)
+	shr	%edx
+	jnc	L(odd)			C jum if  size == 3 (mod 4)
+
+	add	%ecx, %ecx
+	lea	4(up), up
+	mov	%ecx, (rp)
+	mov	(up), %ecx
+	lea	4(rp), rp
+
+	dec	%edx
+	jnz	L(goloop)		C jump if  size == 0 (mod 4)
+L(odd):	lea	-8(up), up
+	lea	-8(rp), rp
+	jmp	L(sentry)		C reached if size == 2 or 3 (mod 4)
+
+L(sloop):
+	adc	%ecx, %ecx
+	mov	4(up), %edx
+	mov	%ecx, (rp)
+	adc	%edx, %edx
+	mov	8(up), %ecx
+	mov	%edx, 4(rp)
+L(sentry):
+	adc	%ecx, %ecx
+	mov	12(up), %edx
+	mov	%ecx, 8(rp)
+	adc	%edx, %edx
+	lea	16(up), up
+	mov	%edx, 12(rp)
+	lea	16(rp), rp
+	mov	(up), %ecx
+L(goloop):
+	decl	%eax
+	jnz	L(sloop)
+
+L(squit):
+	adc	%ecx, %ecx
+	mov	%ecx, (rp)
+	adc	%eax, %eax
+
+	mov	SAVE_UP, up
+	pop	rp			FRAME_popl()
+	ret
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/x86/atom/lshiftc.asm b/third_party/gmp/mpn/x86/atom/lshiftc.asm
new file mode 100644
index 0000000..5be53ed
--- /dev/null
+++ b/third_party/gmp/mpn/x86/atom/lshiftc.asm

@@ -0,0 +1,159 @@
+dnl  Intel Atom mpn_lshiftc -- mpn left shift with complement.
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C mp_limb_t mpn_lshiftc (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C			 unsigned cnt);
+
+C				cycles/limb
+C P5
+C P6 model 0-8,10-12
+C P6 model 9  (Banias)
+C P6 model 13 (Dothan)
+C P4 model 0  (Willamette)
+C P4 model 1  (?)
+C P4 model 2  (Northwood)
+C P4 model 3  (Prescott)
+C P4 model 4  (Nocona)
+C Intel Atom			 5.5
+C AMD K6
+C AMD K7
+C AMD K8
+C AMD K10
+
+defframe(PARAM_CNT, 16)
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC,  8)
+defframe(PARAM_DST,  4)
+
+dnl  re-use parameter space
+define(SAVE_UP,`PARAM_CNT')
+define(VAR_COUNT,`PARAM_SIZE')
+define(SAVE_EBX,`PARAM_SRC')
+define(SAVE_EBP,`PARAM_DST')
+
+define(`rp',  `%edi')
+define(`up',  `%esi')
+define(`cnt',  `%ecx')
+
+ASM_START()
+	TEXT
+
+PROLOGUE(mpn_lshiftc)
+deflit(`FRAME',0)
+	mov	PARAM_CNT, cnt
+	mov	PARAM_SIZE, %edx
+	mov	up, SAVE_UP
+	mov	PARAM_SRC, up
+	push	rp			FRAME_pushl()
+	mov	PARAM_DST, rp
+
+	lea	-4(up,%edx,4), up
+	mov	%ebx, SAVE_EBX
+	lea	-4(rp,%edx,4), rp
+
+	shr	%edx
+	mov	(up), %eax
+	mov	%edx, VAR_COUNT
+	jnc	L(evn)
+
+	mov	%eax, %ebx
+	shl	%cl, %ebx
+	neg	cnt
+	shr	%cl, %eax
+	test	%edx, %edx
+	jnz	L(gt1)
+	not	%ebx
+	mov	%ebx, (rp)
+	jmp	L(quit)
+
+L(gt1):	mov	%ebp, SAVE_EBP
+	push	%eax
+	mov	-4(up), %eax
+	mov	%eax, %ebp
+	shr	%cl, %eax
+	jmp	L(lo1)
+
+L(evn):	mov	%ebp, SAVE_EBP
+	neg	cnt
+	mov	%eax, %ebp
+	mov	-4(up), %edx
+	shr	%cl, %eax
+	mov	%edx, %ebx
+	shr	%cl, %edx
+	neg	cnt
+	decl	VAR_COUNT
+	lea	4(rp), rp
+	lea	-4(up), up
+	jz	L(end)
+	push	%eax			FRAME_pushl()
+
+L(top):	shl	%cl, %ebp
+	or	%ebp, %edx
+	shl	%cl, %ebx
+	neg	cnt
+	not	%edx
+	mov	-4(up), %eax
+	mov	%eax, %ebp
+	mov	%edx, -4(rp)
+	shr	%cl, %eax
+	lea	-8(rp), rp
+L(lo1):	mov	-8(up), %edx
+	or	%ebx, %eax
+	mov	%edx, %ebx
+	shr	%cl, %edx
+	not	%eax
+	lea	-8(up), up
+	neg	cnt
+	mov	%eax, (rp)
+	decl	VAR_COUNT
+	jg	L(top)
+
+	pop	%eax			FRAME_popl()
+L(end):
+	shl	%cl, %ebp
+	shl	%cl, %ebx
+	or	%ebp, %edx
+	mov	SAVE_EBP, %ebp
+	not	%edx
+	not	%ebx
+	mov	%edx, -4(rp)
+	mov	%ebx, -8(rp)
+
+L(quit):
+	mov	SAVE_UP, up
+	mov	SAVE_EBX, %ebx
+	pop	rp			FRAME_popl()
+	ret
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/x86/atom/mmx/copyd.asm b/third_party/gmp/mpn/x86/atom/mmx/copyd.asm
new file mode 100644
index 0000000..b80fb03
--- /dev/null
+++ b/third_party/gmp/mpn/x86/atom/mmx/copyd.asm

@@ -0,0 +1,34 @@
+dnl  Intel Atom mpn_copyd -- copy limb vector, decrementing.
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_copyd)
+include_mpn(`x86/k7/mmx/copyd.asm')

diff --git a/third_party/gmp/mpn/x86/atom/mmx/copyi.asm b/third_party/gmp/mpn/x86/atom/mmx/copyi.asm
new file mode 100644
index 0000000..49b6b8d
--- /dev/null
+++ b/third_party/gmp/mpn/x86/atom/mmx/copyi.asm

@@ -0,0 +1,34 @@
+dnl  Intel Atom mpn_copyi -- copy limb vector, incrementing.
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_copyi)
+include_mpn(`x86/k7/mmx/copyi.asm')

diff --git a/third_party/gmp/mpn/x86/atom/mmx/hamdist.asm b/third_party/gmp/mpn/x86/atom/mmx/hamdist.asm
new file mode 100644
index 0000000..3fe8253
--- /dev/null
+++ b/third_party/gmp/mpn/x86/atom/mmx/hamdist.asm

@@ -0,0 +1,34 @@
+dnl  Intel Atom mpn_hamdist -- hamming distance.
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_hamdist)
+include_mpn(`x86/k7/mmx/popham.asm')

diff --git a/third_party/gmp/mpn/x86/atom/mod_34lsub1.asm b/third_party/gmp/mpn/x86/atom/mod_34lsub1.asm
new file mode 100644
index 0000000..6d57ba3
--- /dev/null
+++ b/third_party/gmp/mpn/x86/atom/mod_34lsub1.asm

@@ -0,0 +1,34 @@
+dnl  Intel Atom mpn_mod_34lsub1 -- remainder modulo 2^24-1.
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_mod_34lsub1)
+include_mpn(`x86/p6/mod_34lsub1.asm')

diff --git a/third_party/gmp/mpn/x86/atom/mode1o.asm b/third_party/gmp/mpn/x86/atom/mode1o.asm
new file mode 100644
index 0000000..c9ee6bd
--- /dev/null
+++ b/third_party/gmp/mpn/x86/atom/mode1o.asm

@@ -0,0 +1,34 @@
+dnl  Intel Atom mpn_modexact_1_odd -- exact division style remainder.
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_modexact_1_odd mpn_modexact_1c_odd)
+include_mpn(`x86/pentium/mode1o.asm')

diff --git a/third_party/gmp/mpn/x86/atom/rshift.asm b/third_party/gmp/mpn/x86/atom/rshift.asm
new file mode 100644
index 0000000..1cb5dbe
--- /dev/null
+++ b/third_party/gmp/mpn/x86/atom/rshift.asm

@@ -0,0 +1,152 @@
+dnl  Intel Atom mpn_rshift -- mpn right shift.
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  Converted from AMD64 by Marco Bodrato.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C			unsigned cnt);
+
+C				cycles/limb
+C P5
+C P6 model 0-8,10-12
+C P6 model 9  (Banias)
+C P6 model 13 (Dothan)
+C P4 model 0  (Willamette)
+C P4 model 1  (?)
+C P4 model 2  (Northwood)
+C P4 model 3  (Prescott)
+C P4 model 4  (Nocona)
+C Intel Atom			 5
+C AMD K6
+C AMD K7
+C AMD K8
+C AMD K10
+
+defframe(PARAM_CNT, 16)
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC,  8)
+defframe(PARAM_DST,  4)
+
+dnl  re-use parameter space
+define(SAVE_UP,`PARAM_CNT')
+define(VAR_COUNT,`PARAM_SIZE')
+define(SAVE_EBX,`PARAM_SRC')
+define(SAVE_EBP,`PARAM_DST')
+
+define(`rp',  `%edi')
+define(`up',  `%esi')
+define(`cnt',  `%ecx')
+
+ASM_START()
+	TEXT
+	ALIGN(8)
+deflit(`FRAME',0)
+PROLOGUE(mpn_rshift)
+	mov	PARAM_CNT, cnt
+	mov	PARAM_SIZE, %edx
+	mov	up, SAVE_UP
+	mov	PARAM_SRC, up
+	push	rp			FRAME_pushl()
+	mov	PARAM_DST, rp
+	mov	%ebx, SAVE_EBX
+
+	shr	%edx
+	mov	(up), %eax
+	mov	%edx, VAR_COUNT
+	jnc	L(evn)
+
+	mov	%eax, %ebx
+	shr	%cl, %ebx
+	neg	cnt
+	shl	%cl, %eax
+	test	%edx, %edx
+	jnz	L(gt1)
+	mov	%ebx, (rp)
+	jmp	L(quit)
+
+L(gt1):	mov	%ebp, SAVE_EBP
+	push	%eax
+	mov	4(up), %eax
+	mov	%eax, %ebp
+	shl	%cl, %eax
+	jmp	L(lo1)
+
+L(evn):	mov	%ebp, SAVE_EBP
+	neg	cnt
+	mov	%eax, %ebp
+	mov	4(up), %edx
+	shl	%cl, %eax
+	mov	%edx, %ebx
+	shl	%cl, %edx
+	neg	cnt
+	decl	VAR_COUNT
+	lea	-4(rp), rp
+	lea	4(up), up
+	jz	L(end)
+	push	%eax			FRAME_pushl()
+
+	ALIGN(8)
+L(top):	shr	%cl, %ebp
+	or	%ebp, %edx
+	shr	%cl, %ebx
+	neg	cnt
+	mov	4(up), %eax
+	mov	%eax, %ebp
+	mov	%edx, 4(rp)
+	shl	%cl, %eax
+	lea	8(rp), rp
+L(lo1):	mov	8(up), %edx
+	or	%ebx, %eax
+	mov	%edx, %ebx
+	shl	%cl, %edx
+	lea	8(up), up
+	neg	cnt
+	mov	%eax, (rp)
+	decl	VAR_COUNT
+	jg	L(top)
+
+	pop	%eax			FRAME_popl()
+L(end):
+	shr	%cl, %ebp
+	shr	%cl, %ebx
+	or	%ebp, %edx
+	mov	SAVE_EBP, %ebp
+	mov	%edx, 4(rp)
+	mov	%ebx, 8(rp)
+
+L(quit):
+	mov	SAVE_UP, up
+	mov	SAVE_EBX, %ebx
+	pop	rp			FRAME_popl()
+	ret
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/x86/atom/sse2/aorsmul_1.asm b/third_party/gmp/mpn/x86/atom/sse2/aorsmul_1.asm
new file mode 100644
index 0000000..969a14a
--- /dev/null
+++ b/third_party/gmp/mpn/x86/atom/sse2/aorsmul_1.asm

@@ -0,0 +1,174 @@
+dnl x86-32 mpn_addmul_1 and mpn_submul_1 optimised for Intel Atom.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C			    cycles/limb
+C			    cycles/limb
+C P5				 -
+C P6 model 0-8,10-12		 -
+C P6 model 9  (Banias)
+C P6 model 13 (Dothan)
+C P4 model 0  (Willamette)
+C P4 model 1  (?)
+C P4 model 2  (Northwood)
+C P4 model 3  (Prescott)
+C P4 model 4  (Nocona)
+C Intel Atom			 8
+C AMD K6
+C AMD K7			 -
+C AMD K8
+C AMD K10
+
+define(`rp', `%edi')
+define(`up', `%esi')
+define(`n',  `%ecx')
+
+ifdef(`OPERATION_addmul_1',`
+	define(ADDSUB,  add)
+	define(func_1,  mpn_addmul_1)
+	define(func_1c, mpn_addmul_1c)')
+ifdef(`OPERATION_submul_1',`
+	define(ADDSUB,  sub)
+	define(func_1,  mpn_submul_1)
+	define(func_1c, mpn_submul_1c)')
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c)
+
+	TEXT
+	ALIGN(16)
+PROLOGUE(func_1)
+	xor	%edx, %edx
+L(ent):	push	%edi
+	push	%esi
+	push	%ebx
+	mov	16(%esp), rp
+	mov	20(%esp), up
+	mov	24(%esp), n
+	movd	28(%esp), %mm7
+	test	$1, n
+	jz	L(fi0or2)
+	movd	(up), %mm0
+	pmuludq	%mm7, %mm0
+	shr	$2, n
+	jnc	L(fi1)
+
+L(fi3):	lea	-8(up), up
+	lea	-8(rp), rp
+	movd	12(up), %mm1
+	movd	%mm0, %ebx
+	pmuludq	%mm7, %mm1
+	add	$1, n			C increment and clear carry
+	jmp	L(lo3)
+
+L(fi1):	movd	%mm0, %ebx
+	jz	L(wd1)
+	movd	4(up), %mm1
+	pmuludq	%mm7, %mm1
+	jmp	L(lo1)
+
+L(fi0or2):
+	movd	(up), %mm1
+	pmuludq	%mm7, %mm1
+	shr	$2, n
+	movd	4(up), %mm0
+	jc	L(fi2)
+	lea	-4(up), up
+	lea	-4(rp), rp
+	movd	%mm1, %eax
+	pmuludq	%mm7, %mm0
+	jmp	L(lo0)
+
+L(fi2):	lea	4(up), up
+	add	$1, n			C increment and clear carry
+	movd	%mm1, %eax
+	lea	-12(rp), rp
+	jmp	L(lo2)
+
+C	ALIGN(16)			C alignment seems irrelevant
+L(top):	movd	4(up), %mm1
+	adc	$0, %edx
+	ADDSUB	%eax, 12(rp)
+	movd	%mm0, %ebx
+	pmuludq	%mm7, %mm1
+	lea	16(rp), rp
+L(lo1):	psrlq	$32, %mm0
+	adc	%edx, %ebx
+	movd	%mm0, %edx
+	movd	%mm1, %eax
+	movd	8(up), %mm0
+	pmuludq	%mm7, %mm0
+	adc	$0, %edx
+	ADDSUB	%ebx, (rp)
+L(lo0):	psrlq	$32, %mm1
+	adc	%edx, %eax
+	movd	%mm1, %edx
+	movd	%mm0, %ebx
+	movd	12(up), %mm1
+	pmuludq	%mm7, %mm1
+	adc	$0, %edx
+	ADDSUB	%eax, 4(rp)
+L(lo3):	psrlq	$32, %mm0
+	adc	%edx, %ebx
+	movd	%mm0, %edx
+	movd	%mm1, %eax
+	lea	16(up), up
+	movd	(up), %mm0
+	adc	$0, %edx
+	ADDSUB	%ebx, 8(rp)
+L(lo2):	psrlq	$32, %mm1
+	adc	%edx, %eax
+	movd	%mm1, %edx
+	pmuludq	%mm7, %mm0
+	dec	n
+	jnz	L(top)
+
+L(end):	adc	n, %edx			C n is zero here
+	ADDSUB	%eax, 12(rp)
+	movd	%mm0, %ebx
+	lea	16(rp), rp
+L(wd1):	psrlq	$32, %mm0
+	adc	%edx, %ebx
+	movd	%mm0, %eax
+	adc	n, %eax
+	ADDSUB	%ebx, (rp)
+	emms
+	adc	n, %eax
+	pop	%ebx
+	pop	%esi
+	pop	%edi
+	ret
+EPILOGUE()
+PROLOGUE(func_1c)
+	mov	20(%esp), %edx		C carry
+	jmp	L(ent)
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/atom/sse2/bdiv_dbm1c.asm b/third_party/gmp/mpn/x86/atom/sse2/bdiv_dbm1c.asm
new file mode 100644
index 0000000..782e914
--- /dev/null
+++ b/third_party/gmp/mpn/x86/atom/sse2/bdiv_dbm1c.asm

@@ -0,0 +1,34 @@
+dnl  Intel Atom  mpn_bdiv_dbm1.
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_bdiv_dbm1c)
+include_mpn(`x86/pentium4/sse2/bdiv_dbm1c.asm')

diff --git a/third_party/gmp/mpn/x86/atom/sse2/divrem_1.asm b/third_party/gmp/mpn/x86/atom/sse2/divrem_1.asm
new file mode 100644
index 0000000..f84709a
--- /dev/null
+++ b/third_party/gmp/mpn/x86/atom/sse2/divrem_1.asm

@@ -0,0 +1,34 @@
+dnl  Intel Atom mpn_divrem_1 -- mpn by limb division.
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_preinv_divrem_1 mpn_divrem_1c mpn_divrem_1)
+include_mpn(`x86/pentium4/sse2/divrem_1.asm')

diff --git a/third_party/gmp/mpn/x86/atom/sse2/mod_1_1.asm b/third_party/gmp/mpn/x86/atom/sse2/mod_1_1.asm
new file mode 100644
index 0000000..ae6581d
--- /dev/null
+++ b/third_party/gmp/mpn/x86/atom/sse2/mod_1_1.asm

@@ -0,0 +1,34 @@
+dnl  Intel Atom/SSE2 mpn_mod_1_1.
+
+dnl  Copyright 2009, 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_mod_1_1p)
+include_mpn(`x86/pentium4/sse2/mod_1_1.asm')

diff --git a/third_party/gmp/mpn/x86/atom/sse2/mod_1_4.asm b/third_party/gmp/mpn/x86/atom/sse2/mod_1_4.asm
new file mode 100644
index 0000000..31faa3f
--- /dev/null
+++ b/third_party/gmp/mpn/x86/atom/sse2/mod_1_4.asm

@@ -0,0 +1,34 @@
+dnl  Intel Atom/SSE2 mpn_mod_1_4.
+
+dnl  Copyright 2009, 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_mod_1s_4p)
+include_mpn(`x86/pentium4/sse2/mod_1_4.asm')

diff --git a/third_party/gmp/mpn/x86/atom/sse2/mul_1.asm b/third_party/gmp/mpn/x86/atom/sse2/mul_1.asm
new file mode 100644
index 0000000..aa3bb97
--- /dev/null
+++ b/third_party/gmp/mpn/x86/atom/sse2/mul_1.asm

@@ -0,0 +1,124 @@
+dnl  Intel Atom mpn_mul_1.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C			    cycles/limb
+C			    cycles/limb
+C P5				 -
+C P6 model 0-8,10-12		 -
+C P6 model 9  (Banias)
+C P6 model 13 (Dothan)
+C P4 model 0  (Willamette)
+C P4 model 1  (?)
+C P4 model 2  (Northwood)
+C P4 model 3  (Prescott)
+C P4 model 4  (Nocona)
+C Intel Atom			 7.5
+C AMD K6			 -
+C AMD K7			 -
+C AMD K8
+C AMD K10
+
+defframe(PARAM_CARRY,20)
+defframe(PARAM_MUL,  16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC,  8)
+defframe(PARAM_DST,  4)
+
+define(`rp', `%edx')
+define(`up', `%esi')
+define(`n',  `%ecx')
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+deflit(`FRAME',0)
+
+PROLOGUE(mpn_mul_1c)
+	movd	PARAM_CARRY, %mm6	C carry
+	jmp	L(ent)
+EPILOGUE()
+
+	ALIGN(8)			C for compact code
+PROLOGUE(mpn_mul_1)
+	pxor	%mm6, %mm6
+L(ent):	push	%esi			FRAME_pushl()
+	mov	PARAM_SRC, up
+	mov	PARAM_SIZE, %eax	C size
+	movd	PARAM_MUL, %mm7
+	movd	(up), %mm0
+	mov	%eax, n
+	and	$3, %eax
+	pmuludq	%mm7, %mm0
+	mov	PARAM_DST, rp
+	jz	L(lo0)
+	cmp	$2, %eax
+	lea	-16(up,%eax,4),up
+	lea	-16(rp,%eax,4),rp
+	jc	L(lo1)
+	jz	L(lo2)
+	jmp	L(lo3)
+
+	ALIGN(16)
+L(top):	movd	(up), %mm0
+	pmuludq	%mm7, %mm0
+	psrlq	$32, %mm6
+	lea	16(rp), rp
+L(lo0):	paddq	%mm0, %mm6
+	movd	4(up), %mm0
+	pmuludq	%mm7, %mm0
+	movd	%mm6, (rp)
+	psrlq	$32, %mm6
+L(lo3):	paddq	%mm0, %mm6
+	movd	8(up), %mm0
+	pmuludq	%mm7, %mm0
+	movd	%mm6, 4(rp)
+	psrlq	$32, %mm6
+L(lo2):	paddq	%mm0, %mm6
+	movd	12(up), %mm0
+	pmuludq	%mm7, %mm0
+	movd	%mm6, 8(rp)
+	psrlq	$32, %mm6
+L(lo1):	paddq	%mm0, %mm6
+	sub	$4, n
+	movd	%mm6, 12(rp)
+	lea	16(up), up
+	ja	L(top)
+
+	psrlq	$32, %mm6
+	movd	%mm6, %eax
+	emms
+	pop	%esi			FRAME_popl()
+	ret
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/x86/atom/sse2/mul_basecase.asm b/third_party/gmp/mpn/x86/atom/sse2/mul_basecase.asm
new file mode 100644
index 0000000..97d3aeb
--- /dev/null
+++ b/third_party/gmp/mpn/x86/atom/sse2/mul_basecase.asm

@@ -0,0 +1,501 @@
+dnl  x86 mpn_mul_basecase -- Multiply two limb vectors and store the result in
+dnl  a third limb vector.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C TODO
+C  * Check if 'jmp N(%esp)' is well-predicted enough to allow us to combine the
+C    4 large loops into one; we could use it for the outer loop branch.
+C  * Optimise code outside of inner loops.
+C  * Write combined addmul_1 feed-in a wind-down code, and use when iterating
+C    outer each loop.  ("Overlapping software pipelining")
+C  * Postpone push of ebx until we know vn > 1.  Perhaps use caller-saves regs
+C    for inlined mul_1, allowing us to postpone all pushes.
+C  * Perhaps write special code for vn <= un < M, for some small M.
+
+C void mpn_mul_basecase (mp_ptr wp,
+C                        mp_srcptr xp, mp_size_t xn,
+C                        mp_srcptr yp, mp_size_t yn);
+C
+
+define(`rp',  `%edi')
+define(`up',  `%esi')
+define(`un',  `%ecx')
+define(`vp',  `%ebp')
+define(`vn',  `36(%esp)')
+
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_mul_basecase)
+	push	%edi
+	push	%esi
+	push	%ebx
+	push	%ebp
+	mov	20(%esp), rp
+	mov	24(%esp), up
+	mov	28(%esp), un
+	mov	32(%esp), vp
+
+	movd	(up), %mm0
+	movd	(vp), %mm7
+	pmuludq	%mm7, %mm0
+	pxor	%mm6, %mm6
+
+	mov	un, %eax
+	and	$3, %eax
+	jz	L(of0)
+	cmp	$2, %eax
+	jc	L(of1)
+	jz	L(of2)
+
+C ================================================================
+	jmp	L(m3)
+	ALIGN(16)
+L(lm3):	movd	-4(up), %mm0
+	pmuludq	%mm7, %mm0
+	psrlq	$32, %mm6
+	lea	16(rp), rp
+	paddq	%mm0, %mm6
+	movd	(up), %mm0
+	pmuludq	%mm7, %mm0
+	movd	%mm6, -4(rp)
+	psrlq	$32, %mm6
+L(m3):	paddq	%mm0, %mm6
+	movd	4(up), %mm0
+	pmuludq	%mm7, %mm0
+	movd	%mm6, (rp)
+	psrlq	$32, %mm6
+	paddq	%mm0, %mm6
+	movd	8(up), %mm0
+	pmuludq	%mm7, %mm0
+	movd	%mm6, 4(rp)
+	psrlq	$32, %mm6
+	paddq	%mm0, %mm6
+	sub	$4, un
+	movd	%mm6, 8(rp)
+	lea	16(up), up
+	ja	L(lm3)
+
+	psrlq	$32, %mm6
+	movd	%mm6, 12(rp)
+
+	decl	vn
+	jz	L(done)
+	lea	-8(rp), rp
+
+L(ol3):	mov	28(%esp), un
+	neg	un
+	lea	4(vp), vp
+	movd	(vp), %mm7	C read next V limb
+	mov	24(%esp), up
+	lea	16(rp,un,4), rp
+
+	movd	(up), %mm0
+	pmuludq	%mm7, %mm0
+	sar	$2, un
+	movd	4(up), %mm1
+	movd	%mm0, %ebx
+	pmuludq	%mm7, %mm1
+	lea	-8(up), up
+	xor	%edx, %edx	C zero edx and CF
+	jmp	L(a3)
+
+L(la3):	movd	4(up), %mm1
+	adc	$0, %edx
+	add	%eax, 12(rp)
+	movd	%mm0, %ebx
+	pmuludq	%mm7, %mm1
+	lea	16(rp), rp
+	psrlq	$32, %mm0
+	adc	%edx, %ebx
+	movd	%mm0, %edx
+	movd	%mm1, %eax
+	movd	8(up), %mm0
+	pmuludq	%mm7, %mm0
+	adc	$0, %edx
+	add	%ebx, (rp)
+	psrlq	$32, %mm1
+	adc	%edx, %eax
+	movd	%mm1, %edx
+	movd	%mm0, %ebx
+	movd	12(up), %mm1
+	pmuludq	%mm7, %mm1
+	adc	$0, %edx
+	add	%eax, 4(rp)
+L(a3):	psrlq	$32, %mm0
+	adc	%edx, %ebx
+	movd	%mm0, %edx
+	movd	%mm1, %eax
+	lea	16(up), up
+	movd	(up), %mm0
+	adc	$0, %edx
+	add	%ebx, 8(rp)
+	psrlq	$32, %mm1
+	adc	%edx, %eax
+	movd	%mm1, %edx
+	pmuludq	%mm7, %mm0
+	inc	un
+	jnz	L(la3)
+
+	adc	un, %edx	C un is zero here
+	add	%eax, 12(rp)
+	movd	%mm0, %ebx
+	psrlq	$32, %mm0
+	adc	%edx, %ebx
+	movd	%mm0, %eax
+	adc	un, %eax
+	add	%ebx, 16(rp)
+	adc	un, %eax
+	mov	%eax, 20(rp)
+
+	decl	vn
+	jnz	L(ol3)
+	jmp	L(done)
+
+C ================================================================
+	ALIGN(16)
+L(lm0):	movd	(up), %mm0
+	pmuludq	%mm7, %mm0
+	psrlq	$32, %mm6
+	lea	16(rp), rp
+L(of0):	paddq	%mm0, %mm6
+	movd	4(up), %mm0
+	pmuludq	%mm7, %mm0
+	movd	%mm6, (rp)
+	psrlq	$32, %mm6
+	paddq	%mm0, %mm6
+	movd	8(up), %mm0
+	pmuludq	%mm7, %mm0
+	movd	%mm6, 4(rp)
+	psrlq	$32, %mm6
+	paddq	%mm0, %mm6
+	movd	12(up), %mm0
+	pmuludq	%mm7, %mm0
+	movd	%mm6, 8(rp)
+	psrlq	$32, %mm6
+	paddq	%mm0, %mm6
+	sub	$4, un
+	movd	%mm6, 12(rp)
+	lea	16(up), up
+	ja	L(lm0)
+
+	psrlq	$32, %mm6
+	movd	%mm6, 16(rp)
+
+	decl	vn
+	jz	L(done)
+	lea	-4(rp), rp
+
+L(ol0):	mov	28(%esp), un
+	neg	un
+	lea	4(vp), vp
+	movd	(vp), %mm7	C read next V limb
+	mov	24(%esp), up
+	lea	20(rp,un,4), rp
+
+	movd	(up), %mm1
+	pmuludq	%mm7, %mm1
+	sar	$2, un
+	movd	4(up), %mm0
+	lea	-4(up), up
+	movd	%mm1, %eax
+	pmuludq	%mm7, %mm0
+	xor	%edx, %edx	C zero edx and CF
+	jmp	L(a0)
+
+L(la0):	movd	4(up), %mm1
+	adc	$0, %edx
+	add	%eax, 12(rp)
+	movd	%mm0, %ebx
+	pmuludq	%mm7, %mm1
+	lea	16(rp), rp
+	psrlq	$32, %mm0
+	adc	%edx, %ebx
+	movd	%mm0, %edx
+	movd	%mm1, %eax
+	movd	8(up), %mm0
+	pmuludq	%mm7, %mm0
+	adc	$0, %edx
+	add	%ebx, (rp)
+L(a0):	psrlq	$32, %mm1
+	adc	%edx, %eax
+	movd	%mm1, %edx
+	movd	%mm0, %ebx
+	movd	12(up), %mm1
+	pmuludq	%mm7, %mm1
+	adc	$0, %edx
+	add	%eax, 4(rp)
+	psrlq	$32, %mm0
+	adc	%edx, %ebx
+	movd	%mm0, %edx
+	movd	%mm1, %eax
+	lea	16(up), up
+	movd	(up), %mm0
+	adc	$0, %edx
+	add	%ebx, 8(rp)
+	psrlq	$32, %mm1
+	adc	%edx, %eax
+	movd	%mm1, %edx
+	pmuludq	%mm7, %mm0
+	inc	un
+	jnz	L(la0)
+
+	adc	un, %edx	C un is zero here
+	add	%eax, 12(rp)
+	movd	%mm0, %ebx
+	psrlq	$32, %mm0
+	adc	%edx, %ebx
+	movd	%mm0, %eax
+	adc	un, %eax
+	add	%ebx, 16(rp)
+	adc	un, %eax
+	mov	%eax, 20(rp)
+
+	decl	vn
+	jnz	L(ol0)
+	jmp	L(done)
+
+C ================================================================
+	ALIGN(16)
+L(lm1):	movd	-12(up), %mm0
+	pmuludq	%mm7, %mm0
+	psrlq	$32, %mm6
+	lea	16(rp), rp
+	paddq	%mm0, %mm6
+	movd	-8(up), %mm0
+	pmuludq	%mm7, %mm0
+	movd	%mm6, -12(rp)
+	psrlq	$32, %mm6
+	paddq	%mm0, %mm6
+	movd	-4(up), %mm0
+	pmuludq	%mm7, %mm0
+	movd	%mm6, -8(rp)
+	psrlq	$32, %mm6
+	paddq	%mm0, %mm6
+	movd	(up), %mm0
+	pmuludq	%mm7, %mm0
+	movd	%mm6, -4(rp)
+	psrlq	$32, %mm6
+L(of1):	paddq	%mm0, %mm6
+	sub	$4, un
+	movd	%mm6, (rp)
+	lea	16(up), up
+	ja	L(lm1)
+
+	psrlq	$32, %mm6
+	movd	%mm6, 4(rp)
+
+	decl	vn
+	jz	L(done)
+	lea	-16(rp), rp
+
+L(ol1):	mov	28(%esp), un
+	neg	un
+	lea	4(vp), vp
+	movd	(vp), %mm7	C read next V limb
+	mov	24(%esp), up
+	lea	24(rp,un,4), rp
+
+	movd	(up), %mm0
+	pmuludq	%mm7, %mm0
+	sar	$2, un
+	movd	%mm0, %ebx
+	movd	4(up), %mm1
+	pmuludq	%mm7, %mm1
+	xor	%edx, %edx	C zero edx and CF
+	inc	un
+	jmp	L(a1)
+
+L(la1):	movd	4(up), %mm1
+	adc	$0, %edx
+	add	%eax, 12(rp)
+	movd	%mm0, %ebx
+	pmuludq	%mm7, %mm1
+	lea	16(rp), rp
+L(a1):	psrlq	$32, %mm0
+	adc	%edx, %ebx
+	movd	%mm0, %edx
+	movd	%mm1, %eax
+	movd	8(up), %mm0
+	pmuludq	%mm7, %mm0
+	adc	$0, %edx
+	add	%ebx, (rp)
+	psrlq	$32, %mm1
+	adc	%edx, %eax
+	movd	%mm1, %edx
+	movd	%mm0, %ebx
+	movd	12(up), %mm1
+	pmuludq	%mm7, %mm1
+	adc	$0, %edx
+	add	%eax, 4(rp)
+	psrlq	$32, %mm0
+	adc	%edx, %ebx
+	movd	%mm0, %edx
+	movd	%mm1, %eax
+	lea	16(up), up
+	movd	(up), %mm0
+	adc	$0, %edx
+	add	%ebx, 8(rp)
+	psrlq	$32, %mm1
+	adc	%edx, %eax
+	movd	%mm1, %edx
+	pmuludq	%mm7, %mm0
+	inc	un
+	jnz	L(la1)
+
+	adc	un, %edx	C un is zero here
+	add	%eax, 12(rp)
+	movd	%mm0, %ebx
+	psrlq	$32, %mm0
+	adc	%edx, %ebx
+	movd	%mm0, %eax
+	adc	un, %eax
+	add	%ebx, 16(rp)
+	adc	un, %eax
+	mov	%eax, 20(rp)
+
+	decl	vn
+	jnz	L(ol1)
+	jmp	L(done)
+
+C ================================================================
+	ALIGN(16)
+L(lm2):	movd	-8(up), %mm0
+	pmuludq	%mm7, %mm0
+	psrlq	$32, %mm6
+	lea	16(rp), rp
+	paddq	%mm0, %mm6
+	movd	-4(up), %mm0
+	pmuludq	%mm7, %mm0
+	movd	%mm6, -8(rp)
+	psrlq	$32, %mm6
+	paddq	%mm0, %mm6
+	movd	(up), %mm0
+	pmuludq	%mm7, %mm0
+	movd	%mm6, -4(rp)
+	psrlq	$32, %mm6
+L(of2):	paddq	%mm0, %mm6
+	movd	4(up), %mm0
+	pmuludq	%mm7, %mm0
+	movd	%mm6, (rp)
+	psrlq	$32, %mm6
+	paddq	%mm0, %mm6
+	sub	$4, un
+	movd	%mm6, 4(rp)
+	lea	16(up), up
+	ja	L(lm2)
+
+	psrlq	$32, %mm6
+	movd	%mm6, 8(rp)
+
+	decl	vn
+	jz	L(done)
+	lea	-12(rp), rp
+
+L(ol2):	mov	28(%esp), un
+	neg	un
+	lea	4(vp), vp
+	movd	(vp), %mm7	C read next V limb
+	mov	24(%esp), up
+	lea	12(rp,un,4), rp
+
+	movd	(up), %mm1
+	pmuludq	%mm7, %mm1
+	sar	$2, un
+	movd	4(up), %mm0
+	lea	4(up), up
+	movd	%mm1, %eax
+	xor	%edx, %edx	C zero edx and CF
+	jmp	L(lo2)
+
+L(la2):	movd	4(up), %mm1
+	adc	$0, %edx
+	add	%eax, 12(rp)
+	movd	%mm0, %ebx
+	pmuludq	%mm7, %mm1
+	lea	16(rp), rp
+	psrlq	$32, %mm0
+	adc	%edx, %ebx
+	movd	%mm0, %edx
+	movd	%mm1, %eax
+	movd	8(up), %mm0
+	pmuludq	%mm7, %mm0
+	adc	$0, %edx
+	add	%ebx, (rp)
+	psrlq	$32, %mm1
+	adc	%edx, %eax
+	movd	%mm1, %edx
+	movd	%mm0, %ebx
+	movd	12(up), %mm1
+	pmuludq	%mm7, %mm1
+	adc	$0, %edx
+	add	%eax, 4(rp)
+	psrlq	$32, %mm0
+	adc	%edx, %ebx
+	movd	%mm0, %edx
+	movd	%mm1, %eax
+	lea	16(up), up
+	movd	(up), %mm0
+	adc	$0, %edx
+	add	%ebx, 8(rp)
+L(lo2):	psrlq	$32, %mm1
+	adc	%edx, %eax
+	movd	%mm1, %edx
+	pmuludq	%mm7, %mm0
+	inc	un
+	jnz	L(la2)
+
+	adc	un, %edx	C un is zero here
+	add	%eax, 12(rp)
+	movd	%mm0, %ebx
+	psrlq	$32, %mm0
+	adc	%edx, %ebx
+	movd	%mm0, %eax
+	adc	un, %eax
+	add	%ebx, 16(rp)
+	adc	un, %eax
+	mov	%eax, 20(rp)
+
+	decl	vn
+	jnz	L(ol2)
+C	jmp	L(done)
+
+C ================================================================
+L(done):
+	emms
+	pop	%ebp
+	pop	%ebx
+	pop	%esi
+	pop	%edi
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/atom/sse2/popcount.asm b/third_party/gmp/mpn/x86/atom/sse2/popcount.asm
new file mode 100644
index 0000000..7847aec
--- /dev/null
+++ b/third_party/gmp/mpn/x86/atom/sse2/popcount.asm

@@ -0,0 +1,35 @@
+dnl  Intel Atom mpn_popcount -- population count.
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+MULFUNC_PROLOGUE(mpn_popcount)
+include_mpn(`x86/pentium4/sse2/popcount.asm')

diff --git a/third_party/gmp/mpn/x86/atom/sse2/sqr_basecase.asm b/third_party/gmp/mpn/x86/atom/sse2/sqr_basecase.asm
new file mode 100644
index 0000000..af19ed8
--- /dev/null
+++ b/third_party/gmp/mpn/x86/atom/sse2/sqr_basecase.asm

@@ -0,0 +1,634 @@
+dnl  x86 mpn_sqr_basecase -- square an mpn number, optimised for atom.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C TODO
+C  * Check if 'jmp N(%esp)' is well-predicted enough to allow us to combine the
+C    4 large loops into one; we could use it for the outer loop branch.
+C  * Optimise code outside of inner loops.
+C  * Write combined addmul_1 feed-in a wind-down code, and use when iterating
+C    outer each loop.  ("Overlapping software pipelining")
+C  * Perhaps use caller-saves regs for inlined mul_1, allowing us to postpone
+C    all pushes.
+C  * Perhaps write special code for n < M, for some small M.
+C  * Replace inlined addmul_1 with smaller code from aorsmul_1.asm, or perhaps
+C    with even less pipelined code.
+C  * We run the outer loop until we have a 2-limb by 1-limb addmul_1 left.
+C    Consider breaking out earlier, saving high the cost of short loops.
+
+C void mpn_sqr_basecase (mp_ptr wp,
+C                        mp_srcptr xp, mp_size_t xn);
+
+define(`rp',  `%edi')
+define(`up',  `%esi')
+define(`n',   `%ecx')
+
+define(`un',  `%ebp')
+
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_sqr_basecase)
+	push	%edi
+	push	%esi
+	mov	12(%esp), rp
+	mov	16(%esp), up
+	mov	20(%esp), n
+
+	lea	4(rp), rp	C write triangular product starting at rp[1]
+	dec	n
+	movd	(up), %mm7
+
+	jz	L(one)
+	lea	4(up), up
+	push	%ebx
+	push	%ebp
+	mov	n, %eax
+
+	movd	(up), %mm0
+	neg	n
+	pmuludq	%mm7, %mm0
+	pxor	%mm6, %mm6
+	mov	n, un
+
+	and	$3, %eax
+	jz	L(of0)
+	cmp	$2, %eax
+	jc	L(of1)
+	jz	L(of2)
+
+C ================================================================
+	jmp	L(m3)
+	ALIGN(16)
+L(lm3):	movd	-4(up), %mm0
+	pmuludq	%mm7, %mm0
+	psrlq	$32, %mm6
+	lea	16(rp), rp
+	paddq	%mm0, %mm6
+	movd	(up), %mm0
+	pmuludq	%mm7, %mm0
+	movd	%mm6, -4(rp)
+	psrlq	$32, %mm6
+L(m3):	paddq	%mm0, %mm6
+	movd	4(up), %mm0
+	pmuludq	%mm7, %mm0
+	movd	%mm6, (rp)
+	psrlq	$32, %mm6
+	paddq	%mm0, %mm6
+	movd	8(up), %mm0
+	pmuludq	%mm7, %mm0
+	movd	%mm6, 4(rp)
+	psrlq	$32, %mm6
+	paddq	%mm0, %mm6
+	add	$4, un
+	movd	%mm6, 8(rp)
+	lea	16(up), up
+	js	L(lm3)
+
+	psrlq	$32, %mm6
+	movd	%mm6, 12(rp)
+
+	inc	n
+C	jz	L(done)
+  lea	-12(up), up
+  lea	4(rp), rp
+	jmp	L(ol2)
+
+C ================================================================
+	ALIGN(16)
+L(lm0):	movd	(up), %mm0
+	pmuludq	%mm7, %mm0
+	psrlq	$32, %mm6
+	lea	16(rp), rp
+L(of0):	paddq	%mm0, %mm6
+	movd	4(up), %mm0
+	pmuludq	%mm7, %mm0
+	movd	%mm6, (rp)
+	psrlq	$32, %mm6
+	paddq	%mm0, %mm6
+	movd	8(up), %mm0
+	pmuludq	%mm7, %mm0
+	movd	%mm6, 4(rp)
+	psrlq	$32, %mm6
+	paddq	%mm0, %mm6
+	movd	12(up), %mm0
+	pmuludq	%mm7, %mm0
+	movd	%mm6, 8(rp)
+	psrlq	$32, %mm6
+	paddq	%mm0, %mm6
+	add	$4, un
+	movd	%mm6, 12(rp)
+	lea	16(up), up
+	js	L(lm0)
+
+	psrlq	$32, %mm6
+	movd	%mm6, 16(rp)
+
+	inc	n
+C	jz	L(done)
+  lea	-8(up), up
+  lea	8(rp), rp
+	jmp	L(ol3)
+
+C ================================================================
+	ALIGN(16)
+L(lm1):	movd	-12(up), %mm0
+	pmuludq	%mm7, %mm0
+	psrlq	$32, %mm6
+	lea	16(rp), rp
+	paddq	%mm0, %mm6
+	movd	-8(up), %mm0
+	pmuludq	%mm7, %mm0
+	movd	%mm6, -12(rp)
+	psrlq	$32, %mm6
+	paddq	%mm0, %mm6
+	movd	-4(up), %mm0
+	pmuludq	%mm7, %mm0
+	movd	%mm6, -8(rp)
+	psrlq	$32, %mm6
+	paddq	%mm0, %mm6
+	movd	(up), %mm0
+	pmuludq	%mm7, %mm0
+	movd	%mm6, -4(rp)
+	psrlq	$32, %mm6
+L(of1):	paddq	%mm0, %mm6
+	add	$4, un
+	movd	%mm6, (rp)
+	lea	16(up), up
+	js	L(lm1)
+
+	psrlq	$32, %mm6
+	movd	%mm6, 4(rp)
+
+	inc	n
+	jz	L(done)		C goes away when we add special n=2 code
+  lea	-20(up), up
+  lea	-4(rp), rp
+	jmp	L(ol0)
+
+C ================================================================
+	ALIGN(16)
+L(lm2):	movd	-8(up), %mm0
+	pmuludq	%mm7, %mm0
+	psrlq	$32, %mm6
+	lea	16(rp), rp
+	paddq	%mm0, %mm6
+	movd	-4(up), %mm0
+	pmuludq	%mm7, %mm0
+	movd	%mm6, -8(rp)
+	psrlq	$32, %mm6
+	paddq	%mm0, %mm6
+	movd	(up), %mm0
+	pmuludq	%mm7, %mm0
+	movd	%mm6, -4(rp)
+	psrlq	$32, %mm6
+L(of2):	paddq	%mm0, %mm6
+	movd	4(up), %mm0
+	pmuludq	%mm7, %mm0
+	movd	%mm6, (rp)
+	psrlq	$32, %mm6
+	paddq	%mm0, %mm6
+	add	$4, un
+	movd	%mm6, 4(rp)
+	lea	16(up), up
+	js	L(lm2)
+
+	psrlq	$32, %mm6
+	movd	%mm6, 8(rp)
+
+	inc	n
+C	jz	L(done)
+  lea	-16(up), up
+C  lea	(rp), rp
+C	jmp	L(ol1)
+
+C ================================================================
+
+L(ol1):	lea	4(up,n,4), up
+	movd	(up), %mm7	C read next U invariant limb
+	lea	8(rp,n,4), rp
+	mov	n, un
+
+	movd	4(up), %mm1
+	pmuludq	%mm7, %mm1
+	sar	$2, un
+	movd	%mm1, %ebx
+	inc	un
+	jz	L(re1)
+
+	movd	8(up), %mm0
+	pmuludq	%mm7, %mm0
+	xor	%edx, %edx	C zero edx and CF
+	jmp	L(a1)
+
+L(la1):	adc	$0, %edx
+	add	%ebx, 12(rp)
+	movd	%mm0, %eax
+	pmuludq	%mm7, %mm1
+	lea	16(rp), rp
+	psrlq	$32, %mm0
+	adc	%edx, %eax
+	movd	%mm0, %edx
+	movd	%mm1, %ebx
+	movd	8(up), %mm0
+	pmuludq	%mm7, %mm0
+	adc	$0, %edx
+	add	%eax, (rp)
+L(a1):	psrlq	$32, %mm1
+	adc	%edx, %ebx
+	movd	%mm1, %edx
+	movd	%mm0, %eax
+	movd	12(up), %mm1
+	pmuludq	%mm7, %mm1
+	adc	$0, %edx
+	add	%ebx, 4(rp)
+	psrlq	$32, %mm0
+	adc	%edx, %eax
+	movd	%mm0, %edx
+	movd	%mm1, %ebx
+	lea	16(up), up
+	movd	(up), %mm0
+	adc	$0, %edx
+	add	%eax, 8(rp)
+	psrlq	$32, %mm1
+	adc	%edx, %ebx
+	movd	%mm1, %edx
+	pmuludq	%mm7, %mm0
+	inc	un
+	movd	4(up), %mm1
+	jnz	L(la1)
+
+	adc	un, %edx	C un is zero here
+	add	%ebx, 12(rp)
+	movd	%mm0, %eax
+	pmuludq	%mm7, %mm1
+	lea	16(rp), rp
+	psrlq	$32, %mm0
+	adc	%edx, %eax
+	movd	%mm0, %edx
+	movd	%mm1, %ebx
+	adc	un, %edx
+	add	%eax, (rp)
+	psrlq	$32, %mm1
+	adc	%edx, %ebx
+	movd	%mm1, %eax
+	adc	un, %eax
+	add	%ebx, 4(rp)
+	adc	un, %eax
+	mov	%eax, 8(rp)
+
+	inc	n
+
+C ================================================================
+
+L(ol0):	lea	(up,n,4), up
+	movd	4(up), %mm7	C read next U invariant limb
+	lea	4(rp,n,4), rp
+	mov	n, un
+
+	movd	8(up), %mm0
+	pmuludq	%mm7, %mm0
+	sar	$2, un
+	movd	12(up), %mm1
+	movd	%mm0, %eax
+	pmuludq	%mm7, %mm1
+	xor	%edx, %edx	C zero edx and CF
+	jmp	L(a0)
+
+L(la0):	adc	$0, %edx
+	add	%ebx, 12(rp)
+	movd	%mm0, %eax
+	pmuludq	%mm7, %mm1
+	lea	16(rp), rp
+	psrlq	$32, %mm0
+	adc	%edx, %eax
+	movd	%mm0, %edx
+	movd	%mm1, %ebx
+	movd	8(up), %mm0
+	pmuludq	%mm7, %mm0
+	adc	$0, %edx
+	add	%eax, (rp)
+	psrlq	$32, %mm1
+	adc	%edx, %ebx
+	movd	%mm1, %edx
+	movd	%mm0, %eax
+	movd	12(up), %mm1
+	pmuludq	%mm7, %mm1
+	adc	$0, %edx
+	add	%ebx, 4(rp)
+L(a0):	psrlq	$32, %mm0
+	adc	%edx, %eax
+	movd	%mm0, %edx
+	movd	%mm1, %ebx
+	lea	16(up), up
+	movd	(up), %mm0
+	adc	$0, %edx
+	add	%eax, 8(rp)
+	psrlq	$32, %mm1
+	adc	%edx, %ebx
+	movd	%mm1, %edx
+	pmuludq	%mm7, %mm0
+	inc	un
+	movd	4(up), %mm1
+	jnz	L(la0)
+
+	adc	un, %edx	C un is zero here
+	add	%ebx, 12(rp)
+	movd	%mm0, %eax
+	pmuludq	%mm7, %mm1
+	lea	16(rp), rp
+	psrlq	$32, %mm0
+	adc	%edx, %eax
+	movd	%mm0, %edx
+	movd	%mm1, %ebx
+	adc	un, %edx
+	add	%eax, (rp)
+	psrlq	$32, %mm1
+	adc	%edx, %ebx
+	movd	%mm1, %eax
+	adc	un, %eax
+	add	%ebx, 4(rp)
+	adc	un, %eax
+	mov	%eax, 8(rp)
+
+	inc	n
+
+C ================================================================
+
+L(ol3):	lea	12(up,n,4), up
+	movd	-8(up), %mm7	C read next U invariant limb
+	lea	(rp,n,4), rp	C put rp back
+	mov	n, un
+
+	movd	-4(up), %mm1
+	pmuludq	%mm7, %mm1
+	sar	$2, un
+	movd	%mm1, %ebx
+	movd	(up), %mm0
+	xor	%edx, %edx	C zero edx and CF
+	jmp	L(a3)
+
+L(la3):	adc	$0, %edx
+	add	%ebx, 12(rp)
+	movd	%mm0, %eax
+	pmuludq	%mm7, %mm1
+	lea	16(rp), rp
+	psrlq	$32, %mm0
+	adc	%edx, %eax
+	movd	%mm0, %edx
+	movd	%mm1, %ebx
+	movd	8(up), %mm0
+	pmuludq	%mm7, %mm0
+	adc	$0, %edx
+	add	%eax, (rp)
+	psrlq	$32, %mm1
+	adc	%edx, %ebx
+	movd	%mm1, %edx
+	movd	%mm0, %eax
+	movd	12(up), %mm1
+	pmuludq	%mm7, %mm1
+	adc	$0, %edx
+	add	%ebx, 4(rp)
+	psrlq	$32, %mm0
+	adc	%edx, %eax
+	movd	%mm0, %edx
+	movd	%mm1, %ebx
+	lea	16(up), up
+	movd	(up), %mm0
+	adc	$0, %edx
+	add	%eax, 8(rp)
+L(a3):	psrlq	$32, %mm1
+	adc	%edx, %ebx
+	movd	%mm1, %edx
+	pmuludq	%mm7, %mm0
+	inc	un
+	movd	4(up), %mm1
+	jnz	L(la3)
+
+	adc	un, %edx	C un is zero here
+	add	%ebx, 12(rp)
+	movd	%mm0, %eax
+	pmuludq	%mm7, %mm1
+	lea	16(rp), rp
+	psrlq	$32, %mm0
+	adc	%edx, %eax
+	movd	%mm0, %edx
+	movd	%mm1, %ebx
+	adc	un, %edx
+	add	%eax, (rp)
+	psrlq	$32, %mm1
+	adc	%edx, %ebx
+	movd	%mm1, %eax
+	adc	un, %eax
+	add	%ebx, 4(rp)
+	adc	un, %eax
+	mov	%eax, 8(rp)
+
+	inc	n
+
+C ================================================================
+
+L(ol2):	lea	8(up,n,4), up
+	movd	-4(up), %mm7	C read next U invariant limb
+	lea	12(rp,n,4), rp
+	mov	n, un
+
+	movd	(up), %mm0
+	pmuludq	%mm7, %mm0
+	xor	%edx, %edx
+	sar	$2, un
+	movd	4(up), %mm1
+	test	un, un		C clear carry
+	movd	%mm0, %eax
+	pmuludq	%mm7, %mm1
+	inc	un
+	jnz	L(a2)
+	jmp	L(re2)
+
+L(la2):	adc	$0, %edx
+	add	%ebx, 12(rp)
+	movd	%mm0, %eax
+	pmuludq	%mm7, %mm1
+	lea	16(rp), rp
+L(a2):	psrlq	$32, %mm0
+	adc	%edx, %eax
+	movd	%mm0, %edx
+	movd	%mm1, %ebx
+	movd	8(up), %mm0
+	pmuludq	%mm7, %mm0
+	adc	$0, %edx
+	add	%eax, (rp)
+	psrlq	$32, %mm1
+	adc	%edx, %ebx
+	movd	%mm1, %edx
+	movd	%mm0, %eax
+	movd	12(up), %mm1
+	pmuludq	%mm7, %mm1
+	adc	$0, %edx
+	add	%ebx, 4(rp)
+	psrlq	$32, %mm0
+	adc	%edx, %eax
+	movd	%mm0, %edx
+	movd	%mm1, %ebx
+	lea	16(up), up
+	movd	(up), %mm0
+	adc	$0, %edx
+	add	%eax, 8(rp)
+	psrlq	$32, %mm1
+	adc	%edx, %ebx
+	movd	%mm1, %edx
+	pmuludq	%mm7, %mm0
+	inc	un
+	movd	4(up), %mm1
+	jnz	L(la2)
+
+	adc	un, %edx	C un is zero here
+	add	%ebx, 12(rp)
+	movd	%mm0, %eax
+	pmuludq	%mm7, %mm1
+	lea	16(rp), rp
+	psrlq	$32, %mm0
+	adc	%edx, %eax
+	movd	%mm0, %edx
+	movd	%mm1, %ebx
+	adc	un, %edx
+	add	%eax, (rp)
+	psrlq	$32, %mm1
+	adc	%edx, %ebx
+	movd	%mm1, %eax
+	adc	un, %eax
+	add	%ebx, 4(rp)
+	adc	un, %eax
+	mov	%eax, 8(rp)
+
+	inc	n
+	jmp	L(ol1)
+
+C ================================================================
+L(re2):	psrlq	$32, %mm0
+	movd	(up), %mm7	C read next U invariant limb
+	adc	%edx, %eax
+	movd	%mm0, %edx
+	movd	%mm1, %ebx
+	adc	un, %edx
+	add	%eax, (rp)
+	lea	4(rp), rp
+	psrlq	$32, %mm1
+	adc	%edx, %ebx
+	movd	%mm1, %eax
+	movd	4(up), %mm1
+	adc	un, %eax
+	add	%ebx, (rp)
+	pmuludq	%mm7, %mm1
+	adc	un, %eax
+	mov	%eax, 4(rp)
+	movd	%mm1, %ebx
+
+L(re1):	psrlq	$32, %mm1
+	add	%ebx, 4(rp)
+	movd	%mm1, %eax
+	adc	un, %eax
+	xor	n, n		C make n zeroness assumption below true
+	mov	%eax, 8(rp)
+
+L(done):			C n is zero here
+	mov	24(%esp), up
+	mov	28(%esp), %eax
+
+	movd	(up), %mm0
+	inc	%eax
+	pmuludq	%mm0, %mm0
+	lea	4(up), up
+	mov	20(%esp), rp
+	shr	%eax
+	movd	%mm0, (rp)
+	psrlq	$32, %mm0
+	lea	-12(rp), rp
+	mov	%eax, 28(%esp)
+	jnc	L(odd)
+
+	movd	%mm0, %ebp
+	movd	(up), %mm0
+	lea	8(rp), rp
+	pmuludq	%mm0, %mm0
+	lea	-4(up), up
+	add	8(rp), %ebp
+	movd	%mm0, %edx
+	adc	12(rp), %edx
+	rcr	n
+	jmp	L(ent)
+
+C	ALIGN(16)		C alignment seems irrelevant
+L(top):	movd	(up), %mm1
+	adc	n, n
+	movd	%mm0, %eax
+	pmuludq	%mm1, %mm1
+	movd	4(up), %mm0
+	adc	(rp), %eax
+	movd	%mm1, %ebx
+	pmuludq	%mm0, %mm0
+	psrlq	$32, %mm1
+	adc	4(rp), %ebx
+	movd	%mm1, %ebp
+	movd	%mm0, %edx
+	adc	8(rp), %ebp
+	adc	12(rp), %edx
+	rcr	n		C FIXME: isn't this awfully slow on atom???
+	adc	%eax, (rp)
+	adc	%ebx, 4(rp)
+L(ent):	lea	8(up), up
+	adc	%ebp, 8(rp)
+	psrlq	$32, %mm0
+	adc	%edx, 12(rp)
+L(odd):	decl	28(%esp)
+	lea	16(rp), rp
+	jnz	L(top)
+
+L(end):	adc	n, n
+	movd	%mm0, %eax
+	adc	n, %eax
+	mov	%eax, (rp)
+
+L(rtn):	emms
+	pop	%ebp
+	pop	%ebx
+	pop	%esi
+	pop	%edi
+	ret
+
+L(one):	pmuludq	%mm7, %mm7
+	movq	%mm7, -4(rp)
+	emms
+	pop	%esi
+	pop	%edi
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/atom/sublsh1_n.asm b/third_party/gmp/mpn/x86/atom/sublsh1_n.asm
new file mode 100644
index 0000000..d3e7e5b
--- /dev/null
+++ b/third_party/gmp/mpn/x86/atom/sublsh1_n.asm

@@ -0,0 +1,34 @@
+dnl  Intel Atom mpn_sublsh1_n -- rp[] = up[] - (vp[] << 1)
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_sublsh1_n_ip1)
+include_mpn(`x86/k7/sublsh1_n.asm')

diff --git a/third_party/gmp/mpn/x86/atom/sublsh2_n.asm b/third_party/gmp/mpn/x86/atom/sublsh2_n.asm
new file mode 100644
index 0000000..79405cf
--- /dev/null
+++ b/third_party/gmp/mpn/x86/atom/sublsh2_n.asm

@@ -0,0 +1,57 @@
+dnl  Intel Atom mpn_addlsh2_n/mpn_sublsh2_n -- rp[] = up[] +- (vp[] << 2).
+
+dnl  Contributed to the GNU project by Marco Bodrato.
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+define(LSH, 2)
+define(RSH, 30)
+
+ifdef(`OPERATION_addlsh2_n', `
+	define(M4_inst,		adcl)
+	define(M4_opp,		subl)
+	define(M4_function,	mpn_addlsh2_n)
+	define(M4_function_c,	mpn_addlsh2_nc)
+	define(M4_ip_function_c, mpn_addlsh2_nc_ip1)
+	define(M4_ip_function,	mpn_addlsh2_n_ip1)
+',`ifdef(`OPERATION_sublsh2_n', `
+	define(M4_inst,		sbbl)
+	define(M4_opp,		addl)
+	define(M4_function,	mpn_sublsh2_n)
+	define(M4_function_c,	mpn_sublsh2_nc)
+	define(M4_ip_function_c, mpn_sublsh2_nc_ip1)
+	define(M4_ip_function,	mpn_sublsh2_n_ip1)
+',`m4_error(`Need OPERATION_addlsh2_n or OPERATION_sublsh2_n
+')')')
+
+MULFUNC_PROLOGUE(mpn_sublsh2_n mpn_sublsh2_nc mpn_sublsh2_n_ip1 mpn_sublsh2_nc_ip1)
+
+include_mpn(`x86/atom/aorslshC_n.asm')

diff --git a/third_party/gmp/mpn/x86/bd1/gmp-mparam.h b/third_party/gmp/mpn/x86/bd1/gmp-mparam.h
new file mode 100644
index 0000000..254cfea
--- /dev/null
+++ b/third_party/gmp/mpn/x86/bd1/gmp-mparam.h

@@ -0,0 +1,211 @@
+/* AMD bd1 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* 3600-3800 MHz Bulldozer Zambezi */
+/* FFT tuning limit = 67,000,000 */
+/* Generated by tuneup.c, 2019-10-27, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD                 0  /* always */
+#define MOD_1_UNNORM_THRESHOLD               0  /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          7
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          4
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD        15
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD         0  /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     13
+#define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_1N_PI1_METHOD                 1  /* 59.59% faster than 2 */
+#define DIV_QR_1_NORM_THRESHOLD              5
+#define DIV_QR_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           27
+
+#define DIV_1_VS_MUL_1_PERCENT             245
+
+#define MUL_TOOM22_THRESHOLD                32
+#define MUL_TOOM33_THRESHOLD                89
+#define MUL_TOOM44_THRESHOLD               154
+#define MUL_TOOM6H_THRESHOLD               230
+#define MUL_TOOM8H_THRESHOLD               351
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      89
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     110
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD     101
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD     111
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD     130
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 46
+#define SQR_TOOM3_THRESHOLD                 87
+#define SQR_TOOM4_THRESHOLD                216
+#define SQR_TOOM6_THRESHOLD                294
+#define SQR_TOOM8_THRESHOLD                442
+
+#define MULMID_TOOM42_THRESHOLD             50
+
+#define MULMOD_BNM1_THRESHOLD               22
+#define SQRMOD_BNM1_THRESHOLD               26
+
+#define MUL_FFT_MODF_THRESHOLD             636  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    636, 5}, {     28, 6}, {     15, 5}, {     31, 6}, \
+    {     28, 7}, {     15, 6}, {     33, 7}, {     17, 6}, \
+    {     35, 7}, {     19, 6}, {     39, 7}, {     23, 6}, \
+    {     47, 7}, {     29, 8}, {     15, 7}, {     35, 8}, \
+    {     19, 7}, {     41, 8}, {     23, 7}, {     47, 8}, \
+    {     27, 7}, {     55, 8}, {     31, 7}, {     63, 8}, \
+    {     43, 9}, {     23, 8}, {     55, 9}, {     31, 8}, \
+    {     67, 9}, {     39, 8}, {     79, 9}, {     47, 8}, \
+    {     95, 9}, {     55,10}, {     31, 9}, {     79,10}, \
+    {     47, 9}, {     95,11}, {     31,10}, {     63, 9}, \
+    {    135,10}, {     79, 9}, {    159,10}, {     95, 9}, \
+    {    191,11}, {     63, 7}, {   1023, 8}, {    543,11}, \
+    {     95,10}, {    191,12}, {     63,11}, {    127,10}, \
+    {    255, 9}, {    511,10}, {    271,11}, {    159,10}, \
+    {    319, 9}, {    639,10}, {    335,11}, {    191,10}, \
+    {    399,11}, {    223,12}, {    127,11}, {    255,10}, \
+    {    543,11}, {    287,10}, {    607,11}, {    319,10}, \
+    {    639,12}, {    191,11}, {    383,10}, {    799,11}, \
+    {    415,13}, {    127,12}, {    255,11}, {    543,10}, \
+    {   1087,11}, {    607,12}, {    319,11}, {    671,10}, \
+    {   1343,11}, {    735,12}, {    383,11}, {    799,10}, \
+    {   1599,11}, {    863,12}, {    447,11}, {    895,13}, \
+    {    255,12}, {    511,11}, {   1087,12}, {    575,11}, \
+    {   1215,12}, {    639,11}, {   1343,12}, {    703,11}, \
+    {   1471,13}, {    383,12}, {    767,11}, {   1599,12}, \
+    {    831,11}, {   1727,12}, {    895,14}, {    255,13}, \
+    {    511,12}, {   1087,11}, {   2239,10}, {   4479,12}, \
+    {   1215,13}, {    639,12}, {   1471,11}, {   2943,13}, \
+    {    767,12}, {   1727,11}, {   3455,13}, {    895,12}, \
+    {   1919,14}, {    511,13}, {   1023,12}, {   2239,11}, \
+    {   4479,13}, {   1151,12}, {   2495,11}, {   4991,13}, \
+    {   1279,12}, {   2623,13}, {   1407,12}, {   2943,14}, \
+    {    767,13}, {   1663,12}, {   3455,13}, {   1919,15}, \
+    {    511,14}, {   1023,13}, {   2175,12}, {   4479,13}, \
+    {   2431,12}, {   4991,14}, {   1279,13}, {   2943,12}, \
+    {   5887,14}, {   1535,13}, {   3455,14}, {   1791,13}, \
+    {   3967,12}, {   7935,15}, {   1023,14}, {   2047,13}, \
+    {   4479,14}, {   2303,13}, {   4991,12}, {   9983,14}, \
+    {   2815,13}, {   5887,15}, {   1535,14}, {   3327,13}, \
+    {   6911,14}, {   3839,13}, {   7935,16} }
+#define MUL_FFT_TABLE3_SIZE 159
+#define MUL_FFT_THRESHOLD                 6784
+
+#define SQR_FFT_MODF_THRESHOLD             565  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    565, 5}, {     29, 6}, {     15, 5}, {     32, 6}, \
+    {     17, 5}, {     35, 6}, {     29, 7}, {     15, 6}, \
+    {     33, 7}, {     17, 6}, {     35, 7}, {     19, 6}, \
+    {     39, 7}, {     23, 6}, {     47, 7}, {     35, 8}, \
+    {     19, 7}, {     41, 8}, {     23, 7}, {     49, 8}, \
+    {     27, 7}, {     55, 8}, {     43, 9}, {     23, 8}, \
+    {     55, 9}, {     31, 8}, {     67, 9}, {     39, 8}, \
+    {     79, 9}, {     47, 8}, {     95, 9}, {     55,10}, \
+    {     31, 9}, {     79,10}, {     47, 9}, {     95,11}, \
+    {     31,10}, {     63, 9}, {    135,10}, {     79, 9}, \
+    {    159,10}, {     95,11}, {     63,10}, {    159,11}, \
+    {    127,10}, {    255, 9}, {    511,10}, {    271, 9}, \
+    {    543,11}, {    159,10}, {    319, 9}, {    639,10}, \
+    {    335, 9}, {    671,11}, {    191,10}, {    415,11}, \
+    {    223,12}, {    127,11}, {    255,10}, {    543,11}, \
+    {    287,10}, {    607,11}, {    319,10}, {    671,12}, \
+    {    191,11}, {    383,10}, {    799,11}, {    415,10}, \
+    {    831,13}, {    127,12}, {    255,11}, {    543,10}, \
+    {   1087,11}, {    607,12}, {    319,11}, {    671,10}, \
+    {   1343,11}, {    735,12}, {    383,11}, {    863,12}, \
+    {    447,11}, {    959,13}, {    255,12}, {    511,11}, \
+    {   1087,12}, {    575,11}, {   1215,12}, {    639,11}, \
+    {   1343,12}, {    703,13}, {    383,12}, {    767,11}, \
+    {   1535,12}, {    831,11}, {   1727,12}, {    895,11}, \
+    {   1791,12}, {    959,14}, {    255,13}, {    511,12}, \
+    {   1087,11}, {   2239,10}, {   4479,12}, {   1215,13}, \
+    {    639,12}, {   1471,11}, {   2943,13}, {    767,12}, \
+    {   1727,13}, {    895,12}, {   1919,14}, {    511,13}, \
+    {   1023,12}, {   2239,11}, {   4479,13}, {   1151,12}, \
+    {   2495,11}, {   4991,13}, {   1279,12}, {   2623,13}, \
+    {   1407,12}, {   2943,14}, {    767,13}, {   1663,12}, \
+    {   3455,13}, {   1919,15}, {    511,14}, {   1023,13}, \
+    {   2175,12}, {   4479,13}, {   2431,12}, {   4991,14}, \
+    {   1279,13}, {   2943,12}, {   5887,14}, {   1535,13}, \
+    {   3455,14}, {   1791,13}, {   3967,15}, {   1023,14}, \
+    {   2047,13}, {   4479,14}, {   2303,13}, {   4991,12}, \
+    {   9983,14}, {   2815,13}, {   5887,15}, {   1535,14}, \
+    {   3327,13}, {   6783,14}, {   3839,13}, {   7679,16} }
+#define SQR_FFT_TABLE3_SIZE 152
+#define SQR_FFT_THRESHOLD                 5760
+
+#define MULLO_BASECASE_THRESHOLD             3
+#define MULLO_DC_THRESHOLD                  31
+#define MULLO_MUL_N_THRESHOLD            13463
+#define SQRLO_BASECASE_THRESHOLD             0  /* always */
+#define SQRLO_DC_THRESHOLD                  33
+#define SQRLO_SQR_THRESHOLD              11278
+
+#define DC_DIV_QR_THRESHOLD                 52
+#define DC_DIVAPPR_Q_THRESHOLD             198
+#define DC_BDIV_QR_THRESHOLD                48
+#define DC_BDIV_Q_THRESHOLD                126
+
+#define INV_MULMOD_BNM1_THRESHOLD           82
+#define INV_NEWTON_THRESHOLD               212
+#define INV_APPR_THRESHOLD                 202
+
+#define BINV_NEWTON_THRESHOLD              238
+#define REDC_1_TO_REDC_N_THRESHOLD          55
+
+#define MU_DIV_QR_THRESHOLD               1652
+#define MU_DIVAPPR_Q_THRESHOLD            1528
+#define MUPI_DIV_QR_THRESHOLD              110
+#define MU_BDIV_QR_THRESHOLD              1442
+#define MU_BDIV_Q_THRESHOLD               1528
+
+#define POWM_SEC_TABLE  1,20,96,386,1221,2698
+
+#define GET_STR_DC_THRESHOLD                11
+#define GET_STR_PRECOMPUTE_THRESHOLD        21
+#define SET_STR_DC_THRESHOLD               100
+#define SET_STR_PRECOMPUTE_THRESHOLD       762
+
+#define FAC_DSC_THRESHOLD                  118
+#define FAC_ODD_THRESHOLD                   34
+
+#define MATRIX22_STRASSEN_THRESHOLD         16
+#define HGCD2_DIV1_METHOD                    4  /* 1.22% faster than 3 */
+#define HGCD_THRESHOLD                      67
+#define HGCD_APPR_THRESHOLD                150
+#define HGCD_REDUCE_THRESHOLD             3389
+#define GCD_DC_THRESHOLD                   483
+#define GCDEXT_DC_THRESHOLD                345
+#define JACOBI_BASE_METHOD                   4  /* 5.07% faster than 1 */
+
+/* Tuneup completed successfully, took 65358 seconds */

diff --git a/third_party/gmp/mpn/x86/bd2/gmp-mparam.h b/third_party/gmp/mpn/x86/bd2/gmp-mparam.h
new file mode 100644
index 0000000..6893da7
--- /dev/null
+++ b/third_party/gmp/mpn/x86/bd2/gmp-mparam.h

@@ -0,0 +1,214 @@
+/* AMD bd2 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* 4000-4200 MHz Piledriver Vishera  */
+/* FFT tuning limit = 67,000,000 */
+/* Generated by tuneup.c, 2019-10-23, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD                 3
+#define MOD_1_UNNORM_THRESHOLD               4
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          6
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          4
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD        18
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD         0  /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     12
+#define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_1N_PI1_METHOD                 1  /* 40.87% faster than 2 */
+#define DIV_QR_1_NORM_THRESHOLD              5
+#define DIV_QR_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           24
+
+#define DIV_1_VS_MUL_1_PERCENT             254
+
+#define MUL_TOOM22_THRESHOLD                32
+#define MUL_TOOM33_THRESHOLD                73
+#define MUL_TOOM44_THRESHOLD               151
+#define MUL_TOOM6H_THRESHOLD               222
+#define MUL_TOOM8H_THRESHOLD               351
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      85
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     110
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD     100
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD     110
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD     130
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 44
+#define SQR_TOOM3_THRESHOLD                 93
+#define SQR_TOOM4_THRESHOLD                212
+#define SQR_TOOM6_THRESHOLD                318
+#define SQR_TOOM8_THRESHOLD                466
+
+#define MULMID_TOOM42_THRESHOLD             66
+
+#define MULMOD_BNM1_THRESHOLD               20
+#define SQRMOD_BNM1_THRESHOLD               23
+
+#define MUL_FFT_MODF_THRESHOLD             595  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    595, 5}, {     27, 6}, {     29, 7}, {     15, 6}, \
+    {     33, 7}, {     17, 6}, {     35, 7}, {     19, 6}, \
+    {     39, 7}, {     23, 6}, {     47, 7}, {     29, 8}, \
+    {     15, 7}, {     35, 8}, {     19, 7}, {     41, 8}, \
+    {     23, 7}, {     49, 8}, {     31, 7}, {     63, 8}, \
+    {     39, 9}, {     23, 8}, {     55, 9}, {     31, 8}, \
+    {     67, 9}, {     39, 8}, {     83, 9}, {     47, 8}, \
+    {     95, 9}, {     55,10}, {     31, 9}, {     79,10}, \
+    {     47, 9}, {     95,11}, {     31,10}, {     63, 9}, \
+    {    135,10}, {     79, 9}, {    159,10}, {     95, 9}, \
+    {    191,11}, {     63,10}, {    143, 7}, {   1215, 9}, \
+    {    319, 8}, {    639, 9}, {    335, 8}, {    671, 9}, \
+    {    351,10}, {    191,12}, {     63,11}, {    127,10}, \
+    {    271,11}, {    159,10}, {    319, 9}, {    639,10}, \
+    {    335,11}, {    191,10}, {    399,11}, {    223,12}, \
+    {    127,11}, {    255,10}, {    543,11}, {    287,10}, \
+    {    607,11}, {    319,10}, {    671,12}, {    191,11}, \
+    {    383,10}, {    799,11}, {    415,13}, {    127,12}, \
+    {    255,11}, {    543,10}, {   1087,11}, {    607,12}, \
+    {    319,11}, {    671,10}, {   1343,11}, {    735,10}, \
+    {   1471,12}, {    383,11}, {    799,10}, {   1599,11}, \
+    {    863,12}, {    447,11}, {    895,13}, {    255,12}, \
+    {    511,11}, {   1087,12}, {    575,11}, {   1215,12}, \
+    {    639,11}, {   1343,12}, {    703,11}, {   1471,13}, \
+    {    383,12}, {    767,11}, {   1599,12}, {    831,11}, \
+    {   1727,12}, {    895,14}, {    255,13}, {    511,12}, \
+    {   1087,11}, {   2239,12}, {   1215,13}, {    639,12}, \
+    {   1471,11}, {   2943,13}, {    767,12}, {   1727,13}, \
+    {    895,12}, {   1919,14}, {    511,13}, {   1023,12}, \
+    {   2239,13}, {   1151,12}, {   2431,13}, {   1279,12}, \
+    {   2623,13}, {   1407,12}, {   2943,14}, {    767,13}, \
+    {   1535,12}, {   3135,13}, {   1663,12}, {   3455,13}, \
+    {   1919,15}, {    511,14}, {   1023,13}, {   2175,12}, \
+    {   4479,13}, {   2431,14}, {   1279,13}, {   2943,12}, \
+    {   5887,14}, {   1535,13}, {   3455,14}, {   1791,13}, \
+    {   3967,12}, {   7935,11}, {  15871,15}, {   1023,14}, \
+    {   2047,13}, {   4479,14}, {   2303,13}, {   4991,12}, \
+    {   9983,14}, {   2815,13}, {   5887,15}, {   1535,14}, \
+    {   3839,13}, {   7935,12}, {  15871,16} }
+#define MUL_FFT_TABLE3_SIZE 155
+#define MUL_FFT_THRESHOLD                 6784
+
+#define SQR_FFT_MODF_THRESHOLD             555  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    555, 5}, {     28, 6}, {     15, 5}, {     31, 6}, \
+    {     16, 5}, {     33, 6}, {     29, 7}, {     15, 6}, \
+    {     33, 7}, {     17, 6}, {     36, 7}, {     19, 6}, \
+    {     39, 7}, {     23, 6}, {     47, 7}, {     29, 8}, \
+    {     15, 7}, {     35, 8}, {     19, 7}, {     43, 8}, \
+    {     23, 7}, {     49, 8}, {     31, 7}, {     63, 8}, \
+    {     43, 9}, {     23, 8}, {     55, 9}, {     31, 8}, \
+    {     67, 9}, {     39, 8}, {     79, 9}, {     47, 8}, \
+    {     95, 9}, {     55,10}, {     31, 9}, {     79,10}, \
+    {     47, 9}, {     95,11}, {     31,10}, {     63, 9}, \
+    {    135,10}, {     79, 9}, {    159,10}, {     95,11}, \
+    {     63,10}, {    143, 9}, {    287,10}, {    159,11}, \
+    {     95,10}, {    191, 6}, {   3071, 5}, {   6399, 6}, \
+    {   3455, 7}, {   1791, 8}, {    959,10}, {    255, 9}, \
+    {    511,10}, {    271,11}, {    159,10}, {    319, 9}, \
+    {    639,10}, {    335, 9}, {    671,10}, {    351,11}, \
+    {    191,10}, {    399, 9}, {    799,10}, {    415,11}, \
+    {    223,12}, {    127,11}, {    255,10}, {    543,11}, \
+    {    287,10}, {    607,11}, {    319,10}, {    671,11}, \
+    {    351,12}, {    191,11}, {    383,10}, {    799,11}, \
+    {    415,13}, {    127,12}, {    255,11}, {    543,10}, \
+    {   1087,11}, {    607,12}, {    319,11}, {    671,10}, \
+    {   1343,11}, {    735,12}, {    383,11}, {    799,10}, \
+    {   1599,11}, {    863,12}, {    447,11}, {    927,13}, \
+    {    255,12}, {    511,11}, {   1055,10}, {   2111,11}, \
+    {   1087,12}, {    575,11}, {   1215,12}, {    639,11}, \
+    {   1343,12}, {    703,13}, {    383,12}, {    767,11}, \
+    {   1599,12}, {    831,11}, {   1727,10}, {   3455,12}, \
+    {    959,14}, {    255,13}, {    511,12}, {   1023,11}, \
+    {   2111,12}, {   1087,11}, {   2239,10}, {   4479,12}, \
+    {   1215,13}, {    639,12}, {   1471,11}, {   2943,13}, \
+    {    767,12}, {   1727,11}, {   3455,13}, {    895,12}, \
+    {   1855,14}, {    511,13}, {   1023,12}, {   2239,13}, \
+    {   1151,12}, {   2495,13}, {   1279,12}, {   2623,13}, \
+    {   1407,12}, {   2943,14}, {    767,13}, {   1663,12}, \
+    {   3455,13}, {   1791,15}, {    511,14}, {   1023,13}, \
+    {   2175,12}, {   4479,13}, {   2431,14}, {   1279,13}, \
+    {   2943,12}, {   5887,14}, {   1535,13}, {   3455,14}, \
+    {   1791,13}, {   3967,12}, {   7935,15}, {   1023,14}, \
+    {   2047,13}, {   4479,14}, {   2303,13}, {   4991,12}, \
+    {   9983,14}, {   2815,13}, {   5887,15}, {   1535,14}, \
+    {   3839,13}, {   7935,16} }
+#define SQR_FFT_TABLE3_SIZE 166
+#define SQR_FFT_THRESHOLD                 5760
+
+#define MULLO_BASECASE_THRESHOLD             3
+#define MULLO_DC_THRESHOLD                  34
+#define MULLO_MUL_N_THRESHOLD            13463
+#define SQRLO_BASECASE_THRESHOLD             8
+#define SQRLO_DC_THRESHOLD                  43
+#define SQRLO_SQR_THRESHOLD              11278
+
+#define DC_DIV_QR_THRESHOLD                 75
+#define DC_DIVAPPR_Q_THRESHOLD             200
+#define DC_BDIV_QR_THRESHOLD                71
+#define DC_BDIV_Q_THRESHOLD                119
+
+#define INV_MULMOD_BNM1_THRESHOLD           74
+#define INV_NEWTON_THRESHOLD               266
+#define INV_APPR_THRESHOLD                 214
+
+#define BINV_NEWTON_THRESHOLD              278
+#define REDC_1_TO_REDC_N_THRESHOLD          71
+
+#define MU_DIV_QR_THRESHOLD               1652
+#define MU_DIVAPPR_Q_THRESHOLD            1589
+#define MUPI_DIV_QR_THRESHOLD              122
+#define MU_BDIV_QR_THRESHOLD              1442
+#define MU_BDIV_Q_THRESHOLD               1597
+
+#define POWM_SEC_TABLE  1,22,96,289,1259
+
+#define GET_STR_DC_THRESHOLD                11
+#define GET_STR_PRECOMPUTE_THRESHOLD        20
+#define SET_STR_DC_THRESHOLD               173
+#define SET_STR_PRECOMPUTE_THRESHOLD       454
+
+#define FAC_DSC_THRESHOLD                   90
+#define FAC_ODD_THRESHOLD                   34
+
+#define MATRIX22_STRASSEN_THRESHOLD         19
+#define HGCD2_DIV1_METHOD                    1  /* 5.80% faster than 3 */
+#define HGCD_THRESHOLD                      74
+#define HGCD_APPR_THRESHOLD                 50
+#define HGCD_REDUCE_THRESHOLD             3389
+#define GCD_DC_THRESHOLD                   456
+#define GCDEXT_DC_THRESHOLD                345
+#define JACOBI_BASE_METHOD                   4  /* 17.07% faster than 1 */
+
+/* Tuneup completed successfully, took 53914 seconds */

diff --git a/third_party/gmp/mpn/x86/bd4/gmp-mparam.h b/third_party/gmp/mpn/x86/bd4/gmp-mparam.h
new file mode 100644
index 0000000..6c20d0f
--- /dev/null
+++ b/third_party/gmp/mpn/x86/bd4/gmp-mparam.h

@@ -0,0 +1,225 @@
+/* AMD bd4 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* 3800-4200 MHz Excavator/Bristol Ridge  */
+/* FFT tuning limit = 67,000,000 */
+/* Generated by tuneup.c, 2019-10-23, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD                 0  /* always */
+#define MOD_1_UNNORM_THRESHOLD               0  /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          8
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          6
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD        27
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD        50
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     13
+#define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_1N_PI1_METHOD                 1  /* 28.45% faster than 2 */
+#define DIV_QR_1_NORM_THRESHOLD              4
+#define DIV_QR_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */
+#define DIV_QR_2_PI2_THRESHOLD              13
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           28
+
+#define DIV_1_VS_MUL_1_PERCENT             314
+
+#define MUL_TOOM22_THRESHOLD                32
+#define MUL_TOOM33_THRESHOLD                73
+#define MUL_TOOM44_THRESHOLD               166
+#define MUL_TOOM6H_THRESHOLD               270
+#define MUL_TOOM8H_THRESHOLD               357
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      69
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     114
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD     103
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD     121
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD     154
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 42
+#define SQR_TOOM3_THRESHOLD                 89
+#define SQR_TOOM4_THRESHOLD                208
+#define SQR_TOOM6_THRESHOLD                306
+#define SQR_TOOM8_THRESHOLD                454
+
+#define MULMID_TOOM42_THRESHOLD             68
+
+#define MULMOD_BNM1_THRESHOLD               19
+#define SQRMOD_BNM1_THRESHOLD               18
+
+#define MUL_FFT_MODF_THRESHOLD             570  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    570, 5}, {     25, 6}, {     13, 5}, {     27, 6}, \
+    {     25, 7}, {     13, 6}, {     27, 7}, {     15, 6}, \
+    {     32, 7}, {     17, 6}, {     35, 7}, {     19, 6}, \
+    {     39, 7}, {     27, 8}, {     15, 7}, {     35, 8}, \
+    {     19, 7}, {     41, 8}, {     23, 7}, {     47, 8}, \
+    {     27, 9}, {     15, 8}, {     31, 7}, {     63, 8}, \
+    {     39, 9}, {     23, 8}, {     51,10}, {     15, 9}, \
+    {     31, 8}, {     67, 9}, {     39, 8}, {     79, 9}, \
+    {     47, 8}, {     95,10}, {     31, 9}, {     79,10}, \
+    {     47, 9}, {     95,11}, {     31,10}, {     63, 9}, \
+    {    135,10}, {     79, 9}, {    159,10}, {     95, 9}, \
+    {    191,11}, {     63,10}, {    143, 6}, {   2303, 5}, \
+    {   4735, 4}, {   9471, 5}, {   4863, 7}, {   1279, 9}, \
+    {    335, 8}, {    671, 9}, {    351, 8}, {    703,10}, \
+    {    191,12}, {     63,11}, {    127,10}, {    255, 9}, \
+    {    511,10}, {    271, 9}, {    543,11}, {    159,10}, \
+    {    319, 9}, {    639,10}, {    335, 9}, {    671, 8}, \
+    {   1343,10}, {    351, 9}, {    703,10}, {    367, 9}, \
+    {    735,11}, {    191,10}, {    383, 9}, {    767,10}, \
+    {    399, 9}, {    799, 8}, {   1599,10}, {    415,11}, \
+    {    223,12}, {    127,11}, {    255,10}, {    543, 9}, \
+    {   1087,11}, {    287,10}, {    607, 9}, {   1215,11}, \
+    {    319,10}, {    671, 9}, {   1343,11}, {    351,12}, \
+    {    191,11}, {    383,10}, {    799,11}, {    415,10}, \
+    {    863,13}, {    127,12}, {    255,11}, {    543,10}, \
+    {   1087,11}, {    607,10}, {   1215, 9}, {   2431,12}, \
+    {    319,11}, {    671,10}, {   1343,11}, {    735,10}, \
+    {   1471, 9}, {   2943,12}, {    383,11}, {    799,10}, \
+    {   1599,11}, {    863,10}, {   1727,12}, {    447,11}, \
+    {    959,10}, {   1919,13}, {    255,12}, {    511,11}, \
+    {   1087,12}, {    575,11}, {   1215,10}, {   2431,12}, \
+    {    639,11}, {   1343,12}, {    703,11}, {   1471,10}, \
+    {   2943,13}, {    383,12}, {    767,11}, {   1599,12}, \
+    {    831,11}, {   1727,10}, {   3455,12}, {    959,11}, \
+    {   1919,10}, {   3839,13}, {    511,12}, {   1087,11}, \
+    {   2239,12}, {   1215,11}, {   2431,13}, {    639,12}, \
+    {   1471,11}, {   2943,10}, {   5887,13}, {    767,12}, \
+    {   1727,11}, {   3455,13}, {    895,12}, {   1919,11}, \
+    {   3839,14}, {    511,13}, {   1023,12}, {   2239,13}, \
+    {   1151,12}, {   2431,13}, {   1279,12}, {   2559,13}, \
+    {   1407,12}, {   2943,11}, {   5887,14}, {    767,13}, \
+    {   1663,12}, {   3455,13}, {   1919,12}, {   3839,15}, \
+    {    511,14}, {   1023,13}, {   2175,12}, {   4479,13}, \
+    {   2431,14}, {   1279,13}, {   2943,12}, {   5887,14}, \
+    {   1535,13}, {   3455,14}, {   1791,13}, {   3967,12}, \
+    {   7935,15}, {   1023,14}, {   2047,13}, {   4479,14}, \
+    {   2303,13}, {   4991,12}, {   9983,14}, {   2815,13}, \
+    {   5887,15}, {   1535,14}, {   3839,13}, {   7935,16} }
+#define MUL_FFT_TABLE3_SIZE 192
+#define MUL_FFT_THRESHOLD                 5760
+
+#define SQR_FFT_MODF_THRESHOLD             476  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    476, 5}, {     28, 6}, {     15, 5}, {     31, 6}, \
+    {     16, 5}, {     33, 6}, {     29, 7}, {     15, 6}, \
+    {     33, 7}, {     17, 6}, {     36, 7}, {     19, 6}, \
+    {     39, 7}, {     29, 8}, {     15, 7}, {     35, 8}, \
+    {     19, 7}, {     41, 8}, {     23, 7}, {     47, 8}, \
+    {     27, 9}, {     15, 8}, {     39, 9}, {     23, 8}, \
+    {     51, 9}, {     31, 8}, {     67, 9}, {     39, 8}, \
+    {     79, 9}, {     47, 8}, {     95,10}, {     31, 9}, \
+    {     79,10}, {     47, 9}, {     95,11}, {     31,10}, \
+    {     63, 9}, {    135,10}, {     95, 9}, {    191,10}, \
+    {    111,11}, {     63,10}, {    127, 9}, {    255,10}, \
+    {    143, 9}, {    287, 8}, {    575,10}, {    159,11}, \
+    {     95,10}, {    191,12}, {     63,10}, {    255, 9}, \
+    {    511,10}, {    271, 9}, {    543,10}, {    287, 9}, \
+    {    575,11}, {    159,10}, {    319, 9}, {    639,10}, \
+    {    335, 9}, {    671,10}, {    351, 9}, {    735,11}, \
+    {    191,10}, {    383, 9}, {    767,10}, {    399, 9}, \
+    {    799,10}, {    415, 9}, {    863,12}, {    127,11}, \
+    {    255,10}, {    511, 9}, {   1023,10}, {    543,11}, \
+    {    287,10}, {    607, 9}, {   1215,11}, {    319,10}, \
+    {    671, 9}, {   1343,11}, {    351,10}, {    735,12}, \
+    {    191,11}, {    383,10}, {    799,11}, {    415,10}, \
+    {    863,13}, {    127,12}, {    255,11}, {    511,10}, \
+    {   1055,11}, {    543,10}, {   1087,11}, {    607,10}, \
+    {   1215,12}, {    319,11}, {    671,10}, {   1343,11}, \
+    {    735,10}, {   1471,12}, {    383,11}, {    799,10}, \
+    {   1599,11}, {    863,10}, {   1727,12}, {    447,11}, \
+    {    959,13}, {    255,12}, {    511,11}, {   1087,12}, \
+    {    575,11}, {   1215,12}, {    639,11}, {   1343,12}, \
+    {    703,11}, {   1471,13}, {    383,12}, {    767,11}, \
+    {   1599,12}, {    831,11}, {   1727,12}, {    959,11}, \
+    {   1919,14}, {    255,13}, {    511,12}, {   1023,11}, \
+    {   2047,12}, {   1087,11}, {   2239,12}, {   1215,11}, \
+    {   2431,13}, {    639,12}, {   1471,11}, {   2943,13}, \
+    {    767,12}, {   1727,13}, {    895,12}, {   1983,14}, \
+    {    511,13}, {   1023,12}, {   2239,13}, {   1151,12}, \
+    {   2431,13}, {   1279,12}, {   2559,13}, {   1407,12}, \
+    {   2943,14}, {    767,13}, {   1663,12}, {   3455,13}, \
+    {   1919,12}, {   3839,15}, {    511,14}, {   1023,13}, \
+    {   2175,12}, {   4479,13}, {   2431,14}, {   1279,13}, \
+    {   2943,12}, {   5887,14}, {   1535,13}, {   3455,14}, \
+    {   1791,13}, {   3967,15}, {   1023,14}, {   2047,13}, \
+    {   4479,14}, {   2303,13}, {   4991,12}, {   9983,14}, \
+    {   2815,13}, {   5887,15}, {   1535,14}, {   3839,16} }
+#define SQR_FFT_TABLE3_SIZE 176
+#define SQR_FFT_THRESHOLD                 4736
+
+#define MULLO_BASECASE_THRESHOLD             3
+#define MULLO_DC_THRESHOLD                  54
+#define MULLO_MUL_N_THRESHOLD            10950
+#define SQRLO_BASECASE_THRESHOLD            10
+#define SQRLO_DC_THRESHOLD                  77
+#define SQRLO_SQR_THRESHOLD               9449
+
+#define DC_DIV_QR_THRESHOLD                 84
+#define DC_DIVAPPR_Q_THRESHOLD             252
+#define DC_BDIV_QR_THRESHOLD                79
+#define DC_BDIV_Q_THRESHOLD                 80
+
+#define INV_MULMOD_BNM1_THRESHOLD           71
+#define INV_NEWTON_THRESHOLD               254
+#define INV_APPR_THRESHOLD                 266
+
+#define BINV_NEWTON_THRESHOLD              294
+#define REDC_1_TO_REDC_N_THRESHOLD          79
+
+#define MU_DIV_QR_THRESHOLD               1652
+#define MU_DIVAPPR_Q_THRESHOLD            1528
+#define MUPI_DIV_QR_THRESHOLD              122
+#define MU_BDIV_QR_THRESHOLD              1387
+#define MU_BDIV_Q_THRESHOLD               1528
+
+#define POWM_SEC_TABLE  1,16,96,480,960
+
+#define GET_STR_DC_THRESHOLD                12
+#define GET_STR_PRECOMPUTE_THRESHOLD        19
+#define SET_STR_DC_THRESHOLD               264
+#define SET_STR_PRECOMPUTE_THRESHOLD       542
+
+#define FAC_DSC_THRESHOLD                   91
+#define FAC_ODD_THRESHOLD                   29
+
+#define MATRIX22_STRASSEN_THRESHOLD         19
+#define HGCD2_DIV1_METHOD                    1  /* 9.73% faster than 3 */
+#define HGCD_THRESHOLD                      55
+#define HGCD_APPR_THRESHOLD                 50
+#define HGCD_REDUCE_THRESHOLD             3389
+#define GCD_DC_THRESHOLD                   562
+#define GCDEXT_DC_THRESHOLD                416
+#define JACOBI_BASE_METHOD                   4  /* 16.50% faster than 1 */
+
+/* Tuneup completed successfully, took 49179 seconds */

diff --git a/third_party/gmp/mpn/x86/bdiv_dbm1c.asm b/third_party/gmp/mpn/x86/bdiv_dbm1c.asm
new file mode 100644
index 0000000..0288c47
--- /dev/null
+++ b/third_party/gmp/mpn/x86/bdiv_dbm1c.asm

@@ -0,0 +1,129 @@
+dnl  x86 mpn_bdiv_dbm1.
+
+dnl  Copyright 2008, 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C			    cycles/limb
+C P5
+C P6 model 0-8,10-12)
+C P6 model 9  (Banias)
+C P6 model 13 (Dothan)		 5.1
+C P4 model 0  (Willamette)
+C P4 model 1  (?)
+C P4 model 2  (Northwood)	13.67
+C P4 model 3  (Prescott)
+C P4 model 4  (Nocona)
+C Intel Atom
+C AMD K6
+C AMD K7			 3.5
+C AMD K8
+C AMD K10
+
+
+C TODO
+C  * Optimize for more x86 processors
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_bdiv_dbm1c)
+	mov	16(%esp), %ecx		C d
+	push	%esi
+	mov	12(%esp), %esi		C ap
+	push	%edi
+	mov	12(%esp), %edi		C qp
+	push	%ebp
+	mov	24(%esp), %ebp		C n
+	push	%ebx
+
+	mov	(%esi), %eax
+	mul	%ecx
+	mov	36(%esp), %ebx
+	sub	%eax, %ebx
+	mov	%ebx, (%edi)
+	sbb	%edx, %ebx
+
+	mov	%ebp, %eax
+	and	$3, %eax
+	jz	L(b0)
+	cmp	$2, %eax
+	jc	L(b1)
+	jz	L(b2)
+
+L(b3):	lea	-8(%esi), %esi
+	lea	8(%edi), %edi
+	add	$-3, %ebp
+	jmp	L(3)
+
+L(b0):	mov	4(%esi), %eax
+	lea	-4(%esi), %esi
+	lea	12(%edi), %edi
+	add	$-4, %ebp
+	jmp	L(0)
+
+L(b2):	mov	4(%esi), %eax
+	lea	4(%esi), %esi
+	lea	4(%edi), %edi
+	add	$-2, %ebp
+	jmp	L(2)
+
+	ALIGN(8)
+L(top):	mov	4(%esi), %eax
+	mul	%ecx
+	lea	16(%edi), %edi
+	sub	%eax, %ebx
+	mov	8(%esi), %eax
+	mov	%ebx, -12(%edi)
+	sbb	%edx, %ebx
+L(0):	mul	%ecx
+	sub	%eax, %ebx
+	mov	%ebx, -8(%edi)
+	sbb	%edx, %ebx
+L(3):	mov	12(%esi), %eax
+	mul	%ecx
+	sub	%eax, %ebx
+	mov	%ebx, -4(%edi)
+	mov	16(%esi), %eax
+	lea	16(%esi), %esi
+	sbb	%edx, %ebx
+L(2):	mul	%ecx
+	sub	%eax, %ebx
+	mov	%ebx, 0(%edi)
+	sbb	%edx, %ebx
+L(b1):	add	$-4, %ebp
+	jns	L(top)
+
+	mov	%ebx, %eax
+	pop	%ebx
+	pop	%ebp
+	pop	%edi
+	pop	%esi
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/bdiv_q_1.asm b/third_party/gmp/mpn/x86/bdiv_q_1.asm
new file mode 100644
index 0000000..132de06
--- /dev/null
+++ b/third_party/gmp/mpn/x86/bdiv_q_1.asm

@@ -0,0 +1,208 @@
+dnl  x86 mpn_bdiv_q_1 -- mpn by limb exact division.
+
+dnl  Rearranged from mpn/x86/dive_1.asm by Marco Bodrato.
+
+dnl  Copyright 2001, 2002, 2007, 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C     cycles/limb
+C P54    30.0
+C P55    29.0
+C P6     13.0 odd divisor, 12.0 even (strangely)
+C K6     14.0
+C K7     12.0
+C P4     42.0
+
+MULFUNC_PROLOGUE(mpn_bdiv_q_1 mpn_pi1_bdiv_q_1)
+
+defframe(PARAM_SHIFT,  24)
+defframe(PARAM_INVERSE,20)
+defframe(PARAM_DIVISOR,16)
+defframe(PARAM_SIZE,   12)
+defframe(PARAM_SRC,    8)
+defframe(PARAM_DST,    4)
+
+dnl  re-use parameter space
+define(VAR_INVERSE,`PARAM_SRC')
+
+	TEXT
+
+C mp_limb_t
+C mpn_pi1_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor,
+C		    mp_limb_t inverse, int shift)
+
+	ALIGN(16)
+PROLOGUE(mpn_pi1_bdiv_q_1)
+deflit(`FRAME',0)
+
+	movl	PARAM_SHIFT, %ecx
+	pushl	%ebp	FRAME_pushl()
+
+	movl	PARAM_INVERSE, %eax
+	movl	PARAM_SIZE, %ebp
+	pushl	%ebx	FRAME_pushl()
+L(common):
+	pushl	%edi	FRAME_pushl()
+	pushl	%esi	FRAME_pushl()
+
+	movl	PARAM_SRC, %esi
+	movl	PARAM_DST, %edi
+
+	leal	(%esi,%ebp,4), %esi	C src end
+	leal	(%edi,%ebp,4), %edi	C dst end
+	negl	%ebp			C -size
+
+	movl	%eax, VAR_INVERSE
+	movl	(%esi,%ebp,4), %eax	C src[0]
+
+	xorl	%ebx, %ebx
+	xorl	%edx, %edx
+
+	incl	%ebp
+	jz	L(one)
+
+	movl	(%esi,%ebp,4), %edx	C src[1]
+
+	shrdl(	%cl, %edx, %eax)
+
+	movl	VAR_INVERSE, %edx
+	jmp	L(entry)
+
+
+	ALIGN(8)
+	nop	C k6 code alignment
+	nop
+L(top):
+	C eax	q
+	C ebx	carry bit, 0 or -1
+	C ecx	shift
+	C edx	carry limb
+	C esi	src end
+	C edi	dst end
+	C ebp	counter, limbs, negative
+
+	movl	-4(%esi,%ebp,4), %eax
+	subl	%ebx, %edx		C accumulate carry bit
+
+	movl	(%esi,%ebp,4), %ebx
+
+	shrdl(	%cl, %ebx, %eax)
+
+	subl	%edx, %eax		C apply carry limb
+	movl	VAR_INVERSE, %edx
+
+	sbbl	%ebx, %ebx
+
+L(entry):
+	imull	%edx, %eax
+
+	movl	%eax, -4(%edi,%ebp,4)
+	movl	PARAM_DIVISOR, %edx
+
+	mull	%edx
+
+	incl	%ebp
+	jnz	L(top)
+
+
+	movl	-4(%esi), %eax		C src high limb
+L(one):
+	shrl	%cl, %eax
+	popl	%esi	FRAME_popl()
+
+	addl	%ebx, %eax		C apply carry bit
+
+	subl	%edx, %eax		C apply carry limb
+
+	imull	VAR_INVERSE, %eax
+
+	movl	%eax, -4(%edi)
+
+	popl	%edi
+	popl	%ebx
+	popl	%ebp
+
+	ret
+
+EPILOGUE()
+
+C mp_limb_t mpn_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                           mp_limb_t divisor);
+C
+
+	ALIGN(16)
+PROLOGUE(mpn_bdiv_q_1)
+deflit(`FRAME',0)
+
+	movl	PARAM_DIVISOR, %eax
+	pushl	%ebp	FRAME_pushl()
+
+	movl	$-1, %ecx		C shift count
+	movl	PARAM_SIZE, %ebp
+
+	pushl	%ebx	FRAME_pushl()
+
+L(strip_twos):
+	incl	%ecx
+
+	shrl	%eax
+	jnc	L(strip_twos)
+
+	leal	1(%eax,%eax), %ebx	C d without twos
+	andl	$127, %eax		C d/2, 7 bits
+
+ifdef(`PIC',`
+	LEA(	binvert_limb_table, %edx)
+	movzbl	(%eax,%edx), %eax		C inv 8 bits
+',`
+	movzbl	binvert_limb_table(%eax), %eax	C inv 8 bits
+')
+
+	leal	(%eax,%eax), %edx	C 2*inv
+	movl	%ebx, PARAM_DIVISOR	C d without twos
+	imull	%eax, %eax		C inv*inv
+	imull	%ebx, %eax		C inv*inv*d
+	subl	%eax, %edx		C inv = 2*inv - inv*inv*d
+
+	leal	(%edx,%edx), %eax	C 2*inv
+	imull	%edx, %edx		C inv*inv
+	imull	%ebx, %edx		C inv*inv*d
+	subl	%edx, %eax		C inv = 2*inv - inv*inv*d
+
+	ASSERT(e,`	C expect d*inv == 1 mod 2^GMP_LIMB_BITS
+	pushl	%eax	FRAME_pushl()
+	imull	PARAM_DIVISOR, %eax
+	cmpl	$1, %eax
+	popl	%eax	FRAME_popl()')
+
+	jmp	L(common)
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/x86/bt1/gmp-mparam.h b/third_party/gmp/mpn/x86/bt1/gmp-mparam.h
new file mode 100644
index 0000000..302dbc6
--- /dev/null
+++ b/third_party/gmp/mpn/x86/bt1/gmp-mparam.h

@@ -0,0 +1,218 @@
+/* x86/bobcat gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* NOTE: In a fat binary build SQR_TOOM2_THRESHOLD here cannot be greater than
+   the value in mpn/x86/k7/gmp-mparam.h.  The latter is used as a hard limit in
+   k7/sqr_basecase.asm.  */
+
+/* 1600 MHz AMD Bobcat Zacate E-350 */
+/* FFT tuning limit = 67,000,000 */
+/* Generated by tuneup.c, 2019-10-17, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD                 0  /* always */
+#define MOD_1_UNNORM_THRESHOLD               0  /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD         10
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          6
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD        16
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD         0  /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     21
+#define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_1N_PI1_METHOD                 1  /* 57.16% faster than 2 */
+#define DIV_QR_1_NORM_THRESHOLD              3
+#define DIV_QR_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           36
+
+#define DIV_1_VS_MUL_1_PERCENT             199
+
+#define MUL_TOOM22_THRESHOLD                28
+#define MUL_TOOM33_THRESHOLD                93
+#define MUL_TOOM44_THRESHOLD               166
+#define MUL_TOOM6H_THRESHOLD               270
+#define MUL_TOOM8H_THRESHOLD               478
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD     102
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     177
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD     169
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD     113
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD     143
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 50
+#define SQR_TOOM3_THRESHOLD                 89
+#define SQR_TOOM4_THRESHOLD                248
+#define SQR_TOOM6_THRESHOLD                342
+#define SQR_TOOM8_THRESHOLD                470
+
+#define MULMID_TOOM42_THRESHOLD             72
+
+#define MULMOD_BNM1_THRESHOLD               20
+#define SQRMOD_BNM1_THRESHOLD               21
+
+#define MUL_FFT_MODF_THRESHOLD             630  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    630, 5}, {     25, 6}, {     13, 5}, {     27, 6}, \
+    {     15, 5}, {     31, 6}, {     27, 7}, {     15, 6}, \
+    {     33, 7}, {     17, 6}, {     35, 7}, {     19, 6}, \
+    {     39, 7}, {     23, 6}, {     47, 7}, {     29, 8}, \
+    {     15, 7}, {     35, 8}, {     19, 7}, {     41, 8}, \
+    {     23, 7}, {     49, 8}, {     27, 7}, {     55, 9}, \
+    {     15, 8}, {     31, 7}, {     63, 8}, {     43, 9}, \
+    {     23, 8}, {     55, 9}, {     31, 8}, {     67, 9}, \
+    {     39, 8}, {     79, 9}, {     47, 8}, {     95, 9}, \
+    {     55,10}, {     31, 9}, {     79,10}, {     47, 6}, \
+    {    767, 7}, {    399, 6}, {    799, 7}, {    415, 8}, \
+    {    235, 7}, {    479, 9}, {    135,10}, {     79, 9}, \
+    {    159,10}, {     95, 9}, {    191,11}, {     63,10}, \
+    {    159,11}, {     95,10}, {    191,12}, {     63,11}, \
+    {    127,10}, {    255, 9}, {    511,10}, {    271, 9}, \
+    {    543,11}, {    159,10}, {    319, 9}, {    639,10}, \
+    {    335, 9}, {    671,11}, {    191,10}, {    383, 9}, \
+    {    767,10}, {    399, 9}, {    799,11}, {    223,12}, \
+    {    127,11}, {    255,10}, {    543, 9}, {   1087,11}, \
+    {    287,10}, {    607, 9}, {   1215,11}, {    319,10}, \
+    {    671,11}, {    351,12}, {    191,11}, {    383,10}, \
+    {    799,11}, {    415,13}, {    127,12}, {    255,11}, \
+    {    543,10}, {   1087,11}, {    607,10}, {   1215,12}, \
+    {    319,11}, {    671,10}, {   1343,11}, {    735,10}, \
+    {   1471,12}, {    383,11}, {    799,10}, {   1599,11}, \
+    {    863,12}, {    447,11}, {    991,13}, {    255,12}, \
+    {    511,11}, {   1087,12}, {    575,11}, {   1215,12}, \
+    {    639,11}, {   1343,12}, {    703,11}, {   1471,13}, \
+    {    383,12}, {    767,11}, {   1599,12}, {    831,11}, \
+    {   1727,12}, {    959,14}, {    255,13}, {    511,12}, \
+    {   1215,13}, {    639,12}, {   1471,13}, {    767,12}, \
+    {   1727,13}, {    895,12}, {   1919,14}, {    511,13}, \
+    {   1023,12}, {   2111,13}, {   1151,12}, {   2431,13}, \
+    {   1407,14}, {    767,13}, {   1663,12}, {   3455,13}, \
+    {   1919,15}, {    511,14}, {   1023,13}, {   2175,12}, \
+    {   4479,13}, {   2431,14}, {   1279,13}, {   2943,12}, \
+    {   5887,14}, {   1535,13}, {   3455,14}, {   1791,13}, \
+    {   3967,15}, {   1023,14}, {   2047,13}, {   4479,14}, \
+    {   2303,13}, {   4991,12}, {   9983,14}, {   2815,13}, \
+    {   5887,15}, {   1535,14}, {   3839,16} }
+#define MUL_FFT_TABLE3_SIZE 159
+#define MUL_FFT_THRESHOLD                 7424
+
+#define SQR_FFT_MODF_THRESHOLD             500  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    500, 5}, {     25, 6}, {     13, 5}, {     27, 6}, \
+    {     28, 7}, {     15, 6}, {     32, 7}, {     17, 6}, \
+    {     35, 7}, {     19, 6}, {     39, 7}, {     23, 6}, \
+    {     47, 7}, {     27, 8}, {     15, 7}, {     35, 8}, \
+    {     19, 7}, {     41, 8}, {     23, 7}, {     47, 8}, \
+    {     27, 9}, {     15, 8}, {     31, 7}, {     63, 8}, \
+    {     39, 9}, {     23, 8}, {     51, 9}, {     31, 8}, \
+    {     67, 9}, {     39, 8}, {     79, 9}, {     47, 8}, \
+    {     95, 9}, {     55,10}, {     31, 9}, {     79,10}, \
+    {     47, 9}, {     95,11}, {     31,10}, {     63, 9}, \
+    {    127, 6}, {   1087, 7}, {    575, 8}, {    303, 9}, \
+    {    159,10}, {     95,11}, {     63,10}, {    127, 9}, \
+    {    255,10}, {    143, 9}, {    287,10}, {    159,11}, \
+    {     95,10}, {    191,12}, {     63,11}, {    127,10}, \
+    {    255, 9}, {    511,10}, {    271, 9}, {    543,10}, \
+    {    287, 9}, {    575,11}, {    159,10}, {    319, 9}, \
+    {    639,10}, {    335, 9}, {    671,10}, {    351,11}, \
+    {    191,10}, {    383, 9}, {    767,10}, {    399, 9}, \
+    {    799,10}, {    415, 9}, {    831,12}, {    127,11}, \
+    {    255,10}, {    543,11}, {    287,10}, {    607,11}, \
+    {    319,10}, {    671,11}, {    351,10}, {    703,12}, \
+    {    191,11}, {    383,10}, {    799,11}, {    415,10}, \
+    {    831,13}, {    127,12}, {    255,11}, {    543,10}, \
+    {   1087,11}, {    607,12}, {    319,11}, {    671,10}, \
+    {   1343,11}, {    735,10}, {   1471,12}, {    383,11}, \
+    {    799,10}, {   1599,11}, {    863,12}, {    447,11}, \
+    {    959,13}, {    255,12}, {    511,11}, {   1087,12}, \
+    {    575,11}, {   1215,12}, {    639,11}, {   1343,12}, \
+    {    703,11}, {   1471,13}, {    383,12}, {    767,11}, \
+    {   1599,12}, {    831,11}, {   1727,12}, {    959,14}, \
+    {    255,13}, {    511,12}, {   1215,13}, {    639,12}, \
+    {   1471,13}, {    767,12}, {   1727,13}, {    895,12}, \
+    {   1919,14}, {    511,13}, {   1023,12}, {   2111,13}, \
+    {   1151,12}, {   2431,13}, {   1407,14}, {    767,13}, \
+    {   1663,12}, {   3455,13}, {   1919,15}, {    511,14}, \
+    {   1023,13}, {   2175,12}, {   4479,13}, {   2431,14}, \
+    {   1279,13}, {   2943,12}, {   5887,14}, {   1535,13}, \
+    {   3455,14}, {   1791,13}, {   3839,15}, {   1023,14}, \
+    {   2047,13}, {   4479,14}, {   2303,13}, {   4991,12}, \
+    {   9983,14}, {   2815,13}, {   5887,15}, {   1535,14}, \
+    {   3839,16} }
+#define SQR_FFT_TABLE3_SIZE 161
+#define SQR_FFT_THRESHOLD                 5760
+
+#define MULLO_BASECASE_THRESHOLD             9
+#define MULLO_DC_THRESHOLD                  48
+#define MULLO_MUL_N_THRESHOLD            14281
+#define SQRLO_BASECASE_THRESHOLD             7
+#define SQRLO_DC_THRESHOLD                 146
+#define SQRLO_SQR_THRESHOLD              11278
+
+#define DC_DIV_QR_THRESHOLD                 77
+#define DC_DIVAPPR_Q_THRESHOLD             240
+#define DC_BDIV_QR_THRESHOLD                83
+#define DC_BDIV_Q_THRESHOLD                182
+
+#define INV_MULMOD_BNM1_THRESHOLD           74
+#define INV_NEWTON_THRESHOLD               252
+#define INV_APPR_THRESHOLD                 252
+
+#define BINV_NEWTON_THRESHOLD              252
+#define REDC_1_TO_REDC_N_THRESHOLD          79
+
+#define MU_DIV_QR_THRESHOLD               1787
+#define MU_DIVAPPR_Q_THRESHOLD            1718
+#define MUPI_DIV_QR_THRESHOLD              122
+#define MU_BDIV_QR_THRESHOLD              1470
+#define MU_BDIV_Q_THRESHOLD               1713
+
+#define POWM_SEC_TABLE  1,16,96,563,1317,1867
+
+#define GET_STR_DC_THRESHOLD                19
+#define GET_STR_PRECOMPUTE_THRESHOLD        32
+#define SET_STR_DC_THRESHOLD               254
+#define SET_STR_PRECOMPUTE_THRESHOLD       907
+
+#define FAC_DSC_THRESHOLD                  224
+#define FAC_ODD_THRESHOLD                   55
+
+#define MATRIX22_STRASSEN_THRESHOLD         23
+#define HGCD2_DIV1_METHOD                    3  /* 3.59% faster than 5 */
+#define HGCD_THRESHOLD                      85
+#define HGCD_APPR_THRESHOLD                152
+#define HGCD_REDUCE_THRESHOLD             3389
+#define GCD_DC_THRESHOLD                   531
+#define GCDEXT_DC_THRESHOLD                386
+#define JACOBI_BASE_METHOD                   3  /* 0.92% faster than 1 */
+
+/* Tuneup completed successfully, took 159946 seconds */

diff --git a/third_party/gmp/mpn/x86/bt2/gmp-mparam.h b/third_party/gmp/mpn/x86/bt2/gmp-mparam.h
new file mode 100644
index 0000000..f936cb7
--- /dev/null
+++ b/third_party/gmp/mpn/x86/bt2/gmp-mparam.h

@@ -0,0 +1,214 @@
+/* x86/bobcat gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* NOTE: In a fat binary build SQR_TOOM2_THRESHOLD here cannot be greater than
+   the value in mpn/x86/k7/gmp-mparam.h.  The latter is used as a hard limit in
+   k7/sqr_basecase.asm.  */
+
+/* 2050 MHz AMD Jaguar/Kabini */
+/* FFT tuning limit = 67,000,000 */
+/* Generated by tuneup.c, 2019-10-24, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD                 4
+#define MOD_1_UNNORM_THRESHOLD               6
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          4
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          4
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD        18
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD         0  /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD      9
+#define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_1N_PI1_METHOD                 1  /* 47.53% faster than 2 */
+#define DIV_QR_1_NORM_THRESHOLD          MP_SIZE_T_MAX  /* never */
+#define DIV_QR_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           27
+
+#define DIV_1_VS_MUL_1_PERCENT             243
+
+#define MUL_TOOM22_THRESHOLD                32
+#define MUL_TOOM33_THRESHOLD                90
+#define MUL_TOOM44_THRESHOLD               154
+#define MUL_TOOM6H_THRESHOLD               286
+#define MUL_TOOM8H_THRESHOLD               478
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      97
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     152
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD     103
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD     113
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD     154
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 38
+#define SQR_TOOM3_THRESHOLD                126
+#define SQR_TOOM4_THRESHOLD                220
+#define SQR_TOOM6_THRESHOLD                318
+#define SQR_TOOM8_THRESHOLD                502
+
+#define MULMID_TOOM42_THRESHOLD             68
+
+#define MULMOD_BNM1_THRESHOLD               19
+#define SQRMOD_BNM1_THRESHOLD               25
+
+#define MUL_FFT_MODF_THRESHOLD             570  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    570, 5}, {     25, 6}, {     13, 5}, {     27, 6}, \
+    {     15, 5}, {     31, 6}, {     28, 7}, {     15, 6}, \
+    {     33, 7}, {     17, 6}, {     35, 7}, {     19, 6}, \
+    {     39, 7}, {     23, 6}, {     47, 7}, {     27, 8}, \
+    {     15, 7}, {     35, 8}, {     19, 7}, {     41, 8}, \
+    {     23, 7}, {     49, 8}, {     31, 7}, {     63, 8}, \
+    {     39, 9}, {     23, 8}, {     55, 9}, {     31, 8}, \
+    {     67, 9}, {     39, 8}, {     79, 9}, {     47, 8}, \
+    {     95, 9}, {     55,10}, {     31, 9}, {     79,10}, \
+    {     47, 9}, {     95,11}, {     31,10}, {     63, 9}, \
+    {    135,10}, {     79, 9}, {    159,10}, {     95,11}, \
+    {     63,10}, {    159,11}, {     95,10}, {    191,12}, \
+    {     63,11}, {    127,10}, {    255, 9}, {    511,10}, \
+    {    271, 9}, {    543,10}, {    287,11}, {    159,10}, \
+    {    319, 9}, {    639,10}, {    335, 9}, {    671,11}, \
+    {    191,10}, {    383, 9}, {    767,10}, {    399, 9}, \
+    {    799,10}, {    415,11}, {    223,12}, {    127,11}, \
+    {    255,10}, {    543,11}, {    287,10}, {    607, 9}, \
+    {   1215,11}, {    319,10}, {    671,11}, {    351,12}, \
+    {    191,11}, {    383,10}, {    799,11}, {    415,13}, \
+    {    127,12}, {    255,11}, {    543,10}, {   1087,11}, \
+    {    607,10}, {   1215,12}, {    319,11}, {    671,10}, \
+    {   1343,11}, {    735,10}, {   1471,12}, {    383,11}, \
+    {    799,10}, {   1599,11}, {    863,12}, {    447,11}, \
+    {    991,13}, {    255,12}, {    511,11}, {   1087,12}, \
+    {    575,11}, {   1215,12}, {    639,11}, {   1343,12}, \
+    {    703,11}, {   1471,13}, {    383,12}, {    767,11}, \
+    {   1599,12}, {    831,11}, {   1727,12}, {    959,14}, \
+    {    255,13}, {    511,12}, {   1215,13}, {    639,12}, \
+    {   1471,13}, {    767,12}, {   1727,13}, {    895,12}, \
+    {   1919,14}, {    511,13}, {   1023,12}, {   2111,13}, \
+    {   1151,12}, {   2431,13}, {   1407,14}, {    767,13}, \
+    {   1663,12}, {   3455,13}, {   1919,15}, {    511,14}, \
+    {   1023,13}, {   2175,12}, {   4479,13}, {   2431,14}, \
+    {   1279,13}, {   2943,12}, {   5887,14}, {   1535,13}, \
+    {   3455,14}, {   1791,13}, {   3967,15}, {   1023,14}, \
+    {   2047,13}, {   4479,14}, {   2303,13}, {   4991,12}, \
+    {   9983,14}, {   2815,13}, {   5887,15}, {   1535,14}, \
+    {   3839,16} }
+#define MUL_FFT_TABLE3_SIZE 153
+#define MUL_FFT_THRESHOLD                 5760
+
+#define SQR_FFT_MODF_THRESHOLD             530  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    530, 5}, {     27, 6}, {     15, 5}, {     31, 6}, \
+    {     28, 7}, {     15, 6}, {     33, 7}, {     17, 6}, \
+    {     35, 7}, {     19, 6}, {     39, 7}, {     23, 6}, \
+    {     47, 7}, {     29, 8}, {     15, 7}, {     35, 8}, \
+    {     19, 7}, {     41, 8}, {     23, 7}, {     49, 8}, \
+    {     31, 7}, {     63, 8}, {     39, 9}, {     23, 8}, \
+    {     55, 9}, {     31, 8}, {     67, 9}, {     39, 8}, \
+    {     79, 9}, {     47, 8}, {     95, 9}, {     55,10}, \
+    {     31, 9}, {     79,10}, {     47, 9}, {     95,11}, \
+    {     31,10}, {     63, 9}, {    135,10}, {     95,11}, \
+    {     63,10}, {    143, 9}, {    287,10}, {    159,11}, \
+    {     95,12}, {     63,11}, {    127,10}, {    255, 9}, \
+    {    511,10}, {    271, 9}, {    543,10}, {    287,11}, \
+    {    159,10}, {    319, 9}, {    639,10}, {    335, 9}, \
+    {    671,10}, {    351,11}, {    191,10}, {    383, 9}, \
+    {    767,10}, {    399, 9}, {    799,12}, {    127,11}, \
+    {    255,10}, {    543,11}, {    287,10}, {    607, 9}, \
+    {   1215,11}, {    319,10}, {    671,11}, {    351,12}, \
+    {    191,11}, {    383,10}, {    799,11}, {    415,10}, \
+    {    831,13}, {    127,12}, {    255,11}, {    543,10}, \
+    {   1087,11}, {    607,10}, {   1215,12}, {    319,11}, \
+    {    671,10}, {   1343,11}, {    735,10}, {   1471,12}, \
+    {    383,11}, {    799,10}, {   1599,11}, {    863,12}, \
+    {    447,11}, {    991,13}, {    255,12}, {    511,11}, \
+    {   1087,12}, {    575,11}, {   1215,12}, {    639,11}, \
+    {   1343,12}, {    703,11}, {   1471,13}, {    383,12}, \
+    {    767,11}, {   1599,12}, {    831,11}, {   1727,12}, \
+    {    959,11}, {   1919,14}, {    255,13}, {    511,12}, \
+    {   1215,13}, {    639,12}, {   1471,13}, {    767,12}, \
+    {   1727,13}, {    895,12}, {   1919,14}, {    511,13}, \
+    {   1023,12}, {   2111,13}, {   1151,12}, {   2495,13}, \
+    {   1407,14}, {    767,13}, {   1663,12}, {   3455,13}, \
+    {   1919,15}, {    511,14}, {   1023,13}, {   2175,12}, \
+    {   4479,13}, {   2431,14}, {   1279,13}, {   2943,12}, \
+    {   5887,14}, {   1535,13}, {   3455,14}, {   1791,13}, \
+    {   3967,15}, {   1023,14}, {   2047,13}, {   4479,14}, \
+    {   2303,13}, {   4991,12}, {   9983,14}, {   2815,13}, \
+    {   5887,15}, {   1535,14}, {   3839,16} }
+#define SQR_FFT_TABLE3_SIZE 151
+#define SQR_FFT_THRESHOLD                 4736
+
+#define MULLO_BASECASE_THRESHOLD             8
+#define MULLO_DC_THRESHOLD                  44
+#define MULLO_MUL_N_THRESHOLD            11278
+#define SQRLO_BASECASE_THRESHOLD            13
+#define SQRLO_DC_THRESHOLD                  62
+#define SQRLO_SQR_THRESHOLD               8907
+
+#define DC_DIV_QR_THRESHOLD                 79
+#define DC_DIVAPPR_Q_THRESHOLD             228
+#define DC_BDIV_QR_THRESHOLD                75
+#define DC_BDIV_Q_THRESHOLD                136
+
+#define INV_MULMOD_BNM1_THRESHOLD           90
+#define INV_NEWTON_THRESHOLD               260
+#define INV_APPR_THRESHOLD                 236
+
+#define BINV_NEWTON_THRESHOLD              294
+#define REDC_1_TO_REDC_N_THRESHOLD          80
+
+#define MU_DIV_QR_THRESHOLD               1787
+#define MU_DIVAPPR_Q_THRESHOLD            1718
+#define MUPI_DIV_QR_THRESHOLD              118
+#define MU_BDIV_QR_THRESHOLD              1442
+#define MU_BDIV_Q_THRESHOLD               1652
+
+#define POWM_SEC_TABLE  1,16,96,615,865,1442
+
+#define GET_STR_DC_THRESHOLD                16
+#define GET_STR_PRECOMPUTE_THRESHOLD        27
+#define SET_STR_DC_THRESHOLD               252
+#define SET_STR_PRECOMPUTE_THRESHOLD       638
+
+#define FAC_DSC_THRESHOLD                  141
+#define FAC_ODD_THRESHOLD                   39
+
+#define MATRIX22_STRASSEN_THRESHOLD         19
+#define HGCD2_DIV1_METHOD                    1  /* 13.65% faster than 3 */
+#define HGCD_THRESHOLD                      81
+#define HGCD_APPR_THRESHOLD                 66
+#define HGCD_REDUCE_THRESHOLD             3389
+#define GCD_DC_THRESHOLD                   531
+#define GCDEXT_DC_THRESHOLD                345
+#define JACOBI_BASE_METHOD                   1  /* 0.84% faster than 4 */
+
+/* Tuneup completed successfully, took 103818 seconds */

diff --git a/third_party/gmp/mpn/x86/cnd_aors_n.asm b/third_party/gmp/mpn/x86/cnd_aors_n.asm
new file mode 100644
index 0000000..74f4917
--- /dev/null
+++ b/third_party/gmp/mpn/x86/cnd_aors_n.asm

@@ -0,0 +1,124 @@
+dnl  X86 mpn_cnd_add_n, mpn_cnd_sub_n
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C			    cycles/limb
+C P5				 ?
+C P6 model 0-8,10-12		 ?
+C P6 model 9   (Banias)		 ?
+C P6 model 13  (Dothan)		 5.4
+C P4 model 0-1 (Willamette)	 ?
+C P4 model 2   (Northwood)	14.5
+C P4 model 3-4 (Prescott)	21
+C Intel atom			11
+C AMD K6			 ?
+C AMD K7			 3.4
+C AMD K8			 ?
+
+
+define(`rp',  `%edi')
+define(`up',  `%esi')
+define(`vp',  `%ebp')
+define(`n',   `%ecx')
+define(`cnd', `20(%esp)')
+define(`cy',  `%edx')
+
+ifdef(`OPERATION_cnd_add_n', `
+	define(ADDSUB,	      add)
+	define(ADCSBB,	      adc)
+	define(func,	      mpn_cnd_add_n)')
+ifdef(`OPERATION_cnd_sub_n', `
+	define(ADDSUB,	      sub)
+	define(ADCSBB,	      sbb)
+	define(func,	      mpn_cnd_sub_n)')
+
+MULFUNC_PROLOGUE(mpn_cnd_add_n mpn_cnd_sub_n)
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(func)
+	add	$-16, %esp
+	mov	%ebp, (%esp)
+	mov	%ebx, 4(%esp)
+	mov	%esi, 8(%esp)
+	mov	%edi, 12(%esp)
+
+	C make cnd into a full mask
+	mov	cnd, %eax
+	neg	%eax
+	sbb	%eax, %eax
+	mov	%eax, cnd
+
+	C load parameters into registers
+	mov	24(%esp), rp
+	mov	28(%esp), up
+	mov	32(%esp), vp
+	mov	36(%esp), n
+
+	mov	(vp), %eax
+	mov	(up), %ebx
+
+	C put operand pointers just beyond their last limb
+	lea	(vp,n,4), vp
+	lea	(up,n,4), up
+	lea	-4(rp,n,4), rp
+	neg	n
+
+	and	cnd, %eax
+	ADDSUB	%eax, %ebx
+	sbb	cy, cy
+	inc	n
+	je	L(end)
+
+	ALIGN(16)
+L(top):	mov	(vp,n,4), %eax
+	and	cnd, %eax
+	mov	%ebx, (rp,n,4)
+	mov	(up,n,4), %ebx
+	add	cy, cy
+	ADCSBB	%eax, %ebx
+	sbb	cy, cy
+	inc	n
+	jne	L(top)
+
+L(end):	mov	%ebx, (rp)
+	xor	%eax, %eax
+	sub	cy, %eax
+
+	mov	(%esp), %ebp
+	mov	4(%esp), %ebx
+	mov	8(%esp), %esi
+	mov	12(%esp), %edi
+	add	$16, %esp
+	ret
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/x86/copyd.asm b/third_party/gmp/mpn/x86/copyd.asm
new file mode 100644
index 0000000..51fa195
--- /dev/null
+++ b/third_party/gmp/mpn/x86/copyd.asm

@@ -0,0 +1,91 @@
+dnl  x86 mpn_copyd -- copy limb vector, decrementing.
+
+dnl  Copyright 1999-2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C     cycles/limb  startup (approx)
+C P5	  1.0	      40
+C P6	  2.4	      70
+C K6	  1.0	      55
+C K7	  1.3	      75
+C P4	  2.6	     175
+C
+C (Startup time includes some function call overheads.)
+
+
+C void mpn_copyd (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C Copy src,size to dst,size, working from high to low addresses.
+C
+C The code here is very generic and can be expected to be reasonable on all
+C the x86 family.
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+deflit(`FRAME',0)
+
+	TEXT
+	ALIGN(32)
+
+PROLOGUE(mpn_copyd)
+	C eax	saved esi
+	C ebx
+	C ecx	counter
+	C edx	saved edi
+	C esi	src
+	C edi	dst
+	C ebp
+
+	movl	PARAM_SIZE, %ecx
+	movl	%esi, %eax
+
+	movl	PARAM_SRC, %esi
+	movl	%edi, %edx
+
+	movl	PARAM_DST, %edi
+	leal	-4(%esi,%ecx,4), %esi
+
+	leal	-4(%edi,%ecx,4), %edi
+
+	std
+
+	rep
+	movsl
+
+	cld
+
+	movl	%eax, %esi
+	movl	%edx, %edi
+
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/copyi.asm b/third_party/gmp/mpn/x86/copyi.asm
new file mode 100644
index 0000000..f6b0354
--- /dev/null
+++ b/third_party/gmp/mpn/x86/copyi.asm

@@ -0,0 +1,99 @@
+dnl  x86 mpn_copyi -- copy limb vector, incrementing.
+
+dnl  Copyright 1999-2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C     cycles/limb  startup (approx)
+C P5	  1.0	      35
+C P6	  0.75	      45
+C K6	  1.0	      30
+C K7	  1.3	      65
+C P4	  1.0	     120
+C
+C (Startup time includes some function call overheads.)
+
+
+C void mpn_copyi (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C Copy src,size to dst,size, working from low to high addresses.
+C
+C The code here is very generic and can be expected to be reasonable on all
+C the x86 family.
+C
+C P6 -  An MMX based copy was tried, but was found to be slower than a rep
+C       movs in all cases.  The fastest MMX found was 0.8 cycles/limb (when
+C       fully aligned).  A rep movs seems to have a startup time of about 15
+C       cycles, but doing something special for small sizes could lead to a
+C       branch misprediction that would destroy any saving.  For now a plain
+C       rep movs seems ok.
+C
+C K62 - We used to have a big chunk of code doing an MMX copy at 0.56 c/l if
+C       aligned or a 1.0 rep movs if not.  But that seemed excessive since
+C       it only got an advantage half the time, and even then only showed it
+C       above 50 limbs or so.
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+deflit(`FRAME',0)
+
+	TEXT
+	ALIGN(32)
+
+	C eax	saved esi
+	C ebx
+	C ecx	counter
+	C edx	saved edi
+	C esi	src
+	C edi	dst
+	C ebp
+
+PROLOGUE(mpn_copyi)
+
+	movl	PARAM_SIZE, %ecx
+	movl	%esi, %eax
+
+	movl	PARAM_SRC, %esi
+	movl	%edi, %edx
+
+	movl	PARAM_DST, %edi
+
+	cld	C better safe than sorry, see mpn/x86/README
+
+	rep
+	movsl
+
+	movl	%eax, %esi
+	movl	%edx, %edi
+
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/core2/gmp-mparam.h b/third_party/gmp/mpn/x86/core2/gmp-mparam.h
new file mode 100644
index 0000000..8a44ad1
--- /dev/null
+++ b/third_party/gmp/mpn/x86/core2/gmp-mparam.h

@@ -0,0 +1,210 @@
+/* x86/core2 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* 3000 MHz Penryn */
+/* FFT tuning limit = 67,000,000 */
+/* Generated by tuneup.c, 2019-10-20, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD             MP_SIZE_T_MAX  /* never */
+#define MOD_1_UNNORM_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          9
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          8
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD         9
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD         0  /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD      3
+#define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_1N_PI1_METHOD                 2  /* 22.20% faster than 1 */
+#define DIV_QR_1_NORM_THRESHOLD          MP_SIZE_T_MAX  /* never */
+#define DIV_QR_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */
+#define DIV_QR_2_PI2_THRESHOLD               9
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           18
+
+#define DIV_1_VS_MUL_1_PERCENT             277
+
+#define MUL_TOOM22_THRESHOLD                24
+#define MUL_TOOM33_THRESHOLD                93
+#define MUL_TOOM44_THRESHOLD               136
+#define MUL_TOOM6H_THRESHOLD               300
+#define MUL_TOOM8H_THRESHOLD               478
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      91
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     153
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD      93
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD      94
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD     130
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 34
+#define SQR_TOOM3_THRESHOLD                117
+#define SQR_TOOM4_THRESHOLD                184
+#define SQR_TOOM6_THRESHOLD                262
+#define SQR_TOOM8_THRESHOLD                597
+
+#define MULMID_TOOM42_THRESHOLD             70
+
+#define MULMOD_BNM1_THRESHOLD               17
+#define SQRMOD_BNM1_THRESHOLD               25
+
+#define MUL_FFT_MODF_THRESHOLD             505  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    505, 5}, {     28, 6}, {     15, 5}, {     31, 6}, \
+    {     29, 7}, {     15, 6}, {     32, 7}, {     17, 6}, \
+    {     35, 7}, {     19, 6}, {     39, 7}, {     29, 8}, \
+    {     15, 7}, {     35, 8}, {     19, 7}, {     43, 8}, \
+    {     23, 7}, {     47, 8}, {     27, 9}, {     15, 8}, \
+    {     31, 7}, {     63, 8}, {     43, 9}, {     23, 8}, \
+    {     55, 9}, {     31, 8}, {     67, 9}, {     39, 8}, \
+    {     79, 9}, {     55,10}, {     31, 9}, {     79,10}, \
+    {     47, 9}, {     95,11}, {     31,10}, {     63, 9}, \
+    {    135,10}, {     79, 9}, {    159,10}, {     95,11}, \
+    {     63, 9}, {    255,10}, {    159,11}, {     95,10}, \
+    {    191,12}, {     63,11}, {    127,10}, {    271, 9}, \
+    {    543,10}, {    287,11}, {    159,10}, {    335, 9}, \
+    {    671,10}, {    351,11}, {    191,10}, {    399, 9}, \
+    {    799,11}, {    223,12}, {    127,11}, {    255,10}, \
+    {    543,11}, {    287,10}, {    607,11}, {    319,10}, \
+    {    671,11}, {    351,12}, {    191,11}, {    383,10}, \
+    {    799,11}, {    415,13}, {    127,12}, {    255,11}, \
+    {    543,10}, {   1087,11}, {    607,12}, {    319,11}, \
+    {    671,10}, {   1343,11}, {    735,10}, {   1471,12}, \
+    {    383,11}, {    799,10}, {   1599,11}, {    863,12}, \
+    {    447,11}, {    959,13}, {    255,12}, {    511,11}, \
+    {   1087,12}, {    575,11}, {   1215,12}, {    639,11}, \
+    {   1343,12}, {    703,11}, {   1471,13}, {    383,12}, \
+    {    767,11}, {   1599,12}, {    831,11}, {   1727,12}, \
+    {    959,14}, {    255,13}, {    511,12}, {   1087,11}, \
+    {   2239,12}, {   1215,13}, {    639,12}, {   1471,11}, \
+    {   2943,13}, {    767,12}, {   1727,13}, {    895,12}, \
+    {   1919,14}, {    511,13}, {   1023,12}, {   2239,13}, \
+    {   1151,12}, {   2431,13}, {   1407,12}, {   2943,14}, \
+    {    767,13}, {   1663,12}, {   3455,13}, {   1919,15}, \
+    {    511,14}, {   1023,13}, {   2175,12}, {   4479,13}, \
+    {   2431,14}, {   1279,13}, {   2943,12}, {   5887,14}, \
+    {   1535,13}, {   3455,14}, {   1791,13}, {   3967,12}, \
+    {   7935,15}, {   1023,14}, {   2047,13}, {   4479,14}, \
+    {   2303,13}, {   4991,12}, {   9983,14}, {   2815,13}, \
+    {   5887,15}, {   1535,14}, {   3839,16} }
+#define MUL_FFT_TABLE3_SIZE 147
+#define MUL_FFT_THRESHOLD                 6784
+
+#define SQR_FFT_MODF_THRESHOLD             464  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    464, 5}, {     28, 6}, {     15, 5}, {     31, 6}, \
+    {     29, 7}, {     15, 6}, {     32, 7}, {     17, 6}, \
+    {     35, 7}, {     19, 6}, {     39, 7}, {     29, 8}, \
+    {     15, 7}, {     35, 8}, {     19, 7}, {     41, 8}, \
+    {     23, 7}, {     49, 8}, {     27, 9}, {     15, 8}, \
+    {     39, 9}, {     23, 8}, {     51,10}, {     15, 9}, \
+    {     31, 8}, {     67, 9}, {     39, 8}, {     79, 9}, \
+    {     47, 8}, {     95, 9}, {     55,10}, {     31, 9}, \
+    {     79,10}, {     47, 9}, {     95,11}, {     31,10}, \
+    {     63, 9}, {    127,10}, {     79, 9}, {    159,10}, \
+    {     95,11}, {     63,10}, {    127, 9}, {    255,10}, \
+    {    143, 9}, {    287, 5}, {   4863, 6}, {   2495, 7}, \
+    {   1343, 8}, {    703, 9}, {    367,12}, {     63,11}, \
+    {    127,10}, {    303,11}, {    159,10}, {    319, 9}, \
+    {    639,10}, {    335, 9}, {    671,10}, {    351, 9}, \
+    {    703,10}, {    367,11}, {    191,10}, {    383, 9}, \
+    {    767,10}, {    399, 9}, {    799,10}, {    415, 9}, \
+    {    831,12}, {    127,11}, {    255,10}, {    543,11}, \
+    {    287,10}, {    607,11}, {    319,10}, {    671,11}, \
+    {    351,10}, {    703,12}, {    191,11}, {    383,10}, \
+    {    799,11}, {    415,10}, {    863,13}, {    127,12}, \
+    {    255,11}, {    543,10}, {   1087,11}, {    607,12}, \
+    {    319,11}, {    671,10}, {   1343,11}, {    735,10}, \
+    {   1471,12}, {    383,11}, {    799,10}, {   1599,11}, \
+    {    863,12}, {    447,11}, {    959,13}, {    255,12}, \
+    {    511,11}, {   1087,12}, {    575,11}, {   1215,12}, \
+    {    639,11}, {   1343,12}, {    703,11}, {   1407,13}, \
+    {    383,12}, {    767,11}, {   1599,12}, {    831,11}, \
+    {   1727,12}, {    959,14}, {    255,13}, {    511,12}, \
+    {   1215,13}, {    639,12}, {   1471,11}, {   2943,13}, \
+    {    767,12}, {   1727,13}, {    895,12}, {   1919,14}, \
+    {    511,13}, {   1023,12}, {   2111,13}, {   1151,12}, \
+    {   2431,13}, {   1407,12}, {   2943,14}, {    767,13}, \
+    {   1663,12}, {   3455,13}, {   1919,15}, {    511,14}, \
+    {   1023,13}, {   2175,12}, {   4479,13}, {   2431,14}, \
+    {   1279,13}, {   2943,12}, {   5887,14}, {   1535,13}, \
+    {   3455,14}, {   1791,13}, {   3967,15}, {   1023,14}, \
+    {   2047,13}, {   4479,14}, {   2303,13}, {   4991,12}, \
+    {   9983,14}, {   2815,13}, {   5887,15}, {   1535,14}, \
+    {   3839,16} }
+#define SQR_FFT_TABLE3_SIZE 157
+#define SQR_FFT_THRESHOLD                 5312
+
+#define MULLO_BASECASE_THRESHOLD             0  /* always */
+#define MULLO_DC_THRESHOLD                  36
+#define MULLO_MUL_N_THRESHOLD            13463
+#define SQRLO_BASECASE_THRESHOLD             0  /* always */
+#define SQRLO_DC_THRESHOLD                 140
+#define SQRLO_SQR_THRESHOLD              10393
+
+#define DC_DIV_QR_THRESHOLD                 32
+#define DC_DIVAPPR_Q_THRESHOLD             116
+#define DC_BDIV_QR_THRESHOLD                76
+#define DC_BDIV_Q_THRESHOLD                180
+
+#define INV_MULMOD_BNM1_THRESHOLD           46
+#define INV_NEWTON_THRESHOLD               138
+#define INV_APPR_THRESHOLD                 123
+
+#define BINV_NEWTON_THRESHOLD              306
+#define REDC_1_TO_REDC_N_THRESHOLD          82
+
+#define MU_DIV_QR_THRESHOLD               1499
+#define MU_DIVAPPR_Q_THRESHOLD            1442
+#define MUPI_DIV_QR_THRESHOLD               63
+#define MU_BDIV_QR_THRESHOLD              1442
+#define MU_BDIV_Q_THRESHOLD               1589
+
+#define POWM_SEC_TABLE  1,22,66,428,1035
+
+#define GET_STR_DC_THRESHOLD                13
+#define GET_STR_PRECOMPUTE_THRESHOLD        18
+#define SET_STR_DC_THRESHOLD               732
+#define SET_STR_PRECOMPUTE_THRESHOLD      1118
+
+#define FAC_DSC_THRESHOLD                  115
+#define FAC_ODD_THRESHOLD                   50
+
+#define MATRIX22_STRASSEN_THRESHOLD         25
+#define HGCD2_DIV1_METHOD                    1  /* 5.78% faster than 3 */
+#define HGCD_THRESHOLD                     121
+#define HGCD_APPR_THRESHOLD                151
+#define HGCD_REDUCE_THRESHOLD             3259
+#define GCD_DC_THRESHOLD                   368
+#define GCDEXT_DC_THRESHOLD                306
+#define JACOBI_BASE_METHOD                   4  /* 14.19% faster than 1 */
+
+/* Tuneup completed successfully, took 67142 seconds */

diff --git a/third_party/gmp/mpn/x86/coreibwl/gmp-mparam.h b/third_party/gmp/mpn/x86/coreibwl/gmp-mparam.h
new file mode 100644
index 0000000..7b58cad
--- /dev/null
+++ b/third_party/gmp/mpn/x86/coreibwl/gmp-mparam.h

@@ -0,0 +1,216 @@
+/* x86/coreibwl gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* 3400-3800 MHz Intel Xeon E3-1285Lv4 Broadwell */
+/* FFT tuning limit = 67,000,000 */
+/* Generated by tuneup.c, 2019-10-20, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD                15
+#define MOD_1_UNNORM_THRESHOLD              16
+#define MOD_1N_TO_MOD_1_1_THRESHOLD         10
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          8
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD         0  /* never mpn_mod_1_1p */
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD        10
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     11
+#define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_1N_PI1_METHOD                 1  /* 21.34% faster than 2 */
+#define DIV_QR_1_NORM_THRESHOLD             14
+#define DIV_QR_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */
+#define DIV_QR_2_PI2_THRESHOLD              29
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           19
+
+#define DIV_1_VS_MUL_1_PERCENT             295
+
+#define MUL_TOOM22_THRESHOLD                26
+#define MUL_TOOM33_THRESHOLD                97
+#define MUL_TOOM44_THRESHOLD               220
+#define MUL_TOOM6H_THRESHOLD               306
+#define MUL_TOOM8H_THRESHOLD               454
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      93
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     153
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD     154
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD     169
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD     136
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 44
+#define SQR_TOOM3_THRESHOLD                134
+#define SQR_TOOM4_THRESHOLD                242
+#define SQR_TOOM6_THRESHOLD                342
+#define SQR_TOOM8_THRESHOLD                502
+
+#define MULMID_TOOM42_THRESHOLD             98
+
+#define MULMOD_BNM1_THRESHOLD               20
+#define SQRMOD_BNM1_THRESHOLD               23
+
+#define MUL_FFT_MODF_THRESHOLD             540  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    540, 5}, {     29, 6}, {     15, 5}, {     31, 6}, \
+    {     16, 5}, {     33, 6}, {     17, 5}, {     36, 6}, \
+    {     25, 7}, {     13, 6}, {     29, 7}, {     15, 6}, \
+    {     33, 7}, {     17, 6}, {     36, 7}, {     19, 6}, \
+    {     39, 7}, {     21, 6}, {     43, 7}, {     23, 6}, \
+    {     47, 7}, {     29, 8}, {     15, 7}, {     35, 8}, \
+    {     19, 7}, {     43, 8}, {     23, 7}, {     49, 8}, \
+    {     27, 7}, {     55, 9}, {     15, 8}, {     31, 7}, \
+    {     63, 8}, {     43, 9}, {     23, 8}, {     55,10}, \
+    {     15, 9}, {     31, 8}, {     67, 9}, {     39, 8}, \
+    {     83, 9}, {     47, 8}, {     95, 9}, {     55,10}, \
+    {     31, 9}, {     79,10}, {     47, 9}, {     95,11}, \
+    {     31,10}, {     63, 9}, {    135,10}, {     79, 9}, \
+    {    159,10}, {     95, 9}, {    191,10}, {    111,11}, \
+    {     63,10}, {    143, 9}, {    287,10}, {    159,11}, \
+    {     95, 7}, {   1599, 8}, {    831, 9}, {    431, 8}, \
+    {    863, 9}, {    447,10}, {    239, 9}, {    479,10}, \
+    {    255, 9}, {    511,10}, {    287,11}, {    159,10}, \
+    {    319, 9}, {    639,10}, {    335, 9}, {    671,11}, \
+    {    191,10}, {    383, 9}, {    767,10}, {    399,11}, \
+    {    223,12}, {    127,11}, {    255,10}, {    511, 9}, \
+    {   1023,11}, {    287,10}, {    607,11}, {    319,10}, \
+    {    671,11}, {    351,12}, {    191,11}, {    383,10}, \
+    {    799,11}, {    415,13}, {    127,12}, {    255,11}, \
+    {    543,10}, {   1119,11}, {    607,12}, {    319,11}, \
+    {    671,10}, {   1343,11}, {    735,12}, {    383,11}, \
+    {    799,10}, {   1599,11}, {    863,12}, {    447,11}, \
+    {    959,13}, {    255,12}, {    511,11}, {   1119,12}, \
+    {    575,11}, {   1215,12}, {    639,11}, {   1343,12}, \
+    {    703,11}, {   1407,13}, {    383,12}, {    767,11}, \
+    {   1599,12}, {    831,11}, {   1727,12}, {    959,14}, \
+    {    255,13}, {    511,12}, {   1215,13}, {    639,12}, \
+    {   1471,13}, {    767,12}, {   1727,13}, {    895,12}, \
+    {   1919,14}, {    511,13}, {   1023,12}, {   2239,13}, \
+    {   1151,12}, {   2431,13}, {   1279,12}, {   2623,13}, \
+    {   1407,12}, {   2815,14}, {    767,13}, {   1535,12}, \
+    {   3135,13}, {   1663,12}, {   3455,13}, {   1919,15}, \
+    {    511,14}, {   1023,13}, {   2175,12}, {   4479,13}, \
+    {   2431,14}, {   1279,13}, {   2943,12}, {   5887,14}, \
+    {   1535,13}, {   3455,14}, {   1791,13}, {   3839,15}, \
+    {   1023,14}, {   2047,13}, {   4479,14}, {   2303,13}, \
+    {   4991,12}, {   9983,14}, {   2559,13}, {   5247,14}, \
+    {   2815,13}, {   5887,15}, {   1535,14}, {   3839,16} }
+#define MUL_FFT_TABLE3_SIZE 172
+#define MUL_FFT_THRESHOLD                 7424
+
+#define SQR_FFT_MODF_THRESHOLD             472  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    472, 5}, {     29, 6}, {     15, 5}, {     33, 6}, \
+    {     37, 7}, {     19, 6}, {     40, 7}, {     29, 8}, \
+    {     15, 7}, {     35, 8}, {     19, 7}, {     43, 8}, \
+    {     23, 7}, {     49, 8}, {     27, 9}, {     15, 8}, \
+    {     31, 7}, {     63, 8}, {     43, 9}, {     23, 8}, \
+    {     55,10}, {     15, 9}, {     31, 8}, {     67, 9}, \
+    {     39, 8}, {     83, 9}, {     47, 8}, {     95, 9}, \
+    {     55,10}, {     31, 9}, {     79,10}, {     47, 9}, \
+    {     95,11}, {     31,10}, {     63, 9}, {    135,10}, \
+    {     79, 9}, {    159,10}, {     95,11}, {     63,10}, \
+    {    127, 9}, {    255,10}, {    143, 9}, {    287,10}, \
+    {    159,11}, {     95,12}, {     63,11}, {    127,10}, \
+    {    271, 9}, {    543, 6}, {   4479, 7}, {   2431, 8}, \
+    {   1247, 7}, {   2495, 8}, {   1279,10}, {    351,11}, \
+    {    191,10}, {    399, 9}, {    799,10}, {    415,12}, \
+    {    127,11}, {    255,10}, {    543,11}, {    287,10}, \
+    {    607,11}, {    319,10}, {    639,11}, {    351,12}, \
+    {    191,11}, {    383,10}, {    799,11}, {    415,10}, \
+    {    831,13}, {    127,12}, {    255,11}, {    511,10}, \
+    {   1023,11}, {    543,10}, {   1087,11}, {    607,12}, \
+    {    319,11}, {    671,10}, {   1343,11}, {    735,12}, \
+    {    383,11}, {    799,10}, {   1599,11}, {    863,12}, \
+    {    447,11}, {    927,13}, {    255,12}, {    511,11}, \
+    {   1087,12}, {    575,11}, {   1215,12}, {    639,11}, \
+    {   1343,12}, {    703,11}, {   1471,13}, {    383,12}, \
+    {    767,11}, {   1599,12}, {    831,11}, {   1663,12}, \
+    {    895,11}, {   1855,14}, {    255,13}, {    511,12}, \
+    {   1023,11}, {   2047,12}, {   1087,11}, {   2239,12}, \
+    {   1215,13}, {    639,12}, {   1471,13}, {    767,12}, \
+    {   1663,13}, {    895,12}, {   1983,14}, {    511,13}, \
+    {   1023,12}, {   2239,13}, {   1151,12}, {   2495,13}, \
+    {   1279,12}, {   2623,13}, {   1407,14}, {    767,13}, \
+    {   1535,12}, {   3135,13}, {   1663,12}, {   3455,13}, \
+    {   1919,15}, {    511,14}, {   1023,13}, {   2175,12}, \
+    {   4479,13}, {   2431,14}, {   1279,13}, {   2943,12}, \
+    {   5887,14}, {   1535,13}, {   3455,14}, {   1791,13}, \
+    {   3839,15}, {   1023,14}, {   2047,13}, {   4479,14}, \
+    {   2303,13}, {   4991,12}, {   9983,14}, {   2815,13}, \
+    {   5887,15}, {   1535,14}, {   3327,13}, {   6783,14}, \
+    {   3839,16} }
+#define SQR_FFT_TABLE3_SIZE 157
+#define SQR_FFT_THRESHOLD                 5568
+
+#define MULLO_BASECASE_THRESHOLD            16
+#define MULLO_DC_THRESHOLD                  37
+#define MULLO_MUL_N_THRESHOLD            14281
+#define SQRLO_BASECASE_THRESHOLD             0  /* always */
+#define SQRLO_DC_THRESHOLD                 137
+#define SQRLO_SQR_THRESHOLD              10821
+
+#define DC_DIV_QR_THRESHOLD                 54
+#define DC_DIVAPPR_Q_THRESHOLD             146
+#define DC_BDIV_QR_THRESHOLD                98
+#define DC_BDIV_Q_THRESHOLD                218
+
+#define INV_MULMOD_BNM1_THRESHOLD           50
+#define INV_NEWTON_THRESHOLD               173
+#define INV_APPR_THRESHOLD                 165
+
+#define BINV_NEWTON_THRESHOLD              278
+#define REDC_1_TO_REDC_N_THRESHOLD          79
+
+#define MU_DIV_QR_THRESHOLD               1787
+#define MU_DIVAPPR_Q_THRESHOLD            1787
+#define MUPI_DIV_QR_THRESHOLD               78
+#define MU_BDIV_QR_THRESHOLD              1589
+#define MU_BDIV_Q_THRESHOLD               1830
+
+#define POWM_SEC_TABLE  1,16,126,416,932
+
+#define GET_STR_DC_THRESHOLD                11
+#define GET_STR_PRECOMPUTE_THRESHOLD        17
+#define SET_STR_DC_THRESHOLD               306
+#define SET_STR_PRECOMPUTE_THRESHOLD       894
+
+#define FAC_DSC_THRESHOLD                  141
+#define FAC_ODD_THRESHOLD                   34
+
+#define MATRIX22_STRASSEN_THRESHOLD         20
+#define HGCD2_DIV1_METHOD                    3  /* 5.97% faster than 1 */
+#define HGCD_THRESHOLD                      73
+#define HGCD_APPR_THRESHOLD                123
+#define HGCD_REDUCE_THRESHOLD             3664
+#define GCD_DC_THRESHOLD                   562
+#define GCDEXT_DC_THRESHOLD                465
+#define JACOBI_BASE_METHOD                   1  /* 31.16% faster than 3 */
+
+/* Tuneup completed successfully, took 35114 seconds */

diff --git a/third_party/gmp/mpn/x86/coreihwl/gmp-mparam.h b/third_party/gmp/mpn/x86/coreihwl/gmp-mparam.h
new file mode 100644
index 0000000..ea4ac11
--- /dev/null
+++ b/third_party/gmp/mpn/x86/coreihwl/gmp-mparam.h

@@ -0,0 +1,215 @@
+/* x86/coreihwl gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* 3600-4000 MHz Intel Xeon E3-1271v3 Haswell */
+/* FFT tuning limit = 67,000,000 */
+/* Generated by tuneup.c, 2019-10-21, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD                17
+#define MOD_1_UNNORM_THRESHOLD              17
+#define MOD_1N_TO_MOD_1_1_THRESHOLD         10
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          9
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD         0  /* never mpn_mod_1_1p */
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD        10
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD      5
+#define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_1N_PI1_METHOD                 1  /* 11.44% faster than 2 */
+#define DIV_QR_1_NORM_THRESHOLD             13
+#define DIV_QR_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           21
+
+#define DIV_1_VS_MUL_1_PERCENT             296
+
+#define MUL_TOOM22_THRESHOLD                28
+#define MUL_TOOM33_THRESHOLD               108
+#define MUL_TOOM44_THRESHOLD               232
+#define MUL_TOOM6H_THRESHOLD               306
+#define MUL_TOOM8H_THRESHOLD               478
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD     109
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     183
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD     113
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD     113
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD     136
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 44
+#define SQR_TOOM3_THRESHOLD                141
+#define SQR_TOOM4_THRESHOLD                384
+#define SQR_TOOM6_THRESHOLD                517
+#define SQR_TOOM8_THRESHOLD                698
+
+#define MULMID_TOOM42_THRESHOLD             98
+
+#define MULMOD_BNM1_THRESHOLD               20
+#define SQRMOD_BNM1_THRESHOLD               23
+
+#define MUL_FFT_MODF_THRESHOLD             565  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    565, 5}, {     28, 6}, {     15, 5}, {     31, 6}, \
+    {     16, 5}, {     33, 6}, {     29, 7}, {     15, 6}, \
+    {     33, 7}, {     17, 6}, {     36, 7}, {     19, 6}, \
+    {     39, 7}, {     21, 6}, {     43, 7}, {     23, 6}, \
+    {     47, 7}, {     29, 8}, {     15, 7}, {     35, 8}, \
+    {     19, 7}, {     43, 8}, {     23, 7}, {     47, 8}, \
+    {     27, 7}, {     55, 9}, {     15, 8}, {     31, 7}, \
+    {     63, 8}, {     43, 9}, {     23, 8}, {     55, 9}, \
+    {     31, 8}, {     71, 9}, {     39, 8}, {     83, 9}, \
+    {     47, 8}, {     95, 9}, {     55,10}, {     31, 9}, \
+    {     79,10}, {     47, 9}, {     95,11}, {     31,10}, \
+    {     63, 9}, {    135,10}, {     79, 9}, {    159,10}, \
+    {     95, 9}, {    191,10}, {    111,11}, {     63,10}, \
+    {    143, 9}, {    287,10}, {    159,11}, {     95,10}, \
+    {    191, 6}, {   3199, 7}, {   1727, 9}, {    447,10}, \
+    {    239, 9}, {    479,10}, {    287,11}, {    159,10}, \
+    {    319, 9}, {    639,10}, {    335, 9}, {    671,11}, \
+    {    191,10}, {    399, 9}, {    799,10}, {    415,11}, \
+    {    223,12}, {    127,11}, {    255,10}, {    511, 9}, \
+    {   1023,10}, {    527,11}, {    287,10}, {    607,11}, \
+    {    319,10}, {    671,12}, {    191,11}, {    383,10}, \
+    {    799,11}, {    415,13}, {    127,12}, {    255,11}, \
+    {    511,10}, {   1023,11}, {    543,10}, {   1087,11}, \
+    {    607,12}, {    319,11}, {    671,10}, {   1343,11}, \
+    {    735,12}, {    383,11}, {    799,10}, {   1599,11}, \
+    {    863,12}, {    447,11}, {    991,13}, {    255,12}, \
+    {    511,11}, {   1087,12}, {    575,11}, {   1215,12}, \
+    {    639,11}, {   1343,12}, {    703,11}, {   1407,13}, \
+    {    383,12}, {    767,11}, {   1599,12}, {    831,11}, \
+    {   1727,12}, {    959,11}, {   1919,14}, {    255,13}, \
+    {    511,12}, {   1087,11}, {   2239,12}, {   1215,13}, \
+    {    639,12}, {   1471,13}, {    767,12}, {   1727,13}, \
+    {    895,12}, {   1919,14}, {    511,13}, {   1023,12}, \
+    {   2239,13}, {   1151,12}, {   2431,13}, {   1279,12}, \
+    {   2623,13}, {   1407,14}, {    767,13}, {   1663,12}, \
+    {   3455,13}, {   1919,15}, {    511,14}, {   1023,13}, \
+    {   2175,12}, {   4479,13}, {   2431,14}, {   1279,13}, \
+    {   2943,12}, {   5887,14}, {   1535,13}, {   3455,14}, \
+    {   1791,13}, {   3967,15}, {   1023,14}, {   2047,13}, \
+    {   4479,14}, {   2303,13}, {   4991,14}, {   2559,13}, \
+    {   5375,14}, {   2815,13}, {   5887,15}, {   1535,14}, \
+    {   3839,16} }
+#define MUL_FFT_TABLE3_SIZE 165
+#define MUL_FFT_THRESHOLD                 7808
+
+#define SQR_FFT_MODF_THRESHOLD             560  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    560, 5}, {     29, 6}, {     15, 5}, {     31, 6}, \
+    {     16, 5}, {     33, 6}, {     17, 5}, {     36, 6}, \
+    {     29, 7}, {     15, 6}, {     33, 7}, {     17, 6}, \
+    {     36, 7}, {     19, 6}, {     40, 7}, {     21, 6}, \
+    {     43, 7}, {     23, 6}, {     47, 7}, {     29, 8}, \
+    {     15, 7}, {     35, 8}, {     19, 7}, {     43, 8}, \
+    {     23, 7}, {     49, 8}, {     27, 7}, {     55, 9}, \
+    {     15, 8}, {     31, 7}, {     63, 8}, {     43, 9}, \
+    {     23, 8}, {     55,10}, {     15, 9}, {     31, 8}, \
+    {     67, 9}, {     39, 8}, {     79, 9}, {     47, 8}, \
+    {     95, 9}, {     55,10}, {     31, 9}, {     79,10}, \
+    {     47, 9}, {     95,11}, {     31,10}, {     63, 9}, \
+    {    135,10}, {     79, 9}, {    159,10}, {     95,11}, \
+    {     63,10}, {    143, 9}, {    287,10}, {    159,11}, \
+    {     95,12}, {     63,11}, {    127, 9}, {    511, 5}, \
+    {   8959, 7}, {   2431, 8}, {   1247, 7}, {   2495, 8}, \
+    {   1279, 9}, {    671,10}, {    367,11}, {    191,10}, \
+    {    399, 9}, {    799,10}, {    415,12}, {    127,11}, \
+    {    255,10}, {    527,11}, {    287,10}, {    607,11}, \
+    {    319,10}, {    671,11}, {    351,10}, {    703,12}, \
+    {    191,11}, {    383,10}, {    799,11}, {    415,10}, \
+    {    831,13}, {    127,11}, {    543,10}, {   1119,11}, \
+    {    607,12}, {    319,11}, {    671,10}, {   1343,11}, \
+    {    735,12}, {    383,11}, {    863,12}, {    447,11}, \
+    {    991,12}, {    511,11}, {   1119,12}, {    575,11}, \
+    {   1215,12}, {    639,11}, {   1343,12}, {    703,13}, \
+    {    383,12}, {    767,11}, {   1599,12}, {    831,11}, \
+    {   1727,12}, {    959,11}, {   1983,13}, {    511,12}, \
+    {   1087,11}, {   2239,12}, {   1215,13}, {    639,12}, \
+    {   1471,13}, {    767,12}, {   1727,13}, {    895,12}, \
+    {   1983,14}, {    511,13}, {   1023,12}, {   2239,13}, \
+    {   1151,12}, {   2495,13}, {   1279,12}, {   2623,13}, \
+    {   1407,14}, {    767,13}, {   1663,12}, {   3455,13}, \
+    {   1919,15}, {    511,14}, {   1023,13}, {   2175,12}, \
+    {   4479,13}, {   2431,14}, {   1279,13}, {   2943,12}, \
+    {   5887,14}, {   1535,13}, {   3455,14}, {   1791,13}, \
+    {   3967,15}, {   1023,14}, {   2047,13}, {   4479,14}, \
+    {   2303,13}, {   4991,12}, {   9983,14}, {   2559,13}, \
+    {   5119,14}, {   2815,13}, {   5887,15}, {   1535,14}, \
+    {   3327,13}, {   6911,14}, {   3839,16} }
+#define SQR_FFT_TABLE3_SIZE 159
+#define SQR_FFT_THRESHOLD                 5568
+
+#define MULLO_BASECASE_THRESHOLD            17
+#define MULLO_DC_THRESHOLD                  40
+#define MULLO_MUL_N_THRESHOLD            14281
+#define SQRLO_BASECASE_THRESHOLD             0  /* always */
+#define SQRLO_DC_THRESHOLD                 141
+#define SQRLO_SQR_THRESHOLD              10821
+
+#define DC_DIV_QR_THRESHOLD                 30
+#define DC_DIVAPPR_Q_THRESHOLD             190
+#define DC_BDIV_QR_THRESHOLD                67
+#define DC_BDIV_Q_THRESHOLD                254
+
+#define INV_MULMOD_BNM1_THRESHOLD           54
+#define INV_NEWTON_THRESHOLD               157
+#define INV_APPR_THRESHOLD                 163
+
+#define BINV_NEWTON_THRESHOLD              236
+#define REDC_1_TO_REDC_N_THRESHOLD          79
+
+#define MU_DIV_QR_THRESHOLD               1895
+#define MU_DIVAPPR_Q_THRESHOLD            1718
+#define MUPI_DIV_QR_THRESHOLD               54
+#define MU_BDIV_QR_THRESHOLD              1589
+#define MU_BDIV_Q_THRESHOLD               1898
+
+#define POWM_SEC_TABLE  1,16,95,480,1442
+
+#define GET_STR_DC_THRESHOLD                10
+#define GET_STR_PRECOMPUTE_THRESHOLD        16
+#define SET_STR_DC_THRESHOLD               372
+#define SET_STR_PRECOMPUTE_THRESHOLD      1037
+
+#define FAC_DSC_THRESHOLD                  141
+#define FAC_ODD_THRESHOLD                   34
+
+#define MATRIX22_STRASSEN_THRESHOLD         21
+#define HGCD2_DIV1_METHOD                    3  /* 6.26% faster than 1 */
+#define HGCD_THRESHOLD                      70
+#define HGCD_APPR_THRESHOLD                129
+#define HGCD_REDUCE_THRESHOLD             3664
+#define GCD_DC_THRESHOLD                   573
+#define GCDEXT_DC_THRESHOLD                483
+#define JACOBI_BASE_METHOD                   1  /* 27.01% faster than 3 */
+
+/* Tuneup completed successfully, took 35232 seconds */

diff --git a/third_party/gmp/mpn/x86/coreinhm/gmp-mparam.h b/third_party/gmp/mpn/x86/coreinhm/gmp-mparam.h
new file mode 100644
index 0000000..4428b4b
--- /dev/null
+++ b/third_party/gmp/mpn/x86/coreinhm/gmp-mparam.h

@@ -0,0 +1,223 @@
+/* x86/coreinhm gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* 2933-3200 MHz Intel Xeon X3470 Nehalem */
+/* FFT tuning limit = 67,000,000 */
+/* Generated by tuneup.c, 2019-10-23, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD                36
+#define MOD_1_UNNORM_THRESHOLD              40
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          8
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          7
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD        12
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD         0  /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD      3
+#define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_1N_PI1_METHOD                 1  /* 42.59% faster than 2 */
+#define DIV_QR_1_NORM_THRESHOLD          MP_SIZE_T_MAX  /* never */
+#define DIV_QR_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */
+#define DIV_QR_2_PI2_THRESHOLD               9
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           17
+
+#define DIV_1_VS_MUL_1_PERCENT             288
+
+#define MUL_TOOM22_THRESHOLD                24
+#define MUL_TOOM33_THRESHOLD                93
+#define MUL_TOOM44_THRESHOLD               214
+#define MUL_TOOM6H_THRESHOLD               306
+#define MUL_TOOM8H_THRESHOLD               430
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      93
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     134
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD     145
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD      94
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD     118
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 38
+#define SQR_TOOM3_THRESHOLD                133
+#define SQR_TOOM4_THRESHOLD                212
+#define SQR_TOOM6_THRESHOLD                318
+#define SQR_TOOM8_THRESHOLD                620
+
+#define MULMID_TOOM42_THRESHOLD             68
+
+#define MULMOD_BNM1_THRESHOLD               17
+#define SQRMOD_BNM1_THRESHOLD               23
+
+#define MUL_FFT_MODF_THRESHOLD             595  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    595, 5}, {     28, 6}, {     15, 5}, {     31, 6}, \
+    {     17, 5}, {     35, 6}, {     28, 7}, {     15, 6}, \
+    {     33, 7}, {     17, 6}, {     36, 7}, {     19, 6}, \
+    {     39, 7}, {     23, 6}, {     47, 7}, {     27, 8}, \
+    {     15, 7}, {     35, 8}, {     19, 7}, {     43, 8}, \
+    {     23, 7}, {     49, 8}, {     27, 9}, {     15, 8}, \
+    {     31, 7}, {     63, 8}, {     43, 9}, {     23, 8}, \
+    {     51, 9}, {     31, 8}, {     67, 9}, {     39, 8}, \
+    {     79, 9}, {     47, 8}, {     99, 9}, {     55,10}, \
+    {     31, 9}, {     63, 8}, {    127, 9}, {     79,10}, \
+    {     47, 9}, {     95,11}, {     31,10}, {     63, 9}, \
+    {    135,10}, {     79, 9}, {    159,10}, {     95, 9}, \
+    {    191,11}, {     63, 9}, {    255,10}, {    159,11}, \
+    {     95,10}, {    191,12}, {     63,11}, {    127,10}, \
+    {    255, 9}, {    511,10}, {    271, 9}, {    543,11}, \
+    {    159,10}, {    335,11}, {    191,10}, {    383, 9}, \
+    {    767,10}, {    399,12}, {    127,11}, {    255,10}, \
+    {    511, 9}, {   1023,10}, {    543, 9}, {   1087,11}, \
+    {    287,10}, {    607,11}, {    319,10}, {    671,12}, \
+    {    191,11}, {    383,10}, {    767,13}, {    127,12}, \
+    {    255,11}, {    511,10}, {   1023,11}, {    543,10}, \
+    {   1119,11}, {    607,12}, {    319,11}, {    671,10}, \
+    {   1343,11}, {    735,10}, {   1471,12}, {    383,11}, \
+    {    799,10}, {   1599,11}, {    863,10}, {   1727,12}, \
+    {    447,11}, {    959,13}, {    255,12}, {    511,11}, \
+    {   1119,12}, {    575,11}, {   1215,10}, {   2431,12}, \
+    {    639,11}, {   1343,12}, {    703,11}, {   1471,10}, \
+    {   2943,13}, {    383,12}, {    767,11}, {   1599,12}, \
+    {    831,11}, {   1727,10}, {   3455,12}, {    959,14}, \
+    {    255,13}, {    511,12}, {   1087,11}, {   2239,10}, \
+    {   4479,12}, {   1215,11}, {   2431,13}, {    639,12}, \
+    {   1471,11}, {   2943,13}, {    767,12}, {   1727,11}, \
+    {   3455,13}, {    895,12}, {   1983,14}, {    511,13}, \
+    {   1023,12}, {   2239,11}, {   4479,13}, {   1151,12}, \
+    {   2431,13}, {   1279,12}, {   2559,13}, {   1407,12}, \
+    {   2943,11}, {   5887,14}, {    767,13}, {   1663,12}, \
+    {   3455,13}, {   1919,12}, {   3839,15}, {    511,14}, \
+    {   1023,13}, {   2175,12}, {   4479,13}, {   2431,14}, \
+    {   1279,13}, {   2943,12}, {   5887,14}, {   1535,13}, \
+    {   3455,14}, {   1791,13}, {   3967,15}, {   1023,14}, \
+    {   2047,13}, {   4479,14}, {   2303,13}, {   4991,12}, \
+    {   9983,14}, {   2815,13}, {   6015,15}, {   1535,14}, \
+    {   3839,13}, {   7679,16} }
+#define MUL_FFT_TABLE3_SIZE 170
+#define MUL_FFT_THRESHOLD                 6784
+
+#define SQR_FFT_MODF_THRESHOLD             525  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    525, 5}, {     29, 6}, {     15, 5}, {     33, 6}, \
+    {     17, 5}, {     35, 6}, {     29, 7}, {     15, 6}, \
+    {     33, 7}, {     17, 6}, {     35, 7}, {     19, 6}, \
+    {     39, 7}, {     23, 6}, {     47, 7}, {     29, 8}, \
+    {     15, 7}, {     35, 8}, {     19, 7}, {     41, 8}, \
+    {     23, 7}, {     49, 8}, {     27, 7}, {     55, 9}, \
+    {     15, 8}, {     31, 7}, {     63, 8}, {     39, 9}, \
+    {     23, 8}, {     55, 9}, {     31, 8}, {     67, 9}, \
+    {     39, 8}, {     79, 9}, {     47, 8}, {     95, 9}, \
+    {     55,10}, {     31, 9}, {     79,10}, {     47, 9}, \
+    {     95,11}, {     31,10}, {     63, 9}, {    135,10}, \
+    {     79, 9}, {    159,10}, {     95,11}, {     63,10}, \
+    {    143, 9}, {    287,10}, {    159, 6}, {   2687, 7}, \
+    {   1407, 9}, {    367, 8}, {    735, 9}, {    383,10}, \
+    {    207, 9}, {    415,11}, {    127,10}, {    271, 9}, \
+    {    543,10}, {    287,11}, {    159,10}, {    319, 9}, \
+    {    639,10}, {    335, 9}, {    671,10}, {    351,11}, \
+    {    191,10}, {    383, 9}, {    767,10}, {    399, 9}, \
+    {    799,10}, {    415,12}, {    127,11}, {    255,10}, \
+    {    511, 9}, {   1023,10}, {    543,11}, {    287,10}, \
+    {    607,11}, {    319,10}, {    671,11}, {    351,12}, \
+    {    191,11}, {    383,10}, {    799,11}, {    415,13}, \
+    {    127,12}, {    255,11}, {    511,10}, {   1023,11}, \
+    {    543,10}, {   1087,11}, {    607,10}, {   1215,12}, \
+    {    319,11}, {    671,10}, {   1343,11}, {    735,10}, \
+    {   1471,12}, {    383,11}, {    799,10}, {   1599,11}, \
+    {    863,10}, {   1727,12}, {    447,11}, {    991,10}, \
+    {   1983,13}, {    255,12}, {    511,11}, {   1119,12}, \
+    {    575,11}, {   1215,10}, {   2431,12}, {    639,11}, \
+    {   1343,12}, {    703,11}, {   1471,13}, {    383,12}, \
+    {    767,11}, {   1599,12}, {    831,11}, {   1727,10}, \
+    {   3455,12}, {    895,11}, {   1791,12}, {    959,11}, \
+    {   1983,14}, {    255,13}, {    511,12}, {   1023,11}, \
+    {   2047,12}, {   1087,11}, {   2239,12}, {   1215,11}, \
+    {   2431,13}, {    639,12}, {   1471,11}, {   2943,13}, \
+    {    767,12}, {   1727,11}, {   3455,13}, {    895,12}, \
+    {   1983,11}, {   3967,14}, {    511,13}, {   1023,12}, \
+    {   2239,13}, {   1151,12}, {   2495,13}, {   1279,12}, \
+    {   2623,13}, {   1407,12}, {   2943,14}, {    767,13}, \
+    {   1663,12}, {   3455,13}, {   1919,12}, {   3967,15}, \
+    {    511,14}, {   1023,13}, {   2175,12}, {   4479,13}, \
+    {   2431,12}, {   4863,14}, {   1279,13}, {   2943,12}, \
+    {   5887,14}, {   1535,13}, {   3455,14}, {   1791,13}, \
+    {   3967,12}, {   7935,15}, {   1023,14}, {   2047,13}, \
+    {   4479,14}, {   2303,13}, {   4991,12}, {   9983,14}, \
+    {   2815,13}, {   5887,15}, {   1535,14}, {   3327,13}, \
+    {   6655,14}, {   3839,13}, {   7935,16} }
+#define SQR_FFT_TABLE3_SIZE 187
+#define SQR_FFT_THRESHOLD                 5312
+
+#define MULLO_BASECASE_THRESHOLD             0  /* always */
+#define MULLO_DC_THRESHOLD                  43
+#define MULLO_MUL_N_THRESHOLD            13463
+#define SQRLO_BASECASE_THRESHOLD             9
+#define SQRLO_DC_THRESHOLD                  42
+#define SQRLO_SQR_THRESHOLD              10323
+
+#define DC_DIV_QR_THRESHOLD                 43
+#define DC_DIVAPPR_Q_THRESHOLD             132
+#define DC_BDIV_QR_THRESHOLD                83
+#define DC_BDIV_Q_THRESHOLD                130
+
+#define INV_MULMOD_BNM1_THRESHOLD           46
+#define INV_NEWTON_THRESHOLD               189
+#define INV_APPR_THRESHOLD                 167
+
+#define BINV_NEWTON_THRESHOLD              372
+#define REDC_1_TO_REDC_N_THRESHOLD          83
+
+#define MU_DIV_QR_THRESHOLD               1589
+#define MU_DIVAPPR_Q_THRESHOLD            1589
+#define MUPI_DIV_QR_THRESHOLD               97
+#define MU_BDIV_QR_THRESHOLD              1589
+#define MU_BDIV_Q_THRESHOLD               1718
+
+#define POWM_SEC_TABLE  1,28,96,473,803
+
+#define GET_STR_DC_THRESHOLD                12
+#define GET_STR_PRECOMPUTE_THRESHOLD        16
+#define SET_STR_DC_THRESHOLD               145
+#define SET_STR_PRECOMPUTE_THRESHOLD       419
+
+#define FAC_DSC_THRESHOLD                  114
+#define FAC_ODD_THRESHOLD                   57
+
+#define MATRIX22_STRASSEN_THRESHOLD         20
+#define HGCD2_DIV1_METHOD                    1  /* 1.03% faster than 3 */
+#define HGCD_THRESHOLD                     117
+#define HGCD_APPR_THRESHOLD                137
+#define HGCD_REDUCE_THRESHOLD             3524
+#define GCD_DC_THRESHOLD                   389
+#define GCDEXT_DC_THRESHOLD                318
+#define JACOBI_BASE_METHOD                   4  /* 6.10% faster than 1 */
+
+/* Tuneup completed successfully, took 67994 seconds */

diff --git a/third_party/gmp/mpn/x86/coreisbr/gmp-mparam.h b/third_party/gmp/mpn/x86/coreisbr/gmp-mparam.h
new file mode 100644
index 0000000..23d708a
--- /dev/null
+++ b/third_party/gmp/mpn/x86/coreisbr/gmp-mparam.h

@@ -0,0 +1,215 @@
+/* x86/coreisbr gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* 3400-3800 MHz Intel Xeon E3-1270 Sandy Bridge */
+/* FFT tuning limit = 67,000,000 */
+/* Generated by tuneup.c, 2019-10-24, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD                28
+#define MOD_1_UNNORM_THRESHOLD              26
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          9
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          7
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD        11
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD         0  /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD      4
+#define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_1N_PI1_METHOD                 2  /* 88.29% faster than 1 */
+#define DIV_QR_1_NORM_THRESHOLD             21
+#define DIV_QR_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */
+#define DIV_QR_2_PI2_THRESHOLD              14
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           20
+
+#define DIV_1_VS_MUL_1_PERCENT             297
+
+#define MUL_TOOM22_THRESHOLD                32
+#define MUL_TOOM33_THRESHOLD               105
+#define MUL_TOOM44_THRESHOLD               190
+#define MUL_TOOM6H_THRESHOLD               294
+#define MUL_TOOM8H_THRESHOLD               478
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD     109
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     144
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD     116
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD     129
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD     160
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 48
+#define SQR_TOOM3_THRESHOLD                163
+#define SQR_TOOM4_THRESHOLD                250
+#define SQR_TOOM6_THRESHOLD                354
+#define SQR_TOOM8_THRESHOLD                502
+
+#define MULMID_TOOM42_THRESHOLD             98
+
+#define MULMOD_BNM1_THRESHOLD               19
+#define SQRMOD_BNM1_THRESHOLD               23
+
+#define MUL_FFT_MODF_THRESHOLD             666  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    666, 5}, {     28, 6}, {     15, 5}, {     31, 6}, \
+    {     28, 7}, {     15, 6}, {     33, 7}, {     17, 6}, \
+    {     36, 7}, {     19, 6}, {     39, 7}, {     23, 6}, \
+    {     47, 7}, {     29, 8}, {     15, 7}, {     35, 8}, \
+    {     19, 7}, {     41, 8}, {     23, 7}, {     49, 8}, \
+    {     27, 7}, {     55, 8}, {     31, 7}, {     63, 8}, \
+    {     43, 9}, {     23, 8}, {     55, 9}, {     31, 8}, \
+    {     71, 9}, {     39, 8}, {     79, 9}, {     47, 8}, \
+    {     99, 9}, {     55,10}, {     31, 9}, {     79,10}, \
+    {     47, 9}, {     95,11}, {     31,10}, {     63, 9}, \
+    {    135,10}, {     79, 9}, {    159,10}, {     95, 9}, \
+    {    191,11}, {     63,10}, {    159, 7}, {   1343, 8}, \
+    {    703, 9}, {    367, 8}, {    735, 9}, {    383,10}, \
+    {    207,11}, {    127,10}, {    255, 9}, {    511,10}, \
+    {    271, 9}, {    543,10}, {    287,11}, {    159,10}, \
+    {    319, 9}, {    639,10}, {    335,11}, {    191,10}, \
+    {    383, 9}, {    767,11}, {    223,12}, {    127,11}, \
+    {    255,10}, {    543,11}, {    287,10}, {    607, 9}, \
+    {   1215,11}, {    319,10}, {    671,12}, {    191,11}, \
+    {    383,10}, {    799,13}, {    127,12}, {    255,11}, \
+    {    511,10}, {   1023,11}, {    543,10}, {   1087,11}, \
+    {    607,10}, {   1215,12}, {    319,11}, {    671,10}, \
+    {   1343,11}, {    735,10}, {   1471,12}, {    383,11}, \
+    {    799,10}, {   1599,11}, {    863,12}, {    447,11}, \
+    {    959,13}, {    255,12}, {    511,11}, {   1087,12}, \
+    {    575,11}, {   1215,12}, {    639,11}, {   1343,12}, \
+    {    703,11}, {   1471,13}, {    383,12}, {    767,11}, \
+    {   1599,12}, {    831,11}, {   1727,12}, {    959,14}, \
+    {    255,13}, {    511,12}, {   1087,11}, {   2239,12}, \
+    {   1215,13}, {    639,12}, {   1471,11}, {   2943,13}, \
+    {    767,12}, {   1727,11}, {   3455,13}, {    895,12}, \
+    {   1983,14}, {    511,13}, {   1023,12}, {   2239,13}, \
+    {   1151,12}, {   2495,13}, {   1279,12}, {   2623,13}, \
+    {   1407,12}, {   2943,14}, {    767,13}, {   1535,12}, \
+    {   3071,13}, {   1663,12}, {   3455,13}, {   1919,15}, \
+    {    511,14}, {   1023,13}, {   2175,12}, {   4479,13}, \
+    {   2431,14}, {   1279,13}, {   2943,12}, {   5887,14}, \
+    {   1535,13}, {   3455,14}, {   1791,13}, {   3967,15}, \
+    {   1023,14}, {   2047,13}, {   4479,14}, {   2303,13}, \
+    {   4991,12}, {   9983,14}, {   2815,13}, {   5887,15}, \
+    {   1535,14}, {   3839,13}, {   7679,16} }
+#define MUL_FFT_TABLE3_SIZE 163
+#define MUL_FFT_THRESHOLD                 7552
+
+#define SQR_FFT_MODF_THRESHOLD             570  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    570, 5}, {     28, 6}, {     15, 5}, {     32, 6}, \
+    {     17, 5}, {     35, 6}, {     29, 7}, {     15, 6}, \
+    {     33, 7}, {     17, 6}, {     36, 7}, {     19, 6}, \
+    {     40, 7}, {     23, 6}, {     47, 7}, {     29, 8}, \
+    {     15, 7}, {     35, 8}, {     19, 7}, {     43, 8}, \
+    {     23, 7}, {     49, 8}, {     27, 7}, {     55, 8}, \
+    {     31, 7}, {     63, 8}, {     43, 9}, {     23, 8}, \
+    {     55, 9}, {     31, 8}, {     67, 9}, {     39, 8}, \
+    {     79, 9}, {     47, 8}, {     95, 9}, {     55,10}, \
+    {     31, 9}, {     79,10}, {     47, 9}, {     95,11}, \
+    {     31,10}, {     63, 9}, {    135,10}, {     79, 9}, \
+    {    159,10}, {     95,11}, {     63,10}, {    159,11}, \
+    {     95,10}, {    191,12}, {     63, 8}, {   1023, 9}, \
+    {    543,11}, {    159,10}, {    319, 9}, {    639,10}, \
+    {    335,11}, {    191,10}, {    383, 9}, {    767,10}, \
+    {    399, 9}, {    799,12}, {    127,11}, {    255,10}, \
+    {    511, 9}, {   1023,10}, {    543,11}, {    287,10}, \
+    {    607,11}, {    319,10}, {    671,11}, {    351,12}, \
+    {    191,11}, {    383,10}, {    799,13}, {    127,12}, \
+    {    255,11}, {    511,10}, {   1023,11}, {    543,10}, \
+    {   1087,11}, {    607,12}, {    319,11}, {    671,10}, \
+    {   1343,11}, {    735,10}, {   1471,12}, {    383,11}, \
+    {    799,10}, {   1599,11}, {    863,12}, {    447,11}, \
+    {    991,13}, {    255,12}, {    511,11}, {   1087,12}, \
+    {    575,11}, {   1215,12}, {    639,11}, {   1343,12}, \
+    {    703,11}, {   1471,13}, {    383,12}, {    767,11}, \
+    {   1599,12}, {    831,11}, {   1727,12}, {    959,11}, \
+    {   1919,14}, {    255,13}, {    511,12}, {   1023,11}, \
+    {   2047,12}, {   1087,11}, {   2239,12}, {   1215,11}, \
+    {   2431,13}, {    639,12}, {   1471,13}, {    767,12}, \
+    {   1727,13}, {    895,12}, {   1983,14}, {    511,13}, \
+    {   1023,12}, {   2239,13}, {   1151,12}, {   2495,13}, \
+    {   1279,12}, {   2623,13}, {   1407,12}, {   2943,14}, \
+    {    767,13}, {   1663,12}, {   3455,13}, {   1919,12}, \
+    {   3967,15}, {    511,14}, {   1023,13}, {   2175,12}, \
+    {   4479,13}, {   2431,12}, {   4863,14}, {   1279,13}, \
+    {   2943,12}, {   5887,14}, {   1535,13}, {   3455,14}, \
+    {   1791,13}, {   3967,15}, {   1023,14}, {   2047,13}, \
+    {   4479,14}, {   2303,13}, {   4991,12}, {   9983,14}, \
+    {   2559,13}, {   5119,14}, {   2815,13}, {   5887,15}, \
+    {   1535,14}, {   3839,13}, {   7679,16} }
+#define SQR_FFT_TABLE3_SIZE 163
+#define SQR_FFT_THRESHOLD                 5760
+
+#define MULLO_BASECASE_THRESHOLD            16
+#define MULLO_DC_THRESHOLD                  46
+#define MULLO_MUL_N_THRESHOLD            14281
+#define SQRLO_BASECASE_THRESHOLD             0  /* always */
+#define SQRLO_DC_THRESHOLD                 159
+#define SQRLO_SQR_THRESHOLD              11317
+
+#define DC_DIV_QR_THRESHOLD                 47
+#define DC_DIVAPPR_Q_THRESHOLD             191
+#define DC_BDIV_QR_THRESHOLD               107
+#define DC_BDIV_Q_THRESHOLD                232
+
+#define INV_MULMOD_BNM1_THRESHOLD           62
+#define INV_NEWTON_THRESHOLD               181
+#define INV_APPR_THRESHOLD                 182
+
+#define BINV_NEWTON_THRESHOLD              378
+#define REDC_1_TO_REDC_N_THRESHOLD          91
+
+#define MU_DIV_QR_THRESHOLD               1858
+#define MU_DIVAPPR_Q_THRESHOLD            1858
+#define MUPI_DIV_QR_THRESHOLD               77
+#define MU_BDIV_QR_THRESHOLD              1830
+#define MU_BDIV_Q_THRESHOLD               2166
+
+#define POWM_SEC_TABLE  1,16,126,428,1442
+
+#define GET_STR_DC_THRESHOLD                10
+#define GET_STR_PRECOMPUTE_THRESHOLD        16
+#define SET_STR_DC_THRESHOLD               418
+#define SET_STR_PRECOMPUTE_THRESHOLD      1104
+
+#define FAC_DSC_THRESHOLD                  149
+#define FAC_ODD_THRESHOLD                   34
+
+#define MATRIX22_STRASSEN_THRESHOLD         21
+#define HGCD2_DIV1_METHOD                    1  /* 5.54% faster than 4 */
+#define HGCD_THRESHOLD                      66
+#define HGCD_APPR_THRESHOLD                135
+#define HGCD_REDUCE_THRESHOLD             4284
+#define GCD_DC_THRESHOLD                   642
+#define GCDEXT_DC_THRESHOLD                465
+#define JACOBI_BASE_METHOD                   3  /* 14.76% faster than 4 */
+
+/* Tuneup completed successfully, took 44241 seconds */

diff --git a/third_party/gmp/mpn/x86/darwin.m4 b/third_party/gmp/mpn/x86/darwin.m4
new file mode 100644
index 0000000..c449216
--- /dev/null
+++ b/third_party/gmp/mpn/x86/darwin.m4

@@ -0,0 +1,102 @@
+divert(-1)
+dnl  Copyright 2007, 2011, 2012, 2014 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+define(`DARWIN')
+
+
+dnl  Usage LEA(symbol,reg)
+dnl  Usage LEAL(symbol_local_to_file,reg)
+dnl
+dnl  We maintain lists of stuff to append in load_eip and darwin_bd.  The
+dnl  `index' stuff is needed to suppress repeated definitions.  To avoid
+dnl  getting fooled by "var" and "var1", we add 'bol ' (the end of
+dnl  'indirect_symbol') at the beginning and and a newline at the end.  This
+dnl  might be a bit fragile.
+
+define(`LEA',
+m4_assert_numargs(2)
+`ifdef(`PIC',`
+ifelse(index(defn(`load_eip'), `$2'),-1,
+`m4append(`load_eip',
+`	TEXT
+	ALIGN(16)
+L(movl_eip_`'substr($2,1)):
+	movl	(%esp), $2
+	ret_internal
+')')
+ifelse(index(defn(`darwin_bd'), `bol $1
+'),-1,
+`m4append(`darwin_bd',
+`	.section __IMPORT,__pointers,non_lazy_symbol_pointers
+L($1`'$non_lazy_ptr):
+	.indirect_symbol $1
+	.long	 0
+')')
+	call	L(movl_eip_`'substr($2,1))
+	movl	L($1`'$non_lazy_ptr)-.($2), $2
+',`
+	movl	`$'$1, $2
+')')
+
+define(`LEAL',
+m4_assert_numargs(2)
+`ifdef(`PIC',`
+ifelse(index(defn(`load_eip'), `$2'),-1,
+`m4append(`load_eip',
+`	TEXT
+	ALIGN(16)
+L(movl_eip_`'substr($2,1)):
+	movl	(%esp), $2
+	ret_internal
+')')
+	call	L(movl_eip_`'substr($2,1))
+	leal	$1-.($2), $2
+',`
+	movl	`$'$1, $2
+')')
+
+
+dnl ASM_END
+
+define(`ASM_END',`load_eip`'darwin_bd')
+
+define(`load_eip', `')		dnl updated in LEA
+define(`darwin_bd', `')		dnl updated in LEA
+
+
+dnl  Usage: CALL(funcname)
+dnl
+
+define(`CALL',
+m4_assert_numargs(1)
+`call	GSYM_PREFIX`'$1')
+
+undefine(`PIC_WITH_EBX')
+
+divert`'dnl

diff --git a/third_party/gmp/mpn/x86/dive_1.asm b/third_party/gmp/mpn/x86/dive_1.asm
new file mode 100644
index 0000000..5bb0f45
--- /dev/null
+++ b/third_party/gmp/mpn/x86/dive_1.asm

@@ -0,0 +1,190 @@
+dnl  x86 mpn_divexact_1 -- mpn by limb exact division.
+
+dnl  Copyright 2001, 2002, 2007 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C     cycles/limb
+C P54    30.0
+C P55    29.0
+C P6     13.0 odd divisor, 12.0 even (strangely)
+C K6     14.0
+C K7     12.0
+C P4     42.0
+
+
+C mp_limb_t mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                           mp_limb_t divisor);
+C
+
+defframe(PARAM_DIVISOR,16)
+defframe(PARAM_SIZE,   12)
+defframe(PARAM_SRC,    8)
+defframe(PARAM_DST,    4)
+
+dnl  re-use parameter space
+define(VAR_INVERSE,`PARAM_SRC')
+
+	TEXT
+
+	ALIGN(16)
+PROLOGUE(mpn_divexact_1)
+deflit(`FRAME',0)
+
+	movl	PARAM_DIVISOR, %eax
+	pushl	%ebp	FRAME_pushl()
+
+	movl	PARAM_SIZE, %ebp
+	pushl	%edi	FRAME_pushl()
+
+	pushl	%ebx	FRAME_pushl()
+	movl	$-1, %ecx		C shift count
+
+	pushl	%esi	FRAME_pushl()
+
+L(strip_twos):
+	incl	%ecx
+
+	shrl	%eax
+	jnc	L(strip_twos)
+
+	leal	1(%eax,%eax), %ebx	C d without twos
+	andl	$127, %eax		C d/2, 7 bits
+
+ifdef(`PIC',`
+	LEA(	binvert_limb_table, %edx)
+	movzbl	(%eax,%edx), %eax		C inv 8 bits
+',`
+	movzbl	binvert_limb_table(%eax), %eax	C inv 8 bits
+')
+
+	leal	(%eax,%eax), %edx	C 2*inv
+	movl	%ebx, PARAM_DIVISOR	C d without twos
+
+	imull	%eax, %eax		C inv*inv
+
+	movl	PARAM_SRC, %esi
+	movl	PARAM_DST, %edi
+
+	imull	%ebx, %eax		C inv*inv*d
+
+	subl	%eax, %edx		C inv = 2*inv - inv*inv*d
+	leal	(%edx,%edx), %eax	C 2*inv
+
+	imull	%edx, %edx		C inv*inv
+
+	leal	(%esi,%ebp,4), %esi	C src end
+	leal	(%edi,%ebp,4), %edi	C dst end
+	negl	%ebp			C -size
+
+	imull	%ebx, %edx		C inv*inv*d
+
+	subl	%edx, %eax		C inv = 2*inv - inv*inv*d
+
+	ASSERT(e,`	C expect d*inv == 1 mod 2^GMP_LIMB_BITS
+	pushl	%eax	FRAME_pushl()
+	imull	PARAM_DIVISOR, %eax
+	cmpl	$1, %eax
+	popl	%eax	FRAME_popl()')
+
+	movl	%eax, VAR_INVERSE
+	movl	(%esi,%ebp,4), %eax	C src[0]
+
+	xorl	%ebx, %ebx
+	xorl	%edx, %edx
+
+	incl	%ebp
+	jz	L(one)
+
+	movl	(%esi,%ebp,4), %edx	C src[1]
+
+	shrdl(	%cl, %edx, %eax)
+
+	movl	VAR_INVERSE, %edx
+	jmp	L(entry)
+
+
+	ALIGN(8)
+	nop	C k6 code alignment
+	nop
+L(top):
+	C eax	q
+	C ebx	carry bit, 0 or -1
+	C ecx	shift
+	C edx	carry limb
+	C esi	src end
+	C edi	dst end
+	C ebp	counter, limbs, negative
+
+	movl	-4(%esi,%ebp,4), %eax
+	subl	%ebx, %edx		C accumulate carry bit
+
+	movl	(%esi,%ebp,4), %ebx
+
+	shrdl(	%cl, %ebx, %eax)
+
+	subl	%edx, %eax		C apply carry limb
+	movl	VAR_INVERSE, %edx
+
+	sbbl	%ebx, %ebx
+
+L(entry):
+	imull	%edx, %eax
+
+	movl	%eax, -4(%edi,%ebp,4)
+	movl	PARAM_DIVISOR, %edx
+
+	mull	%edx
+
+	incl	%ebp
+	jnz	L(top)
+
+
+	movl	-4(%esi), %eax		C src high limb
+L(one):
+	shrl	%cl, %eax
+	popl	%esi	FRAME_popl()
+
+	addl	%ebx, %eax		C apply carry bit
+	popl	%ebx	FRAME_popl()
+
+	subl	%edx, %eax		C apply carry limb
+
+	imull	VAR_INVERSE, %eax
+
+	movl	%eax, -4(%edi)
+
+	popl	%edi
+	popl	%ebp
+
+	ret
+
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/x86/divrem_1.asm b/third_party/gmp/mpn/x86/divrem_1.asm
new file mode 100644
index 0000000..255d493
--- /dev/null
+++ b/third_party/gmp/mpn/x86/divrem_1.asm

@@ -0,0 +1,233 @@
+dnl  x86 mpn_divrem_1 -- mpn by limb division extending to fractional quotient.
+
+dnl  Copyright 1999-2003, 2007 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C       cycles/limb
+C 486   approx 43 maybe
+C P5        44
+C P6        39
+C P6MMX     39
+C K6        22
+C K7        42
+C P4        58
+
+
+C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
+C                         mp_srcptr src, mp_size_t size, mp_limb_t divisor);
+C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
+C                          mp_srcptr src, mp_size_t size, mp_limb_t divisor,
+C                          mp_limb_t carry);
+C
+C Divide src,size by divisor and store the quotient in dst+xsize,size.
+C Extend the division to fractional quotient limbs in dst,xsize.  Return the
+C remainder.  Either or both xsize and size can be 0.
+C
+C mpn_divrem_1c takes a carry parameter which is an initial high limb,
+C effectively one extra limb at the top of src,size.  Must have
+C carry<divisor.
+C
+C
+C Essentially the code is the same as the division based part of
+C mpn/generic/divrem_1.c, but has the advantage that we get the desired divl
+C instruction even when gcc is not being used (when longlong.h only has the
+C rather slow generic C udiv_qrnnd().
+C
+C A test is done to see if the high limb is less than the divisor, and if so
+C one less div is done.  A div is between 20 and 40 cycles on the various
+C x86s, so assuming high<divisor about half the time, then this test saves
+C half that amount.  The branch misprediction penalty on each chip is less
+C than half a div.
+C
+C
+C Notes for P5:
+C
+C It might be thought that moving the load down to pair with the store would
+C save 1 cycle, but that doesn't seem to happen in practice, and in any case
+C would be a mere 2.2% saving, so it's hardly worth bothering about.
+C
+C A mul-by-inverse might be a possibility for P5, as done in
+C mpn/x86/pentium/mod_1.asm.  The number of auxiliary instructions required
+C is a hinderance, but there could be a 10-15% speedup available.
+C
+C
+C Notes for K6:
+C
+C K6 has its own version of this code, using loop and paying attention to
+C cache line boundary crossings.  The target 20 c/l can be had with the
+C decl+jnz of the present code by pairing up the load and store in the
+C loops.  But it's considered easier not to introduce complexity just for
+C that, but instead let k6 have its own code.
+C
+
+defframe(PARAM_CARRY,  24)
+defframe(PARAM_DIVISOR,20)
+defframe(PARAM_SIZE,   16)
+defframe(PARAM_SRC,    12)
+defframe(PARAM_XSIZE,  8)
+defframe(PARAM_DST,    4)
+
+	TEXT
+	ALIGN(16)
+
+PROLOGUE(mpn_divrem_1c)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	pushl	%edi		FRAME_pushl()
+
+	movl	PARAM_SRC, %edi
+	pushl	%esi		FRAME_pushl()
+
+	movl	PARAM_DIVISOR, %esi
+	pushl	%ebx		FRAME_pushl()
+
+	movl	PARAM_DST, %ebx
+	pushl	%ebp		FRAME_pushl()
+
+	movl	PARAM_XSIZE, %ebp
+	orl	%ecx, %ecx
+
+	movl	PARAM_CARRY, %edx
+	jz	L(fraction)
+
+	leal	-4(%ebx,%ebp,4), %ebx	C dst one limb below integer part
+	jmp	L(integer_top)
+
+EPILOGUE()
+
+
+PROLOGUE(mpn_divrem_1)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	pushl	%edi		FRAME_pushl()
+
+	movl	PARAM_SRC, %edi
+	pushl	%esi		FRAME_pushl()
+
+	movl	PARAM_DIVISOR, %esi
+	orl	%ecx,%ecx
+
+	jz	L(size_zero)
+	pushl	%ebx		FRAME_pushl()
+
+	movl	-4(%edi,%ecx,4), %eax	C src high limb
+	xorl	%edx, %edx
+
+	movl	PARAM_DST, %ebx
+	pushl	%ebp		FRAME_pushl()
+
+	movl	PARAM_XSIZE, %ebp
+	cmpl	%esi, %eax
+
+	leal	-4(%ebx,%ebp,4), %ebx	C dst one limb below integer part
+	jae	L(integer_entry)
+
+
+	C high<divisor, so high of dst is zero, and avoid one div
+
+	movl	%edx, (%ebx,%ecx,4)
+	decl	%ecx
+
+	movl	%eax, %edx
+	jz	L(fraction)
+
+
+L(integer_top):
+	C eax	scratch (quotient)
+	C ebx	dst+4*xsize-4
+	C ecx	counter
+	C edx	scratch (remainder)
+	C esi	divisor
+	C edi	src
+	C ebp	xsize
+
+	movl	-4(%edi,%ecx,4), %eax
+L(integer_entry):
+
+	divl	%esi
+
+	movl	%eax, (%ebx,%ecx,4)
+	decl	%ecx
+	jnz	L(integer_top)
+
+
+L(fraction):
+	orl	%ebp, %ecx
+	jz	L(done)
+
+	movl	PARAM_DST, %ebx
+
+
+L(fraction_top):
+	C eax	scratch (quotient)
+	C ebx	dst
+	C ecx	counter
+	C edx	scratch (remainder)
+	C esi	divisor
+	C edi
+	C ebp
+
+	xorl	%eax, %eax
+
+	divl	%esi
+
+	movl	%eax, -4(%ebx,%ecx,4)
+	decl	%ecx
+	jnz	L(fraction_top)
+
+
+L(done):
+	popl	%ebp
+	movl	%edx, %eax
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
+
+
+L(size_zero):
+deflit(`FRAME',8)
+	movl	PARAM_XSIZE, %ecx
+	xorl	%eax, %eax
+
+	movl	PARAM_DST, %edi
+
+	cld	C better safe than sorry, see mpn/x86/README
+
+	rep
+	stosl
+
+	popl	%esi
+	popl	%edi
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/divrem_2.asm b/third_party/gmp/mpn/x86/divrem_2.asm
new file mode 100644
index 0000000..4c38ad0
--- /dev/null
+++ b/third_party/gmp/mpn/x86/divrem_2.asm

@@ -0,0 +1,199 @@
+dnl  x86 mpn_divrem_2 -- Divide an mpn number by a normalized 2-limb number.
+
+dnl  Copyright 2007, 2008 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C		norm	frac
+C 486
+C P5
+C P6-13		29.2
+C P6-15		*26
+C K6
+C K7		22
+C K8		*19
+C P4-f1
+C P4-f2		*65
+C P4-f3
+C P4-f4		*72
+
+C A star means numbers not updated for the latest version of the code.
+
+
+C TODO
+C  * Perhaps keep ecx or esi in stack slot, freeing up a reg for q0.
+C  * The loop has not been carefully tuned.  We should at the very least do
+C    some local insn swapping.
+C  * The code outside the main loop is what gcc generated.  Clean up!
+C  * Clean up stack slot usage.
+
+C INPUT PARAMETERS
+C qp
+C fn
+C up_param
+C un_param
+C dp
+
+
+C eax ebx ecx edx esi edi ebp
+C         cnt         qp
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_divrem_2)
+	push	%ebp
+	push	%edi
+	push	%esi
+	push	%ebx
+	sub	$36, %esp
+	mov	68(%esp), %ecx		C un
+	mov	72(%esp), %esi		C dp
+	movl	$0, 32(%esp)
+	lea	0(,%ecx,4), %edi
+	add	64(%esp), %edi		C up
+	mov	(%esi), %ebx
+	mov	4(%esi), %eax
+	mov	%ebx, 20(%esp)
+	sub	$12, %edi
+	mov	%eax, 24(%esp)
+	mov	%edi, 12(%esp)
+	mov	8(%edi), %ebx
+	mov	4(%edi), %ebp
+	cmp	%eax, %ebx
+	jb	L(8)
+	seta	%dl
+	cmp	20(%esp), %ebp
+	setae	%al
+	orb	%dl, %al		C "orb" form to placate Sun tools
+	jne	L(35)
+L(8):
+	mov	60(%esp), %esi		C fn
+	lea	-3(%esi,%ecx), %edi
+	test	%edi, %edi
+	js	L(9)
+	mov	24(%esp), %edx
+	mov	$-1, %esi
+	mov	%esi, %eax
+	mov	%esi, %ecx
+	not	%edx
+	divl	24(%esp)
+	mov	%eax, %esi
+	imul	24(%esp), %eax
+	mov	%eax, (%esp)
+	mov	%esi, %eax
+	mull	20(%esp)
+	mov	(%esp), %eax
+	add	20(%esp), %eax
+	adc	$0, %ecx
+	add	%eax, %edx
+	adc	$0, %ecx
+	mov	%ecx, %eax
+	js	L(32)
+L(36):	dec	%esi
+	sub	24(%esp), %edx
+	sbb	$0, %eax
+	jns	L(36)
+L(32):
+	mov	%esi, 16(%esp)		C di
+	mov	%edi, %ecx		C un
+	mov	12(%esp), %esi		C up
+	mov	24(%esp), %eax
+	neg	%eax
+	mov	%eax, 4(%esp)		C -d1
+	ALIGN(16)
+	nop
+
+C eax ebx ecx edx esi edi ebp  0    4   8   12  16  20  24  28  32   56  60
+C     n2  un      up      n1   q0  -d1          di  d0  d1      msl  qp  fn
+
+L(loop):
+	mov	16(%esp), %eax		C di
+	mul	%ebx
+	add	%ebp, %eax
+	mov	%eax, (%esp)		C q0
+	adc	%ebx, %edx
+	mov	%edx, %edi		C q
+	imul	4(%esp), %edx
+	mov	20(%esp), %eax
+	lea	(%edx, %ebp), %ebx	C n1 -= ...
+	mul	%edi
+	xor	%ebp, %ebp
+	cmp	60(%esp), %ecx
+	jl	L(19)
+	mov	(%esi), %ebp
+	sub	$4, %esi
+L(19):	sub	20(%esp), %ebp
+	sbb	24(%esp), %ebx
+	sub	%eax, %ebp
+	sbb	%edx, %ebx
+	mov	20(%esp), %eax		C d1
+	inc	%edi
+	xor	%edx, %edx
+	cmp	(%esp), %ebx
+	adc	$-1, %edx		C mask
+	add	%edx, %edi		C q--
+	and	%edx, %eax		C d0 or 0
+	and	24(%esp), %edx		C d1 or 0
+	add	%eax, %ebp
+	adc	%edx, %ebx
+	cmp	24(%esp), %ebx
+	jae	L(fix)
+L(bck):	mov	56(%esp), %edx
+	mov	%edi, (%edx, %ecx, 4)
+	dec	%ecx
+	jns	L(loop)
+
+L(9):	mov	64(%esp), %esi		C up
+	mov	%ebp, (%esi)
+	mov	%ebx, 4(%esi)
+	mov	32(%esp), %eax
+	add	$36, %esp
+	pop	%ebx
+	pop	%esi
+	pop	%edi
+	pop	%ebp
+	ret
+
+L(fix):	seta	%dl
+	cmp	20(%esp), %ebp
+	setae	%al
+	orb	%dl, %al		C "orb" form to placate Sun tools
+	je	L(bck)
+	inc	%edi
+	sub	20(%esp), %ebp
+	sbb	24(%esp), %ebx
+	jmp	L(bck)
+
+L(35):	sub	20(%esp), %ebp
+	sbb	24(%esp), %ebx
+	movl	$1, 32(%esp)
+	jmp	L(8)
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/fat/com.c b/third_party/gmp/mpn/x86/fat/com.c
new file mode 100644
index 0000000..d359d4c
--- /dev/null
+++ b/third_party/gmp/mpn/x86/fat/com.c

@@ -0,0 +1,32 @@
+/* Fat binary fallback mpn_com.
+
+Copyright 2003, 2009, 2011 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+#include "mpn/generic/com.c"

diff --git a/third_party/gmp/mpn/x86/fat/fat.c b/third_party/gmp/mpn/x86/fat/fat.c
new file mode 100644
index 0000000..18be05a
--- /dev/null
+++ b/third_party/gmp/mpn/x86/fat/fat.c

@@ -0,0 +1,499 @@
+/* x86 fat binary initializers.
+
+   THE FUNCTIONS AND VARIABLES IN THIS FILE ARE FOR INTERNAL USE ONLY.
+   THEY'RE ALMOST CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR
+   COMPLETELY IN FUTURE GNU MP RELEASES.
+
+Copyright 2003, 2004, 2011-2013, 2015, 2017, 2018 Free Software Foundation,
+Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include <stdio.h>    /* for printf */
+#include <stdlib.h>   /* for getenv */
+#include <string.h>
+
+#include "gmp-impl.h"
+
+/* Change this to "#define TRACE(x) x" for some traces. */
+#define TRACE(x)
+
+
+/* fat_entry.asm */
+long __gmpn_cpuid (char [12], int);
+int  __gmpn_cpuid_available (void);
+
+
+#if WANT_FAKE_CPUID
+/* The "name"s in the table are values for the GMP_CPU_TYPE environment
+   variable.  Anything can be used, but for now it's the canonical cpu types
+   as per config.guess/config.sub.  */
+
+#define __gmpn_cpuid            fake_cpuid
+#define __gmpn_cpuid_available  fake_cpuid_available
+
+#define MAKE_FMS(family, model)						\
+  ((((family) & 0xf) << 8) + (((family) & 0xff0) << 20)			\
+   + (((model) & 0xf) << 4) + (((model)  &  0xf0) << 12))
+
+static struct {
+  const char  *name;
+  const char  *vendor;
+  unsigned    fms;
+} fake_cpuid_table[] = {
+  { "i386",       "" },
+  { "i486",       "GenuineIntel", MAKE_FMS (4, 0) },
+  { "pentium",    "GenuineIntel", MAKE_FMS (5, 0) },
+  { "pentiummmx", "GenuineIntel", MAKE_FMS (5, 4) },
+  { "pentiumpro", "GenuineIntel", MAKE_FMS (6, 0) },
+  { "pentium2",   "GenuineIntel", MAKE_FMS (6, 2) },
+  { "pentium3",   "GenuineIntel", MAKE_FMS (6, 7) },
+  { "pentium4",   "GenuineIntel", MAKE_FMS (15, 2) },
+  { "prescott",   "GenuineIntel", MAKE_FMS (15, 3) },
+  { "nocona",     "GenuineIntel", MAKE_FMS (15, 4) },
+  { "core2",      "GenuineIntel", MAKE_FMS (6, 0xf) },
+  { "nehalem",    "GenuineIntel", MAKE_FMS (6, 0x1a) },
+  { "nhm",        "GenuineIntel", MAKE_FMS (6, 0x1a) },
+  { "atom",       "GenuineIntel", MAKE_FMS (6, 0x1c) },
+  { "westmere",   "GenuineIntel", MAKE_FMS (6, 0x25) },
+  { "wsm",        "GenuineIntel", MAKE_FMS (6, 0x25) },
+  { "sandybridge","GenuineIntel", MAKE_FMS (6, 0x2a) },
+  { "sbr",        "GenuineIntel", MAKE_FMS (6, 0x2a) },
+  { "silvermont", "GenuineIntel", MAKE_FMS (6, 0x37) },
+  { "slm",        "GenuineIntel", MAKE_FMS (6, 0x37) },
+  { "haswell",    "GenuineIntel", MAKE_FMS (6, 0x3c) },
+  { "hwl",        "GenuineIntel", MAKE_FMS (6, 0x3c) },
+  { "broadwell",  "GenuineIntel", MAKE_FMS (6, 0x3d) },
+  { "bwl",        "GenuineIntel", MAKE_FMS (6, 0x3d) },
+  { "skylake",    "GenuineIntel", MAKE_FMS (6, 0x5e) },
+  { "sky",        "GenuineIntel", MAKE_FMS (6, 0x5e) },
+
+  { "k5",         "AuthenticAMD", MAKE_FMS (5, 0) },
+  { "k6",         "AuthenticAMD", MAKE_FMS (5, 3) },
+  { "k62",        "AuthenticAMD", MAKE_FMS (5, 8) },
+  { "k63",        "AuthenticAMD", MAKE_FMS (5, 9) },
+  { "athlon",     "AuthenticAMD", MAKE_FMS (6, 0) },
+  { "k8",         "AuthenticAMD", MAKE_FMS (15, 0) },
+  { "k10",        "AuthenticAMD", MAKE_FMS (16, 0) },
+  { "bobcat",     "AuthenticAMD", MAKE_FMS (20, 1) },
+  { "bulldozer",  "AuthenticAMD", MAKE_FMS (21, 1) },
+  { "piledriver", "AuthenticAMD", MAKE_FMS (21, 2) },
+  { "steamroller","AuthenticAMD", MAKE_FMS (21, 0x30) },
+  { "excavator",  "AuthenticAMD", MAKE_FMS (21, 0x60) },
+  { "jaguar",     "AuthenticAMD", MAKE_FMS (22, 1) },
+
+  { "viac3",      "CentaurHauls", MAKE_FMS (6, 0) },
+  { "viac32",     "CentaurHauls", MAKE_FMS (6, 9) },
+  { "nano",       "CentaurHauls", MAKE_FMS (6, 15) },
+};
+
+static int
+fake_cpuid_lookup (void)
+{
+  char  *s;
+  int   i;
+
+  s = getenv ("GMP_CPU_TYPE");
+  if (s == NULL)
+    {
+      printf ("Need GMP_CPU_TYPE environment variable for fake cpuid\n");
+      abort ();
+    }
+
+  for (i = 0; i < numberof (fake_cpuid_table); i++)
+    if (strcmp (s, fake_cpuid_table[i].name) == 0)
+      return i;
+
+  printf ("GMP_CPU_TYPE=%s unknown\n", s);
+  abort ();
+}
+
+static int
+fake_cpuid_available (void)
+{
+  return fake_cpuid_table[fake_cpuid_lookup()].vendor[0] != '\0';
+}
+
+static long
+fake_cpuid (char dst[12], int id)
+{
+  int  i = fake_cpuid_lookup();
+
+  switch (id) {
+  case 0:
+    memcpy (dst, fake_cpuid_table[i].vendor, 12);
+    return 0;
+  case 1:
+    return fake_cpuid_table[i].fms;
+  default:
+    printf ("fake_cpuid(): oops, unknown id %d\n", id);
+    abort ();
+  }
+}
+#endif
+
+
+typedef DECL_preinv_divrem_1 ((*preinv_divrem_1_t));
+typedef DECL_preinv_mod_1    ((*preinv_mod_1_t));
+
+struct cpuvec_t __gmpn_cpuvec = {
+  __MPN(add_n_init),
+  0,
+  0,
+  __MPN(addmul_1_init),
+  0,
+  __MPN(bdiv_dbm1c_init),
+  __MPN(cnd_add_n_init),
+  __MPN(cnd_sub_n_init),
+  __MPN(com_init),
+  __MPN(copyd_init),
+  __MPN(copyi_init),
+  __MPN(divexact_1_init),
+  __MPN(divrem_1_init),
+  __MPN(gcd_11_init),
+  __MPN(lshift_init),
+  __MPN(lshiftc_init),
+  __MPN(mod_1_init),
+  __MPN(mod_1_1p_init),
+  __MPN(mod_1_1p_cps_init),
+  __MPN(mod_1s_2p_init),
+  __MPN(mod_1s_2p_cps_init),
+  __MPN(mod_1s_4p_init),
+  __MPN(mod_1s_4p_cps_init),
+  __MPN(mod_34lsub1_init),
+  __MPN(modexact_1c_odd_init),
+  __MPN(mul_1_init),
+  __MPN(mul_basecase_init),
+  __MPN(mullo_basecase_init),
+  __MPN(preinv_divrem_1_init),
+  __MPN(preinv_mod_1_init),
+  __MPN(redc_1_init),
+  __MPN(redc_2_init),
+  __MPN(rshift_init),
+  __MPN(sqr_basecase_init),
+  __MPN(sub_n_init),
+  0,
+  __MPN(submul_1_init),
+  0
+};
+
+int __gmpn_cpuvec_initialized = 0;
+
+/* The following setups start with generic x86, then overwrite with
+   specifics for a chip, and higher versions of that chip.
+
+   The arrangement of the setups here will normally be the same as the $path
+   selections in configure.in for the respective chips.
+
+   This code is reentrant and thread safe.  We always calculate the same
+   decided_cpuvec, so if two copies of the code are running it doesn't
+   matter which completes first, both write the same to __gmpn_cpuvec.
+
+   We need to go via decided_cpuvec because if one thread has completed
+   __gmpn_cpuvec then it may be making use of the threshold values in that
+   vector.  If another thread is still running __gmpn_cpuvec_init then we
+   don't want it to write different values to those fields since some of the
+   asm routines only operate correctly up to their own defined threshold,
+   not an arbitrary value.  */
+
+void
+__gmpn_cpuvec_init (void)
+{
+  struct cpuvec_t  decided_cpuvec;
+
+  TRACE (printf ("__gmpn_cpuvec_init:\n"));
+
+  memset (&decided_cpuvec, '\0', sizeof (decided_cpuvec));
+
+  CPUVEC_SETUP_x86;
+  CPUVEC_SETUP_fat;
+
+  if (! __gmpn_cpuid_available ())
+    {
+      TRACE (printf ("  80386, or early 80486 without cpuid\n"));
+    }
+  else
+    {
+      char vendor_string[13];
+      char dummy_string[12];
+      long fms;
+      int family, model;
+
+      __gmpn_cpuid (vendor_string, 0);
+      vendor_string[12] = 0;
+
+      fms = __gmpn_cpuid (dummy_string, 1);
+      family = ((fms >> 8) & 0xf) + ((fms >> 20) & 0xff);
+      model = ((fms >> 4) & 0xf) + ((fms >> 12) & 0xf0);
+
+      if (strcmp (vendor_string, "GenuineIntel") == 0)
+        {
+          switch (family)
+            {
+            case 4:
+              TRACE (printf ("  80486 with cpuid\n"));
+              break;
+
+            case 5:
+              TRACE (printf ("  pentium\n"));
+              CPUVEC_SETUP_pentium;
+              if (model == 4 || model == 8)
+                {
+                  TRACE (printf ("  pentiummmx\n"));
+                  CPUVEC_SETUP_pentium_mmx;
+                }
+              break;
+
+            case 6:
+              TRACE (printf ("  p6\n"));
+              CPUVEC_SETUP_p6;
+	      switch (model)
+		{
+		case 0x00:
+		case 0x01:
+		  TRACE (printf ("  pentiumpro\n"));
+		  break;
+
+		case 0x02:
+		case 0x03:
+		case 0x04:
+		case 0x05:
+		case 0x06:
+		  TRACE (printf ("  pentium2\n"));
+                  CPUVEC_SETUP_p6_mmx;
+		  break;
+
+		case 0x07:
+		case 0x08:
+		case 0x0a:
+		case 0x0b:
+		case 0x0c:
+		  TRACE (printf ("  pentium3\n"));
+                  CPUVEC_SETUP_p6_mmx;
+                  CPUVEC_SETUP_p6_p3mmx;
+		  break;
+
+		case 0x09:		/* Banias */
+		case 0x0d:		/* Dothan */
+		case 0x0e:		/* Yonah */
+		  TRACE (printf ("  Banias/Dothan/Yonah\n"));
+                  CPUVEC_SETUP_p6_mmx;
+                  CPUVEC_SETUP_p6_p3mmx;
+                  CPUVEC_SETUP_p6_sse2;
+		  break;
+
+		case 0x0f:		/* Conroe Merom Kentsfield Allendale */
+		case 0x10:
+		case 0x11:
+		case 0x12:
+		case 0x13:
+		case 0x14:
+		case 0x15:
+		case 0x16:
+		case 0x17:		/* PNR Wolfdale Yorkfield */
+		case 0x18:
+		case 0x19:
+		case 0x1d:		/* PNR Dunnington */
+		  TRACE (printf ("  Conroe\n"));
+                  CPUVEC_SETUP_p6_mmx;
+                  CPUVEC_SETUP_p6_p3mmx;
+                  CPUVEC_SETUP_p6_sse2;
+		  CPUVEC_SETUP_core2;
+		  break;
+
+		case 0x1c:		/* Atom Silverthorne */
+		case 0x26:		/* Atom Lincroft */
+		case 0x27:		/* Atom Saltwell */
+		case 0x36:		/* Atom Cedarview/Saltwell */
+		  TRACE (printf ("  atom\n"));
+		  CPUVEC_SETUP_atom;
+		  CPUVEC_SETUP_atom_mmx;
+		  CPUVEC_SETUP_atom_sse2;
+		  break;
+
+		case 0x1a:		/* NHM Gainestown */
+		case 0x1b:
+		case 0x1e:		/* NHM Lynnfield/Jasper */
+		case 0x1f:
+		case 0x20:
+		case 0x21:
+		case 0x22:
+		case 0x23:
+		case 0x24:
+		case 0x25:		/* WSM Clarkdale/Arrandale */
+		case 0x28:
+		case 0x29:
+		case 0x2b:
+		case 0x2c:		/* WSM Gulftown */
+		case 0x2e:		/* NHM Beckton */
+		case 0x2f:		/* WSM Eagleton */
+		  TRACE (printf ("  nehalem/westmere\n"));
+                  CPUVEC_SETUP_p6_mmx;
+                  CPUVEC_SETUP_p6_p3mmx;
+                  CPUVEC_SETUP_p6_sse2;
+		  CPUVEC_SETUP_core2;
+		  CPUVEC_SETUP_coreinhm;
+		  break;
+
+		case 0x2a:		/* SBR */
+		case 0x2d:		/* SBR-EP */
+		case 0x3a:		/* IBR */
+		case 0x3c:		/* Haswell client */
+		case 0x3f:		/* Haswell server */
+		case 0x45:		/* Haswell ULT */
+		case 0x46:		/* Crystal Well */
+		case 0x3d:		/* Broadwell */
+		case 0x47:		/* Broadwell */
+		case 0x4f:		/* Broadwell server */
+		case 0x56:		/* Broadwell microserver */
+		case 0x4e:		/* Skylake client */
+		case 0x55:		/* Skylake server */
+		case 0x5e:		/* Skylake */
+		case 0x8e:		/* Kabylake */
+		case 0x9e:		/* Kabylake */
+		  TRACE (printf ("  sandybridge\n"));
+                  CPUVEC_SETUP_p6_mmx;
+                  CPUVEC_SETUP_p6_p3mmx;
+                  CPUVEC_SETUP_p6_sse2;
+		  CPUVEC_SETUP_core2;
+		  CPUVEC_SETUP_coreinhm;
+		  CPUVEC_SETUP_coreisbr;
+		  break;
+		}
+              break;
+
+            case 15:
+              TRACE (printf ("  pentium4\n"));
+              CPUVEC_SETUP_pentium4;
+              CPUVEC_SETUP_pentium4_mmx;
+              CPUVEC_SETUP_pentium4_sse2;
+              break;
+            }
+        }
+      else if (strcmp (vendor_string, "AuthenticAMD") == 0)
+        {
+          switch (family)
+            {
+            case 5:
+              if (model <= 3)
+                {
+                  TRACE (printf ("  k5\n"));
+                }
+              else
+                {
+                  TRACE (printf ("  k6\n"));
+                  CPUVEC_SETUP_k6;
+                  CPUVEC_SETUP_k6_mmx;
+                  if (model >= 8)
+                    {
+                      TRACE (printf ("  k62\n"));
+                      CPUVEC_SETUP_k6_k62mmx;
+                    }
+                  if (model >= 9)
+                    {
+                      TRACE (printf ("  k63\n"));
+                    }
+                }
+              break;
+            case 6:
+              TRACE (printf ("  athlon\n"));
+              CPUVEC_SETUP_k7;
+              CPUVEC_SETUP_k7_mmx;
+              break;
+
+            case 0x0f:		/* k8 */
+            case 0x11:		/* "fam 11h", mix of k8 and k10 */
+            case 0x13:		/* unknown, conservatively assume k8  */
+            case 0x16:		/* unknown, conservatively assume k8  */
+            case 0x17:		/* unknown, conservatively assume k8  */
+              TRACE (printf ("  k8\n"));
+              CPUVEC_SETUP_k7;
+              CPUVEC_SETUP_k7_mmx;
+              CPUVEC_SETUP_k8;
+	      break;
+
+            case 0x10:		/* k10 */
+            case 0x12:		/* k10 (llano) */
+              TRACE (printf ("  k10\n"));
+              CPUVEC_SETUP_k7;
+              CPUVEC_SETUP_k7_mmx;
+	      break;
+
+            case 0x14:		/* bobcat */
+              TRACE (printf ("  bobcat\n"));
+              CPUVEC_SETUP_k7;
+              CPUVEC_SETUP_k7_mmx;
+              CPUVEC_SETUP_bt1;
+	      break;
+
+            case 0x15:		/* bulldozer */
+              TRACE (printf ("  bulldozer\n"));
+              CPUVEC_SETUP_k7;
+              CPUVEC_SETUP_k7_mmx;
+	      break;
+            }
+        }
+      else if (strcmp (vendor_string, "CentaurHauls") == 0)
+        {
+          switch (family)
+            {
+            case 6:
+              TRACE (printf ("  viac3\n"));
+              if (model >= 9)
+                {
+                  TRACE (printf ("  viac32\n"));
+                }
+	      if (model >= 15)
+		{
+                  TRACE (printf ("  nano\n"));
+		  CPUVEC_SETUP_nano;
+		}
+              break;
+            }
+        }
+      else if (strcmp (vendor_string, "CyrixInstead") == 0)
+        {
+          /* Should recognize Cyrix' processors too.  */
+          TRACE (printf ("  cyrix something\n"));
+        }
+    }
+
+  /* There's no x86 generic mpn_preinv_divrem_1 or mpn_preinv_mod_1.
+     Instead default to the plain versions from whichever CPU we detected.
+     The function arguments are compatible, no need for any glue code.  */
+  if (decided_cpuvec.preinv_divrem_1 == NULL)
+    decided_cpuvec.preinv_divrem_1 =(preinv_divrem_1_t)decided_cpuvec.divrem_1;
+  if (decided_cpuvec.preinv_mod_1 == NULL)
+    decided_cpuvec.preinv_mod_1    =(preinv_mod_1_t)   decided_cpuvec.mod_1;
+
+  ASSERT_CPUVEC (decided_cpuvec);
+  CPUVEC_INSTALL (decided_cpuvec);
+
+  /* Set this once the threshold fields are ready.
+     Use volatile to prevent it getting moved.  */
+  *((volatile int *) &__gmpn_cpuvec_initialized) = 1;
+}

diff --git a/third_party/gmp/mpn/x86/fat/fat_entry.asm b/third_party/gmp/mpn/x86/fat/fat_entry.asm
new file mode 100644
index 0000000..25655cf
--- /dev/null
+++ b/third_party/gmp/mpn/x86/fat/fat_entry.asm

@@ -0,0 +1,243 @@
+dnl  x86 fat binary entrypoints.
+
+dnl  Copyright 2003, 2012, 2014 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+dnl  Forcibly disable profiling.
+dnl
+dnl  The entrypoints and inits are small enough not to worry about, the real
+dnl  routines arrived at will have any profiling.  Also, the way the code
+dnl  here ends with a jump means we won't work properly with the
+dnl  "instrument" profiling scheme anyway.
+
+define(`WANT_PROFILING',no)
+
+
+	TEXT
+
+
+dnl  Usage: FAT_ENTRY(name, offset)
+dnl
+dnl  Emit a fat binary entrypoint function of the given name.  This is the
+dnl  normal entry for applications, eg. __gmpn_add_n.
+dnl
+dnl  The code simply jumps through the function pointer in __gmpn_cpuvec at
+dnl  the given "offset" (in bytes).
+dnl
+dnl  For non-PIC, the jumps are 5 bytes each, aligning them to 8 should be
+dnl  fine for all x86s.
+dnl
+dnl  For PIC, the jumps are 20 bytes each, and are best aligned to 16 to
+dnl  ensure at least the first two instructions don't cross a cache line
+dnl  boundary.
+dnl
+dnl  Note the extra `' ahead of PROLOGUE obscures it from the HAVE_NATIVE
+dnl  grepping in configure, stopping that code trying to eval something with
+dnl  $1 in it.
+
+define(FAT_ENTRY,
+m4_assert_numargs(2)
+`	ALIGN(ifdef(`PIC',16,8))
+`'PROLOGUE($1)dnl
+ifdef(`PIC',`dnl
+ifdef(`DARWIN',`
+	call	L(movl_eip_edx)
+	movl	L(___gmpn_cpuvec)$non_lazy_ptr-.(%edx), %edx
+	jmp	*m4_empty_if_zero($2)(%edx)
+',`dnl
+	call	L(movl_eip_edx)
+L(entry_here$2):
+	addl	$_GLOBAL_OFFSET_TABLE_+[.-L(entry_here$2)], %edx
+	movl	GSYM_PREFIX`'__gmpn_cpuvec@GOT(%edx), %edx
+	jmp	*m4_empty_if_zero($2)(%edx)
+')
+',`dnl non-PIC
+	jmp	*GSYM_PREFIX`'__gmpn_cpuvec+$2
+')
+EPILOGUE()
+')
+
+
+dnl  FAT_ENTRY for each CPUVEC_FUNCS_LIST
+dnl
+
+define(`CPUVEC_offset',0)
+foreach(i,
+`FAT_ENTRY(MPN(i),CPUVEC_offset)
+define(`CPUVEC_offset',eval(CPUVEC_offset + 4))',
+CPUVEC_FUNCS_LIST)
+
+ifdef(`PIC',`
+	ALIGN(8)
+L(movl_eip_edx):
+	movl	(%esp), %edx
+	ret_internal
+ifdef(`DARWIN',`
+	.section	__IMPORT,__pointers,non_lazy_symbol_pointers
+L(___gmpn_cpuvec)$non_lazy_ptr:
+	.indirect_symbol	___gmpn_cpuvec
+	.long	0
+	TEXT
+')
+')
+
+
+dnl  Usage: FAT_INIT(name, offset)
+dnl
+dnl  Emit a fat binary initializer function of the given name.  These
+dnl  functions are the initial values for the pointers in __gmpn_cpuvec.
+dnl
+dnl  The code simply calls __gmpn_cpuvec_init, and then jumps back through
+dnl  the __gmpn_cpuvec pointer, at the given "offset" (in bytes).
+dnl  __gmpn_cpuvec_init will have stored the address of the selected
+dnl  implementation there.
+dnl
+dnl  Only one of these routines will be executed, and only once, since after
+dnl  that all the __gmpn_cpuvec pointers go to real routines.  So there's no
+dnl  need for anything special here, just something small and simple.  To
+dnl  keep code size down, "fat_init" is a shared bit of code, arrived at
+dnl  with the offset in %al.  %al is used since the movb instruction is 2
+dnl  bytes where %eax would be 4.
+dnl
+dnl  Note having `PROLOGUE in FAT_INIT obscures that PROLOGUE from the
+dnl  HAVE_NATIVE grepping in configure, preventing that code trying to eval
+dnl  something with $1 in it.
+
+define(FAT_INIT,
+m4_assert_numargs(2)
+`PROLOGUE($1)dnl
+	movb	$`'$2, %al
+	jmp	L(fat_init)
+EPILOGUE()
+')
+
+L(fat_init):
+	C al	__gmpn_cpuvec byte offset
+
+	movzbl	%al, %eax
+	pushl	%eax
+
+ifdef(`PIC',`dnl
+ifdef(`DARWIN',`
+	sub	$8, %esp
+	CALL(	__gmpn_cpuvec_init)
+	add	$8, %esp
+	call	L(movl_eip_edx)
+	movl	L(___gmpn_cpuvec)$non_lazy_ptr-.(%edx), %edx
+',`dnl
+	pushl	%ebx
+	call	L(movl_eip_ebx)
+L(init_here):
+	addl	$_GLOBAL_OFFSET_TABLE_+[.-L(init_here)], %ebx
+	CALL(	__gmpn_cpuvec_init)
+	movl	GSYM_PREFIX`'__gmpn_cpuvec@GOT(%ebx), %edx
+	popl	%ebx
+')
+	popl	%eax
+	jmp	*(%edx,%eax)
+
+L(movl_eip_ebx):
+	movl	(%esp), %ebx
+	ret_internal
+',`dnl non-PIC
+	sub	$8, %esp		C needed on Darwin, harmless elsewhere
+	CALL(	__gmpn_cpuvec_init)
+	add	$8, %esp		C needed on Darwin, harmless elsewhere
+	popl	%eax
+	jmp	*GSYM_PREFIX`'__gmpn_cpuvec(%eax)
+')
+
+dnl  FAT_INIT for each CPUVEC_FUNCS_LIST
+dnl
+
+define(`CPUVEC_offset',0)
+foreach(i,
+`FAT_INIT(MPN(i`'_init),CPUVEC_offset)
+define(`CPUVEC_offset',eval(CPUVEC_offset + 4))',
+CPUVEC_FUNCS_LIST)
+
+
+
+C long __gmpn_cpuid (char dst[12], int id);
+C
+C This is called only once, so just something simple and compact is fine.
+
+defframe(PARAM_ID,  8)
+defframe(PARAM_DST, 4)
+deflit(`FRAME',0)
+
+PROLOGUE(__gmpn_cpuid)
+	pushl	%esi		FRAME_pushl()
+	pushl	%ebx		FRAME_pushl()
+	movl	PARAM_ID, %eax
+	cpuid
+	movl	PARAM_DST, %esi
+	movl	%ebx, (%esi)
+	movl	%edx, 4(%esi)
+	movl	%ecx, 8(%esi)
+	popl	%ebx
+	popl	%esi
+	ret
+EPILOGUE()
+
+
+C int __gmpn_cpuid_available (void);
+C
+C Return non-zero if the cpuid instruction is available, which means late
+C model 80486 and higher.  80386 and early 80486 don't have cpuid.
+C
+C The test follows Intel AP-485 application note, namely that if bit 21 is
+C modifiable then cpuid is supported.  This test is reentrant and thread
+C safe, since of course any interrupt or context switch will preserve the
+C flags while we're tinkering with them.
+C
+C This is called only once, so just something simple and compact is fine.
+
+PROLOGUE(__gmpn_cpuid_available)
+	pushf
+	popl	%ecx		C old flags
+
+	movl	%ecx, %edx
+	xorl	$0x200000, %edx
+	pushl	%edx
+	popf
+	pushf
+	popl	%edx		C tweaked flags
+
+	movl	$1, %eax
+	cmpl	%ecx, %edx
+	jne	L(available)
+	xorl	%eax, %eax	C not changed, so cpuid not available
+
+L(available):
+	ret
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/x86/fat/gmp-mparam.h b/third_party/gmp/mpn/x86/fat/gmp-mparam.h
new file mode 100644
index 0000000..3641a6b
--- /dev/null
+++ b/third_party/gmp/mpn/x86/fat/gmp-mparam.h

@@ -0,0 +1,71 @@
+/* Fat binary x86 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 2000-2003, 2011 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+
+/* mpn_divexact_1 is faster than mpn_divrem_1 at all sizes.  The only time
+   this might not be true currently is for actual 80386 and 80486 chips,
+   where mpn/x86/dive_1.asm might be slower than mpn/x86/divrem_1.asm, but
+   that's not worth worrying about.  */
+#define DIVEXACT_1_THRESHOLD  0
+
+/* Only some of the x86s have an mpn_preinv_divrem_1, but we set
+   USE_PREINV_DIVREM_1 so that all callers use it, and then let the
+   __gmpn_cpuvec pointer go to plain mpn_divrem_1 if there's not an actual
+   preinv.  */
+#define USE_PREINV_DIVREM_1   1
+
+#define BMOD_1_TO_MOD_1_THRESHOLD           20
+
+/* mpn_sqr_basecase is faster than mpn_mul_basecase at all sizes, no need
+   for mpn_sqr to call the latter.  */
+#define SQR_BASECASE_THRESHOLD 0
+
+/* Sensible fallbacks for these, when not taken from a cpu-specific
+   gmp-mparam.h.  */
+#define MUL_TOOM22_THRESHOLD      20
+#define MUL_TOOM33_THRESHOLD     130
+#define SQR_TOOM2_THRESHOLD       30
+#define SQR_TOOM3_THRESHOLD      200
+
+/* These are values more or less in the middle of what the typical x86 chips
+   come out as.  For a fat binary it's necessary to have values for these,
+   since the defaults for MUL_FFT_TABLE and SQR_FFT_TABLE otherwise come out
+   as non-constant array initializers.  FIXME: Perhaps these should be done
+   in the cpuvec structure like other thresholds.  */
+#define MUL_FFT_TABLE  { 464, 928, 1920, 3584, 10240, 40960, 0 }
+#define MUL_FFT_MODF_THRESHOLD          400
+#define MUL_FFT_THRESHOLD              2000
+
+#define SQR_FFT_TABLE  { 528, 1184, 1920, 4608, 14336, 40960, 0 }
+#define SQR_FFT_MODF_THRESHOLD          500
+#define SQR_FFT_THRESHOLD              3000

diff --git a/third_party/gmp/mpn/x86/fat/lshiftc.c b/third_party/gmp/mpn/x86/fat/lshiftc.c
new file mode 100644
index 0000000..9ecf489
--- /dev/null
+++ b/third_party/gmp/mpn/x86/fat/lshiftc.c

@@ -0,0 +1,32 @@
+/* Fat binary fallback mpn_lshiftc.
+
+Copyright 2003, 2009, 2011 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+#include "mpn/generic/lshiftc.c"

diff --git a/third_party/gmp/mpn/x86/fat/mod_1.c b/third_party/gmp/mpn/x86/fat/mod_1.c
new file mode 100644
index 0000000..4f149cc
--- /dev/null
+++ b/third_party/gmp/mpn/x86/fat/mod_1.c

@@ -0,0 +1,32 @@
+/* Fat binary fallback mpn_mod_1.
+
+Copyright 2003, 2009 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+#include "mpn/generic/mod_1.c"

diff --git a/third_party/gmp/mpn/x86/fat/mod_1_1.c b/third_party/gmp/mpn/x86/fat/mod_1_1.c
new file mode 100644
index 0000000..92eaa7a
--- /dev/null
+++ b/third_party/gmp/mpn/x86/fat/mod_1_1.c

@@ -0,0 +1,36 @@
+/* Fat binary fallback mpn_mod_1_1p.
+
+Copyright 2003, 2009, 2011 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+/*
+PROLOGUE(mpn_mod_1_1p_cps)
+*/
+
+#define OPERATION_mod_1_1_cps 1
+#include "mpn/generic/mod_1_1.c"

diff --git a/third_party/gmp/mpn/x86/fat/mod_1_2.c b/third_party/gmp/mpn/x86/fat/mod_1_2.c
new file mode 100644
index 0000000..9095a61
--- /dev/null
+++ b/third_party/gmp/mpn/x86/fat/mod_1_2.c

@@ -0,0 +1,36 @@
+/* Fat binary fallback mpn_mod_1s_2p.
+
+Copyright 2003, 2009, 2011 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+/*
+PROLOGUE(mpn_mod_1s_2p_cps)
+*/
+
+#define OPERATION_mod_1_2_cps 1
+#include "mpn/generic/mod_1_2.c"

diff --git a/third_party/gmp/mpn/x86/fat/mod_1_4.c b/third_party/gmp/mpn/x86/fat/mod_1_4.c
new file mode 100644
index 0000000..51c0def
--- /dev/null
+++ b/third_party/gmp/mpn/x86/fat/mod_1_4.c

@@ -0,0 +1,36 @@
+/* Fat binary fallback mpn_mod_1s_4p.
+
+Copyright 2003, 2009, 2011 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+/*
+PROLOGUE(mpn_mod_1s_4p_cps)
+*/
+
+#define OPERATION_mod_1_4_cps 1
+#include "mpn/generic/mod_1_4.c"

diff --git a/third_party/gmp/mpn/x86/fat/mode1o.c b/third_party/gmp/mpn/x86/fat/mode1o.c
new file mode 100644
index 0000000..870ddb8
--- /dev/null
+++ b/third_party/gmp/mpn/x86/fat/mode1o.c

@@ -0,0 +1,32 @@
+/* Fat binary fallback mpn_modexact_1c_odd.
+
+Copyright 2003 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+#include "mpn/generic/mode1o.c"

diff --git a/third_party/gmp/mpn/x86/fat/mullo_basecase.c b/third_party/gmp/mpn/x86/fat/mullo_basecase.c
new file mode 100644
index 0000000..7f86be6
--- /dev/null
+++ b/third_party/gmp/mpn/x86/fat/mullo_basecase.c

@@ -0,0 +1,32 @@
+/* Fat binary fallback mpn_mullo_basecase.
+
+Copyright 2012 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+#include "mpn/generic/mullo_basecase.c"

diff --git a/third_party/gmp/mpn/x86/fat/redc_1.c b/third_party/gmp/mpn/x86/fat/redc_1.c
new file mode 100644
index 0000000..0025403
--- /dev/null
+++ b/third_party/gmp/mpn/x86/fat/redc_1.c

@@ -0,0 +1,32 @@
+/* Fat binary fallback mpn_redc_1.
+
+Copyright 2012 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+#include "mpn/generic/redc_1.c"

diff --git a/third_party/gmp/mpn/x86/fat/redc_2.c b/third_party/gmp/mpn/x86/fat/redc_2.c
new file mode 100644
index 0000000..1932d58
--- /dev/null
+++ b/third_party/gmp/mpn/x86/fat/redc_2.c

@@ -0,0 +1,32 @@
+/* Fat binary fallback mpn_redc_2.
+
+Copyright 2012 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+#include "mpn/generic/redc_2.c"

diff --git a/third_party/gmp/mpn/x86/gcd_11.asm b/third_party/gmp/mpn/x86/gcd_11.asm
new file mode 100644
index 0000000..af69135
--- /dev/null
+++ b/third_party/gmp/mpn/x86/gcd_11.asm

@@ -0,0 +1,126 @@
+dnl  x86 mpn_gcd_11 optimised for processors with slow BSF.
+
+dnl  Based on C version.
+
+dnl  Copyright 2019 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+dnl  Rudimentary code for x86-32, i.e. for CPUs without cmov.  Also, the bsf
+dnl  instruction is assumed to be so slow it is useless.  Instead a teble is
+dnl  used.
+dnl
+dnl  The loop benefits from OoO, in-order CPUs might want a different loop.
+dnl  The ebx and ecx registers could be combined if the assigment of ecx were
+dnl  postponed until ebx died, but that would at least hurt in-order CPUs.
+
+C	     cycles/bit (approx)
+C AMD K7	 ?
+C AMD K8,K9	 ?
+C AMD K10	 ?
+C AMD bd1	 ?
+C AMD bd2	 ?
+C AMD bd3	 ?
+C AMD bd4	 ?
+C AMD bt1	 ?
+C AMD bt2	 ?
+C AMD zn1	 ?
+C AMD zn2	 ?
+C Intel P4-2	 ?
+C Intel P4-3/4	 ?
+C Intel P6/13	 ?
+C Intel CNR	 ?
+C Intel NHM	 ?
+C Intel SBR	 ?
+C Intel IBR	 ?
+C Intel HWL	 ?
+C Intel BWL	 ?
+C Intel SKL	 ?
+C Intel atom	 ?
+C Intel SLM	 ?
+C Intel GLM	 ?
+C Intel GLM+	 ?
+C VIA nano	 ?
+C Numbers measured with: speed -CD -s8-32 -t24 mpn_gcd_1
+
+deflit(MAXSHIFT, 6)
+deflit(MASK, eval((m4_lshift(1,MAXSHIFT))-1))
+
+DEF_OBJECT(ctz_table,64)
+	.byte	MAXSHIFT
+forloop(i,1,MASK,
+`	.byte	m4_count_trailing_zeros(i)
+')
+END_OBJECT(ctz_table)
+
+define(`u0',    `%eax')
+define(`v0',    `%edx')
+
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_gcd_11)
+	push	%edi
+	push	%esi
+	push	%ebx
+
+	mov	16(%esp), u0
+	mov	20(%esp), v0
+	LEAL(	ctz_table, %esi)
+	sub	v0, u0			C u = u - v		0
+	jz	L(end)
+
+	ALIGN(16)
+L(top):	sbb	%ebx, %ebx		C mask			1
+	mov	u0, %edi		C			1
+	mov	u0, %ecx		C			1
+	and	%ebx, %edi		C			2
+	xor	%ebx, u0		C			2
+	add	%edi, v0		C v = min(u.v)		3
+	sub	%ebx, u0		C u = |u - v|		3
+L(mid):	and	$MASK, %ecx		C			2
+	movzbl	(%esi,%ecx), %ecx	C			3
+	jz	L(shift_alot)
+	shr	%cl, u0			C			4
+	sub	v0, u0			C u = u - v		0,5
+	jnz	L(top)
+
+L(end):	mov	v0, %eax
+	pop	%ebx
+	pop	%esi
+	pop	%edi
+	ret
+
+L(shift_alot):
+	shr	$MAXSHIFT, u0
+	mov	u0, %ecx
+	jmp	L(mid)
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/x86/geode/gmp-mparam.h b/third_party/gmp/mpn/x86/geode/gmp-mparam.h
new file mode 100644
index 0000000..cc9c9f1
--- /dev/null
+++ b/third_party/gmp/mpn/x86/geode/gmp-mparam.h

@@ -0,0 +1,141 @@
+/* Generic x86 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 2000-2002, 2011 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* Generated by tuneup.c, 2011-01-30, gcc 3.4 */
+
+#define MOD_1_NORM_THRESHOLD                 6
+#define MOD_1_UNNORM_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD         17
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          9
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD         0  /* never mpn_mod_1_1p */
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD        14
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD  MP_SIZE_T_MAX  /* never */
+#define USE_PREINV_DIVREM_1                  0
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           42
+
+#define MUL_TOOM22_THRESHOLD                18
+#define MUL_TOOM33_THRESHOLD                66
+#define MUL_TOOM44_THRESHOLD               105
+#define MUL_TOOM6H_THRESHOLD               141
+#define MUL_TOOM8H_THRESHOLD               212
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      62
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD      69
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD      65
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD      67
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 33
+#define SQR_TOOM3_THRESHOLD                 60
+#define SQR_TOOM4_THRESHOLD                136
+#define SQR_TOOM6_THRESHOLD                196
+#define SQR_TOOM8_THRESHOLD                292
+
+#define MULMOD_BNM1_THRESHOLD               14
+#define SQRMOD_BNM1_THRESHOLD               16
+
+#define MUL_FFT_MODF_THRESHOLD             468  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    468, 5}, {     17, 6}, {      9, 5}, {     19, 6}, \
+    {     11, 5}, {     23, 6}, {     21, 7}, {     11, 6}, \
+    {     25, 7}, {     13, 6}, {     27, 7}, {     15, 6}, \
+    {     31, 7}, {     21, 8}, {     11, 7}, {     27, 8}, \
+    {     15, 7}, {     33, 8}, {     19, 7}, {     39, 8}, \
+    {     23, 7}, {     47, 8}, {     27, 9}, {     15, 8}, \
+    {     39, 9}, {     23, 8}, {     47,10}, {     15, 9}, \
+    {     31, 8}, {     67, 9}, {     39, 8}, {     79, 9}, \
+    {     47, 8}, {     95, 9}, {     55,10}, {     31, 9}, \
+    {     63, 8}, {    127, 9}, {     79,10}, {     47, 9}, \
+    {     95,11}, {     31,10}, {     63, 9}, {    135,10}, \
+    {     79, 9}, {    159,10}, {     95, 9}, {    191,11}, \
+    {     63,10}, {    127, 9}, {    255,10}, {    143, 9}, \
+    {    287,10}, {    159,11}, {     95,10}, {    191, 9}, \
+    {    383,12}, {   4096,13}, {   8192,14}, {  16384,15}, \
+    {  32768,16} }
+#define MUL_FFT_TABLE3_SIZE 61
+#define MUL_FFT_THRESHOLD                 5504
+
+#define SQR_FFT_MODF_THRESHOLD             396  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    396, 5}, {     21, 6}, {     11, 5}, {     23, 6}, \
+    {     21, 7}, {     11, 6}, {     24, 7}, {     13, 6}, \
+    {     27, 7}, {     15, 6}, {     31, 7}, {     21, 8}, \
+    {     11, 7}, {     27, 8}, {     15, 7}, {     33, 8}, \
+    {     19, 7}, {     39, 8}, {     23, 7}, {     47, 8}, \
+    {     27, 9}, {     15, 8}, {     39, 9}, {     23, 8}, \
+    {     51,10}, {     15, 9}, {     31, 8}, {     67, 9}, \
+    {     39, 8}, {     79, 9}, {     47, 8}, {     95, 9}, \
+    {     55,10}, {     31, 9}, {     79,10}, {     47, 9}, \
+    {     95,11}, {     31,10}, {     63, 9}, {    127, 8}, \
+    {    255, 9}, {    135,10}, {     79, 9}, {    159, 8}, \
+    {    319,10}, {     95, 9}, {    191,11}, {     63,10}, \
+    {    127, 9}, {    255, 8}, {    511,10}, {    143, 9}, \
+    {    287, 8}, {    575,10}, {    159,11}, {     95,10}, \
+    {    191,12}, {   4096,13}, {   8192,14}, {  16384,15}, \
+    {  32768,16} }
+#define SQR_FFT_TABLE3_SIZE 61
+#define SQR_FFT_THRESHOLD                 3712
+
+#define MULLO_BASECASE_THRESHOLD             3
+#define MULLO_DC_THRESHOLD                  37
+#define MULLO_MUL_N_THRESHOLD            10950
+
+#define DC_DIV_QR_THRESHOLD                 59
+#define DC_DIVAPPR_Q_THRESHOLD             189
+#define DC_BDIV_QR_THRESHOLD                55
+#define DC_BDIV_Q_THRESHOLD                136
+
+#define INV_MULMOD_BNM1_THRESHOLD           50
+#define INV_NEWTON_THRESHOLD               183
+#define INV_APPR_THRESHOLD                 181
+
+#define BINV_NEWTON_THRESHOLD              204
+#define REDC_1_TO_REDC_N_THRESHOLD          54
+
+#define MU_DIV_QR_THRESHOLD               1142
+#define MU_DIVAPPR_Q_THRESHOLD            1142
+#define MUPI_DIV_QR_THRESHOLD               81
+#define MU_BDIV_QR_THRESHOLD               889
+#define MU_BDIV_Q_THRESHOLD                998
+
+#define MATRIX22_STRASSEN_THRESHOLD         13
+#define HGCD_THRESHOLD                     133
+#define GCD_DC_THRESHOLD                   451
+#define GCDEXT_DC_THRESHOLD                318
+#define JACOBI_BASE_METHOD                   1
+
+#define GET_STR_DC_THRESHOLD                15
+#define GET_STR_PRECOMPUTE_THRESHOLD        30
+#define SET_STR_DC_THRESHOLD               547
+#define SET_STR_PRECOMPUTE_THRESHOLD      1049

diff --git a/third_party/gmp/mpn/x86/gmp-mparam.h b/third_party/gmp/mpn/x86/gmp-mparam.h
new file mode 100644
index 0000000..2cb1984
--- /dev/null
+++ b/third_party/gmp/mpn/x86/gmp-mparam.h

@@ -0,0 +1,38 @@
+/* Generic x86 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 2000-2002 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+
+/* Generic x86 mpn_divexact_1 is faster than generic x86 mpn_divrem_1 on all
+   of p5, p6, k6 and k7, so use it always.  It's probably slower on 386 and
+   486, but that's too bad.  */
+#define DIVEXACT_1_THRESHOLD  0

diff --git a/third_party/gmp/mpn/x86/goldmont/gmp-mparam.h b/third_party/gmp/mpn/x86/goldmont/gmp-mparam.h
new file mode 100644
index 0000000..3d37fa3
--- /dev/null
+++ b/third_party/gmp/mpn/x86/goldmont/gmp-mparam.h

@@ -0,0 +1,219 @@
+/* Intel Goldmont/32 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* 2200 MHz Intel Atom C3758 Goldmont/Denverton */
+/* FFT tuning limit = 67,000,000 */
+/* Generated by tuneup.c, 2019-10-22, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD                 7
+#define MOD_1_UNNORM_THRESHOLD              12
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          9
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          7
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD        10
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD         0  /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     12
+#define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_1N_PI1_METHOD                 1  /* 32.79% faster than 2 */
+#define DIV_QR_1_NORM_THRESHOLD             32
+#define DIV_QR_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           23
+
+#define DIV_1_VS_MUL_1_PERCENT             228
+
+#define MUL_TOOM22_THRESHOLD                18
+#define MUL_TOOM33_THRESHOLD                81
+#define MUL_TOOM44_THRESHOLD               193
+#define MUL_TOOM6H_THRESHOLD               286
+#define MUL_TOOM8H_THRESHOLD               399
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      81
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     138
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD     125
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD     137
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD     185
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 32
+#define SQR_TOOM3_THRESHOLD                113
+#define SQR_TOOM4_THRESHOLD                280
+#define SQR_TOOM6_THRESHOLD                399
+#define SQR_TOOM8_THRESHOLD                547
+
+#define MULMID_TOOM42_THRESHOLD             60
+
+#define MULMOD_BNM1_THRESHOLD               13
+#define SQRMOD_BNM1_THRESHOLD               15
+
+#define MUL_FFT_MODF_THRESHOLD             368  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    368, 5}, {     21, 6}, {     11, 5}, {     23, 6}, \
+    {     21, 7}, {     11, 6}, {     25, 7}, {     13, 6}, \
+    {     27, 7}, {     15, 6}, {     31, 7}, {     21, 8}, \
+    {     11, 7}, {     27, 8}, {     15, 7}, {     33, 8}, \
+    {     19, 7}, {     39, 8}, {     23, 7}, {     47, 8}, \
+    {     27, 9}, {     15, 8}, {     39, 9}, {     23, 8}, \
+    {     47,10}, {     15, 9}, {     31, 8}, {     63, 9}, \
+    {     39, 8}, {     79, 9}, {     47,10}, {     31, 9}, \
+    {     79,10}, {     47, 9}, {     95,11}, {     31,10}, \
+    {     63, 9}, {    127, 8}, {    255, 9}, {    135,10}, \
+    {     79, 9}, {    159,10}, {     95, 9}, {    191,11}, \
+    {     63,10}, {    127, 9}, {    255, 8}, {    511,10}, \
+    {    143, 9}, {    287, 8}, {    575, 9}, {    303,10}, \
+    {    159,11}, {     95,10}, {    191,12}, {     63,11}, \
+    {    127,10}, {    255, 9}, {    511,10}, {    271, 9}, \
+    {    543,10}, {    287, 9}, {    575,10}, {    303, 9}, \
+    {    607,11}, {    159,10}, {    319, 9}, {    639,10}, \
+    {    351, 9}, {    703,11}, {    191,10}, {    383, 9}, \
+    {    767,10}, {    415, 9}, {    831,11}, {    223,10}, \
+    {    447,12}, {    127,11}, {    255,10}, {    543, 9}, \
+    {   1087,11}, {    287,10}, {    607, 9}, {   1215,11}, \
+    {    319,10}, {    671,11}, {    351,10}, {    703,12}, \
+    {    191,11}, {    383,10}, {    767,11}, {    415,10}, \
+    {    831,11}, {    447,13}, {    127,12}, {    255,11}, \
+    {    543,10}, {   1087,11}, {    607,10}, {   1215,12}, \
+    {    319,11}, {    671,10}, {   1343,11}, {    703,10}, \
+    {   1407,11}, {    735,12}, {    383,11}, {    831,12}, \
+    {    447,11}, {    959,13}, {    255,12}, {    511,11}, \
+    {   1087,12}, {    575,11}, {   1215,10}, {   2431,12}, \
+    {    639,11}, {   1343,12}, {    703,11}, {   1407,13}, \
+    {    383,12}, {    831,11}, {   1663,12}, {    959,11}, \
+    {   1919,14}, {    255,13}, {    511,12}, {   1215,11}, \
+    {   2431,13}, {    639,12}, {   1471,11}, {   2943,13}, \
+    {    767,12}, {   1727,13}, {    895,12}, {   1919,11}, \
+    {   3839,14}, {    511,13}, {   1023,12}, {   2111,13}, \
+    {   1151,12}, {   2431,13}, {   1407,12}, {   2943,14}, \
+    {    767,13}, {   1663,12}, {   3455,13}, {   1919,12}, \
+    {   3839,15}, {    511,14}, {   1023,13}, {   2431,14}, \
+    {   1279,13}, {   2943,12}, {   5887,14}, {   1535,13}, \
+    {   3455,14}, {   1791,13}, {   3839,12}, {   7679,15}, \
+    {   1023,14}, {   2303,13}, {   4991,12}, {   9983,14}, \
+    {   2559,13}, {   5119,14}, {   2815,13}, {   5887,15}, \
+    {   1535,14}, {   3839,13}, {   7679,16} }
+#define MUL_FFT_TABLE3_SIZE 171
+#define MUL_FFT_THRESHOLD                 3712
+
+#define SQR_FFT_MODF_THRESHOLD             340  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    340, 5}, {     21, 6}, {     11, 5}, {     23, 6}, \
+    {     12, 5}, {     25, 6}, {     21, 7}, {     11, 6}, \
+    {     25, 7}, {     13, 6}, {     27, 7}, {     15, 6}, \
+    {     31, 7}, {     21, 8}, {     11, 7}, {     27, 8}, \
+    {     15, 7}, {     33, 8}, {     19, 7}, {     39, 8}, \
+    {     23, 7}, {     47, 8}, {     27, 9}, {     15, 8}, \
+    {     39, 9}, {     23, 8}, {     47,10}, {     15, 9}, \
+    {     31, 8}, {     67, 9}, {     39, 8}, {     79, 9}, \
+    {     47,10}, {     31, 9}, {     79,10}, {     47,11}, \
+    {     31,10}, {     63, 9}, {    127, 8}, {    255,10}, \
+    {     79, 9}, {    159, 8}, {    319,10}, {     95, 9}, \
+    {    191,11}, {     63,10}, {    127, 9}, {    255, 8}, \
+    {    511, 9}, {    271,10}, {    143, 9}, {    287, 8}, \
+    {    575, 9}, {    303, 8}, {    607, 9}, {    319,11}, \
+    {     95,10}, {    191,12}, {     63,11}, {    127,10}, \
+    {    255, 9}, {    511,10}, {    271, 9}, {    543,10}, \
+    {    287, 9}, {    575,10}, {    303, 9}, {    607,11}, \
+    {    159,10}, {    319, 9}, {    639,10}, {    335, 9}, \
+    {    671,10}, {    351, 9}, {    703,11}, {    191,10}, \
+    {    383, 9}, {    767,10}, {    415, 9}, {    831,11}, \
+    {    223,10}, {    479,12}, {    127,11}, {    255,10}, \
+    {    543, 9}, {   1087,11}, {    287,10}, {    607, 9}, \
+    {   1215,11}, {    319,10}, {    671,11}, {    351,10}, \
+    {    703,12}, {    191,11}, {    383,10}, {    767,11}, \
+    {    415,10}, {    831,11}, {    479,13}, {    127,12}, \
+    {    255,11}, {    543,10}, {   1087,11}, {    607,10}, \
+    {   1215,12}, {    319,11}, {    671,10}, {   1343,11}, \
+    {    735,12}, {    383,11}, {    831,12}, {    447,11}, \
+    {    959,13}, {    255,12}, {    511,11}, {   1087,12}, \
+    {    575,11}, {   1215,12}, {    639,11}, {   1343,12}, \
+    {    703,11}, {   1471,13}, {    383,12}, {    831,11}, \
+    {   1663,12}, {    959,11}, {   1919,14}, {    255,13}, \
+    {    511,12}, {   1215,13}, {    639,12}, {   1471,11}, \
+    {   2943,13}, {    767,12}, {   1727,13}, {    895,12}, \
+    {   1919,14}, {    511,13}, {   1023,12}, {   2111,13}, \
+    {   1151,12}, {   2431,13}, {   1407,12}, {   2943,14}, \
+    {    767,13}, {   1663,12}, {   3455,13}, {   1919,15}, \
+    {    511,14}, {   1023,13}, {   2431,14}, {   1279,13}, \
+    {   2943,12}, {   5887,14}, {   1535,13}, {   3455,14}, \
+    {   1791,13}, {   3839,12}, {   7679,15}, {   1023,14}, \
+    {   2047,13}, {   4095,14}, {   2303,13}, {   4991,12}, \
+    {   9983,14}, {   2815,13}, {   5887,15}, {   1535,14}, \
+    {   3839,13}, {   7679,16} }
+#define SQR_FFT_TABLE3_SIZE 170
+#define SQR_FFT_THRESHOLD                 3520
+
+#define MULLO_BASECASE_THRESHOLD             5
+#define MULLO_DC_THRESHOLD                  50
+#define MULLO_MUL_N_THRESHOLD             6633
+#define SQRLO_BASECASE_THRESHOLD             0  /* always */
+#define SQRLO_DC_THRESHOLD                  95
+#define SQRLO_SQR_THRESHOLD               6633
+
+#define DC_DIV_QR_THRESHOLD                 68
+#define DC_DIVAPPR_Q_THRESHOLD             204
+#define DC_BDIV_QR_THRESHOLD                64
+#define DC_BDIV_Q_THRESHOLD                108
+
+#define INV_MULMOD_BNM1_THRESHOLD           34
+#define INV_NEWTON_THRESHOLD               276
+#define INV_APPR_THRESHOLD                 226
+
+#define BINV_NEWTON_THRESHOLD              298
+#define REDC_1_TO_REDC_N_THRESHOLD          65
+
+#define MU_DIV_QR_THRESHOLD               1528
+#define MU_DIVAPPR_Q_THRESHOLD            1589
+#define MUPI_DIV_QR_THRESHOLD              140
+#define MU_BDIV_QR_THRESHOLD              1334
+#define MU_BDIV_Q_THRESHOLD               1499
+
+#define POWM_SEC_TABLE  3,16,96,428,1317
+
+#define GET_STR_DC_THRESHOLD                13
+#define GET_STR_PRECOMPUTE_THRESHOLD        18
+#define SET_STR_DC_THRESHOLD               704
+#define SET_STR_PRECOMPUTE_THRESHOLD      1358
+
+#define FAC_DSC_THRESHOLD                   95
+#define FAC_ODD_THRESHOLD                   29
+
+#define MATRIX22_STRASSEN_THRESHOLD         15
+#define HGCD2_DIV1_METHOD                    1  /* 5.53% faster than 3 */
+#define HGCD_THRESHOLD                     172
+#define HGCD_APPR_THRESHOLD                204
+#define HGCD_REDUCE_THRESHOLD             2479
+#define GCD_DC_THRESHOLD                   610
+#define GCDEXT_DC_THRESHOLD                443
+#define JACOBI_BASE_METHOD                   4  /* 6.53% faster than 3 */
+
+/* Tuneup completed successfully, took 101563 seconds */

diff --git a/third_party/gmp/mpn/x86/i486/gmp-mparam.h b/third_party/gmp/mpn/x86/i486/gmp-mparam.h
new file mode 100644
index 0000000..aa7dbad
--- /dev/null
+++ b/third_party/gmp/mpn/x86/i486/gmp-mparam.h

@@ -0,0 +1,69 @@
+/* 80486 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2001-2003 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+
+/* 100MHz DX4 */
+
+/* Generated by tuneup.c, 2003-02-13, gcc 2.95 */
+
+#define MUL_TOOM22_THRESHOLD             18
+#define MUL_TOOM33_THRESHOLD            228
+
+#define SQR_BASECASE_THRESHOLD           13
+#define SQR_TOOM2_THRESHOLD              49
+#define SQR_TOOM3_THRESHOLD             238
+
+#define DIV_SB_PREINV_THRESHOLD       MP_SIZE_T_MAX  /* never */
+#define DIV_DC_THRESHOLD                 72
+#define POWM_THRESHOLD                   38
+
+#define GCD_ACCEL_THRESHOLD               3
+#define JACOBI_BASE_METHOD                2
+
+#define USE_PREINV_DIVREM_1               0
+#define USE_PREINV_MOD_1                  0
+#define DIVREM_2_THRESHOLD            MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD              0  /* always (native) */
+#define MODEXACT_1_ODD_THRESHOLD         17
+
+#define GET_STR_DC_THRESHOLD             32
+#define GET_STR_PRECOMPUTE_THRESHOLD     82
+#define SET_STR_THRESHOLD              3524
+
+#define MUL_FFT_TABLE  { 464, 928, 1920, 4608, 10240, 40960, 0 }
+#define MUL_FFT_MODF_THRESHOLD          392
+#define MUL_FFT_THRESHOLD              2816
+
+#define SQR_FFT_TABLE  { 432, 928, 1920, 4608, 14336, 40960, 0 }
+#define SQR_FFT_MODF_THRESHOLD          392
+#define SQR_FFT_THRESHOLD              2816

diff --git a/third_party/gmp/mpn/x86/k10/gmp-mparam.h b/third_party/gmp/mpn/x86/k10/gmp-mparam.h
new file mode 100644
index 0000000..eceaaae
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k10/gmp-mparam.h

@@ -0,0 +1,217 @@
+/* x86/k10 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 2000-2011, 2014-2015 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* 3200-3600 MHz K10 Thuban */
+/* FFT tuning limit = 67,000,000 */
+/* Generated by tuneup.c, 2019-10-19, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD                 0  /* always */
+#define MOD_1_UNNORM_THRESHOLD               0  /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD         14
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          7
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD         0  /* never mpn_mod_1_1p */
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD        18
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     22
+#define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_1N_PI1_METHOD                 1  /* 29.33% faster than 2 */
+#define DIV_QR_1_NORM_THRESHOLD              2
+#define DIV_QR_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           35
+
+#define DIV_1_VS_MUL_1_PERCENT             258
+
+#define MUL_TOOM22_THRESHOLD                22
+#define MUL_TOOM33_THRESHOLD                73
+#define MUL_TOOM44_THRESHOLD               124
+#define MUL_TOOM6H_THRESHOLD               274
+#define MUL_TOOM8H_THRESHOLD               430
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      81
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD      99
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD      85
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD      88
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD     113
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 26
+#define SQR_TOOM3_THRESHOLD                105
+#define SQR_TOOM4_THRESHOLD                154
+#define SQR_TOOM6_THRESHOLD                238
+#define SQR_TOOM8_THRESHOLD                309
+
+#define MULMID_TOOM42_THRESHOLD             50
+
+#define MULMOD_BNM1_THRESHOLD               15
+#define SQRMOD_BNM1_THRESHOLD               18
+
+#define MUL_FFT_MODF_THRESHOLD             570  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    570, 5}, {     21, 6}, {     11, 5}, {     25, 6}, \
+    {     13, 5}, {     27, 6}, {     15, 5}, {     31, 6}, \
+    {     25, 7}, {     13, 6}, {     27, 7}, {     15, 6}, \
+    {     32, 7}, {     17, 6}, {     35, 7}, {     19, 6}, \
+    {     39, 7}, {     27, 8}, {     15, 7}, {     35, 8}, \
+    {     19, 7}, {     41, 8}, {     23, 7}, {     47, 8}, \
+    {     27, 9}, {     15, 8}, {     31, 7}, {     63, 8}, \
+    {     39, 9}, {     23, 8}, {     55, 9}, {     31, 8}, \
+    {     67, 9}, {     39, 8}, {     79, 9}, {     47, 8}, \
+    {     95, 9}, {     55,10}, {     31, 9}, {     79,10}, \
+    {     47, 9}, {     95,11}, {     31,10}, {     63, 9}, \
+    {    135,10}, {     79, 9}, {    159,10}, {     95, 9}, \
+    {    191,11}, {     63,10}, {    143, 9}, {    287,10}, \
+    {    159,11}, {     95,10}, {    191,12}, {     63,11}, \
+    {    127,10}, {    255, 9}, {    511,10}, {    271, 9}, \
+    {    543,10}, {    287,11}, {    159,10}, {    319, 9}, \
+    {    639,10}, {    335, 9}, {    671,11}, {    191,10}, \
+    {    383, 9}, {    767,10}, {    399, 9}, {    799,11}, \
+    {    223,12}, {    127,11}, {    255,10}, {    543,11}, \
+    {    287,10}, {    607, 9}, {   1215,11}, {    319,10}, \
+    {    671,12}, {    191,11}, {    383,10}, {    799,11}, \
+    {    415,13}, {    127,12}, {    255,11}, {    543,10}, \
+    {   1087,11}, {    607,10}, {   1215,12}, {    319,11}, \
+    {    671,10}, {   1343,11}, {    735,10}, {   1471, 9}, \
+    {   2943,12}, {    383,11}, {    799,10}, {   1599,11}, \
+    {    863,12}, {    447,11}, {    959,13}, {    255,12}, \
+    {    511,11}, {   1087,12}, {    575,11}, {   1215,10}, \
+    {   2431,12}, {    639,11}, {   1343,12}, {    703,11}, \
+    {   1471,10}, {   2943,13}, {    383,12}, {    767,11}, \
+    {   1599,12}, {    831,11}, {   1727,10}, {   3455,12}, \
+    {    959,11}, {   1919,14}, {    255,13}, {    511,12}, \
+    {   1087,11}, {   2239,12}, {   1215,11}, {   2431,13}, \
+    {    639,12}, {   1471,11}, {   2943,13}, {    767,12}, \
+    {   1727,11}, {   3455,13}, {    895,12}, {   1983,14}, \
+    {    511,13}, {   1023,12}, {   2239,13}, {   1151,12}, \
+    {   2431,13}, {   1407,12}, {   2943,14}, {    767,13}, \
+    {   1663,12}, {   3455,13}, {   1919,15}, {    511,14}, \
+    {   1023,13}, {   2175,12}, {   4479,13}, {   2431,14}, \
+    {   1279,13}, {   2943,12}, {   5887,14}, {   1535,13}, \
+    {   3455,14}, {   1791,13}, {   3967,15}, {   1023,14}, \
+    {   2047,13}, {   4479,14}, {   2303,13}, {   4991,14}, \
+    {   2815,13}, {   5887,15}, {   1535,14}, {   3839,16} }
+#define MUL_FFT_TABLE3_SIZE 168
+#define MUL_FFT_THRESHOLD                 7424
+
+#define SQR_FFT_MODF_THRESHOLD             525  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    525, 5}, {     25, 6}, {     13, 5}, {     28, 6}, \
+    {     25, 7}, {     13, 6}, {     27, 7}, {     15, 6}, \
+    {     31, 7}, {     17, 6}, {     35, 7}, {     19, 6}, \
+    {     39, 7}, {     27, 8}, {     15, 7}, {     35, 8}, \
+    {     19, 7}, {     41, 8}, {     23, 7}, {     47, 8}, \
+    {     27, 9}, {     15, 8}, {     31, 7}, {     63, 8}, \
+    {     39, 9}, {     23, 8}, {     51, 9}, {     31, 8}, \
+    {     67, 9}, {     39, 8}, {     79, 9}, {     55,10}, \
+    {     31, 9}, {     79,10}, {     47, 9}, {     95,11}, \
+    {     31,10}, {     63, 9}, {    127,10}, {     79, 9}, \
+    {    159,10}, {     95,11}, {     63,10}, {    143, 9}, \
+    {    287,10}, {    159,11}, {     95,10}, {    191,12}, \
+    {     63,11}, {    127,10}, {    255, 9}, {    511,10}, \
+    {    271, 9}, {    543,10}, {    287,11}, {    159,10}, \
+    {    319, 9}, {    639,10}, {    335, 9}, {    671,10}, \
+    {    351,11}, {    191,10}, {    383, 9}, {    767,10}, \
+    {    399, 9}, {    799,10}, {    415,12}, {    127,11}, \
+    {    255,10}, {    543,11}, {    287,10}, {    607,11}, \
+    {    319,10}, {    671, 9}, {   1343,11}, {    351,10}, \
+    {    703,12}, {    191,11}, {    383,10}, {    799, 9}, \
+    {   1599,11}, {    415,10}, {    831,13}, {    127,12}, \
+    {    255,11}, {    543,10}, {   1087,11}, {    607,12}, \
+    {    319,11}, {    671,10}, {   1343,11}, {    735,10}, \
+    {   1471,12}, {    383,11}, {    799,10}, {   1599,11}, \
+    {    863,10}, {   1727,12}, {    447,11}, {    959,10}, \
+    {   1919,11}, {    991,12}, {    511,11}, {   1087,12}, \
+    {    575,11}, {   1215,10}, {   2431,12}, {    639,11}, \
+    {   1343,12}, {    703,11}, {   1471,13}, {    383,12}, \
+    {    767,11}, {   1599,12}, {    831,11}, {   1727,10}, \
+    {   3455,12}, {    959,11}, {   1919,13}, {    511,12}, \
+    {   1087,11}, {   2239,12}, {   1215,11}, {   2431,13}, \
+    {    639,12}, {   1471,11}, {   2943,13}, {    767,12}, \
+    {   1727,11}, {   3455,13}, {    895,12}, {   1919,14}, \
+    {    511,13}, {   1023,12}, {   2239,13}, {   1151,12}, \
+    {   2495,13}, {   1279,12}, {   2623,13}, {   1407,12}, \
+    {   2943,14}, {    767,13}, {   1663,12}, {   3455,13}, \
+    {   1919,15}, {    511,14}, {   1023,13}, {   2175,12}, \
+    {   4351,13}, {   2431,14}, {   1279,13}, {   2943,12}, \
+    {   5887,14}, {   1535,13}, {   3455,14}, {   1791,13}, \
+    {   3967,15}, {   1023,14}, {   2047,13}, {   4351,14}, \
+    {   2303,13}, {   4991,14}, {   2815,13}, {   5887,15}, \
+    {   1535,14}, {   3839,16} }
+#define SQR_FFT_TABLE3_SIZE 166
+#define SQR_FFT_THRESHOLD                 5312
+
+#define MULLO_BASECASE_THRESHOLD             6
+#define MULLO_DC_THRESHOLD                  40
+#define MULLO_MUL_N_THRESHOLD            14281
+#define SQRLO_BASECASE_THRESHOLD             8
+#define SQRLO_DC_THRESHOLD                 113
+#define SQRLO_SQR_THRESHOLD              10323
+
+#define DC_DIV_QR_THRESHOLD                 56
+#define DC_DIVAPPR_Q_THRESHOLD             248
+#define DC_BDIV_QR_THRESHOLD                55
+#define DC_BDIV_Q_THRESHOLD                158
+
+#define INV_MULMOD_BNM1_THRESHOLD           42
+#define INV_NEWTON_THRESHOLD               254
+#define INV_APPR_THRESHOLD                 252
+
+#define BINV_NEWTON_THRESHOLD              292
+#define REDC_1_TO_REDC_N_THRESHOLD          67
+
+#define MU_DIV_QR_THRESHOLD               1589
+#define MU_DIVAPPR_Q_THRESHOLD            1558
+#define MUPI_DIV_QR_THRESHOLD              114
+#define MU_BDIV_QR_THRESHOLD              1442
+#define MU_BDIV_Q_THRESHOLD               1524
+
+#define POWM_SEC_TABLE  1,16,102,416,1378
+
+#define GET_STR_DC_THRESHOLD                13
+#define GET_STR_PRECOMPUTE_THRESHOLD        21
+#define SET_STR_DC_THRESHOLD               270
+#define SET_STR_PRECOMPUTE_THRESHOLD      1105
+
+#define FAC_DSC_THRESHOLD                  159
+#define FAC_ODD_THRESHOLD                   34
+
+#define MATRIX22_STRASSEN_THRESHOLD         17
+#define HGCD2_DIV1_METHOD                    3  /* 0.70% faster than 4 */
+#define HGCD_THRESHOLD                     130
+#define HGCD_APPR_THRESHOLD                163
+#define HGCD_REDUCE_THRESHOLD             3389
+#define GCD_DC_THRESHOLD                   573
+#define GCDEXT_DC_THRESHOLD                393
+#define JACOBI_BASE_METHOD                   4  /* 9.13% faster than 1 */
+
+/* Tuneup completed successfully, took 52901 seconds */

diff --git a/third_party/gmp/mpn/x86/k6/README b/third_party/gmp/mpn/x86/k6/README
new file mode 100644
index 0000000..1d65af3
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k6/README

@@ -0,0 +1,251 @@
+Copyright 2000, 2001 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.
+
+
+
+
+			AMD K6 MPN SUBROUTINES
+
+
+
+This directory contains code optimized for AMD K6 CPUs, meaning K6, K6-2 and
+K6-3.
+
+The mmx subdirectory has MMX code suiting plain K6, the k62mmx subdirectory
+has MMX code suiting K6-2 and K6-3.  All chips in the K6 family have MMX,
+the separate directories are just so that ./configure can omit them if the
+assembler doesn't support MMX.
+
+
+
+
+STATUS
+
+Times for the loops, with all code and data in L1 cache, are as follows.
+
+                                 cycles/limb
+
+	mpn_add_n/sub_n            3.25 normal, 2.75 in-place
+
+	mpn_mul_1                  6.25
+	mpn_add/submul_1           7.65-8.4  (varying with data values)
+
+	mpn_mul_basecase           9.25 cycles/crossproduct (approx)
+	mpn_sqr_basecase           4.7  cycles/crossproduct (approx)
+                                   or 9.2 cycles/triangleproduct (approx)
+
+	mpn_l/rshift               3.0
+
+	mpn_divrem_1              20.0
+	mpn_mod_1                 20.0
+	mpn_divexact_by3          11.0
+
+	mpn_copyi                  1.0
+	mpn_copyd                  1.0
+
+
+K6-2 and K6-3 have dual-issue MMX and get the following improvements.
+
+	mpn_l/rshift               1.75
+
+
+Prefetching of sources hasn't yet given any joy.  With the 3DNow "prefetch"
+instruction, code seems to run slower, and with just "mov" loads it doesn't
+seem faster.  Results so far are inconsistent.  The K6 does a hardware
+prefetch of the second cache line in a sector, so the penalty for not
+prefetching in software is reduced.
+
+
+
+
+NOTES
+
+All K6 family chips have MMX, but only K6-2 and K6-3 have 3DNow.
+
+Plain K6 executes MMX instructions only in the X pipe, but K6-2 and K6-3 can
+execute them in both X and Y (and in both together).
+
+Branch misprediction penalty is 1 to 4 cycles (Optimization Manual
+chapter 6 table 12).
+
+Write-allocate L1 data cache means prefetching of destinations is unnecessary.
+Store queue is 7 entries of 64 bits each.
+
+Floating point multiplications can be done in parallel with integer
+multiplications, but there doesn't seem to be any way to make use of this.
+
+
+
+OPTIMIZATIONS
+
+Unrolled loops are used to reduce looping overhead.  The unrolling is
+configurable up to 32 limbs/loop for most routines, up to 64 for some.
+
+Sometimes computed jumps into the unrolling are used to handle sizes not a
+multiple of the unrolling.  An attractive feature of this is that times
+smoothly increase with operand size, but an indirect jump is about 6 cycles
+and the setups about another 6, so it depends on how much the unrolled code
+is faster than a simple loop as to whether a computed jump ought to be used.
+
+Position independent code is implemented using a call to get eip for
+computed jumps and a ret is always done, rather than an addl $4,%esp or a
+popl, so the CPU return address branch prediction stack stays synchronised
+with the actual stack in memory.  Such a call however still costs 4 to 7
+cycles.
+
+Branch prediction, in absence of any history, will guess forward jumps are
+not taken and backward jumps are taken.  Where possible it's arranged that
+the less likely or less important case is under a taken forward jump.
+
+
+
+MMX
+
+Putting emms or femms as late as possible in a routine seems to be fastest.
+Perhaps an emms or femms stalls until all outstanding MMX instructions have
+completed, so putting it later gives them a chance to complete on their own,
+in parallel with other operations (like register popping).
+
+The Optimization Manual chapter 5 recommends using a femms on K6-2 and K6-3
+at the start of a routine, in case it's been preceded by x87 floating point
+operations.  This isn't done because in gmp programs it's expected that x87
+floating point won't be much used and that chances are an mpn routine won't
+have been preceded by any x87 code.
+
+
+
+CODING
+
+Instructions in general code are shown paired if they can decode and execute
+together, meaning two short decode instructions with the second not
+depending on the first, only the first using the shifter, no more than one
+load, and no more than one store.
+
+K6 does some out of order execution so the pairings aren't essential, they
+just show what slots might be available.  When decoding is the limiting
+factor things can be scheduled that might not execute until later.
+
+
+
+NOTES
+
+Code alignment
+
+- if an opcode/modrm or 0Fh/opcode/modrm crosses a cache line boundary,
+  short decode is inhibited.  The cross.pl script detects this.
+
+- loops and branch targets should be aligned to 16 bytes, or ensure at least
+  2 instructions before a 32 byte boundary.  This makes use of the 16 byte
+  cache in the BTB.
+
+Addressing modes
+
+- (%esi) degrades decoding from short to vector.  0(%esi) doesn't have this
+  problem, and can be used as an equivalent, or easier is just to use a
+  different register, like %ebx.
+
+- K6 and pre-CXT core K6-2 have the following problem.  (K6-2 CXT and K6-3
+  have it fixed, these being cpuid function 1 signatures 0x588 to 0x58F).
+
+  If more than 3 bytes are needed to determine instruction length then
+  decoding degrades from direct to long, or from long to vector.  This
+  happens with forms like "0F opcode mod/rm" with mod/rm=00-xxx-100 since
+  with mod=00 the sib determines whether there's a displacement.
+
+  This affects all MMX and 3DNow instructions, and others with an 0F prefix,
+  like movzbl.  The modes affected are anything with an index and no
+  displacement, or an index but no base, and this includes (%esp) which is
+  really (,%esp,1).
+
+  The cross.pl script detects problem cases.  The workaround is to always
+  use a displacement, and to do this with Zdisp if it's zero so the
+  assembler doesn't discard it.
+
+  See Optimization Manual rev D page 67 and 3DNow Porting Guide rev B pages
+  13-14 and 36-37.
+
+Calls
+
+- indirect jumps and calls are not branch predicted, they measure about 6
+  cycles.
+
+Various
+
+- adcl      2 cycles of decode, maybe 2 cycles executing in the X pipe
+- bsf       12-27 cycles
+- emms      5 cycles
+- femms     3 cycles
+- jecxz     2 cycles taken, 13 not taken (optimization manual says 7 not taken)
+- divl      20 cycles back-to-back
+- imull     2 decode, 3 execute
+- mull      2 decode, 3 execute (optimization manual decoding sample)
+- prefetch  2 cycles
+- rcll/rcrl implicit by one bit: 2 cycles
+            immediate or %cl count: 11 + 2 per bit for dword
+                                    13 + 4 per bit for byte
+- setCC	    2 cycles
+- xchgl	%eax,reg  1.5 cycles, back-to-back (strange)
+        reg,reg   2 cycles, back-to-back
+
+
+
+
+REFERENCES
+
+"AMD-K6 Processor Code Optimization Application Note", AMD publication
+number 21924, revision D amendment 0, January 2000.  This describes K6-2 and
+K6-3.  Available on-line,
+
+http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/21924.pdf
+
+"AMD-K6 MMX Enhanced Processor x86 Code Optimization Application Note", AMD
+publication number 21828, revision A amendment 0, August 1997.  This is an
+older edition of the above document, describing plain K6.  Available
+on-line,
+
+http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/21828.pdf
+
+"3DNow Technology Manual", AMD publication number 21928G/0-March 2000.
+This describes the femms and prefetch instructions, but nothing else from
+3DNow has been used.  Available on-line,
+
+http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/21928.pdf
+
+"3DNow Instruction Porting Guide", AMD publication number 22621, revision B,
+August 1999.  This has some notes on general K6 optimizations as well as
+3DNow.  Available on-line,
+
+http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/22621.pdf
+
+
+
+----------------
+Local variables:
+mode: text
+fill-column: 76
+End:

diff --git a/third_party/gmp/mpn/x86/k6/aors_n.asm b/third_party/gmp/mpn/x86/k6/aors_n.asm
new file mode 100644
index 0000000..168f9b4
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k6/aors_n.asm

@@ -0,0 +1,337 @@
+dnl  AMD K6 mpn_add/sub_n -- mpn addition or subtraction.
+
+dnl  Copyright 1999-2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C K6: normal 3.25 cycles/limb, in-place 2.75 cycles/limb.
+
+
+ifdef(`OPERATION_add_n', `
+	define(M4_inst,        adcl)
+	define(M4_function_n,  mpn_add_n)
+	define(M4_function_nc, mpn_add_nc)
+	define(M4_description, add)
+',`ifdef(`OPERATION_sub_n', `
+	define(M4_inst,        sbbl)
+	define(M4_function_n,  mpn_sub_n)
+	define(M4_function_nc, mpn_sub_nc)
+	define(M4_description, subtract)
+',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n
+')')')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+
+
+C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C                          mp_size_t size);
+C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C	                      mp_size_t size, mp_limb_t carry);
+C
+C Calculate src1,size M4_description src2,size, and store the result in
+C dst,size.  The return value is the carry bit from the top of the result
+C (1 or 0).
+C
+C The _nc version accepts 1 or 0 for an initial carry into the low limb of
+C the calculation.  Note values other than 1 or 0 here will lead to garbage
+C results.
+C
+C Instruction decoding limits a normal dst=src1+src2 operation to 3 c/l, and
+C an in-place dst+=src to 2.5 c/l.  The unrolled loops have 1 cycle/loop of
+C loop control, which with 4 limbs/loop means an extra 0.25 c/l.
+
+define(PARAM_CARRY, `FRAME+20(%esp)')
+define(PARAM_SIZE,  `FRAME+16(%esp)')
+define(PARAM_SRC2,  `FRAME+12(%esp)')
+define(PARAM_SRC1,  `FRAME+8(%esp)')
+define(PARAM_DST,   `FRAME+4(%esp)')
+deflit(`FRAME',0)
+
+dnl  minimum 5 because the unrolled code can't handle less
+deflit(UNROLL_THRESHOLD, 5)
+
+	TEXT
+	ALIGN(32)
+
+PROLOGUE(M4_function_nc)
+	movl	PARAM_CARRY, %eax
+	jmp	L(start)
+EPILOGUE()
+
+
+PROLOGUE(M4_function_n)
+	xorl	%eax, %eax
+L(start):
+	movl	PARAM_SIZE, %ecx
+	pushl	%ebx
+FRAME_pushl()
+
+	movl	PARAM_SRC1, %ebx
+	pushl	%edi
+FRAME_pushl()
+
+	movl	PARAM_SRC2, %edx
+	cmpl	$UNROLL_THRESHOLD, %ecx
+
+	movl	PARAM_DST, %edi
+	jae	L(unroll)
+
+
+	shrl	%eax		C initial carry flag
+
+	C offset 0x21 here, close enough to aligned
+L(simple):
+	C eax	scratch
+	C ebx	src1
+	C ecx	counter
+	C edx	src2
+	C esi
+	C edi	dst
+	C ebp
+	C
+	C The store to (%edi) could be done with a stosl; it'd be smaller
+	C code, but there's no speed gain and a cld would have to be added
+	C (per mpn/x86/README).
+
+	movl	(%ebx), %eax
+	leal	4(%ebx), %ebx
+
+	M4_inst	(%edx), %eax
+
+	movl	%eax, (%edi)
+	leal	4(%edi), %edi
+
+	leal	4(%edx), %edx
+	loop	L(simple)
+
+
+	movl	$0, %eax
+	popl	%edi
+
+	setc	%al
+
+	popl	%ebx
+	ret
+
+
+C -----------------------------------------------------------------------------
+L(unroll):
+	C eax	carry
+	C ebx	src1
+	C ecx	counter
+	C edx	src2
+	C esi
+	C edi	dst
+	C ebp
+
+	cmpl	%edi, %ebx
+	pushl	%esi
+
+	je	L(inplace)
+
+ifdef(`OPERATION_add_n',`
+	cmpl	%edi, %edx
+
+	je	L(inplace_reverse)
+')
+
+	movl	%ecx, %esi
+
+	andl	$-4, %ecx
+	andl	$3, %esi
+
+	leal	(%ebx,%ecx,4), %ebx
+	leal	(%edx,%ecx,4), %edx
+	leal	(%edi,%ecx,4), %edi
+
+	negl	%ecx
+	shrl	%eax
+
+	ALIGN(32)
+L(normal_top):
+	C eax	counter, qwords, negative
+	C ebx	src1
+	C ecx	scratch
+	C edx	src2
+	C esi
+	C edi	dst
+	C ebp
+
+	movl	(%ebx,%ecx,4), %eax
+	leal	5(%ecx), %ecx
+	M4_inst	-20(%edx,%ecx,4), %eax
+	movl	%eax, -20(%edi,%ecx,4)
+
+	movl	4-20(%ebx,%ecx,4), %eax
+	M4_inst	4-20(%edx,%ecx,4), %eax
+	movl	%eax, 4-20(%edi,%ecx,4)
+
+	movl	8-20(%ebx,%ecx,4), %eax
+	M4_inst	8-20(%edx,%ecx,4), %eax
+	movl	%eax, 8-20(%edi,%ecx,4)
+
+	movl	12-20(%ebx,%ecx,4), %eax
+	M4_inst	12-20(%edx,%ecx,4), %eax
+	movl	%eax, 12-20(%edi,%ecx,4)
+
+	loop	L(normal_top)
+
+
+	decl	%esi
+	jz	L(normal_finish_one)
+	js	L(normal_done)
+
+	C two or three more limbs
+
+	movl	(%ebx), %eax
+	M4_inst	(%edx), %eax
+	movl	%eax, (%edi)
+
+	movl	4(%ebx), %eax
+	M4_inst	4(%edx), %eax
+	decl	%esi
+	movl	%eax, 4(%edi)
+
+	jz	L(normal_done)
+	movl	$2, %ecx
+
+L(normal_finish_one):
+	movl	(%ebx,%ecx,4), %eax
+	M4_inst	(%edx,%ecx,4), %eax
+	movl	%eax, (%edi,%ecx,4)
+
+L(normal_done):
+	popl	%esi
+	popl	%edi
+
+	movl	$0, %eax
+	popl	%ebx
+
+	setc	%al
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+
+ifdef(`OPERATION_add_n',`
+L(inplace_reverse):
+	C dst==src2
+
+	movl	%ebx, %edx
+')
+
+L(inplace):
+	C eax	initial carry
+	C ebx
+	C ecx	size
+	C edx	src
+	C esi
+	C edi	dst
+	C ebp
+
+	leal	-1(%ecx), %esi
+	decl	%ecx
+
+	andl	$-4, %ecx
+	andl	$3, %esi
+
+	movl	(%edx), %ebx		C src low limb
+	leal	(%edx,%ecx,4), %edx
+
+	leal	(%edi,%ecx,4), %edi
+	negl	%ecx
+
+	shrl	%eax
+
+
+	ALIGN(32)
+L(inplace_top):
+	C eax
+	C ebx	next src limb
+	C ecx	size
+	C edx	src
+	C esi
+	C edi	dst
+	C ebp
+
+	M4_inst	%ebx, (%edi,%ecx,4)
+
+	movl	4(%edx,%ecx,4), %eax
+	leal	5(%ecx), %ecx
+
+	M4_inst	%eax, 4-20(%edi,%ecx,4)
+
+	movl	8-20(%edx,%ecx,4), %eax
+	movl	12-20(%edx,%ecx,4), %ebx
+
+	M4_inst	%eax, 8-20(%edi,%ecx,4)
+	M4_inst	%ebx, 12-20(%edi,%ecx,4)
+
+	movl	16-20(%edx,%ecx,4), %ebx
+	loop	L(inplace_top)
+
+
+	C now %esi is 0 to 3 representing respectively 1 to 4 limbs more
+
+	M4_inst	%ebx, (%edi)
+
+	decl	%esi
+	jz	L(inplace_finish_one)
+	js	L(inplace_done)
+
+	C two or three more limbs
+
+	movl	4(%edx), %eax
+	movl	8(%edx), %ebx
+	M4_inst	%eax, 4(%edi)
+	M4_inst	%ebx, 8(%edi)
+
+	decl	%esi
+	movl	$2, %ecx
+
+	jz	L(normal_done)
+
+L(inplace_finish_one):
+	movl	4(%edx,%ecx,4), %eax
+	M4_inst	%eax, 4(%edi,%ecx,4)
+
+L(inplace_done):
+	popl	%esi
+	popl	%edi
+
+	movl	$0, %eax
+	popl	%ebx
+
+	setc	%al
+
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/k6/aorsmul_1.asm b/third_party/gmp/mpn/x86/k6/aorsmul_1.asm
new file mode 100644
index 0000000..eaa92eb
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k6/aorsmul_1.asm

@@ -0,0 +1,391 @@
+dnl  AMD K6 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple.
+
+dnl  Copyright 1999-2003, 2005 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C			    cycles/limb
+C P5
+C P6 model 0-8,10-12		 5.94
+C P6 model 9  (Banias)		 5.51
+C P6 model 13 (Dothan)		 5.57
+C P4 model 0  (Willamette)
+C P4 model 1  (?)
+C P4 model 2  (Northwood)
+C P4 model 3  (Prescott)
+C P4 model 4  (Nocona)
+C AMD K6			7.65-8.5 (data dependent)
+C AMD K7
+C AMD K8
+
+
+dnl  K6:           large multipliers  small multipliers
+dnl  UNROLL_COUNT    cycles/limb       cycles/limb
+dnl        4             9.5              7.78
+dnl        8             9.0              7.78
+dnl       16             8.4              7.65
+dnl       32             8.4              8.2
+dnl
+dnl  Maximum possible unrolling with the current code is 32.
+dnl
+dnl  Unrolling to 16 limbs/loop makes the unrolled loop fit exactly in a 256
+dnl  byte block, which might explain the good speed at that unrolling.
+
+deflit(UNROLL_COUNT, 16)
+
+
+ifdef(`OPERATION_addmul_1', `
+	define(M4_inst,        addl)
+	define(M4_function_1,  mpn_addmul_1)
+	define(M4_function_1c, mpn_addmul_1c)
+',`ifdef(`OPERATION_submul_1', `
+	define(M4_inst,        subl)
+	define(M4_function_1,  mpn_submul_1)
+	define(M4_function_1c, mpn_submul_1c)
+',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1
+')')')
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c)
+
+
+C mp_limb_t mpn_addmul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                         mp_limb_t mult);
+C mp_limb_t mpn_addmul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                          mp_limb_t mult, mp_limb_t carry);
+C mp_limb_t mpn_submul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                         mp_limb_t mult);
+C mp_limb_t mpn_submul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                          mp_limb_t mult, mp_limb_t carry);
+C
+C The jadcl0()s in the unrolled loop makes the speed data dependent.  Small
+C multipliers (most significant few bits clear) result in few carry bits and
+C speeds up to 7.65 cycles/limb are attained.  Large multipliers (most
+C significant few bits set) make the carry bits 50/50 and lead to something
+C more like 8.4 c/l.  With adcl's both of these would be 9.3 c/l.
+C
+C It's important that the gains for jadcl0 on small multipliers don't come
+C at the cost of slowing down other data.  Tests on uniformly distributed
+C random data, designed to confound branch prediction, show about a 7%
+C speed-up using jadcl0 over adcl (8.93 versus 9.57 cycles/limb, with all
+C overheads included).
+C
+C In the simple loop, jadcl0() measures slower than adcl (11.9-14.7 versus
+C 11.0 cycles/limb), and hence isn't used.
+C
+C In the simple loop, note that running ecx from negative to zero and using
+C it as an index in the two movs wouldn't help.  It would save one
+C instruction (2*addl+loop becoming incl+jnz), but there's nothing unpaired
+C that would be collapsed by this.
+C
+C Attempts at a simpler main loop, with less unrolling, haven't yielded much
+C success, generally running over 9 c/l.
+C
+C
+C jadcl0
+C ------
+C
+C jadcl0() being faster than adcl $0 seems to be an artifact of two things,
+C firstly the instruction decoding and secondly the fact that there's a
+C carry bit for the jadcl0 only on average about 1/4 of the time.
+C
+C The code in the unrolled loop decodes something like the following.
+C
+C                                         decode cycles
+C		mull	%ebp                    2
+C		M4_inst	%esi, disp(%edi)        1
+C		adcl	%eax, %ecx              2
+C		movl	%edx, %esi            \ 1
+C		jnc	1f                    /
+C		incl	%esi                  \ 1
+C	1:	movl	disp(%ebx), %eax      /
+C                                              ---
+C                                               7
+C
+C In a back-to-back style test this measures 7 with the jnc not taken, or 8
+C with it taken (both when correctly predicted).  This is opposite to the
+C measurements showing small multipliers running faster than large ones.
+C Don't really know why.
+C
+C It's not clear how much branch misprediction might be costing.  The K6
+C doco says it will be 1 to 4 cycles, but presumably it's near the low end
+C of that range to get the measured results.
+C
+C
+C In the code the two carries are more or less the preceding mul product and
+C the calculation is roughly
+C
+C	x*y + u*b+v
+C
+C where b=2^32 is the size of a limb, x*y is the two carry limbs, and u and
+C v are the two limbs it's added to (being the low of the next mul, and a
+C limb from the destination).
+C
+C To get a carry requires x*y+u*b+v >= b^2, which is u*b+v >= b^2-x*y, and
+C there are b^2-(b^2-x*y) = x*y many such values, giving a probability of
+C x*y/b^2.  If x, y, u and v are random and uniformly distributed between 0
+C and b-1, then the total probability can be summed over x and y,
+C
+C	 1    b-1 b-1 x*y    1    b*(b-1)   b*(b-1)
+C	--- * sum sum --- = --- * ------- * ------- = 1/4
+C       b^2   x=0 y=1 b^2   b^4      2         2
+C
+C Actually it's a very tiny bit less than 1/4 of course.  If y is fixed,
+C then the probability is 1/2*y/b thus varying linearly between 0 and 1/2.
+
+
+ifdef(`PIC',`
+deflit(UNROLL_THRESHOLD, 9)
+',`
+deflit(UNROLL_THRESHOLD, 6)
+')
+
+defframe(PARAM_CARRY,     20)
+defframe(PARAM_MULTIPLIER,16)
+defframe(PARAM_SIZE,      12)
+defframe(PARAM_SRC,       8)
+defframe(PARAM_DST,       4)
+
+	TEXT
+	ALIGN(32)
+
+PROLOGUE(M4_function_1c)
+	pushl	%esi
+deflit(`FRAME',4)
+	movl	PARAM_CARRY, %esi
+	jmp	L(start_nc)
+EPILOGUE()
+
+PROLOGUE(M4_function_1)
+	push	%esi
+deflit(`FRAME',4)
+	xorl	%esi, %esi	C initial carry
+
+L(start_nc):
+	movl	PARAM_SIZE, %ecx
+	pushl	%ebx
+deflit(`FRAME',8)
+
+	movl	PARAM_SRC, %ebx
+	pushl	%edi
+deflit(`FRAME',12)
+
+	cmpl	$UNROLL_THRESHOLD, %ecx
+	movl	PARAM_DST, %edi
+
+	pushl	%ebp
+deflit(`FRAME',16)
+	jae	L(unroll)
+
+
+	C simple loop
+
+	movl	PARAM_MULTIPLIER, %ebp
+
+L(simple):
+	C eax	scratch
+	C ebx	src
+	C ecx	counter
+	C edx	scratch
+	C esi	carry
+	C edi	dst
+	C ebp	multiplier
+
+	movl	(%ebx), %eax
+	addl	$4, %ebx
+
+	mull	%ebp
+
+	addl	$4, %edi
+	addl	%esi, %eax
+
+	adcl	$0, %edx
+
+	M4_inst	%eax, -4(%edi)
+
+	adcl	$0, %edx
+
+	movl	%edx, %esi
+	loop	L(simple)
+
+
+	popl	%ebp
+	popl	%edi
+
+	popl	%ebx
+	movl	%esi, %eax
+
+	popl	%esi
+	ret
+
+
+
+C -----------------------------------------------------------------------------
+C The unrolled loop uses a "two carry limbs" scheme.  At the top of the loop
+C the carries are ecx=lo, esi=hi, then they swap for each limb processed.
+C For the computed jump an odd size means they start one way around, an even
+C size the other.
+C
+C VAR_JUMP holds the computed jump temporarily because there's not enough
+C registers at the point of doing the mul for the initial two carry limbs.
+C
+C The add/adc for the initial carry in %esi is necessary only for the
+C mpn_addmul/submul_1c entry points.  Duplicating the startup code to
+C eliminate this for the plain mpn_add/submul_1 doesn't seem like a good
+C idea.
+
+dnl  overlapping with parameters already fetched
+define(VAR_COUNTER, `PARAM_SIZE')
+define(VAR_JUMP,    `PARAM_DST')
+
+L(unroll):
+	C eax
+	C ebx	src
+	C ecx	size
+	C edx
+	C esi	initial carry
+	C edi	dst
+	C ebp
+
+	movl	%ecx, %edx
+	decl	%ecx
+
+	subl	$2, %edx
+	negl	%ecx
+
+	shrl	$UNROLL_LOG2, %edx
+	andl	$UNROLL_MASK, %ecx
+
+	movl	%edx, VAR_COUNTER
+	movl	%ecx, %edx
+
+	shll	$4, %edx
+	negl	%ecx
+
+	C 15 code bytes per limb
+ifdef(`PIC',`
+	call	L(pic_calc)
+L(here):
+',`
+	leal	L(entry) (%edx,%ecx,1), %edx
+')
+	movl	(%ebx), %eax		C src low limb
+
+	movl	PARAM_MULTIPLIER, %ebp
+	movl	%edx, VAR_JUMP
+
+	mull	%ebp
+
+	addl	%esi, %eax	C initial carry (from _1c)
+	jadcl0(	%edx)
+
+
+	leal	4(%ebx,%ecx,4), %ebx
+	movl	%edx, %esi	C high carry
+
+	movl	VAR_JUMP, %edx
+	leal	(%edi,%ecx,4), %edi
+
+	testl	$1, %ecx
+	movl	%eax, %ecx	C low carry
+
+	jz	L(noswap)
+	movl	%esi, %ecx	C high,low carry other way around
+
+	movl	%eax, %esi
+L(noswap):
+
+	jmp	*%edx
+
+
+ifdef(`PIC',`
+L(pic_calc):
+	C See mpn/x86/README about old gas bugs
+	leal	(%edx,%ecx,1), %edx
+	addl	$L(entry)-L(here), %edx
+	addl	(%esp), %edx
+	ret_internal
+')
+
+
+C -----------------------------------------------------------
+	ALIGN(32)
+L(top):
+deflit(`FRAME',16)
+	C eax	scratch
+	C ebx	src
+	C ecx	carry lo
+	C edx	scratch
+	C esi	carry hi
+	C edi	dst
+	C ebp	multiplier
+	C
+	C 15 code bytes per limb
+
+	leal	UNROLL_BYTES(%edi), %edi
+
+L(entry):
+forloop(`i', 0, UNROLL_COUNT/2-1, `
+	deflit(`disp0', eval(2*i*4))
+	deflit(`disp1', eval(disp0 + 4))
+
+Zdisp(	movl,	disp0,(%ebx), %eax)
+	mull	%ebp
+Zdisp(	M4_inst,%ecx, disp0,(%edi))
+	adcl	%eax, %esi
+	movl	%edx, %ecx
+	jadcl0(	%ecx)
+
+	movl	disp1(%ebx), %eax
+	mull	%ebp
+	M4_inst	%esi, disp1(%edi)
+	adcl	%eax, %ecx
+	movl	%edx, %esi
+	jadcl0(	%esi)
+')
+
+	decl	VAR_COUNTER
+
+	leal	UNROLL_BYTES(%ebx), %ebx
+	jns	L(top)
+
+
+	popl	%ebp
+	M4_inst	%ecx, UNROLL_BYTES(%edi)
+
+	popl	%edi
+	movl	%esi, %eax
+
+	popl	%ebx
+	jadcl0(	%eax)
+
+	popl	%esi
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/k6/cross.pl b/third_party/gmp/mpn/x86/k6/cross.pl
new file mode 100755
index 0000000..fc921a5
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k6/cross.pl

@@ -0,0 +1,182 @@
+#! /usr/bin/perl
+
+# Copyright 2000, 2001 Free Software Foundation, Inc.
+#
+#  This file is part of the GNU MP Library.
+#
+#  The GNU MP Library is free software; you can redistribute it and/or modify
+#  it under the terms of either:
+#
+#    * the GNU Lesser General Public License as published by the Free
+#      Software Foundation; either version 3 of the License, or (at your
+#      option) any later version.
+#
+#  or
+#
+#    * the GNU General Public License as published by the Free Software
+#      Foundation; either version 2 of the License, or (at your option) any
+#      later version.
+#
+#  or both in parallel, as here.
+#
+#  The GNU MP Library is distributed in the hope that it will be useful, but
+#  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+#  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+#  for more details.
+#
+#  You should have received copies of the GNU General Public License and the
+#  GNU Lesser General Public License along with the GNU MP Library.  If not,
+#  see https://www.gnu.org/licenses/.
+
+
+# Usage: cross.pl [filename.o]...
+#
+# Produce an annotated disassembly of the given object files, indicating
+# certain code alignment and addressing mode problems afflicting K6 chips.
+# "ZZ" is used on all annotations, so this can be searched for.
+#
+# With no arguments, all .o files corresponding to .asm files are processed.
+# This is good in the mpn object directory of a k6*-*-* build.
+#
+# Code alignments of 8 bytes or more are handled.  When 32 is used, cache
+# line boundaries will fall in at offsets 0x20,0x40,etc and problems are
+# flagged at those locations.  When 16 is used, the line boundaries can also
+# fall at offsets 0x10,0x30,0x50,etc, depending where the file is loaded, so
+# problems are identified there too.  Likewise when 8 byte alignment is used
+# problems are flagged additionally at 0x08,0x18,0x28,etc.
+#
+# Usually 32 byte alignment is used for k6 routines, but less is certainly
+# possible if through good luck, or a little tweaking, cache line crossing
+# problems can be avoided at the extra locations.
+#
+# Bugs:
+#
+# Instructions without mod/rm bytes or which are already vector decoded are
+# unaffected by cache line boundary crossing, but not all of these have yet
+# been put in as exceptions.  All that occur in practice in GMP are present
+# though.
+#
+# There's no messages for using the vector decoded addressing mode (%esi),
+# but that's easy to avoid when coding.
+#
+# Future:
+#
+# Warn about jump targets that are poorly aligned (less than 2 instructions
+# before a cache line boundary).
+
+use strict;
+
+sub disassemble {
+    my ($file) = @_;
+    my ($addr,$b1,$b2,$b3, $prefix,$opcode,$modrm);
+    my $align;
+
+    open (IN, "objdump -Srfh $file |")
+	|| die "Cannot open pipe from objdump\n";
+    while (<IN>) {
+	print;
+
+	if (/^[ \t]*[0-9]+[ \t]+\.text[ \t]/ && /2\*\*([0-9]+)$/) {
+	    $align = 1 << $1;
+	    if ($align < 8) {
+		print "ZZ cross.pl cannot handle alignment < 2**3\n";
+		$align = 8
+	    }
+	}
+
+	if (/^[ \t]*([0-9a-f]*):[ \t]*([0-9a-f]+)[ \t]+([0-9a-f]+)[ \t]+([0-9a-f]+)/) {
+	    ($addr,$b1,$b2,$b3) = ($1,$2,$3,$4);
+
+	} elsif (/^[ \t]*([0-9a-f]*):[ \t]*([0-9a-f]+)[ \t]+([0-9a-f]+)/) {
+	    ($addr,$b1,$b2,$b3) = ($1,$2,$3,'');
+
+	} elsif (/^[ \t]*([0-9a-f]*):[ \t]*([0-9a-f]+)/) {
+	    ($addr,$b1,$b2,$b3) = ($1,$2,'','');
+
+	} else {
+	    next;
+	}
+
+	if ($b1 =~ /0f/) {
+	    $prefix = $b1;
+	    $opcode = $b2;
+	    $modrm = $b3;
+	} else {
+	    $prefix = '';
+	    $opcode = $b1;
+	    $modrm = $b2;
+	}
+
+	# modrm of the form 00-xxx-100 with an 0F prefix is the problem case
+	# for K6 and pre-CXT K6-2
+	if ($prefix =~ /0f/
+	    && $opcode !~ /^8/         # jcond disp32
+	    && $modrm =~ /^[0-3][4c]/) {
+	    print "ZZ ($file) >3 bytes to determine instruction length [K6]\n";
+	}
+
+	# with just an opcode, starting 1f mod 20h
+	if (($align==32 && $addr =~ /[13579bdf]f$/
+	     || $align==16 && $addr =~ /f$/
+	     || $align==8 && $addr =~ /[7f]$/)
+	    && $prefix !~ /0f/
+	    && $opcode !~ /1[012345]/ # adc
+	    && $opcode !~ /1[89abcd]/ # sbb
+	    && $opcode !~ /^4/        # inc/dec reg
+	    && $opcode !~ /^5/        # push/pop reg
+	    && $opcode !~ /68/        # push $imm32
+	    && $opcode !~ /^7/        # jcond disp8
+	    && $opcode !~ /a[89]/     # test+imm
+	    && $opcode !~ /a[a-f]/    # stos/lods/scas
+	    && $opcode !~ /b8/        # movl $imm32,%eax
+	    && $opcode !~ /d[0123]/   # rcl
+	    && $opcode !~ /e[0123]/   # loop/loopz/loopnz/jcxz
+	    && $opcode !~ /e8/        # call disp32
+	    && $opcode !~ /e[9b]/     # jmp disp32/disp8
+	    && $opcode !~ /f[89abcd]/ # clc,stc,cli,sti,cld,std
+	    && !($opcode =~ /f[67]/          # grp 1
+		 && $modrm =~ /^[2367abef]/) # mul, imul, div, idiv
+	    && $modrm !~ /^$/) {
+	    print "ZZ ($file) opcode/modrm cross 32-byte boundary\n";
+	}
+
+	# with an 0F prefix, anything starting at 1f mod 20h
+	if (($align==32 && $addr =~ /[13579bdf][f]$/
+	     || $align==16 && $addr =~ /f$/
+	     || $align==8 && $addr =~ /[7f]$/)
+	    && $prefix =~ /0f/
+	    && $opcode !~ /af/        # imul
+	    && $opcode !~ /a[45]/     # shldl
+	    && $opcode !~ /a[cd]/     # shrdl
+	    ) {
+	    print "ZZ ($file) prefix/opcode cross 32-byte boundary\n";
+	}
+
+	# with an 0F prefix, anything with mod/rm starting at 1e mod 20h
+	if (($align==32 && $addr =~ /[13579bdf][e]$/
+	     || $align==16 && $addr =~ /[e]$/
+	     || $align==8 && $addr =~ /[6e]$/)
+	    && $prefix =~ /0f/
+	     && $opcode !~ /^8/        # jcond disp32
+	     && $opcode !~ /af/        # imull reg,reg
+	     && $opcode !~ /a[45]/     # shldl
+	     && $opcode !~ /a[cd]/     # shrdl
+	    && $modrm !~ /^$/) {
+	    print "ZZ ($file) prefix/opcode/modrm cross 32-byte boundary\n";
+	}
+    }
+    close IN || die "Error from objdump (or objdump not available)\n";
+}
+
+
+my @files;
+if ($#ARGV >= 0) {
+    @files = @ARGV;
+} else {
+    @files = glob "*.asm";
+    map {s/.asm/.o/} @files;
+}
+
+foreach (@files)  {
+    disassemble($_);
+}

diff --git a/third_party/gmp/mpn/x86/k6/divrem_1.asm b/third_party/gmp/mpn/x86/k6/divrem_1.asm
new file mode 100644
index 0000000..b4cea4f
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k6/divrem_1.asm

@@ -0,0 +1,203 @@
+dnl  AMD K6 mpn_divrem_1 -- mpn by limb division.
+
+dnl  Copyright 1999-2003, 2007 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C K6: 20 cycles/limb
+
+
+C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
+C                         mp_srcptr src, mp_size_t size, mp_limb_t divisor);
+C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
+C                          mp_srcptr src, mp_size_t size, mp_limb_t divisor,
+C                          mp_limb_t carry);
+C
+C The code here is basically the same as mpn/x86/divrem_1.asm, but uses loop
+C instead of decl+jnz, since it comes out 2 cycles/limb faster.
+C
+C A test is done to see if the high limb is less than the divisor, and if so
+C one less div is done.  A div is 20 cycles, so assuming high<divisor about
+C half the time, then this test saves half that amount.  The branch
+C misprediction penalty is less than that.
+C
+C Back-to-back div instructions run at 20 cycles, the same as the loop here,
+C so it seems there's nothing to gain by rearranging the loop.  Pairing the
+C mov and loop instructions was found to gain nothing.
+C
+C Enhancements:
+C
+C The low-latency K6 multiply might be thought to suit a mul-by-inverse, but
+C that algorithm has been found to suffer from the relatively poor carry
+C handling on K6 and too many auxiliary instructions.  The fractional part
+C however could be done at about 13 c/l, if it mattered enough.
+
+defframe(PARAM_CARRY,  24)
+defframe(PARAM_DIVISOR,20)
+defframe(PARAM_SIZE,   16)
+defframe(PARAM_SRC,    12)
+defframe(PARAM_XSIZE,  8)
+defframe(PARAM_DST,    4)
+
+	TEXT
+
+	ALIGN(32)
+PROLOGUE(mpn_divrem_1c)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	pushl	%edi		FRAME_pushl()
+
+	movl	PARAM_SRC, %edi
+	pushl	%esi		FRAME_pushl()
+
+	movl	PARAM_DIVISOR, %esi
+	pushl	%ebx		FRAME_pushl()
+
+	movl	PARAM_DST, %ebx
+	pushl	%ebp		FRAME_pushl()
+
+	movl	PARAM_XSIZE, %ebp
+	orl	%ecx, %ecx		C size
+
+	movl	PARAM_CARRY, %edx
+	jz	L(fraction)		C if size==0
+
+	leal	-4(%ebx,%ebp,4), %ebx	C dst one limb below integer part
+	jmp	L(integer_top)
+
+EPILOGUE()
+
+
+	ALIGN(16)
+PROLOGUE(mpn_divrem_1)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	pushl	%edi		FRAME_pushl()
+
+	movl	PARAM_SRC, %edi
+	pushl	%esi		FRAME_pushl()
+
+	movl	PARAM_DIVISOR, %esi
+	orl	%ecx,%ecx		C size
+
+	jz	L(size_zero)
+	pushl	%ebx		FRAME_pushl()
+
+	movl	-4(%edi,%ecx,4), %eax	C src high limb
+	xorl	%edx, %edx
+
+	movl	PARAM_DST, %ebx
+	pushl	%ebp		FRAME_pushl()
+
+	movl	PARAM_XSIZE, %ebp
+	cmpl	%esi, %eax
+
+	leal	-4(%ebx,%ebp,4), %ebx	C dst one limb below integer part
+	jae	L(integer_entry)
+
+
+	C high<divisor, so high of dst is zero, and avoid one div
+
+	movl	%edx, (%ebx,%ecx,4)
+	decl	%ecx
+
+	movl	%eax, %edx
+	jz	L(fraction)
+
+
+L(integer_top):
+	C eax	scratch (quotient)
+	C ebx	dst+4*xsize-4
+	C ecx	counter
+	C edx	scratch (remainder)
+	C esi	divisor
+	C edi	src
+	C ebp	xsize
+
+	movl	-4(%edi,%ecx,4), %eax
+L(integer_entry):
+
+	divl	%esi
+
+	movl	%eax, (%ebx,%ecx,4)
+	loop	L(integer_top)
+
+
+L(fraction):
+	orl	%ebp, %ecx
+	jz	L(done)
+
+	movl	PARAM_DST, %ebx
+
+
+L(fraction_top):
+	C eax	scratch (quotient)
+	C ebx	dst
+	C ecx	counter
+	C edx	scratch (remainder)
+	C esi	divisor
+	C edi
+	C ebp
+
+	xorl	%eax, %eax
+
+	divl	%esi
+
+	movl	%eax, -4(%ebx,%ecx,4)
+	loop	L(fraction_top)
+
+
+L(done):
+	popl	%ebp
+	movl	%edx, %eax
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
+
+
+L(size_zero):
+deflit(`FRAME',8)
+	movl	PARAM_XSIZE, %ecx
+	xorl	%eax, %eax
+
+	movl	PARAM_DST, %edi
+
+	cld	C better safe than sorry, see mpn/x86/README
+
+	rep
+	stosl
+
+	popl	%esi
+	popl	%edi
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/k6/gmp-mparam.h b/third_party/gmp/mpn/x86/k6/gmp-mparam.h
new file mode 100644
index 0000000..f03f1b2
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k6/gmp-mparam.h

@@ -0,0 +1,166 @@
+/* AMD K6 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 2000-2004, 2009, 2010 Free Software Foundation,
+Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+
+/* 450MHz K6-2 */
+
+#define MOD_1_NORM_THRESHOLD                12
+#define MOD_1_UNNORM_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD         41
+#define MOD_1U_TO_MOD_1_1_THRESHOLD         32
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD         3
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD         0
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD    128
+#define USE_PREINV_DIVREM_1                  0
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD        MP_SIZE_T_MAX  /* never */
+
+#define MUL_TOOM22_THRESHOLD                20
+#define MUL_TOOM33_THRESHOLD                69
+#define MUL_TOOM44_THRESHOLD               106
+#define MUL_TOOM6H_THRESHOLD               157
+#define MUL_TOOM8H_THRESHOLD               199
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      73
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD      69
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD      65
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD      64
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 32
+#define SQR_TOOM3_THRESHOLD                 97
+#define SQR_TOOM4_THRESHOLD                143
+#define SQR_TOOM6_THRESHOLD                222
+#define SQR_TOOM8_THRESHOLD                272
+
+#define MULMOD_BNM1_THRESHOLD               13
+#define SQRMOD_BNM1_THRESHOLD               17
+
+#define MUL_FFT_MODF_THRESHOLD             476  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    476, 5}, {     17, 6}, {      9, 5}, {     19, 6}, \
+    {     11, 5}, {     23, 6}, {     17, 7}, {      9, 6}, \
+    {     19, 7}, {     11, 6}, {     23, 7}, {     13, 6}, \
+    {     27, 7}, {     15, 6}, {     31, 7}, {     17, 6}, \
+    {     35, 7}, {     21, 8}, {     11, 7}, {     27, 8}, \
+    {     15, 7}, {     35, 8}, {     19, 7}, {     39, 8}, \
+    {     23, 7}, {     47, 8}, {     27, 9}, {     15, 8}, \
+    {     31, 7}, {     63, 8}, {     39, 9}, {     23, 8}, \
+    {     51,10}, {     15, 9}, {     31, 8}, {     67, 9}, \
+    {     47,10}, {     31, 9}, {     79,10}, {     47, 9}, \
+    {     95,11}, {     31,10}, {     63, 9}, {    135,10}, \
+    {     79, 9}, {    167,10}, {     95, 9}, {    191,10}, \
+    {    111,11}, {     63,10}, {    127, 9}, {    255,10}, \
+    {    143, 9}, {    287,10}, {    159,11}, {     95,10}, \
+    {    191, 9}, {    383,12}, {     63,11}, {    127,10}, \
+    {    255, 9}, {    511,10}, {    271, 9}, {    543,10}, \
+    {    287,11}, {    159,10}, {    351,11}, {    191,10}, \
+    {    415, 9}, {    831,11}, {    223,12}, {    127,11}, \
+    {    255,10}, {    543,11}, {    287,10}, {    575,11}, \
+    {    351,10}, {    703,12}, {    191,11}, {    415,10}, \
+    {    831,13}, {    127,12}, {    255,11}, {    543,10}, \
+    {   1087,11}, {    575,12}, {    319,11}, {    703,12}, \
+    {    383,11}, {    831,12}, {    447,11}, {    895,13}, \
+    {    255,12}, {    511,11}, {   1087,12}, {    575,11}, \
+    {   1151,12}, {    703,13}, {    383,12}, {    959,14}, \
+    {    255,13}, {    511,12}, {   1215,13}, {   8192,14}, \
+    {  16384,15}, {  32768,16} }
+#define MUL_FFT_TABLE3_SIZE 106
+#define MUL_FFT_THRESHOLD                 7424
+
+#define SQR_FFT_MODF_THRESHOLD             432  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    432, 5}, {     17, 6}, {      9, 5}, {     19, 6}, \
+    {     11, 5}, {     23, 6}, {     21, 7}, {     11, 6}, \
+    {     24, 7}, {     13, 6}, {     27, 7}, {     15, 6}, \
+    {     31, 7}, {     21, 8}, {     11, 7}, {     29, 8}, \
+    {     15, 7}, {     35, 8}, {     19, 7}, {     39, 8}, \
+    {     23, 7}, {     49, 8}, {     27, 9}, {     15, 8}, \
+    {     39, 9}, {     23, 7}, {     93, 8}, {     47, 7}, \
+    {     95, 8}, {     51,10}, {     15, 9}, {     31, 8}, \
+    {     67, 9}, {     39, 8}, {     79, 9}, {     47, 8}, \
+    {     95, 9}, {     55,10}, {     31, 9}, {     71, 8}, \
+    {    143, 9}, {     79,10}, {     47, 9}, {     95,11}, \
+    {     31,10}, {     63, 9}, {    135,10}, {     79, 9}, \
+    {    167,10}, {     95, 9}, {    191,11}, {     63,10}, \
+    {    127, 9}, {    255,10}, {    143, 9}, {    287, 8}, \
+    {    575,10}, {    159, 9}, {    319,11}, {     95,10}, \
+    {    191,12}, {     63,11}, {    127,10}, {    255, 9}, \
+    {    511,10}, {    271, 9}, {    543,10}, {    287,11}, \
+    {    159,10}, {    319, 9}, {    639,10}, {    351, 9}, \
+    {    703,11}, {    191,10}, {    415,11}, {    223,12}, \
+    {    127,11}, {    255,10}, {    543,11}, {    287,10}, \
+    {    607,11}, {    319,10}, {    639,11}, {    351,10}, \
+    {    703,12}, {    191,11}, {    415,10}, {    831,13}, \
+    {    127,12}, {    255,11}, {    543,10}, {   1087,11}, \
+    {    607,12}, {    319,11}, {    703,12}, {    383,11}, \
+    {    831,12}, {    447,13}, {    255,12}, {    511,11}, \
+    {   1087,12}, {    575,11}, {   1215,12}, {    703,13}, \
+    {    383,12}, {    895,14}, {    255,13}, {    511,12}, \
+    {   1215,13}, {   8192,14}, {  16384,15}, {  32768,16} }
+#define SQR_FFT_TABLE3_SIZE 112
+#define SQR_FFT_THRESHOLD                 7040
+
+#define MULLO_BASECASE_THRESHOLD             3
+#define MULLO_DC_THRESHOLD                  60
+#define MULLO_MUL_N_THRESHOLD            13463
+
+#define DC_DIV_QR_THRESHOLD                 78
+#define DC_DIVAPPR_Q_THRESHOLD             252
+#define DC_BDIV_QR_THRESHOLD                84
+#define DC_BDIV_Q_THRESHOLD                171
+
+#define INV_MULMOD_BNM1_THRESHOLD           55
+#define INV_NEWTON_THRESHOLD               234
+#define INV_APPR_THRESHOLD                 236
+
+#define BINV_NEWTON_THRESHOLD              268
+#define REDC_1_TO_REDC_N_THRESHOLD          67
+
+#define MU_DIV_QR_THRESHOLD               1308
+#define MU_DIVAPPR_Q_THRESHOLD            1142
+#define MUPI_DIV_QR_THRESHOLD              134
+#define MU_BDIV_QR_THRESHOLD              1164
+#define MU_BDIV_Q_THRESHOLD               1164
+
+#define MATRIX22_STRASSEN_THRESHOLD         15
+#define HGCD_THRESHOLD                     182
+#define GCD_DC_THRESHOLD                   591
+#define GCDEXT_DC_THRESHOLD                472
+#define JACOBI_BASE_METHOD                   2
+
+#define GET_STR_DC_THRESHOLD                24
+#define GET_STR_PRECOMPUTE_THRESHOLD        40
+#define SET_STR_DC_THRESHOLD               834
+#define SET_STR_PRECOMPUTE_THRESHOLD      2042

diff --git a/third_party/gmp/mpn/x86/k6/k62mmx/copyd.asm b/third_party/gmp/mpn/x86/k6/k62mmx/copyd.asm
new file mode 100644
index 0000000..f80a5a1
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k6/k62mmx/copyd.asm

@@ -0,0 +1,118 @@
+dnl  AMD K6-2 mpn_copyd -- copy limb vector, decrementing.
+
+dnl  Copyright 2001, 2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C K6-2: 1.0 cycles/limb
+
+
+C void mpn_copyd (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C The loop here is no faster than a rep movsl at 1.0 c/l, but it avoids a 30
+C cycle startup time, which amounts for instance to a 2x speedup at 15
+C limbs.
+C
+C If dst is 4mod8 the loop would be 1.17 c/l, but that's avoided by
+C processing one limb separately to make it aligned.  This and a final odd
+C limb are handled in a branch-free fashion, ending up re-copying if the
+C special case isn't needed.
+C
+C Alternatives:
+C
+C There used to be a big unrolled version of this, running at 0.56 c/l if
+C the destination was aligned, but that seemed rather excessive for the
+C relative importance of copyd.
+C
+C If the destination alignment is ignored and just left to run at 1.17 c/l
+C some code size and a fixed few cycles can be saved.  Considering how few
+C uses copyd finds perhaps that should be favoured.  The current code has
+C the attraction of being no slower than a basic rep movsl though.
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+dnl  re-using parameter space
+define(SAVE_EBX,`PARAM_SIZE')
+
+	TEXT
+	ALIGN(16)
+
+PROLOGUE(mpn_copyd)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	movl	%ebx, SAVE_EBX
+
+	movl	PARAM_SRC, %eax
+	movl	PARAM_DST, %edx
+
+	subl	$1, %ecx		C better code alignment than decl
+	jb	L(zero)
+
+	jz	L(one_more)
+	leal	4(%edx,%ecx,4), %ebx
+
+Zdisp(	movd,	0,(%eax,%ecx,4), %mm0)	C high limb
+Zdisp(	movd,	%mm0, 0,(%edx,%ecx,4))	C Zdisp for good code alignment
+
+	cmpl	$1, %ecx
+	je	L(one_more)
+
+	shrl	$2, %ebx
+	andl	$1, %ebx		C 1 if dst[size-2] unaligned
+
+	subl	%ebx, %ecx
+	nop				C code alignment
+
+L(top):
+	C eax	src
+	C ebx
+	C ecx	counter
+	C edx	dst
+
+	movq	-4(%eax,%ecx,4), %mm0
+	subl	$2, %ecx
+
+	movq	%mm0, 4(%edx,%ecx,4)
+	ja	L(top)
+
+
+L(one_more):
+	movd	(%eax), %mm0
+	movd	%mm0, (%edx)
+
+	movl	SAVE_EBX, %ebx
+	emms_or_femms
+L(zero):
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/k6/k62mmx/lshift.asm b/third_party/gmp/mpn/x86/k6/k62mmx/lshift.asm
new file mode 100644
index 0000000..c86575f
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k6/k62mmx/lshift.asm

@@ -0,0 +1,294 @@
+dnl  AMD K6-2 mpn_lshift -- mpn left shift.
+
+dnl  Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C K6-2: 1.75 cycles/limb
+
+
+C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                       unsigned shift);
+C
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC,  8)
+defframe(PARAM_DST,  4)
+deflit(`FRAME',0)
+
+dnl  used after src has been fetched
+define(VAR_RETVAL,`PARAM_SRC')
+
+dnl  minimum 9, because unrolled loop can't handle less
+deflit(UNROLL_THRESHOLD, 9)
+
+	TEXT
+	ALIGN(32)
+
+PROLOGUE(mpn_lshift)
+deflit(`FRAME',0)
+
+	C The 1 limb case can be done without the push %ebx, but it's then
+	C still the same speed.  The push is left as a free helping hand for
+	C the two_or_more code.
+
+	movl	PARAM_SIZE, %eax
+	pushl	%ebx			FRAME_pushl()
+
+	movl	PARAM_SRC, %ebx
+	decl	%eax
+
+	movl	PARAM_SHIFT, %ecx
+	jnz	L(two_or_more)
+
+	movl	(%ebx), %edx		C src limb
+	movl	PARAM_DST, %ebx
+
+	shldl(	%cl, %edx, %eax)	C return value
+
+	shll	%cl, %edx
+
+	movl	%edx, (%ebx)		C dst limb
+	popl	%ebx
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(16)	C avoid offset 0x1f
+L(two_or_more):
+	C eax	size-1
+	C ebx	src
+	C ecx	shift
+	C edx
+
+	movl	(%ebx,%eax,4), %edx	C src high limb
+	negl	%ecx
+
+	movd	PARAM_SHIFT, %mm6
+	addl	$32, %ecx		C 32-shift
+
+	shrl	%cl, %edx
+	cmpl	$UNROLL_THRESHOLD-1, %eax
+
+	movl	%edx, VAR_RETVAL
+	jae	L(unroll)
+
+
+	movd	%ecx, %mm7
+	movl	%eax, %ecx
+
+	movl	PARAM_DST, %eax
+
+L(simple):
+	C eax	dst
+	C ebx	src
+	C ecx	counter, size-1 to 1
+	C edx	retval
+	C
+	C mm0	scratch
+	C mm6	shift
+	C mm7	32-shift
+
+	movq	-4(%ebx,%ecx,4), %mm0
+
+	psrlq	%mm7, %mm0
+
+Zdisp(	movd,	%mm0, 0,(%eax,%ecx,4))
+	loop	L(simple)
+
+
+	movd	(%ebx), %mm0
+	popl	%ebx
+
+	psllq	%mm6, %mm0
+
+	movd	%mm0, (%eax)
+	movl	%edx, %eax
+
+	femms
+	ret
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(16)
+L(unroll):
+	C eax	size-1
+	C ebx	src
+	C ecx	32-shift
+	C edx	retval (but instead VAR_RETVAL is used)
+	C
+	C mm6	shift
+
+	addl	$32, %ecx
+	movl	PARAM_DST, %edx
+
+	movd	%ecx, %mm7
+	subl	$7, %eax			C size-8
+
+	leal	(%edx,%eax,4), %ecx		C alignment of dst
+
+	movq	32-8(%ebx,%eax,4), %mm2		C src high qword
+	testb	$4, %cl
+
+	jz	L(dst_aligned)
+	psllq	%mm6, %mm2
+
+	psrlq	$32, %mm2
+	decl	%eax
+
+	movd	%mm2, 32(%edx,%eax,4)		C dst high limb
+	movq	32-8(%ebx,%eax,4), %mm2		C new src high qword
+L(dst_aligned):
+
+	movq	32-16(%ebx,%eax,4), %mm0	C src second highest qword
+
+
+	C This loop is the important bit, the rest is just support for it.
+	C Four src limbs are held at the start, and four more will be read.
+	C Four dst limbs will be written.  This schedule seems necessary for
+	C full speed.
+	C
+	C The use of size-8 lets the loop stop when %eax goes negative and
+	C leaves -4 to -1 which can be tested with test $1 and $2.
+
+L(top):
+	C eax	counter, size-8 step by -4 until <0
+	C ebx	src
+	C ecx
+	C edx	dst
+	C
+	C mm0	src next qword
+	C mm1	scratch
+	C mm2	src prev qword
+	C mm6	shift
+	C mm7	64-shift
+
+	psllq	%mm6, %mm2
+	subl	$4, %eax
+
+	movq	%mm0, %mm1
+	psrlq	%mm7, %mm0
+
+	por	%mm0, %mm2
+	movq	24(%ebx,%eax,4), %mm0
+
+	psllq	%mm6, %mm1
+	movq	%mm2, 40(%edx,%eax,4)
+
+	movq	%mm0, %mm2
+	psrlq	%mm7, %mm0
+
+	por	%mm0, %mm1
+	movq	16(%ebx,%eax,4), %mm0
+
+	movq	%mm1, 32(%edx,%eax,4)
+	jnc	L(top)
+
+
+	C Now have four limbs in mm2 (prev) and mm0 (next), plus eax mod 4.
+	C
+	C 8(%ebx) is the next source, and 24(%edx) is the next destination.
+	C %eax is between -4 and -1, representing respectively 0 to 3 extra
+	C limbs that must be read.
+
+
+	testl	$2, %eax	C testl to avoid bad cache line crossing
+	jz	L(finish_nottwo)
+
+	C Two more limbs: lshift mm2, OR it with rshifted mm0, mm0 becomes
+	C new mm2 and a new mm0 is loaded.
+
+	psllq	%mm6, %mm2
+	movq	%mm0, %mm1
+
+	psrlq	%mm7, %mm0
+	subl	$2, %eax
+
+	por	%mm0, %mm2
+	movq	16(%ebx,%eax,4), %mm0
+
+	movq	%mm2, 32(%edx,%eax,4)
+	movq	%mm1, %mm2
+L(finish_nottwo):
+
+
+	C lshift mm2, OR with rshifted mm0, mm1 becomes lshifted mm0
+
+	testb	$1, %al
+	psllq	%mm6, %mm2
+
+	movq	%mm0, %mm1
+	psrlq	%mm7, %mm0
+
+	por	%mm0, %mm2
+	psllq	%mm6, %mm1
+
+	movq	%mm2, 24(%edx,%eax,4)
+	jz	L(finish_even)
+
+
+	C Size is odd, so mm1 and one extra limb to process.
+
+	movd	(%ebx), %mm0		C src[0]
+	popl	%ebx
+deflit(`FRAME',0)
+
+	movq	%mm0, %mm2
+	psllq	$32, %mm0
+
+	psrlq	%mm7, %mm0
+
+	psllq	%mm6, %mm2
+	por	%mm0, %mm1
+
+	movq	%mm1, 4(%edx)		C dst[1,2]
+	movd	%mm2, (%edx)		C dst[0]
+
+	movl	VAR_RETVAL, %eax
+
+	femms
+	ret
+
+
+	nop	C avoid bad cache line crossing
+L(finish_even):
+deflit(`FRAME',4)
+	C Size is even, so only mm1 left to process.
+
+	movq	%mm1, (%edx)		C dst[0,1]
+	movl	VAR_RETVAL, %eax
+
+	popl	%ebx
+	femms
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/k6/k62mmx/rshift.asm b/third_party/gmp/mpn/x86/k6/k62mmx/rshift.asm
new file mode 100644
index 0000000..f604a7b
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k6/k62mmx/rshift.asm

@@ -0,0 +1,293 @@
+dnl  AMD K6-2 mpn_rshift -- mpn right shift.
+
+dnl  Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C K6-2: 1.75 cycles/limb
+
+
+C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                       unsigned shift);
+C
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC,  8)
+defframe(PARAM_DST,  4)
+deflit(`FRAME',0)
+
+dnl  Minimum 9, because the unrolled loop can't handle less.
+dnl
+deflit(UNROLL_THRESHOLD, 9)
+
+	TEXT
+	ALIGN(32)
+
+PROLOGUE(mpn_rshift)
+deflit(`FRAME',0)
+
+	C The 1 limb case can be done without the push %ebx, but it's then
+	C still the same speed.  The push is left as a free helping hand for
+	C the two_or_more code.
+
+	movl	PARAM_SIZE, %eax
+	pushl	%ebx			FRAME_pushl()
+
+	movl	PARAM_SRC, %ebx
+	decl	%eax
+
+	movl	PARAM_SHIFT, %ecx
+	jnz	L(two_or_more)
+
+	movl	(%ebx), %edx		C src limb
+	movl	PARAM_DST, %ebx
+
+	shrdl(	%cl, %edx, %eax)	C return value
+
+	shrl	%cl, %edx
+
+	movl	%edx, (%ebx)		C dst limb
+	popl	%ebx
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(16)	C avoid offset 0x1f
+L(two_or_more):
+	C eax	size-1
+	C ebx	src
+	C ecx	shift
+	C edx
+
+	movl	(%ebx), %edx	C src low limb
+	negl	%ecx
+
+	addl	$32, %ecx
+	movd	PARAM_SHIFT, %mm6
+
+	shll	%cl, %edx
+	cmpl	$UNROLL_THRESHOLD-1, %eax
+
+	jae	L(unroll)
+
+
+	C eax	size-1
+	C ebx	src
+	C ecx	32-shift
+	C edx	retval
+	C
+	C mm6	shift
+
+	movl	PARAM_DST, %ecx
+	leal	(%ebx,%eax,4), %ebx
+
+	leal	-4(%ecx,%eax,4), %ecx
+	negl	%eax
+
+	C This loop runs at about 3 cycles/limb, which is the amount of
+	C decoding, and this is despite every second access being unaligned.
+
+L(simple):
+	C eax	counter, -(size-1) to -1
+	C ebx	&src[size-1]
+	C ecx	&dst[size-1]
+	C edx	retval
+	C
+	C mm0	scratch
+	C mm6	shift
+
+Zdisp(	movq,	0,(%ebx,%eax,4), %mm0)
+	incl	%eax
+
+	psrlq	%mm6, %mm0
+
+Zdisp(	movd,	%mm0, 0,(%ecx,%eax,4))
+	jnz	L(simple)
+
+
+	movq	%mm0, (%ecx)
+	movl	%edx, %eax
+
+	popl	%ebx
+
+	femms
+	ret
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(16)
+L(unroll):
+	C eax	size-1
+	C ebx	src
+	C ecx	32-shift
+	C edx	retval
+	C
+	C mm6	shift
+
+	addl	$32, %ecx
+	subl	$7, %eax		C size-8
+
+	movd	%ecx, %mm7
+	movl	PARAM_DST, %ecx
+
+	movq	(%ebx), %mm2		C src low qword
+	leal	(%ebx,%eax,4), %ebx	C src end - 32
+
+	testb	$4, %cl
+	leal	(%ecx,%eax,4), %ecx	C dst end - 32
+
+	notl	%eax			C -(size-7)
+	jz	L(dst_aligned)
+
+	psrlq	%mm6, %mm2
+	incl	%eax
+
+Zdisp(	movd,	%mm2, 0,(%ecx,%eax,4))	C dst low limb
+	movq	4(%ebx,%eax,4), %mm2	C new src low qword
+L(dst_aligned):
+
+	movq	12(%ebx,%eax,4), %mm0	C src second lowest qword
+	nop	C avoid bad cache line crossing
+
+
+	C This loop is the important bit, the rest is just support for it.
+	C Four src limbs are held at the start, and four more will be read.
+	C Four dst limbs will be written.  This schedule seems necessary for
+	C full speed.
+	C
+	C The use of -(size-7) lets the loop stop when %eax becomes >= 0 and
+	C and leaves 0 to 3 which can be tested with test $1 and $2.
+
+L(top):
+	C eax	counter, -(size-7) step by +4 until >=0
+	C ebx	src end - 32
+	C ecx	dst end - 32
+	C edx	retval
+	C
+	C mm0	src next qword
+	C mm1	scratch
+	C mm2	src prev qword
+	C mm6	shift
+	C mm7	64-shift
+
+	psrlq	%mm6, %mm2
+	addl	$4, %eax
+
+	movq	%mm0, %mm1
+	psllq	%mm7, %mm0
+
+	por	%mm0, %mm2
+	movq	4(%ebx,%eax,4), %mm0
+
+	psrlq	%mm6, %mm1
+	movq	%mm2, -12(%ecx,%eax,4)
+
+	movq	%mm0, %mm2
+	psllq	%mm7, %mm0
+
+	por	%mm0, %mm1
+	movq	12(%ebx,%eax,4), %mm0
+
+	movq	%mm1, -4(%ecx,%eax,4)
+	ja	L(top)		C jump if no carry and not zero
+
+
+
+	C Now have the four limbs in mm2 (low) and mm0 (high), and %eax is 0
+	C to 3 representing respectively 3 to 0 further limbs.
+
+	testl	$2, %eax	C testl to avoid bad cache line crossings
+	jnz	L(finish_nottwo)
+
+	C Two or three extra limbs: rshift mm2, OR it with lshifted mm0, mm0
+	C becomes new mm2 and a new mm0 is loaded.
+
+	psrlq	%mm6, %mm2
+	movq	%mm0, %mm1
+
+	psllq	%mm7, %mm0
+	addl	$2, %eax
+
+	por	%mm0, %mm2
+	movq	12(%ebx,%eax,4), %mm0
+
+	movq	%mm2, -4(%ecx,%eax,4)
+	movq	%mm1, %mm2
+L(finish_nottwo):
+
+
+	testb	$1, %al
+	psrlq	%mm6, %mm2
+
+	movq	%mm0, %mm1
+	psllq	%mm7, %mm0
+
+	por	%mm0, %mm2
+	psrlq	%mm6, %mm1
+
+	movq	%mm2, 4(%ecx,%eax,4)
+	jnz	L(finish_even)
+
+
+	C one further extra limb to process
+
+	movd	32-4(%ebx), %mm0	C src[size-1], most significant limb
+	popl	%ebx
+
+	movq	%mm0, %mm2
+	psllq	%mm7, %mm0
+
+	por	%mm0, %mm1
+	psrlq	%mm6, %mm2
+
+	movq	%mm1, 32-12(%ecx)	C dst[size-3,size-2]
+	movd	%mm2, 32-4(%ecx)	C dst[size-1]
+
+	movl	%edx, %eax		C retval
+
+	femms
+	ret
+
+
+	nop	C avoid bad cache line crossing
+L(finish_even):
+	C no further extra limbs
+
+	movq	%mm1, 32-8(%ecx)	C dst[size-2,size-1]
+	movl	%edx, %eax		C retval
+
+	popl	%ebx
+
+	femms
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/k6/mmx/com.asm b/third_party/gmp/mpn/x86/k6/mmx/com.asm
new file mode 100644
index 0000000..b747454
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k6/mmx/com.asm

@@ -0,0 +1,103 @@
+dnl  AMD K6-2 mpn_com -- mpn bitwise one's complement.
+
+dnl  Copyright 1999-2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+NAILS_SUPPORT(0-31)
+
+
+C    alignment dst/src, A=0mod8 N=4mod8
+C       A/A   A/N   N/A   N/N
+C K6-2  1.0   1.18  1.18  1.18  cycles/limb
+C K6    1.5   1.85  1.75  1.85
+
+
+C void mpn_com (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C Take the bitwise ones-complement of src,size and write it to dst,size.
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_com)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	movl	PARAM_SRC, %eax
+	movl	PARAM_DST, %edx
+	shrl	%ecx
+	jnz	L(two_or_more)
+
+	movl	(%eax), %eax
+	notl_or_xorl_GMP_NUMB_MASK(	%eax)
+	movl	%eax, (%edx)
+	ret
+
+
+L(two_or_more):
+	pushl	%ebx	FRAME_pushl()
+	pcmpeqd	%mm7, %mm7		C all ones
+
+	movl	%ecx, %ebx
+ifelse(GMP_NAIL_BITS,0,,
+`	psrld	$GMP_NAIL_BITS, %mm7')	C clear nails
+
+
+
+	ALIGN(8)
+L(top):
+	C eax	src
+	C ebx	floor(size/2)
+	C ecx	counter
+	C edx	dst
+	C
+	C mm0	scratch
+	C mm7	mask
+
+	movq	-8(%eax,%ecx,8), %mm0
+	pxor	%mm7, %mm0
+	movq	%mm0, -8(%edx,%ecx,8)
+	loop	L(top)
+
+
+	jnc	L(no_extra)
+	movl	(%eax,%ebx,8), %eax
+	notl_or_xorl_GMP_NUMB_MASK(	%eax)
+	movl	%eax, (%edx,%ebx,8)
+L(no_extra):
+
+	popl	%ebx
+	emms_or_femms
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/k6/mmx/dive_1.asm b/third_party/gmp/mpn/x86/k6/mmx/dive_1.asm
new file mode 100644
index 0000000..1bbad3a
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k6/mmx/dive_1.asm

@@ -0,0 +1,282 @@
+dnl  AMD K6 mpn_divexact_1 -- mpn by limb exact division.
+
+dnl  Copyright 2000-2002, 2007 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C         divisor
+C       odd   even
+C K6:   10.0  12.0  cycles/limb
+C K6-2: 10.0  11.5
+
+
+C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                      mp_limb_t divisor);
+C
+C A simple divl is used for size==1.  This is about 10 cycles faster for an
+C odd divisor or 20 cycles for an even divisor.
+C
+C The loops are quite sensitive to code alignment, speeds should be
+C rechecked (odd and even divisor, pic and non-pic) if contemplating
+C changing anything.
+
+defframe(PARAM_DIVISOR,16)
+defframe(PARAM_SIZE,   12)
+defframe(PARAM_SRC,    8)
+defframe(PARAM_DST,    4)
+
+dnl  re-use parameter space
+define(VAR_INVERSE,`PARAM_DST')
+
+	TEXT
+
+	ALIGN(32)
+PROLOGUE(mpn_divexact_1)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+
+	movl	PARAM_SRC, %eax
+	xorl	%edx, %edx
+
+	cmpl	$1, %ecx
+	jnz	L(two_or_more)
+
+	movl	(%eax), %eax
+
+	divl	PARAM_DIVISOR
+
+	movl	PARAM_DST, %ecx
+	movl	%eax, (%ecx)
+
+	ret
+
+
+L(two_or_more):
+	movl	PARAM_DIVISOR, %eax
+	pushl	%ebx		FRAME_pushl()
+
+	movl	PARAM_SRC, %ebx
+	pushl	%ebp		FRAME_pushl()
+
+L(strip_twos):
+	shrl	%eax
+	incl	%edx			C will get shift+1
+
+	jnc	L(strip_twos)
+	pushl	%esi		FRAME_pushl()
+
+	leal	1(%eax,%eax), %esi	C d without twos
+	andl	$127, %eax		C d/2, 7 bits
+
+ifdef(`PIC',`
+	LEA(	binvert_limb_table, %ebp)
+Zdisp(	movzbl,	0,(%eax,%ebp), %eax)
+',`
+	movzbl	binvert_limb_table(%eax), %eax	C inv 8 bits
+')
+	pushl	%edi		FRAME_pushl()
+
+	leal	(%eax,%eax), %ebp	C 2*inv
+
+	imull	%eax, %eax		C inv*inv
+
+	movl	PARAM_DST, %edi
+
+	imull	%esi, %eax		C inv*inv*d
+
+	subl	%eax, %ebp		C inv = 2*inv - inv*inv*d
+	leal	(%ebp,%ebp), %eax	C 2*inv
+
+	imull	%ebp, %ebp		C inv*inv
+
+	movl	%esi, PARAM_DIVISOR	C d without twos
+	leal	(%ebx,%ecx,4), %ebx	C src end
+
+	imull	%esi, %ebp		C inv*inv*d
+
+	leal	(%edi,%ecx,4), %edi	C dst end
+	negl	%ecx			C -size
+
+	subl	%ebp, %eax		C inv = 2*inv - inv*inv*d
+	subl	$1, %edx		C shift amount, and clear carry
+
+	ASSERT(e,`	C expect d*inv == 1 mod 2^GMP_LIMB_BITS
+	pushl	%eax	FRAME_pushl()
+	imull	PARAM_DIVISOR, %eax
+	cmpl	$1, %eax
+	popl	%eax	FRAME_popl()')
+
+	movl	%eax, VAR_INVERSE
+	jnz	L(even)
+
+	movl	(%ebx,%ecx,4), %esi	C src low limb
+	jmp	L(odd_entry)
+
+
+	ALIGN(16)
+	nop	C code alignment
+L(odd_top):
+	C eax	scratch
+	C ebx	src end
+	C ecx	counter, limbs, negative
+	C edx	inverse
+	C esi	next limb, adjusted for carry
+	C edi	dst end
+	C ebp	carry bit, 0 or -1
+
+	imull	%edx, %esi
+
+	movl	PARAM_DIVISOR, %eax
+	movl	%esi, -4(%edi,%ecx,4)
+
+	mull	%esi			C carry limb in edx
+
+	subl	%ebp, %edx		C apply carry bit
+	movl	(%ebx,%ecx,4), %esi
+
+L(odd_entry):
+	subl	%edx, %esi		C apply carry limb
+	movl	VAR_INVERSE, %edx
+
+	sbbl	%ebp, %ebp		C 0 or -1
+
+	incl	%ecx
+	jnz	L(odd_top)
+
+
+	imull	%edx, %esi
+
+	movl	%esi, -4(%edi,%ecx,4)
+
+	popl	%edi
+	popl	%esi
+
+	popl	%ebp
+	popl	%ebx
+
+	ret
+
+
+L(even):
+	C eax
+	C ebx	src end
+	C ecx	-size
+	C edx	twos
+	C esi
+	C edi	dst end
+	C ebp
+
+	xorl	%ebp, %ebp
+Zdisp(	movq,	0,(%ebx,%ecx,4), %mm0)	C src[0,1]
+
+	movd	%edx, %mm7
+	movl	VAR_INVERSE, %edx
+
+	addl	$2, %ecx
+	psrlq	%mm7, %mm0
+
+	movd	%mm0, %esi
+	jz	L(even_two)		C if only two limbs
+
+
+C Out-of-order execution is good enough to hide the load/rshift/movd
+C latency.  Having imul at the top of the loop gives 11.5 c/l instead of 12,
+C on K6-2.  In fact there's only 11 of decode, but nothing running at 11 has
+C been found.  Maybe the fact every second movq is unaligned costs the extra
+C 0.5.
+
+L(even_top):
+	C eax	scratch
+	C ebx	src end
+	C ecx	counter, limbs, negative
+	C edx	inverse
+	C esi	next limb, adjusted for carry
+	C edi	dst end
+	C ebp	carry bit, 0 or -1
+	C
+	C mm0	scratch, source limbs
+	C mm7	twos
+
+	imull	%edx, %esi
+
+	movl	%esi, -8(%edi,%ecx,4)
+	movl	PARAM_DIVISOR, %eax
+
+	mull	%esi			C carry limb in edx
+
+	movq	-4(%ebx,%ecx,4), %mm0
+	psrlq	%mm7, %mm0
+
+	movd	%mm0, %esi
+	subl	%ebp, %edx		C apply carry bit
+
+	subl	%edx, %esi		C apply carry limb
+	movl	VAR_INVERSE, %edx
+
+	sbbl	%ebp, %ebp		C 0 or -1
+
+	incl	%ecx
+	jnz	L(even_top)
+
+
+L(even_two):
+	movd	-4(%ebx), %mm0		C src high limb
+	psrlq	%mm7, %mm0
+
+	imull	%edx, %esi
+
+	movl	%esi, -8(%edi)
+	movl	PARAM_DIVISOR, %eax
+
+	mull	%esi			C carry limb in edx
+
+	movd	%mm0, %esi
+	subl	%ebp, %edx		C apply carry bit
+
+	movl	VAR_INVERSE, %eax
+	subl	%edx, %esi		C apply carry limb
+
+	imull	%eax, %esi
+
+	movl	%esi, -4(%edi)
+
+	popl	%edi
+	popl	%esi
+
+	popl	%ebp
+	popl	%ebx
+
+	emms_or_femms
+
+	ret
+
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/x86/k6/mmx/logops_n.asm b/third_party/gmp/mpn/x86/k6/mmx/logops_n.asm
new file mode 100644
index 0000000..e17930b
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k6/mmx/logops_n.asm

@@ -0,0 +1,226 @@
+dnl  AMD K6-2 mpn_and_n, mpn_andn_n, mpn_nand_n, mpn_ior_n, mpn_iorn_n,
+dnl  mpn_nior_n, mpn_xor_n, mpn_xnor_n -- mpn bitwise logical operations.
+
+dnl  Copyright 1999-2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+NAILS_SUPPORT(0-31)
+
+
+C         alignment dst/src1/src2, A=0mod8, N=4mod8
+C      A/A/A A/A/N A/N/A A/N/N N/A/A N/A/N N/N/A N/N/N
+C
+C K6-2  1.2   1.5   1.5   1.2   1.2   1.5   1.5   1.2   and,andn,ior,xor
+C K6-2  1.5   1.75  2.0   1.75  1.75  2.0   1.75  1.5   iorn,xnor
+C K6-2  1.75  2.0   2.0   2.0   2.0   2.0   2.0   1.75  nand,nior
+C
+C K6    1.5   1.68  1.75  1.2   1.75  1.75  1.68  1.5   and,andn,ior,xor
+C K6    2.0   2.0   2.25  2.25  2.25  2.25  2.0   2.0   iorn,xnor
+C K6    2.0   2.25  2.25  2.25  2.25  2.25  2.25  2.0   nand,nior
+
+
+dnl  M4_p and M4_i are the MMX and integer instructions
+dnl  M4_*_neg_dst means whether to negate the final result before writing
+dnl  M4_*_neg_src2 means whether to negate the src2 values before using them
+
+define(M4_choose_op,
+m4_assert_numargs(7)
+`ifdef(`OPERATION_$1',`
+define(`M4_function',  `mpn_$1')
+define(`M4_operation', `$1')
+define(`M4_p',         `$2')
+define(`M4_p_neg_dst', `$3')
+define(`M4_p_neg_src2',`$4')
+define(`M4_i',         `$5')
+define(`M4_i_neg_dst', `$6')
+define(`M4_i_neg_src2',`$7')
+')')
+
+dnl  xnor is done in "iorn" style because it's a touch faster than "nior"
+dnl  style (the two are equivalent for xor).
+dnl
+dnl  pandn can't be used with nails.
+
+M4_choose_op( and_n,  pand,0,0,  andl,0,0)
+ifelse(GMP_NAIL_BITS,0,
+`M4_choose_op(andn_n, pandn,0,0, andl,0,1)',
+`M4_choose_op(andn_n, pand,0,1,  andl,0,1)')
+M4_choose_op( nand_n, pand,1,0,  andl,1,0)
+M4_choose_op( ior_n,  por,0,0,   orl,0,0)
+M4_choose_op( iorn_n, por,0,1,   orl,0,1)
+M4_choose_op( nior_n, por,1,0,   orl,1,0)
+M4_choose_op( xor_n,  pxor,0,0,  xorl,0,0)
+M4_choose_op( xnor_n, pxor,0,1,  xorl,0,1)
+
+ifdef(`M4_function',,
+`m4_error(`Unrecognised or undefined OPERATION symbol
+')')
+
+MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n)
+
+
+C void M4_function (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C                   mp_size_t size);
+C
+C Do src1,size M4_operation src2,size, storing the result in dst,size.
+C
+C Unaligned movq loads and stores are a bit slower than aligned ones.  The
+C test at the start of the routine checks the alignment of src1 and if
+C necessary processes one limb separately at the low end to make it aligned.
+C
+C The raw speeds without this alignment switch are as follows.
+C
+C           alignment dst/src1/src2, A=0mod8, N=4mod8
+C     A/A/A  A/A/N  A/N/A  A/N/N  N/A/A  N/A/N  N/N/A  N/N/N
+C
+C K6                 1.5    2.0                 1.5    2.0    and,andn,ior,xor
+C K6                 1.75   2.2                 2.0    2.28   iorn,xnor
+C K6                 2.0    2.25                2.35   2.28   nand,nior
+C
+C
+C Future:
+C
+C K6 can do one 64-bit load per cycle so each of these routines should be
+C able to approach 1.0 c/l, if aligned.  The basic and/andn/ior/xor might be
+C able to get 1.0 with just a 4 limb loop, being 3 instructions per 2 limbs.
+C The others are 4 instructions per 2 limbs, and so can only approach 1.0
+C because there's nowhere to hide some loop control.
+
+defframe(PARAM_SIZE,16)
+defframe(PARAM_SRC2,12)
+defframe(PARAM_SRC1,8)
+defframe(PARAM_DST, 4)
+deflit(`FRAME',0)
+
+	TEXT
+	ALIGN(32)
+PROLOGUE(M4_function)
+			movl	PARAM_SIZE, %ecx
+			pushl	%ebx		FRAME_pushl()
+
+			movl	PARAM_SRC1, %eax
+
+			movl	PARAM_SRC2, %ebx
+			cmpl	$1, %ecx
+
+			movl	PARAM_DST, %edx
+			ja	L(two_or_more)
+
+
+			movl	(%ebx), %ecx
+			popl	%ebx
+ifelse(M4_i_neg_src2,1,`notl_or_xorl_GMP_NUMB_MASK(	%ecx)')
+			M4_i	(%eax), %ecx
+ifelse(M4_i_neg_dst,1,`	notl_or_xorl_GMP_NUMB_MASK(	%ecx)')
+			movl	%ecx, (%edx)
+
+			ret
+
+
+L(two_or_more):
+			C eax	src1
+			C ebx	src2
+			C ecx	size
+			C edx	dst
+			C esi
+			C edi
+			C ebp
+
+			pushl	%esi		FRAME_pushl()
+			testl	$4, %eax
+			jz	L(alignment_ok)
+
+			movl	(%ebx), %esi
+			addl	$4, %ebx
+ifelse(M4_i_neg_src2,1,`notl_or_xorl_GMP_NUMB_MASK(	%esi)')
+			M4_i	(%eax), %esi
+			addl	$4, %eax
+ifelse(M4_i_neg_dst,1,`	notl_or_xorl_GMP_NUMB_MASK(	%esi)')
+			movl	%esi, (%edx)
+			addl	$4, %edx
+			decl	%ecx
+
+L(alignment_ok):
+			movl	%ecx, %esi
+			shrl	%ecx
+			jnz	L(still_two_or_more)
+
+			movl	(%ebx), %ecx
+			popl	%esi
+ifelse(M4_i_neg_src2,1,`notl_or_xorl_GMP_NUMB_MASK(	%ecx)')
+			M4_i	(%eax), %ecx
+ifelse(M4_i_neg_dst,1,`	notl_or_xorl_GMP_NUMB_MASK(	%ecx)')
+			popl	%ebx
+			movl	%ecx, (%edx)
+			ret
+
+
+L(still_two_or_more):
+ifelse(eval(M4_p_neg_src2 || M4_p_neg_dst),1,`
+			pcmpeqd	%mm7, %mm7		C all ones
+ifelse(GMP_NAIL_BITS,0,,`psrld	$GMP_NAIL_BITS, %mm7')	C clear nails
+')
+
+			ALIGN(16)
+L(top):
+			C eax	src1
+			C ebx	src2
+			C ecx	counter
+			C edx	dst
+			C esi
+			C edi
+			C ebp
+			C
+			C carry bit is low of size
+
+			movq	-8(%ebx,%ecx,8), %mm0
+ifelse(M4_p_neg_src2,1,`pxor	%mm7, %mm0')
+			M4_p	-8(%eax,%ecx,8), %mm0
+ifelse(M4_p_neg_dst,1,`	pxor	%mm7, %mm0')
+			movq	%mm0, -8(%edx,%ecx,8)
+
+			loop	L(top)
+
+
+			jnc	L(no_extra)
+
+			movl	-4(%ebx,%esi,4), %ebx
+ifelse(M4_i_neg_src2,1,`notl_or_xorl_GMP_NUMB_MASK(	%ebx)')
+			M4_i	-4(%eax,%esi,4), %ebx
+ifelse(M4_i_neg_dst,1,`	notl_or_xorl_GMP_NUMB_MASK(	%ebx)')
+			movl	%ebx, -4(%edx,%esi,4)
+L(no_extra):
+
+			popl	%esi
+			popl	%ebx
+			emms_or_femms
+			ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/k6/mmx/lshift.asm b/third_party/gmp/mpn/x86/k6/mmx/lshift.asm
new file mode 100644
index 0000000..45be582
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k6/mmx/lshift.asm

@@ -0,0 +1,130 @@
+dnl  AMD K6 mpn_lshift -- mpn left shift.
+
+dnl  Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C K6: 3.0 cycles/limb
+
+
+C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                       unsigned shift);
+C
+C The loop runs at 3 cycles/limb, limited by decoding and by having 3 mmx
+C instructions.  This is despite every second fetch being unaligned.
+
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC,  8)
+defframe(PARAM_DST,  4)
+
+	TEXT
+	ALIGN(32)
+
+PROLOGUE(mpn_lshift)
+deflit(`FRAME',0)
+
+	C The 1 limb case can be done without the push %ebx, but it's then
+	C still the same speed.  The push is left as a free helping hand for
+	C the two_or_more code.
+
+	movl	PARAM_SIZE, %eax
+	pushl	%ebx			FRAME_pushl()
+
+	movl	PARAM_SRC, %ebx
+	decl	%eax
+
+	movl	PARAM_SHIFT, %ecx
+	jnz	L(two_or_more)
+
+	movl	(%ebx), %edx		C src limb
+	movl	PARAM_DST, %ebx
+
+	shldl(	%cl, %edx, %eax)	C return value
+
+	shll	%cl, %edx
+
+	movl	%edx, (%ebx)		C dst limb
+	popl	%ebx
+
+	ret
+
+
+	ALIGN(16)	C avoid offset 0x1f
+	nop		C avoid bad cache line crossing
+L(two_or_more):
+	C eax	size-1
+	C ebx	src
+	C ecx	shift
+	C edx
+
+	movl	(%ebx,%eax,4), %edx	C src high limb
+	negl	%ecx
+
+	movd	PARAM_SHIFT, %mm6
+	addl	$32, %ecx		C 32-shift
+
+	shrl	%cl, %edx
+
+	movd	%ecx, %mm7
+	movl	PARAM_DST, %ecx
+
+L(top):
+	C eax	counter, size-1 to 1
+	C ebx	src
+	C ecx	dst
+	C edx	retval
+	C
+	C mm0	scratch
+	C mm6	shift
+	C mm7	32-shift
+
+	movq	-4(%ebx,%eax,4), %mm0
+	decl	%eax
+
+	psrlq	%mm7, %mm0
+
+	movd	%mm0, 4(%ecx,%eax,4)
+	jnz	L(top)
+
+
+	movd	(%ebx), %mm0
+	popl	%ebx
+
+	psllq	%mm6, %mm0
+	movl	%edx, %eax
+
+	movd	%mm0, (%ecx)
+
+	emms
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/k6/mmx/popham.asm b/third_party/gmp/mpn/x86/k6/mmx/popham.asm
new file mode 100644
index 0000000..2b19d0b
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k6/mmx/popham.asm

@@ -0,0 +1,236 @@
+dnl  AMD K6-2 mpn_popcount, mpn_hamdist -- mpn bit population count and
+dnl  hamming distance.
+
+dnl  Copyright 2000-2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C        popcount  hamdist
+C K6-2:    9.0       11.5   cycles/limb
+C K6:      12.5      13.0
+
+
+C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size);
+C unsigned long mpn_hamdist (mp_srcptr src, mp_srcptr src2, mp_size_t size);
+C
+C The code here isn't optimal, but it's already a 2x speedup over the plain
+C integer mpn/generic/popcount.c,hamdist.c.
+
+
+ifdef(`OPERATION_popcount',,
+`ifdef(`OPERATION_hamdist',,
+`m4_error(`Need OPERATION_popcount or OPERATION_hamdist
+')m4exit(1)')')
+
+define(HAM,
+m4_assert_numargs(1)
+`ifdef(`OPERATION_hamdist',`$1')')
+
+define(POP,
+m4_assert_numargs(1)
+`ifdef(`OPERATION_popcount',`$1')')
+
+HAM(`
+defframe(PARAM_SIZE,   12)
+defframe(PARAM_SRC2,   8)
+defframe(PARAM_SRC,    4)
+define(M4_function,mpn_hamdist)
+')
+POP(`
+defframe(PARAM_SIZE,   8)
+defframe(PARAM_SRC,    4)
+define(M4_function,mpn_popcount)
+')
+
+MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
+
+
+ifdef(`PIC',,`
+	dnl  non-PIC
+
+	RODATA
+	ALIGN(8)
+
+L(rodata_AAAAAAAAAAAAAAAA):
+	.long	0xAAAAAAAA
+	.long	0xAAAAAAAA
+
+L(rodata_3333333333333333):
+	.long	0x33333333
+	.long	0x33333333
+
+L(rodata_0F0F0F0F0F0F0F0F):
+	.long	0x0F0F0F0F
+	.long	0x0F0F0F0F
+
+L(rodata_000000FF000000FF):
+	.long	0x000000FF
+	.long	0x000000FF
+')
+
+	TEXT
+	ALIGN(32)
+
+POP(`ifdef(`PIC', `
+	C avoid shrl crossing a 32-byte boundary
+	nop')')
+
+PROLOGUE(M4_function)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+
+ifdef(`PIC',`
+	movl	$0xAAAAAAAA, %eax
+	movl	$0x33333333, %edx
+
+	movd	%eax, %mm7
+	movd	%edx, %mm6
+
+	movl	$0x0F0F0F0F, %eax
+	movl	$0x000000FF, %edx
+
+	punpckldq %mm7, %mm7
+	punpckldq %mm6, %mm6
+
+	movd	%eax, %mm5
+	movd	%edx, %mm4
+
+	punpckldq %mm5, %mm5
+	punpckldq %mm4, %mm4
+',`
+
+	movq	L(rodata_AAAAAAAAAAAAAAAA), %mm7
+	movq	L(rodata_3333333333333333), %mm6
+	movq	L(rodata_0F0F0F0F0F0F0F0F), %mm5
+	movq	L(rodata_000000FF000000FF), %mm4
+')
+
+define(REG_AAAAAAAAAAAAAAAA, %mm7)
+define(REG_3333333333333333, %mm6)
+define(REG_0F0F0F0F0F0F0F0F, %mm5)
+define(REG_000000FF000000FF, %mm4)
+
+
+	movl	PARAM_SRC, %eax
+HAM(`	movl	PARAM_SRC2, %edx')
+
+	pxor	%mm2, %mm2	C total
+
+	shrl	%ecx
+	jnc	L(top)
+
+Zdisp(	movd,	0,(%eax,%ecx,8), %mm1)
+
+HAM(`
+Zdisp(	movd,	0,(%edx,%ecx,8), %mm0)
+	pxor	%mm0, %mm1
+')
+
+	incl	%ecx
+	jmp	L(loaded)
+
+
+	ALIGN(16)
+POP(`	nop	C alignment to avoid crossing 32-byte boundaries')
+
+L(top):
+	C eax	src
+	C ebx
+	C ecx	counter, qwords, decrementing
+	C edx	[hamdist] src2
+	C
+	C mm0	(scratch)
+	C mm1	(scratch)
+	C mm2	total (low dword)
+	C mm3
+	C mm4	\
+	C mm5	| special constants
+	C mm6	|
+	C mm7	/
+
+	movq	-8(%eax,%ecx,8), %mm1
+HAM(`	pxor	-8(%edx,%ecx,8), %mm1')
+
+L(loaded):
+	movq	%mm1, %mm0
+	pand	REG_AAAAAAAAAAAAAAAA, %mm1
+
+	psrlq	$1, %mm1
+HAM(`	nop			C code alignment')
+
+	psubd	%mm1, %mm0	C bit pairs
+HAM(`	nop			C code alignment')
+
+
+	movq	%mm0, %mm1
+	psrlq	$2, %mm0
+
+	pand	REG_3333333333333333, %mm0
+	pand	REG_3333333333333333, %mm1
+
+	paddd	%mm1, %mm0	C nibbles
+
+
+	movq	%mm0, %mm1
+	psrlq	$4, %mm0
+
+	pand	REG_0F0F0F0F0F0F0F0F, %mm0
+	pand	REG_0F0F0F0F0F0F0F0F, %mm1
+
+	paddd	%mm1, %mm0	C bytes
+
+	movq	%mm0, %mm1
+	psrlq	$8, %mm0
+
+
+	paddb	%mm1, %mm0	C words
+
+
+	movq	%mm0, %mm1
+	psrlq	$16, %mm0
+
+	paddd	%mm1, %mm0	C dwords
+
+	pand	REG_000000FF000000FF, %mm0
+
+	paddd	%mm0, %mm2	C low to total
+	psrlq	$32, %mm0
+
+	paddd	%mm0, %mm2	C high to total
+	loop	L(top)
+
+
+
+	movd	%mm2, %eax
+	emms_or_femms
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/k6/mmx/rshift.asm b/third_party/gmp/mpn/x86/k6/mmx/rshift.asm
new file mode 100644
index 0000000..cd0382f
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k6/mmx/rshift.asm

@@ -0,0 +1,130 @@
+dnl  AMD K6 mpn_rshift -- mpn right shift.
+
+dnl  Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C K6: 3.0 cycles/limb
+
+
+C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                       unsigned shift);
+C
+C The loop runs at 3 cycles/limb, limited by decoding and by having 3 mmx
+C instructions.  This is despite every second fetch being unaligned.
+
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC,  8)
+defframe(PARAM_DST,  4)
+deflit(`FRAME',0)
+
+	TEXT
+	ALIGN(32)
+
+PROLOGUE(mpn_rshift)
+deflit(`FRAME',0)
+
+	C The 1 limb case can be done without the push %ebx, but it's then
+	C still the same speed.  The push is left as a free helping hand for
+	C the two_or_more code.
+
+	movl	PARAM_SIZE, %eax
+	pushl	%ebx			FRAME_pushl()
+
+	movl	PARAM_SRC, %ebx
+	decl	%eax
+
+	movl	PARAM_SHIFT, %ecx
+	jnz	L(two_or_more)
+
+	movl	(%ebx), %edx		C src limb
+	movl	PARAM_DST, %ebx
+
+	shrdl(	%cl, %edx, %eax)	C return value
+
+	shrl	%cl, %edx
+
+	movl	%edx, (%ebx)		C dst limb
+	popl	%ebx
+
+	ret
+
+
+	ALIGN(16)	C avoid offset 0x1f
+L(two_or_more):
+	C eax	size-1
+	C ebx	src
+	C ecx	shift
+	C edx
+
+	movl	(%ebx), %edx	C src low limb
+	negl	%ecx
+
+	addl	$32, %ecx	C 32-shift
+	movd	PARAM_SHIFT, %mm6
+
+	shll	%cl, %edx	C retval
+	movl	PARAM_DST, %ecx
+
+	leal	(%ebx,%eax,4), %ebx
+
+	leal	-4(%ecx,%eax,4), %ecx
+	negl	%eax
+
+
+L(simple):
+	C eax	counter (negative)
+	C ebx	&src[size-1]
+	C ecx	&dst[size-1]
+	C edx	retval
+	C
+	C mm0	scratch
+	C mm6	shift
+
+Zdisp(	movq,	0,(%ebx,%eax,4), %mm0)
+	incl	%eax
+
+	psrlq	%mm6, %mm0
+
+Zdisp(	movd,	%mm0, 0,(%ecx,%eax,4))
+	jnz	L(simple)
+
+
+	movq	%mm0, (%ecx)
+	movl	%edx, %eax
+
+	popl	%ebx
+
+	emms
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/k6/mod_34lsub1.asm b/third_party/gmp/mpn/x86/k6/mod_34lsub1.asm
new file mode 100644
index 0000000..7e30503
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k6/mod_34lsub1.asm

@@ -0,0 +1,190 @@
+dnl  AMD K6 mpn_mod_34lsub1 -- mpn remainder modulo 2**24-1.
+
+dnl  Copyright 2000-2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C K6: 2.66 cycles/limb
+
+
+C mp_limb_t mpn_mod_34lsub1 (mp_srcptr src, mp_size_t size)
+C
+C An attempt was made to use a loop like
+C
+C L(top):
+C	adcl	(%edx), %eax
+C	adcl	4(%edx), %ebx
+C	adcl	8(%edx), %esi
+C	leal	12(%edx), %edx
+C	loop	L(top)
+C
+C with %ecx starting from floor(size/3), but it still measured 2.66 c/l.
+C The form used instead can save about 6 cycles by not dividing by 3.
+C
+C In the code used, putting the "leal"s at the top of the loop is necessary
+C for the claimed speed, anywhere else costs an extra cycle per loop.
+C Perhaps a tight loop like this needs short decode instructions at the
+C branch target, which would explain the leal/loop form above taking 8
+C cycles instead of 7 too.
+
+defframe(PARAM_SIZE, 8)
+defframe(PARAM_SRC,  4)
+
+dnl  re-use parameter space
+define(SAVE_EBX, `PARAM_SIZE')
+define(SAVE_ESI, `PARAM_SRC')
+
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_mod_34lsub1)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %eax
+	movl	PARAM_SRC, %edx
+
+	subl	$2, %eax
+	ja	L(three_or_more)
+
+Zdisp(	movl,	0,(%edx), %eax)		C avoid code cache line boundary
+	jne	L(one)
+
+	movl	%eax, %ecx
+	movl	4(%edx), %edx
+
+	shrl	$24, %eax		C src[0] high
+	andl	$0x00FFFFFF, %ecx	C src[0] low
+
+	addl	%ecx, %eax
+	movl	%edx, %ecx
+
+	shll	$8, %edx
+	andl	$0x00FFFF00, %edx	C src[1] high
+
+	shrl	$16, %ecx		C src[1] low
+	addl	%ecx, %eax
+
+	addl	%edx, %eax
+
+L(one):
+	ret
+
+
+L(three_or_more):
+	C eax	size-2
+	C ebx
+	C ecx
+	C edx	src
+
+	movl	%ebx, SAVE_EBX
+	xorl	%ebx, %ebx
+
+	movl	%esi, SAVE_ESI
+	pushl	%edi	FRAME_pushl()
+
+	xorl	%esi, %esi
+	xorl	%edi, %edi		C and clear carry flag
+
+L(top):
+	C eax	counter, limbs
+	C ebx	acc 0mod3
+	C ecx
+	C edx	src, incrementing
+	C esi	acc 1mod3
+	C edi	acc 2mod3
+	C ebp
+
+	leal	-2(%eax), %eax
+	leal	12(%edx), %edx
+
+	adcl	-12(%edx), %ebx
+	adcl	-8(%edx), %esi
+	adcl	-4(%edx), %edi
+
+	decl	%eax
+	jg	L(top)
+
+
+	C ecx is -3, -2 or -1 representing 0, 1 or 2 more limbs, respectively
+
+	movb	$0, %cl
+	incl	%eax
+
+	js	L(combine)		C 0 more
+
+Zdisp(	adcl,	0,(%edx), %ebx)		C avoid code cache line crossings
+
+	movb	$8, %cl
+	decl	%eax
+
+	js	L(combine)		C 1 more
+
+	adcl	4(%edx), %esi
+
+	movb	$16, %cl
+
+
+L(combine):
+	sbbl	%edx, %edx
+
+	shll	%cl, %edx		C carry
+	movl	%ebx, %eax		C 0mod3
+
+	shrl	$24, %eax		C 0mod3 high
+	andl	$0x00FFFFFF, %ebx	C 0mod3 low
+
+	subl	%edx, %eax		C apply carry
+	movl	%esi, %ecx		C 1mod3
+
+	shrl	$16, %esi		C 1mod3 high
+	addl	%ebx, %eax		C apply 0mod3 low
+
+	andl	$0x0000FFFF, %ecx
+	addl	%esi, %eax		C apply 1mod3 high
+
+	shll	$8, %ecx		C 1mod3 low
+	movl	%edi, %edx		C 2mod3
+
+	shrl	$8, %edx		C 2mod3 high
+	addl	%ecx, %eax		C apply 1mod3 low
+
+	addl	%edx, %eax		C apply 2mod3 high
+	andl	$0x000000FF, %edi
+
+	shll	$16, %edi		C 2mod3 low
+	movl	SAVE_EBX, %ebx
+
+	addl	%edi, %eax		C apply 2mod3 low
+	movl	SAVE_ESI, %esi
+
+	popl	%edi
+
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/k6/mode1o.asm b/third_party/gmp/mpn/x86/k6/mode1o.asm
new file mode 100644
index 0000000..4a338bd
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k6/mode1o.asm

@@ -0,0 +1,176 @@
+dnl  AMD K6 mpn_modexact_1_odd -- exact division style remainder.
+
+dnl  Copyright 2000-2003, 2007 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C K6: 10.0 cycles/limb
+
+
+C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size,
+C                               mp_limb_t divisor);
+C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size,
+C                                mp_limb_t divisor, mp_limb_t carry);
+C
+C A special case for high<divisor at the end measured only about 4 cycles
+C faster, and so isn't used.
+C
+C A special case for size==1 using a divl rather than the inverse measured
+C only about 5 cycles faster, and so isn't used.  When size==1 and
+C high<divisor it can skip a division and be a full 24 cycles faster, but
+C this isn't an important case.
+
+defframe(PARAM_CARRY,  16)
+defframe(PARAM_DIVISOR,12)
+defframe(PARAM_SIZE,   8)
+defframe(PARAM_SRC,    4)
+
+	TEXT
+
+	ALIGN(32)
+PROLOGUE(mpn_modexact_1c_odd)
+deflit(`FRAME',0)
+
+	movl	PARAM_DIVISOR, %ecx
+	pushl	%esi		FRAME_pushl()
+
+	movl	PARAM_CARRY, %edx
+	jmp	L(start_1c)
+
+EPILOGUE()
+
+
+	ALIGN(16)
+PROLOGUE(mpn_modexact_1_odd)
+deflit(`FRAME',0)
+
+	movl	PARAM_DIVISOR, %ecx
+	pushl	%esi		FRAME_pushl()
+
+	xorl	%edx, %edx
+L(start_1c):
+	pushl	%edi		FRAME_pushl()
+
+	shrl	%ecx			C d/2
+	movl	PARAM_DIVISOR, %esi
+
+	andl	$127, %ecx		C d/2, 7 bits
+	pushl	%ebp		FRAME_pushl()
+
+ifdef(`PIC',`
+	LEA(	binvert_limb_table, %edi)
+Zdisp(	movzbl,	0,(%ecx,%edi), %edi)		C inv 8 bits
+',`
+	movzbl	binvert_limb_table(%ecx), %edi	C inv 8 bits
+')
+	leal	(%edi,%edi), %ecx	C 2*inv
+
+	imull	%edi, %edi		C inv*inv
+
+	movl	PARAM_SRC, %eax
+	movl	PARAM_SIZE, %ebp
+
+	imull	%esi, %edi		C inv*inv*d
+
+	pushl	%ebx		FRAME_pushl()
+	leal	(%eax,%ebp,4), %ebx	C src end
+
+	subl	%edi, %ecx		C inv = 2*inv - inv*inv*d
+	leal	(%ecx,%ecx), %edi	C 2*inv
+
+	imull	%ecx, %ecx		C inv*inv
+
+	movl	(%eax), %eax		C src low limb
+	negl	%ebp			C -size
+
+	imull	%esi, %ecx		C inv*inv*d
+
+	subl	%ecx, %edi		C inv = 2*inv - inv*inv*d
+
+	ASSERT(e,`	C d*inv == 1 mod 2^GMP_LIMB_BITS
+	pushl	%eax
+	movl	%esi, %eax
+	imull	%edi, %eax
+	cmpl	$1, %eax
+	popl	%eax')
+
+	jmp	L(entry)
+
+
+C Rotating the mul to the top of the loop saves 1 cycle, presumably by
+C hiding the loop control under the imul latency.
+C
+C The run time is 10 cycles, but decoding is only 9 (and the dependent chain
+C only 8).  It's not clear how to get down to 9 cycles.
+C
+C The xor and rcl to handle the carry bit could be an sbb instead, with the
+C the carry bit add becoming a sub, but that doesn't save anything.
+
+L(top):
+	C eax	(low product)
+	C ebx	src end
+	C ecx	carry bit, 0 or 1
+	C edx	(high product, being carry limb)
+	C esi	divisor
+	C edi	inverse
+	C ebp	counter, limbs, negative
+
+	mull	%esi
+
+	movl	(%ebx,%ebp,4), %eax
+	addl	%ecx, %edx		C apply carry bit to carry limb
+
+L(entry):
+	xorl	%ecx, %ecx
+	subl	%edx, %eax		C apply carry limb
+
+	rcll	%ecx
+
+	imull	%edi, %eax
+
+	incl	%ebp
+	jnz	L(top)
+
+
+
+	popl	%ebx
+	popl	%ebp
+
+	mull	%esi
+
+	popl	%edi
+	popl	%esi
+
+	leal	(%ecx,%edx), %eax
+
+	ret
+
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/x86/k6/mul_1.asm b/third_party/gmp/mpn/x86/k6/mul_1.asm
new file mode 100644
index 0000000..3ef7ec2
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k6/mul_1.asm

@@ -0,0 +1,292 @@
+dnl  AMD K6 mpn_mul_1 -- mpn by limb multiply.
+
+dnl  Copyright 1999, 2000, 2002, 2005 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C			    cycles/limb
+C P5
+C P6 model 0-8,10-12		 5.5
+C P6 model 9  (Banias)
+C P6 model 13 (Dothan)		 4.87
+C P4 model 0  (Willamette)
+C P4 model 1  (?)
+C P4 model 2  (Northwood)
+C P4 model 3  (Prescott)
+C P4 model 4  (Nocona)
+C AMD K6			 6.25
+C AMD K7
+C AMD K8
+
+
+C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                      mp_limb_t multiplier);
+C mp_limb_t mpn_mul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                       mp_limb_t multiplier, mp_limb_t carry);
+C
+C Multiply src,size by mult and store the result in dst,size.
+C Return the carry limb from the top of the result.
+C
+C mpn_mul_1c() accepts an initial carry for the calculation, it's added into
+C the low limb of the result.
+
+defframe(PARAM_CARRY,     20)
+defframe(PARAM_MULTIPLIER,16)
+defframe(PARAM_SIZE,      12)
+defframe(PARAM_SRC,       8)
+defframe(PARAM_DST,       4)
+
+dnl  minimum 5 because the unrolled code can't handle less
+deflit(UNROLL_THRESHOLD, 5)
+
+	TEXT
+	ALIGN(32)
+
+PROLOGUE(mpn_mul_1c)
+	pushl	%esi
+deflit(`FRAME',4)
+	movl	PARAM_CARRY, %esi
+	jmp	L(start_nc)
+EPILOGUE()
+
+
+PROLOGUE(mpn_mul_1)
+	push	%esi
+deflit(`FRAME',4)
+	xorl	%esi, %esi	C initial carry
+
+L(start_nc):
+	mov	PARAM_SIZE, %ecx
+	push	%ebx
+FRAME_pushl()
+
+	movl	PARAM_SRC, %ebx
+	push	%edi
+FRAME_pushl()
+
+	movl	PARAM_DST, %edi
+	pushl	%ebp
+FRAME_pushl()
+
+	cmpl	$UNROLL_THRESHOLD, %ecx
+	movl	PARAM_MULTIPLIER, %ebp
+
+	jae	L(unroll)
+
+
+	C code offset 0x22 here, close enough to aligned
+L(simple):
+	C eax	scratch
+	C ebx	src
+	C ecx	counter
+	C edx	scratch
+	C esi	carry
+	C edi	dst
+	C ebp	multiplier
+	C
+	C this loop 8 cycles/limb
+
+	movl	(%ebx), %eax
+	addl	$4, %ebx
+
+	mull	%ebp
+
+	addl	%esi, %eax
+	movl	$0, %esi
+
+	adcl	%edx, %esi
+
+	movl	%eax, (%edi)
+	addl	$4, %edi
+
+	loop	L(simple)
+
+
+	popl	%ebp
+
+	popl	%edi
+	popl	%ebx
+
+	movl	%esi, %eax
+	popl	%esi
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+C The code for each limb is 6 cycles, with instruction decoding being the
+C limiting factor.  At 4 limbs/loop and 1 cycle/loop of overhead it's 6.25
+C cycles/limb in total.
+C
+C The secret ingredient to get 6.25 is to start the loop with the mul and
+C have the load/store pair at the end.  Rotating the load/store to the top
+C is an 0.5 c/l slowdown.  (Some address generation effect probably.)
+C
+C The whole unrolled loop fits nicely in exactly 80 bytes.
+
+
+	ALIGN(16)	C already aligned to 16 here actually
+L(unroll):
+	movl	(%ebx), %eax
+	leal	-16(%ebx,%ecx,4), %ebx
+
+	leal	-16(%edi,%ecx,4), %edi
+	subl	$4, %ecx
+
+	negl	%ecx
+
+
+	ALIGN(16)	C one byte nop for this alignment
+L(top):
+	C eax	scratch
+	C ebx	&src[size-4]
+	C ecx	counter
+	C edx	scratch
+	C esi	carry
+	C edi	&dst[size-4]
+	C ebp	multiplier
+
+	mull	%ebp
+
+	addl	%esi, %eax
+	movl	$0, %esi
+
+	adcl	%edx, %esi
+
+	movl	%eax, (%edi,%ecx,4)
+	movl	4(%ebx,%ecx,4), %eax
+
+
+	mull	%ebp
+
+	addl	%esi, %eax
+	movl	$0, %esi
+
+	adcl	%edx, %esi
+
+	movl	%eax, 4(%edi,%ecx,4)
+	movl	8(%ebx,%ecx,4), %eax
+
+
+	mull	%ebp
+
+	addl	%esi, %eax
+	movl	$0, %esi
+
+	adcl	%edx, %esi
+
+	movl	%eax, 8(%edi,%ecx,4)
+	movl	12(%ebx,%ecx,4), %eax
+
+
+	mull	%ebp
+
+	addl	%esi, %eax
+	movl	$0, %esi
+
+	adcl	%edx, %esi
+
+	movl	%eax, 12(%edi,%ecx,4)
+	movl	16(%ebx,%ecx,4), %eax
+
+
+	addl	$4, %ecx
+	js	L(top)
+
+
+
+	C eax	next src limb
+	C ebx	&src[size-4]
+	C ecx	0 to 3 representing respectively 4 to 1 further limbs
+	C edx
+	C esi	carry
+	C edi	&dst[size-4]
+
+	testb	$2, %cl
+	jnz	L(finish_not_two)
+
+	mull	%ebp
+
+	addl	%esi, %eax
+	movl	$0, %esi
+
+	adcl	%edx, %esi
+
+	movl	%eax, (%edi,%ecx,4)
+	movl	4(%ebx,%ecx,4), %eax
+
+
+	mull	%ebp
+
+	addl	%esi, %eax
+	movl	$0, %esi
+
+	adcl	%edx, %esi
+
+	movl	%eax, 4(%edi,%ecx,4)
+	movl	8(%ebx,%ecx,4), %eax
+
+	addl	$2, %ecx
+L(finish_not_two):
+
+
+	testb	$1, %cl
+	jnz	L(finish_not_one)
+
+	mull	%ebp
+
+	addl	%esi, %eax
+	movl	$0, %esi
+
+	adcl	%edx, %esi
+
+	movl	%eax, 8(%edi)
+	movl	12(%ebx), %eax
+L(finish_not_one):
+
+
+	mull	%ebp
+
+	addl	%esi, %eax
+	popl	%ebp
+
+	adcl	$0, %edx
+
+	movl	%eax, 12(%edi)
+	popl	%edi
+
+	popl	%ebx
+	movl	%edx, %eax
+
+	popl	%esi
+
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/k6/mul_basecase.asm b/third_party/gmp/mpn/x86/k6/mul_basecase.asm
new file mode 100644
index 0000000..7030001
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k6/mul_basecase.asm

@@ -0,0 +1,612 @@
+dnl  AMD K6 mpn_mul_basecase -- multiply two mpn numbers.
+
+dnl  Copyright 1999-2003 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C K6: approx 9.0 cycles per cross product on 30x30 limbs (with 16 limbs/loop
+C     unrolling).
+
+
+
+dnl  K6: UNROLL_COUNT cycles/product (approx)
+dnl           8           9.75
+dnl          16           9.3
+dnl          32           9.3
+dnl  Maximum possible with the current code is 32.
+dnl
+dnl  With 16 the inner unrolled loop fits exactly in a 256 byte block, which
+dnl  might explain it's good performance.
+
+deflit(UNROLL_COUNT, 16)
+
+
+C void mpn_mul_basecase (mp_ptr wp,
+C                        mp_srcptr xp, mp_size_t xsize,
+C                        mp_srcptr yp, mp_size_t ysize);
+C
+C Calculate xp,xsize multiplied by yp,ysize, storing the result in
+C wp,xsize+ysize.
+C
+C This routine is essentially the same as mpn/generic/mul_basecase.c, but
+C it's faster because it does most of the mpn_addmul_1() entry code only
+C once.  The saving is about 10-20% on typical sizes coming from the
+C Karatsuba multiply code.
+C
+C Enhancements:
+C
+C The mul_1 loop is about 8.5 c/l, which is slower than mpn_mul_1 at 6.25
+C c/l.  Could call mpn_mul_1 when ysize is big enough to make it worthwhile.
+C
+C The main unrolled addmul loop could be shared by mpn_addmul_1, using some
+C extra stack setups and maybe 2 or 3 wasted cycles at the end.  Code saving
+C would be 256 bytes.
+
+ifdef(`PIC',`
+deflit(UNROLL_THRESHOLD, 8)
+',`
+deflit(UNROLL_THRESHOLD, 8)
+')
+
+defframe(PARAM_YSIZE,20)
+defframe(PARAM_YP,   16)
+defframe(PARAM_XSIZE,12)
+defframe(PARAM_XP,   8)
+defframe(PARAM_WP,   4)
+
+	TEXT
+	ALIGN(32)
+PROLOGUE(mpn_mul_basecase)
+deflit(`FRAME',0)
+
+	movl	PARAM_XSIZE, %ecx
+	movl	PARAM_YP, %eax
+
+	movl	PARAM_XP, %edx
+	movl	(%eax), %eax	C yp low limb
+
+	cmpl	$2, %ecx
+	ja	L(xsize_more_than_two_limbs)
+	je	L(two_by_something)
+
+
+	C one limb by one limb
+
+	movl	(%edx), %edx	C xp low limb
+	movl	PARAM_WP, %ecx
+
+	mull	%edx
+
+	movl	%eax, (%ecx)
+	movl	%edx, 4(%ecx)
+	ret
+
+
+C -----------------------------------------------------------------------------
+L(two_by_something):
+	decl	PARAM_YSIZE
+	pushl	%ebx
+deflit(`FRAME',4)
+
+	movl	PARAM_WP, %ebx
+	pushl	%esi
+deflit(`FRAME',8)
+
+	movl	%eax, %ecx	C yp low limb
+	movl	(%edx), %eax	C xp low limb
+
+	movl	%edx, %esi	C xp
+	jnz	L(two_by_two)
+
+
+	C two limbs by one limb
+
+	mull	%ecx
+
+	movl	%eax, (%ebx)
+	movl	4(%esi), %eax
+
+	movl	%edx, %esi	C carry
+
+	mull	%ecx
+
+	addl	%eax, %esi
+	movl	%esi, 4(%ebx)
+
+	adcl	$0, %edx
+
+	movl	%edx, 8(%ebx)
+	popl	%esi
+
+	popl	%ebx
+	ret
+
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(16)
+L(two_by_two):
+	C eax	xp low limb
+	C ebx	wp
+	C ecx	yp low limb
+	C edx
+	C esi	xp
+	C edi
+	C ebp
+deflit(`FRAME',8)
+
+	mull	%ecx		C xp[0] * yp[0]
+
+	push	%edi
+deflit(`FRAME',12)
+	movl	%eax, (%ebx)
+
+	movl	4(%esi), %eax
+	movl	%edx, %edi	C carry, for wp[1]
+
+	mull	%ecx		C xp[1] * yp[0]
+
+	addl	%eax, %edi
+	movl	PARAM_YP, %ecx
+
+	adcl	$0, %edx
+
+	movl	%edi, 4(%ebx)
+	movl	4(%ecx), %ecx	C yp[1]
+
+	movl	4(%esi), %eax	C xp[1]
+	movl	%edx, %edi	C carry, for wp[2]
+
+	mull	%ecx		C xp[1] * yp[1]
+
+	addl	%eax, %edi
+
+	adcl	$0, %edx
+
+	movl	(%esi), %eax	C xp[0]
+	movl	%edx, %esi	C carry, for wp[3]
+
+	mull	%ecx		C xp[0] * yp[1]
+
+	addl	%eax, 4(%ebx)
+	adcl	%edx, %edi
+	adcl	$0, %esi
+
+	movl	%edi, 8(%ebx)
+	popl	%edi
+
+	movl	%esi, 12(%ebx)
+	popl	%esi
+
+	popl	%ebx
+	ret
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(16)
+L(xsize_more_than_two_limbs):
+
+C The first limb of yp is processed with a simple mpn_mul_1 style loop
+C inline.  Unrolling this doesn't seem worthwhile since it's only run once
+C (whereas the addmul below is run ysize-1 many times).  A call to the
+C actual mpn_mul_1 will be slowed down by the call and parameter pushing and
+C popping, and doesn't seem likely to be worthwhile on the typical 10-20
+C limb operations the Karatsuba code calls here with.
+
+	C eax	yp[0]
+	C ebx
+	C ecx	xsize
+	C edx	xp
+	C esi
+	C edi
+	C ebp
+deflit(`FRAME',0)
+
+	pushl	%edi		defframe_pushl(SAVE_EDI)
+	pushl	%ebp		defframe_pushl(SAVE_EBP)
+
+	movl	PARAM_WP, %edi
+	pushl	%esi		defframe_pushl(SAVE_ESI)
+
+	movl	%eax, %ebp
+	pushl	%ebx		defframe_pushl(SAVE_EBX)
+
+	leal	(%edx,%ecx,4), %ebx	C xp end
+	xorl	%esi, %esi
+
+	leal	(%edi,%ecx,4), %edi	C wp end of mul1
+	negl	%ecx
+
+
+L(mul1):
+	C eax	scratch
+	C ebx	xp end
+	C ecx	counter, negative
+	C edx	scratch
+	C esi	carry
+	C edi	wp end of mul1
+	C ebp	multiplier
+
+	movl	(%ebx,%ecx,4), %eax
+
+	mull	%ebp
+
+	addl	%esi, %eax
+	movl	$0, %esi
+
+	adcl	%edx, %esi
+
+	movl	%eax, (%edi,%ecx,4)
+	incl	%ecx
+
+	jnz	L(mul1)
+
+
+	movl	PARAM_YSIZE, %edx
+	movl	%esi, (%edi)		C final carry
+
+	movl	PARAM_XSIZE, %ecx
+	decl	%edx
+
+	jnz	L(ysize_more_than_one_limb)
+
+	popl	%ebx
+	popl	%esi
+	popl	%ebp
+	popl	%edi
+	ret
+
+
+L(ysize_more_than_one_limb):
+	cmpl	$UNROLL_THRESHOLD, %ecx
+	movl	PARAM_YP, %eax
+
+	jae	L(unroll)
+
+
+C -----------------------------------------------------------------------------
+C Simple addmul loop.
+C
+C Using ebx and edi pointing at the ends of their respective locations saves
+C a couple of instructions in the outer loop.  The inner loop is still 11
+C cycles, the same as the simple loop in aorsmul_1.asm.
+
+	C eax	yp
+	C ebx	xp end
+	C ecx	xsize
+	C edx	ysize-1
+	C esi
+	C edi	wp end of mul1
+	C ebp
+
+	movl	4(%eax), %ebp		C multiplier
+	negl	%ecx
+
+	movl	%ecx, PARAM_XSIZE	C -xsize
+	xorl	%esi, %esi		C initial carry
+
+	leal	4(%eax,%edx,4), %eax	C yp end
+	negl	%edx
+
+	movl	%eax, PARAM_YP
+	movl	%edx, PARAM_YSIZE
+
+	jmp	L(simple_outer_entry)
+
+
+	C aligning here saves a couple of cycles
+	ALIGN(16)
+L(simple_outer_top):
+	C edx	ysize counter, negative
+
+	movl	PARAM_YP, %eax		C yp end
+	xorl	%esi, %esi		C carry
+
+	movl	PARAM_XSIZE, %ecx	C -xsize
+	movl	%edx, PARAM_YSIZE
+
+	movl	(%eax,%edx,4), %ebp	C yp limb multiplier
+L(simple_outer_entry):
+	addl	$4, %edi
+
+
+L(simple_inner):
+	C eax	scratch
+	C ebx	xp end
+	C ecx	counter, negative
+	C edx	scratch
+	C esi	carry
+	C edi	wp end of this addmul
+	C ebp	multiplier
+
+	movl	(%ebx,%ecx,4), %eax
+
+	mull	%ebp
+
+	addl	%esi, %eax
+	movl	$0, %esi
+
+	adcl	$0, %edx
+	addl	%eax, (%edi,%ecx,4)
+	adcl	%edx, %esi
+
+	incl	%ecx
+	jnz	L(simple_inner)
+
+
+	movl	PARAM_YSIZE, %edx
+	movl	%esi, (%edi)
+
+	incl	%edx
+	jnz	L(simple_outer_top)
+
+
+	popl	%ebx
+	popl	%esi
+	popl	%ebp
+	popl	%edi
+	ret
+
+
+C -----------------------------------------------------------------------------
+C Unrolled loop.
+C
+C The unrolled inner loop is the same as in aorsmul_1.asm, see that code for
+C some comments.
+C
+C VAR_COUNTER is for the inner loop, running from VAR_COUNTER_INIT down to
+C 0, inclusive.
+C
+C VAR_JMP is the computed jump into the unrolled loop.
+C
+C PARAM_XP and PARAM_WP get offset appropriately for where the unrolled loop
+C is entered.
+C
+C VAR_XP_LOW is the least significant limb of xp, which is needed at the
+C start of the unrolled loop.  This can't just be fetched through the xp
+C pointer because of the offset applied to it.
+C
+C PARAM_YSIZE is the outer loop counter, going from -(ysize-1) up to -1,
+C inclusive.
+C
+C PARAM_YP is offset appropriately so that the PARAM_YSIZE counter can be
+C added to give the location of the next limb of yp, which is the multiplier
+C in the unrolled loop.
+C
+C PARAM_WP is similarly offset so that the PARAM_YSIZE counter can be added
+C to give the starting point in the destination for each unrolled loop (this
+C point is one limb upwards for each limb of yp processed).
+C
+C Having PARAM_YSIZE count negative to zero means it's not necessary to
+C store new values of PARAM_YP and PARAM_WP on each loop.  Those values on
+C the stack remain constant and on each loop an leal adjusts them with the
+C PARAM_YSIZE counter value.
+
+
+defframe(VAR_COUNTER,      -20)
+defframe(VAR_COUNTER_INIT, -24)
+defframe(VAR_JMP,          -28)
+defframe(VAR_XP_LOW,       -32)
+deflit(VAR_STACK_SPACE, 16)
+
+dnl  For some strange reason using (%esp) instead of 0(%esp) is a touch
+dnl  slower in this code, hence the defframe empty-if-zero feature is
+dnl  disabled.
+dnl
+dnl  If VAR_COUNTER is at (%esp), the effect is worse.  In this case the
+dnl  unrolled loop is 255 instead of 256 bytes, but quite how this affects
+dnl  anything isn't clear.
+dnl
+define(`defframe_empty_if_zero_disabled',1)
+
+L(unroll):
+	C eax	yp (not used)
+	C ebx	xp end (not used)
+	C ecx	xsize
+	C edx	ysize-1
+	C esi
+	C edi	wp end of mul1 (not used)
+	C ebp
+deflit(`FRAME', 16)
+
+	leal	-2(%ecx), %ebp	C one limb processed at start,
+	decl	%ecx		C and ebp is one less
+
+	shrl	$UNROLL_LOG2, %ebp
+	negl	%ecx
+
+	subl	$VAR_STACK_SPACE, %esp
+deflit(`FRAME', 16+VAR_STACK_SPACE)
+	andl	$UNROLL_MASK, %ecx
+
+	movl	%ecx, %esi
+	shll	$4, %ecx
+
+	movl	%ebp, VAR_COUNTER_INIT
+	negl	%esi
+
+	C 15 code bytes per limb
+ifdef(`PIC',`
+	call	L(pic_calc)
+L(unroll_here):
+',`
+	leal	L(unroll_entry) (%ecx,%esi,1), %ecx
+')
+
+	movl	PARAM_XP, %ebx
+	movl	%ebp, VAR_COUNTER
+
+	movl	PARAM_WP, %edi
+	movl	%ecx, VAR_JMP
+
+	movl	(%ebx), %eax
+	leal	4(%edi,%esi,4), %edi	C wp adjust for unrolling and mul1
+
+	leal	(%ebx,%esi,4), %ebx	C xp adjust for unrolling
+
+	movl	%eax, VAR_XP_LOW
+
+	movl	%ebx, PARAM_XP
+	movl	PARAM_YP, %ebx
+
+	leal	(%edi,%edx,4), %ecx	C wp adjust for ysize indexing
+	movl	4(%ebx), %ebp		C multiplier (yp second limb)
+
+	leal	4(%ebx,%edx,4), %ebx	C yp adjust for ysize indexing
+
+	movl	%ecx, PARAM_WP
+
+	leal	1(%esi), %ecx	C adjust parity for decl %ecx above
+
+	movl	%ebx, PARAM_YP
+	negl	%edx
+
+	movl	%edx, PARAM_YSIZE
+	jmp	L(unroll_outer_entry)
+
+
+ifdef(`PIC',`
+L(pic_calc):
+	C See mpn/x86/README about old gas bugs
+	leal	(%ecx,%esi,1), %ecx
+	addl	$L(unroll_entry)-L(unroll_here), %ecx
+	addl	(%esp), %ecx
+	ret_internal
+')
+
+
+C -----------------------------------------------------------------------------
+	C Aligning here saves a couple of cycles per loop.  Using 32 doesn't
+	C cost any extra space, since the inner unrolled loop below is
+	C aligned to 32.
+	ALIGN(32)
+L(unroll_outer_top):
+	C edx	ysize
+
+	movl	PARAM_YP, %eax
+	movl	%edx, PARAM_YSIZE	C incremented ysize counter
+
+	movl	PARAM_WP, %edi
+
+	movl	VAR_COUNTER_INIT, %ebx
+	movl	(%eax,%edx,4), %ebp	C next multiplier
+
+	movl	PARAM_XSIZE, %ecx
+	leal	(%edi,%edx,4), %edi	C adjust wp for where we are in yp
+
+	movl	VAR_XP_LOW, %eax
+	movl	%ebx, VAR_COUNTER
+
+L(unroll_outer_entry):
+	mull	%ebp
+
+	C using testb is a tiny bit faster than testl
+	testb	$1, %cl
+
+	movl	%eax, %ecx	C low carry
+	movl	VAR_JMP, %eax
+
+	movl	%edx, %esi	C high carry
+	movl	PARAM_XP, %ebx
+
+	jnz	L(unroll_noswap)
+	movl	%ecx, %esi	C high,low carry other way around
+
+	movl	%edx, %ecx
+L(unroll_noswap):
+
+	jmp	*%eax
+
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(32)
+L(unroll_top):
+	C eax	scratch
+	C ebx	xp
+	C ecx	carry low
+	C edx	scratch
+	C esi	carry high
+	C edi	wp
+	C ebp	multiplier
+	C VAR_COUNTER  loop counter
+	C
+	C 15 code bytes each limb
+
+	leal	UNROLL_BYTES(%edi), %edi
+
+L(unroll_entry):
+deflit(CHUNK_COUNT,2)
+forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
+	deflit(`disp0', eval(i*CHUNK_COUNT*4))
+	deflit(`disp1', eval(disp0 + 4))
+	deflit(`disp2', eval(disp1 + 4))
+
+	movl	disp1(%ebx), %eax
+	mull	%ebp
+Zdisp(	addl,	%ecx, disp0,(%edi))
+	adcl	%eax, %esi
+	movl	%edx, %ecx
+	jadcl0( %ecx)
+
+	movl	disp2(%ebx), %eax
+	mull	%ebp
+	addl	%esi, disp1(%edi)
+	adcl	%eax, %ecx
+	movl	%edx, %esi
+	jadcl0( %esi)
+')
+
+	decl	VAR_COUNTER
+	leal	UNROLL_BYTES(%ebx), %ebx
+
+	jns	L(unroll_top)
+
+
+	movl	PARAM_YSIZE, %edx
+	addl	%ecx, UNROLL_BYTES(%edi)
+
+	adcl	$0, %esi
+
+	incl	%edx
+	movl	%esi, UNROLL_BYTES+4(%edi)
+
+	jnz	L(unroll_outer_top)
+
+
+	movl	SAVE_ESI, %esi
+	movl	SAVE_EBP, %ebp
+	movl	SAVE_EDI, %edi
+	movl	SAVE_EBX, %ebx
+
+	addl	$FRAME, %esp
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/k6/pre_mod_1.asm b/third_party/gmp/mpn/x86/k6/pre_mod_1.asm
new file mode 100644
index 0000000..34db20d
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k6/pre_mod_1.asm

@@ -0,0 +1,146 @@
+dnl  AMD K6 mpn_preinv_mod_1 -- mpn by 1 remainder, with pre-inverted divisor.
+
+dnl  Copyright 2000, 2002, 2003 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C K6: 18.0 cycles/limb
+
+
+C mp_limb_t mpn_preinv_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor,
+C                             mp_limb_t inverse);
+C
+C This code is only 2 c/l faster than a simple divl, but that's 10% so it's
+C considered worthwhile (just).
+
+defframe(PARAM_INVERSE,16)
+defframe(PARAM_DIVISOR,12)
+defframe(PARAM_SIZE,    8)
+defframe(PARAM_SRC,     4)
+
+	TEXT
+	ALIGN(32)
+PROLOGUE(mpn_preinv_mod_1)
+deflit(`FRAME',0)
+
+	ASSERT(ae,`cmpl $1, PARAM_SIZE')
+	ASSERT(nz,`testl $0x80000000, PARAM_DIVISOR')
+
+	movl	PARAM_SIZE, %ecx
+	pushl	%ebp	FRAME_pushl()
+
+	movl	PARAM_SRC, %ebp
+	pushl	%edi	FRAME_pushl()
+
+	movl	PARAM_DIVISOR, %eax
+	pushl	%esi	FRAME_pushl()
+
+	movl	-4(%ebp,%ecx,4), %esi	C src high limb
+	pushl	%ebx	FRAME_pushl()
+
+	movl	%edx, %edi		C first n2 to cancel
+	subl	%eax, %esi		C first n1 = high-divisor
+
+	decl	%ecx
+	jz	L(done_sbbl)
+
+L(top):
+	C eax	scratch
+	C ebx	n10, nadj, q1
+	C ecx	counter, size to 1
+	C edx	scratch
+	C esi	n2
+	C edi	old high, for underflow test
+	C ebp	src
+
+	sbbl	%edx, %edi	    C high n-(q1+1)*d, 0 or -1
+
+L(entry):
+	andl	PARAM_DIVISOR, %edi
+L(q1_ff_top):
+	movl	-4(%ebp,%ecx,4), %ebx
+
+	addl	%esi, %edi	    C possible addback
+	movl	%ebx, %esi	    C n10
+
+	sarl	$31, %ebx	    C -n1 = 0 or -1
+	movl	%edi, %eax	    C n2
+
+	movl	PARAM_INVERSE, %edx
+	subl	%ebx, %eax	    C n2+n1
+
+	mull	%edx		    C m*(n2+n1)
+
+	andl	PARAM_DIVISOR, %ebx C -n1 & d
+	addl	%esi, %ebx	    C nadj = n10 + (-n1&d), ignoring overflow
+
+	addl	%ebx, %eax	    C low m*(n2+n1) + nadj, giving carry flag
+	leal	1(%edi), %ebx	    C n2+1
+
+	adcl	%ebx, %edx	    C 1+high(n2<<32+m*(n2+n1)+nadj) = q1+1
+
+	movl	PARAM_DIVISOR, %eax C d
+	jz	L(q1_ff)
+
+	mull	%edx		    C (q1+1)*d
+
+	subl	%eax, %esi	    C low  n-(q1+1)*d
+	loop	L(top)
+
+
+
+L(done_sbbl):
+	sbbl	%edx, %edi	    C high n-(q1+1)*d, 0 or -1
+
+	andl	PARAM_DIVISOR, %edi
+L(done_esi_edi):
+	popl	%ebx
+
+	leal	(%esi,%edi), %eax
+	popl	%esi
+
+	popl	%edi
+	popl	%ebp
+
+	ret
+
+
+C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword
+C of q*d is simply -d and the remainder n-q*d = n10+d.  This is rarely
+C reached.
+
+L(q1_ff):
+	movl	PARAM_DIVISOR, %edi
+	loop	L(q1_ff_top)
+
+	jmp	L(done_esi_edi)
+
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/k6/sqr_basecase.asm b/third_party/gmp/mpn/x86/k6/sqr_basecase.asm
new file mode 100644
index 0000000..b7ecb5c
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k6/sqr_basecase.asm

@@ -0,0 +1,680 @@
+dnl  AMD K6 mpn_sqr_basecase -- square an mpn number.
+
+dnl  Copyright 1999-2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C K6: approx 4.7 cycles per cross product, or 9.2 cycles per triangular
+C     product (measured on the speed difference between 17 and 33 limbs,
+C     which is roughly the Karatsuba recursing range).
+
+
+dnl  SQR_TOOM2_THRESHOLD_MAX is the maximum SQR_TOOM2_THRESHOLD this
+dnl  code supports.  This value is used only by the tune program to know
+dnl  what it can go up to.  (An attempt to compile with a bigger value will
+dnl  trigger some m4_assert()s in the code, making the build fail.)
+dnl
+dnl  The value is determined by requiring the displacements in the unrolled
+dnl  addmul to fit in single bytes.  This means a maximum UNROLL_COUNT of
+dnl  63, giving a maximum SQR_TOOM2_THRESHOLD of 66.
+
+deflit(SQR_TOOM2_THRESHOLD_MAX, 66)
+
+
+dnl  Allow a value from the tune program to override config.m4.
+
+ifdef(`SQR_TOOM2_THRESHOLD_OVERRIDE',
+`define(`SQR_TOOM2_THRESHOLD',SQR_TOOM2_THRESHOLD_OVERRIDE)')
+
+
+dnl  UNROLL_COUNT is the number of code chunks in the unrolled addmul.  The
+dnl  number required is determined by SQR_TOOM2_THRESHOLD, since
+dnl  mpn_sqr_basecase only needs to handle sizes < SQR_TOOM2_THRESHOLD.
+dnl
+dnl  The first addmul is the biggest, and this takes the second least
+dnl  significant limb and multiplies it by the third least significant and
+dnl  up.  Hence for a maximum operand size of SQR_TOOM2_THRESHOLD-1
+dnl  limbs, UNROLL_COUNT needs to be SQR_TOOM2_THRESHOLD-3.
+
+m4_config_gmp_mparam(`SQR_TOOM2_THRESHOLD')
+deflit(UNROLL_COUNT, eval(SQR_TOOM2_THRESHOLD-3))
+
+
+C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C The algorithm is essentially the same as mpn/generic/sqr_basecase.c, but a
+C lot of function call overheads are avoided, especially when the given size
+C is small.
+C
+C The code size might look a bit excessive, but not all of it is executed
+C and so won't fill up the code cache.  The 1x1, 2x2 and 3x3 special cases
+C clearly apply only to those sizes; mid sizes like 10x10 only need part of
+C the unrolled addmul; and big sizes like 35x35 that do need all of it will
+C at least be getting value for money, because 35x35 spends something like
+C 5780 cycles here.
+C
+C Different values of UNROLL_COUNT give slightly different speeds, between
+C 9.0 and 9.2 c/tri-prod measured on the difference between 17 and 33 limbs.
+C This isn't a big difference, but it's presumably some alignment effect
+C which if understood could give a simple speedup.
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+	TEXT
+	ALIGN(32)
+PROLOGUE(mpn_sqr_basecase)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	movl	PARAM_SRC, %eax
+
+	cmpl	$2, %ecx
+	je	L(two_limbs)
+
+	movl	PARAM_DST, %edx
+	ja	L(three_or_more)
+
+
+C -----------------------------------------------------------------------------
+C one limb only
+	C eax	src
+	C ebx
+	C ecx	size
+	C edx	dst
+
+	movl	(%eax), %eax
+	movl	%edx, %ecx
+
+	mull	%eax
+
+	movl	%eax, (%ecx)
+	movl	%edx, 4(%ecx)
+	ret
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(16)
+L(two_limbs):
+	C eax	src
+	C ebx
+	C ecx	size
+	C edx	dst
+
+	pushl	%ebx
+	movl	%eax, %ebx	C src
+deflit(`FRAME',4)
+
+	movl	(%ebx), %eax
+	movl	PARAM_DST, %ecx
+
+	mull	%eax		C src[0]^2
+
+	movl	%eax, (%ecx)
+	movl	4(%ebx), %eax
+
+	movl	%edx, 4(%ecx)
+
+	mull	%eax		C src[1]^2
+
+	movl	%eax, 8(%ecx)
+	movl	(%ebx), %eax
+
+	movl	%edx, 12(%ecx)
+	movl	4(%ebx), %edx
+
+	mull	%edx		C src[0]*src[1]
+
+	addl	%eax, 4(%ecx)
+
+	adcl	%edx, 8(%ecx)
+	adcl	$0, 12(%ecx)
+
+	popl	%ebx
+	addl	%eax, 4(%ecx)
+
+	adcl	%edx, 8(%ecx)
+	adcl	$0, 12(%ecx)
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+L(three_or_more):
+deflit(`FRAME',0)
+	cmpl	$4, %ecx
+	jae	L(four_or_more)
+
+
+C -----------------------------------------------------------------------------
+C three limbs
+	C eax	src
+	C ecx	size
+	C edx	dst
+
+	pushl	%ebx
+	movl	%eax, %ebx	C src
+
+	movl	(%ebx), %eax
+	movl	%edx, %ecx	C dst
+
+	mull	%eax		C src[0] ^ 2
+
+	movl	%eax, (%ecx)
+	movl	4(%ebx), %eax
+
+	movl	%edx, 4(%ecx)
+	pushl	%esi
+
+	mull	%eax		C src[1] ^ 2
+
+	movl	%eax, 8(%ecx)
+	movl	8(%ebx), %eax
+
+	movl	%edx, 12(%ecx)
+	pushl	%edi
+
+	mull	%eax		C src[2] ^ 2
+
+	movl	%eax, 16(%ecx)
+	movl	(%ebx), %eax
+
+	movl	%edx, 20(%ecx)
+	movl	4(%ebx), %edx
+
+	mull	%edx		C src[0] * src[1]
+
+	movl	%eax, %esi
+	movl	(%ebx), %eax
+
+	movl	%edx, %edi
+	movl	8(%ebx), %edx
+
+	pushl	%ebp
+	xorl	%ebp, %ebp
+
+	mull	%edx		C src[0] * src[2]
+
+	addl	%eax, %edi
+	movl	4(%ebx), %eax
+
+	adcl	%edx, %ebp
+
+	movl	8(%ebx), %edx
+
+	mull	%edx		C src[1] * src[2]
+
+	addl	%eax, %ebp
+
+	adcl	$0, %edx
+
+
+	C eax	will be dst[5]
+	C ebx
+	C ecx	dst
+	C edx	dst[4]
+	C esi	dst[1]
+	C edi	dst[2]
+	C ebp	dst[3]
+
+	xorl	%eax, %eax
+	addl	%esi, %esi
+	adcl	%edi, %edi
+	adcl	%ebp, %ebp
+	adcl	%edx, %edx
+	adcl	$0, %eax
+
+	addl	%esi, 4(%ecx)
+	adcl	%edi, 8(%ecx)
+	adcl	%ebp, 12(%ecx)
+
+	popl	%ebp
+	popl	%edi
+
+	adcl	%edx, 16(%ecx)
+
+	popl	%esi
+	popl	%ebx
+
+	adcl	%eax, 20(%ecx)
+	ASSERT(nc)
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+
+defframe(SAVE_EBX,   -4)
+defframe(SAVE_ESI,   -8)
+defframe(SAVE_EDI,   -12)
+defframe(SAVE_EBP,   -16)
+defframe(VAR_COUNTER,-20)
+defframe(VAR_JMP,    -24)
+deflit(STACK_SPACE, 24)
+
+	ALIGN(16)
+L(four_or_more):
+
+	C eax	src
+	C ebx
+	C ecx	size
+	C edx	dst
+	C esi
+	C edi
+	C ebp
+
+C First multiply src[0]*src[1..size-1] and store at dst[1..size].
+C
+C A test was done calling mpn_mul_1 here to get the benefit of its unrolled
+C loop, but this was only a tiny speedup; at 35 limbs it took 24 cycles off
+C a 5780 cycle operation, which is not surprising since the loop here is 8
+C c/l and mpn_mul_1 is 6.25 c/l.
+
+	subl	$STACK_SPACE, %esp	deflit(`FRAME',STACK_SPACE)
+
+	movl	%edi, SAVE_EDI
+	leal	4(%edx), %edi
+
+	movl	%ebx, SAVE_EBX
+	leal	4(%eax), %ebx
+
+	movl	%esi, SAVE_ESI
+	xorl	%esi, %esi
+
+	movl	%ebp, SAVE_EBP
+
+	C eax
+	C ebx	src+4
+	C ecx	size
+	C edx
+	C esi
+	C edi	dst+4
+	C ebp
+
+	movl	(%eax), %ebp	C multiplier
+	leal	-1(%ecx), %ecx	C size-1, and pad to a 16 byte boundary
+
+
+	ALIGN(16)
+L(mul_1):
+	C eax	scratch
+	C ebx	src ptr
+	C ecx	counter
+	C edx	scratch
+	C esi	carry
+	C edi	dst ptr
+	C ebp	multiplier
+
+	movl	(%ebx), %eax
+	addl	$4, %ebx
+
+	mull	%ebp
+
+	addl	%esi, %eax
+	movl	$0, %esi
+
+	adcl	%edx, %esi
+
+	movl	%eax, (%edi)
+	addl	$4, %edi
+
+	loop	L(mul_1)
+
+
+C Addmul src[n]*src[n+1..size-1] at dst[2*n-1...], for each n=1..size-2.
+C
+C The last two addmuls, which are the bottom right corner of the product
+C triangle, are left to the end.  These are src[size-3]*src[size-2,size-1]
+C and src[size-2]*src[size-1].  If size is 4 then it's only these corner
+C cases that need to be done.
+C
+C The unrolled code is the same as mpn_addmul_1(), see that routine for some
+C comments.
+C
+C VAR_COUNTER is the outer loop, running from -(size-4) to -1, inclusive.
+C
+C VAR_JMP is the computed jump into the unrolled code, stepped by one code
+C chunk each outer loop.
+C
+C K6 doesn't do any branch prediction on indirect jumps, which is good
+C actually because it's a different target each time.  The unrolled addmul
+C is about 3 cycles/limb faster than a simple loop, so the 6 cycle cost of
+C the indirect jump is quickly recovered.
+
+
+dnl  This value is also implicitly encoded in a shift and add.
+dnl
+deflit(CODE_BYTES_PER_LIMB, 15)
+
+dnl  With the unmodified &src[size] and &dst[size] pointers, the
+dnl  displacements in the unrolled code fit in a byte for UNROLL_COUNT
+dnl  values up to 31.  Above that an offset must be added to them.
+dnl
+deflit(OFFSET,
+ifelse(eval(UNROLL_COUNT>31),1,
+eval((UNROLL_COUNT-31)*4),
+0))
+
+	C eax
+	C ebx	&src[size]
+	C ecx
+	C edx
+	C esi	carry
+	C edi	&dst[size]
+	C ebp
+
+	movl	PARAM_SIZE, %ecx
+	movl	%esi, (%edi)
+
+	subl	$4, %ecx
+	jz	L(corner)
+
+	movl	%ecx, %edx
+ifelse(OFFSET,0,,
+`	subl	$OFFSET, %ebx')
+
+	shll	$4, %ecx
+ifelse(OFFSET,0,,
+`	subl	$OFFSET, %edi')
+
+	negl	%ecx
+
+ifdef(`PIC',`
+	call	L(pic_calc)
+L(here):
+',`
+	leal	L(unroll_inner_end)-eval(2*CODE_BYTES_PER_LIMB)(%ecx,%edx), %ecx
+')
+	negl	%edx
+
+
+	C The calculated jump mustn't be before the start of the available
+	C code.  This is the limitation UNROLL_COUNT puts on the src operand
+	C size, but checked here using the jump address directly.
+	C
+	ASSERT(ae,`
+	movl_text_address( L(unroll_inner_start), %eax)
+	cmpl	%eax, %ecx
+	')
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(16)
+L(unroll_outer_top):
+	C eax
+	C ebx	&src[size], constant
+	C ecx	VAR_JMP
+	C edx	VAR_COUNTER, limbs, negative
+	C esi	high limb to store
+	C edi	dst ptr, high of last addmul
+	C ebp
+
+	movl	-12+OFFSET(%ebx,%edx,4), %ebp	C multiplier
+	movl	%edx, VAR_COUNTER
+
+	movl	-8+OFFSET(%ebx,%edx,4), %eax	C first limb of multiplicand
+
+	mull	%ebp
+
+	testb	$1, %cl
+
+	movl	%edx, %esi	C high carry
+	movl	%ecx, %edx	C jump
+
+	movl	%eax, %ecx	C low carry
+	leal	CODE_BYTES_PER_LIMB(%edx), %edx
+
+	movl	%edx, VAR_JMP
+	leal	4(%edi), %edi
+
+	C A branch-free version of this using some xors was found to be a
+	C touch slower than just a conditional jump, despite the jump
+	C switching between taken and not taken on every loop.
+
+ifelse(eval(UNROLL_COUNT%2),0,
+	jz,jnz)	L(unroll_noswap)
+	movl	%esi, %eax	C high,low carry other way around
+
+	movl	%ecx, %esi
+	movl	%eax, %ecx
+L(unroll_noswap):
+
+	jmp	*%edx
+
+
+	C Must be on an even address here so the low bit of the jump address
+	C will indicate which way around ecx/esi should start.
+	C
+	C An attempt was made at padding here to get the end of the unrolled
+	C code to come out on a good alignment, to save padding before
+	C L(corner).  This worked, but turned out to run slower than just an
+	C ALIGN(2).  The reason for this is not clear, it might be related
+	C to the different speeds on different UNROLL_COUNTs noted above.
+
+	ALIGN(2)
+
+L(unroll_inner_start):
+	C eax	scratch
+	C ebx	src
+	C ecx	carry low
+	C edx	scratch
+	C esi	carry high
+	C edi	dst
+	C ebp	multiplier
+	C
+	C 15 code bytes each limb
+	C ecx/esi swapped on each chunk
+
+forloop(`i', UNROLL_COUNT, 1, `
+	deflit(`disp_src', eval(-i*4 + OFFSET))
+	deflit(`disp_dst', eval(disp_src - 4))
+
+	m4_assert(`disp_src>=-128 && disp_src<128')
+	m4_assert(`disp_dst>=-128 && disp_dst<128')
+
+ifelse(eval(i%2),0,`
+Zdisp(	movl,	disp_src,(%ebx), %eax)
+	mull	%ebp
+Zdisp(	addl,	%esi, disp_dst,(%edi))
+	adcl	%eax, %ecx
+	movl	%edx, %esi
+	jadcl0( %esi)
+',`
+	dnl  this one comes out last
+Zdisp(	movl,	disp_src,(%ebx), %eax)
+	mull	%ebp
+Zdisp(	addl,	%ecx, disp_dst,(%edi))
+	adcl	%eax, %esi
+	movl	%edx, %ecx
+	jadcl0( %ecx)
+')
+')
+L(unroll_inner_end):
+
+	addl	%esi, -4+OFFSET(%edi)
+
+	movl	VAR_COUNTER, %edx
+	jadcl0(	%ecx)
+
+	movl	%ecx, m4_empty_if_zero(OFFSET)(%edi)
+	movl	VAR_JMP, %ecx
+
+	incl	%edx
+	jnz	L(unroll_outer_top)
+
+
+ifelse(OFFSET,0,,`
+	addl	$OFFSET, %ebx
+	addl	$OFFSET, %edi
+')
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(16)
+L(corner):
+	C ebx	&src[size]
+	C edi	&dst[2*size-5]
+
+	movl	-12(%ebx), %ebp
+
+	movl	-8(%ebx), %eax
+	movl	%eax, %ecx
+
+	mull	%ebp
+
+	addl	%eax, -4(%edi)
+	adcl	$0, %edx
+
+	movl	-4(%ebx), %eax
+	movl	%edx, %esi
+	movl	%eax, %ebx
+
+	mull	%ebp
+
+	addl	%esi, %eax
+	adcl	$0, %edx
+
+	addl	%eax, (%edi)
+	adcl	$0, %edx
+
+	movl	%edx, %esi
+	movl	%ebx, %eax
+
+	mull	%ecx
+
+	addl	%esi, %eax
+	movl	%eax, 4(%edi)
+
+	adcl	$0, %edx
+
+	movl	%edx, 8(%edi)
+
+
+C -----------------------------------------------------------------------------
+C Left shift of dst[1..2*size-2], the bit shifted out becomes dst[2*size-1].
+C The loop measures about 6 cycles/iteration, though it looks like it should
+C decode in 5.
+
+L(lshift_start):
+	movl	PARAM_SIZE, %ecx
+
+	movl	PARAM_DST, %edi
+	subl	$1, %ecx		C size-1 and clear carry
+
+	movl	PARAM_SRC, %ebx
+	movl	%ecx, %edx
+
+	xorl	%eax, %eax		C ready for adcl
+
+
+	ALIGN(16)
+L(lshift):
+	C eax
+	C ebx	src (for later use)
+	C ecx	counter, decrementing
+	C edx	size-1 (for later use)
+	C esi
+	C edi	dst, incrementing
+	C ebp
+
+	rcll	4(%edi)
+	rcll	8(%edi)
+	leal	8(%edi), %edi
+	loop	L(lshift)
+
+
+	adcl	%eax, %eax
+
+	movl	%eax, 4(%edi)		C dst most significant limb
+	movl	(%ebx), %eax		C src[0]
+
+	leal	4(%ebx,%edx,4), %ebx	C &src[size]
+	subl	%edx, %ecx		C -(size-1)
+
+
+C -----------------------------------------------------------------------------
+C Now add in the squares on the diagonal, src[0]^2, src[1]^2, ...,
+C src[size-1]^2.  dst[0] hasn't yet been set at all yet, and just gets the
+C low limb of src[0]^2.
+
+
+	mull	%eax
+
+	movl	%eax, (%edi,%ecx,8)	C dst[0]
+
+
+	ALIGN(16)
+L(diag):
+	C eax	scratch
+	C ebx	&src[size]
+	C ecx	counter, negative
+	C edx	carry
+	C esi	scratch
+	C edi	dst[2*size-2]
+	C ebp
+
+	movl	(%ebx,%ecx,4), %eax
+	movl	%edx, %esi
+
+	mull	%eax
+
+	addl	%esi, 4(%edi,%ecx,8)
+	adcl	%eax, 8(%edi,%ecx,8)
+	adcl	$0, %edx
+
+	incl	%ecx
+	jnz	L(diag)
+
+
+	movl	SAVE_EBX, %ebx
+	movl	SAVE_ESI, %esi
+
+	addl	%edx, 4(%edi)		C dst most significant limb
+
+	movl	SAVE_EDI, %edi
+	movl	SAVE_EBP, %ebp
+	addl	$FRAME, %esp
+	ret
+
+
+
+C -----------------------------------------------------------------------------
+ifdef(`PIC',`
+L(pic_calc):
+	C See mpn/x86/README about old gas bugs
+	addl	(%esp), %ecx
+	addl	$L(unroll_inner_end)-L(here)-eval(2*CODE_BYTES_PER_LIMB), %ecx
+	addl	%edx, %ecx
+	ret_internal
+')
+
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/k7/README b/third_party/gmp/mpn/x86/k7/README
new file mode 100644
index 0000000..5711b61
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k7/README

@@ -0,0 +1,174 @@
+Copyright 2000, 2001 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.
+
+
+
+
+                      AMD K7 MPN SUBROUTINES
+
+
+This directory contains code optimized for the AMD Athlon CPU.
+
+The mmx subdirectory has routines using MMX instructions.  All Athlons have
+MMX, the separate directory is just so that configure can omit it if the
+assembler doesn't support MMX.
+
+
+
+STATUS
+
+Times for the loops, with all code and data in L1 cache.
+
+                               cycles/limb
+	mpn_add/sub_n             1.6
+
+	mpn_copyi                 0.75 or 1.0   \ varying with data alignment
+	mpn_copyd                 0.75 or 1.0   /
+
+	mpn_divrem_1             17.0 integer part, 15.0 fractional part
+	mpn_mod_1                17.0
+	mpn_divexact_by3          8.0
+
+	mpn_l/rshift              1.2
+
+	mpn_mul_1                 3.4
+	mpn_addmul/submul_1       3.9
+
+	mpn_mul_basecase          4.42 cycles/crossproduct (approx)
+        mpn_sqr_basecase          2.3 cycles/crossproduct (approx)
+				  or 4.55 cycles/triangleproduct (approx)
+
+Prefetching of sources hasn't yet been tried.
+
+
+
+NOTES
+
+cmov, MMX, 3DNow and some extensions to MMX and 3DNow are available.
+
+Write-allocate L1 data cache means prefetching of destinations is unnecessary.
+
+Floating point multiplications can be done in parallel with integer
+multiplications, but there doesn't seem to be any way to make use of this.
+
+Unsigned "mul"s can be issued every 3 cycles.  This suggests 3 is a limit on
+the speed of the multiplication routines.  The documentation shows mul
+executing in IEU0 (or maybe in IEU0 and IEU1 together), so it might be that,
+to get near 3 cycles code has to be arranged so that nothing else is issued
+to IEU0.  A busy IEU0 could explain why some code takes 4 cycles and other
+apparently equivalent code takes 5.
+
+
+
+OPTIMIZATIONS
+
+Unrolled loops are used to reduce looping overhead.  The unrolling is
+configurable up to 32 limbs/loop for most routines and up to 64 for some.
+The K7 has 64k L1 code cache so quite big unrolling is allowable.
+
+Computed jumps into the unrolling are used to handle sizes not a multiple of
+the unrolling.  An attractive feature of this is that times increase
+smoothly with operand size, but it may be that some routines should just
+have simple loops to finish up, especially when PIC adds between 2 and 16
+cycles to get %eip.
+
+Position independent code is implemented using a call to get %eip for the
+computed jumps and a ret is always done, rather than an addl $4,%esp or a
+popl, so the CPU return address branch prediction stack stays synchronised
+with the actual stack in memory.
+
+Branch prediction, in absence of any history, will guess forward jumps are
+not taken and backward jumps are taken.  Where possible it's arranged that
+the less likely or less important case is under a taken forward jump.
+
+
+
+CODING
+
+Instructions in general code have been shown grouped if they can execute
+together, which means up to three direct-path instructions which have no
+successive dependencies.  K7 always decodes three and has out-of-order
+execution, but the groupings show what slots might be available and what
+dependency chains exist.
+
+When there's vector-path instructions an effort is made to get triplets of
+direct-path instructions in between them, even if there's dependencies,
+since this maximizes decoding throughput and might save a cycle or two if
+decoding is the limiting factor.
+
+
+
+INSTRUCTIONS
+
+adcl       direct
+divl       39 cycles back-to-back
+lodsl,etc  vector
+loop       1 cycle vector (decl/jnz opens up one decode slot)
+movd reg   vector
+movd mem   direct
+mull       issue every 3 cycles, latency 4 cycles low word, 6 cycles high word
+popl	   vector (use movl for more than one pop)
+pushl	   direct, will pair with a load
+shrdl %cl  vector, 3 cycles, seems to be 3 decode too
+xorl r,r   false read dependency recognised
+
+
+
+REFERENCES
+
+"AMD Athlon Processor X86 Code Optimization Guide", AMD publication number
+22007, revision K, February 2002.  Available on-line,
+
+http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/22007.pdf
+
+"3DNow Technology Manual", AMD publication number 21928G/0-March 2000.
+This describes the femms and prefetch instructions.  Available on-line,
+
+http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/21928.pdf
+
+"AMD Extensions to the 3DNow and MMX Instruction Sets Manual", AMD
+publication number 22466, revision D, March 2000.  This describes
+instructions added in the Athlon processor, such as pswapd and the extra
+prefetch forms.  Available on-line,
+
+http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/22466.pdf
+
+"3DNow Instruction Porting Guide", AMD publication number 22621, revision B,
+August 1999.  This has some notes on general Athlon optimizations as well as
+3DNow.  Available on-line,
+
+http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/22621.pdf
+
+
+
+
+----------------
+Local variables:
+mode: text
+fill-column: 76
+End:

diff --git a/third_party/gmp/mpn/x86/k7/addlsh1_n.asm b/third_party/gmp/mpn/x86/k7/addlsh1_n.asm
new file mode 100644
index 0000000..2cba1eb
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k7/addlsh1_n.asm

@@ -0,0 +1,196 @@
+dnl  AMD K7 mpn_addlsh1_n -- rp[] = up[] + (vp[] << 1)
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C This is an attempt at an addlsh1_n for x86-32, not relying on sse2 insns.
+C The innerloop is 2*3-way unrolled, which is best we can do with the available
+C registers.  It seems tricky to use the same structure for rsblsh1_n, since we
+C cannot feed carry between operations there.
+
+C			    cycles/limb
+C P5
+C P6 model 0-8,10-12
+C P6 model 9  (Banias)
+C P6 model 13 (Dothan)		 5.4	(worse than add_n + lshift)
+C P4 model 0  (Willamette)
+C P4 model 1  (?)
+C P4 model 2  (Northwood)
+C P4 model 3  (Prescott)
+C P4 model 4  (Nocona)
+C Intel Atom			 6
+C AMD K6			 ?
+C AMD K7			 2.5
+C AMD K8
+
+C This is a basic addlsh1_n for k7, atom, and perhaps some other x86-32
+C processors.  It uses 2*3-way unrolling, for good reasons.  Unfortunately,
+C that means we need an initial magic multiply.
+C
+C It is not clear how to do sublsh1_n or rsblsh1_n using the same pattern.  We
+C cannot do rsblsh1_n since we feed carry from the shift blocks to the
+C add/subtract blocks, which is right for addition but reversed for
+C subtraction.  We could perhaps do sublsh1_n, with some extra move insns,
+C without losing any time, since we're not issue limited but carry recurrency
+C latency.
+C
+C Breaking carry recurrency might be a good idea.  We would then need separate
+C registers for the shift carry and add/subtract carry, which in turn would
+C force us to 2*2-way unrolling.
+
+defframe(PARAM_SIZE,	16)
+defframe(PARAM_DBLD,	12)
+defframe(PARAM_SRC,	 8)
+defframe(PARAM_DST,	 4)
+
+dnl  re-use parameter space
+define(VAR_COUNT,`PARAM_DST')
+define(VAR_TMP,`PARAM_DBLD')
+
+ASM_START()
+	TEXT
+	ALIGN(8)
+PROLOGUE(mpn_addlsh1_n)
+deflit(`FRAME',0)
+
+define(`rp',  `%edi')
+define(`up',  `%esi')
+define(`vp',  `%ebp')
+
+	mov	$0x2aaaaaab, %eax
+
+	push	%ebx			FRAME_pushl()
+	mov	PARAM_SIZE, %ebx	C size
+
+	push	rp			FRAME_pushl()
+	mov	PARAM_DST, rp
+
+	mul	%ebx
+
+	push	up			FRAME_pushl()
+	mov	PARAM_SRC, up
+
+	not	%edx			C count = -(size\8)-1
+	mov	%edx, VAR_COUNT
+
+	push	vp			FRAME_pushl()
+	mov	PARAM_DBLD, vp
+
+	lea	3(%edx,%edx,2), %ecx	C count*3+3 = -(size\6)*3
+	xor	%edx, %edx
+	lea	(%ebx,%ecx,2), %ebx	C size + (count*3+3)*2 = size % 6
+	or	%ebx, %ebx
+	jz	L(exact)
+
+L(oop):
+ifdef(`CPU_P6',`
+	shr	%edx ')			C restore 2nd saved carry bit
+	mov	(vp), %eax
+	adc	%eax, %eax
+	rcr	%edx			C restore 1st saved carry bit
+	lea	4(vp), vp
+	adc	(up), %eax
+	lea	4(up), up
+	adc	%edx, %edx		C save a carry bit in edx
+ifdef(`CPU_P6',`
+	adc	%edx, %edx ')		C save another carry bit in edx
+	dec	%ebx
+	mov	%eax, (rp)
+	lea	4(rp), rp
+	jnz	L(oop)
+	mov	vp, VAR_TMP
+L(exact):
+	incl	VAR_COUNT
+	jz	L(end)
+
+	ALIGN(16)
+L(top):
+ifdef(`CPU_P6',`
+	shr	%edx ')			C restore 2nd saved carry bit
+	mov	(vp), %eax
+	adc	%eax, %eax
+	mov	4(vp), %ebx
+	adc	%ebx, %ebx
+	mov	8(vp), %ecx
+	adc	%ecx, %ecx
+
+	rcr	%edx			C restore 1st saved carry bit
+
+	adc	(up), %eax
+	mov	%eax, (rp)
+	adc	4(up), %ebx
+	mov	%ebx, 4(rp)
+	adc	8(up), %ecx
+	mov	%ecx, 8(rp)
+
+	mov	12(vp), %eax
+	adc	%eax, %eax
+	mov	16(vp), %ebx
+	adc	%ebx, %ebx
+	mov	20(vp), %ecx
+	adc	%ecx, %ecx
+
+	lea	24(vp), vp
+	adc	%edx, %edx		C save a carry bit in edx
+
+	adc	12(up), %eax
+	mov	%eax, 12(rp)
+	adc	16(up), %ebx
+	mov	%ebx, 16(rp)
+	adc	20(up), %ecx
+
+	lea	24(up), up
+
+ifdef(`CPU_P6',`
+	adc	%edx, %edx ')		C save another carry bit in edx
+	mov	%ecx, 20(rp)
+	incl	VAR_COUNT
+	lea	24(rp), rp
+	jne	L(top)
+
+L(end):
+	pop	vp			FRAME_popl()
+	pop	up			FRAME_popl()
+
+ifdef(`CPU_P6',`
+	xor	%eax, %eax
+	shr	$1, %edx
+	adc	%edx, %eax
+',`
+	adc	$0, %edx
+	mov	%edx, %eax
+')
+	pop	rp			FRAME_popl()
+	pop	%ebx			FRAME_popl()
+	ret
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/x86/k7/aors_n.asm b/third_party/gmp/mpn/x86/k7/aors_n.asm
new file mode 100644
index 0000000..1a08072
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k7/aors_n.asm

@@ -0,0 +1,258 @@
+dnl  AMD K7 mpn_add_n/mpn_sub_n -- mpn add or subtract.
+
+dnl  Copyright 1999-2003 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C K7: 1.64 cycles/limb (at 16 limbs/loop).
+
+
+
+dnl  K7: UNROLL_COUNT cycles/limb
+dnl           8           1.9
+dnl          16           1.64
+dnl          32           1.7
+dnl          64           2.0
+dnl  Maximum possible with the current code is 64.
+
+deflit(UNROLL_COUNT, 16)
+
+
+ifdef(`OPERATION_add_n', `
+	define(M4_inst,        adcl)
+	define(M4_function_n,  mpn_add_n)
+	define(M4_function_nc, mpn_add_nc)
+	define(M4_description, add)
+',`ifdef(`OPERATION_sub_n', `
+	define(M4_inst,        sbbl)
+	define(M4_function_n,  mpn_sub_n)
+	define(M4_function_nc, mpn_sub_nc)
+	define(M4_description, subtract)
+',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n
+')')')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+
+
+C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C                         mp_size_t size);
+C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C	                   mp_size_t size, mp_limb_t carry);
+C
+C Calculate src1,size M4_description src2,size, and store the result in
+C dst,size.  The return value is the carry bit from the top of the result (1
+C or 0).
+C
+C The _nc version accepts 1 or 0 for an initial carry into the low limb of
+C the calculation.  Note values other than 1 or 0 here will lead to garbage
+C results.
+C
+C This code runs at 1.64 cycles/limb, which might be the best possible with
+C plain integer operations.  Each limb is 2 loads and 1 store, any 2 of
+C which can be done each cycle, leading to 1.5 c/l.
+
+dnl  Must have UNROLL_THRESHOLD >= 2, since the unrolled loop can't handle 1.
+ifdef(`PIC',`
+deflit(UNROLL_THRESHOLD, 8)
+',`
+deflit(UNROLL_THRESHOLD, 8)
+')
+
+defframe(PARAM_CARRY,20)
+defframe(PARAM_SIZE, 16)
+defframe(PARAM_SRC2, 12)
+defframe(PARAM_SRC1, 8)
+defframe(PARAM_DST,  4)
+
+defframe(SAVE_EBP, -4)
+defframe(SAVE_ESI, -8)
+defframe(SAVE_EBX, -12)
+defframe(SAVE_EDI, -16)
+deflit(STACK_SPACE, 16)
+
+	TEXT
+	ALIGN(32)
+deflit(`FRAME',0)
+
+PROLOGUE(M4_function_nc)
+	movl	PARAM_CARRY, %eax
+	jmp	L(start)
+EPILOGUE()
+
+PROLOGUE(M4_function_n)
+
+	xorl	%eax, %eax	C carry
+L(start):
+	movl	PARAM_SIZE, %ecx
+	subl	$STACK_SPACE, %esp
+deflit(`FRAME',STACK_SPACE)
+
+	movl	%edi, SAVE_EDI
+	movl	%ebx, SAVE_EBX
+	cmpl	$UNROLL_THRESHOLD, %ecx
+
+	movl	PARAM_SRC2, %edx
+	movl	PARAM_SRC1, %ebx
+	jae	L(unroll)
+
+	movl	PARAM_DST, %edi
+	leal	(%ebx,%ecx,4), %ebx
+	leal	(%edx,%ecx,4), %edx
+
+	leal	(%edi,%ecx,4), %edi
+	negl	%ecx
+	shrl	%eax
+
+	C This loop in in a single 16 byte code block already, so no
+	C alignment necessary.
+L(simple):
+	C eax	scratch
+	C ebx	src1
+	C ecx	counter
+	C edx	src2
+	C esi
+	C edi	dst
+	C ebp
+
+	movl	(%ebx,%ecx,4), %eax
+	M4_inst	(%edx,%ecx,4), %eax
+	movl	%eax, (%edi,%ecx,4)
+	incl	%ecx
+	jnz	L(simple)
+
+	movl	$0, %eax
+	movl	SAVE_EDI, %edi
+
+	movl	SAVE_EBX, %ebx
+	setc	%al
+	addl	$STACK_SPACE, %esp
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+	C This is at 0x55, close enough to aligned.
+L(unroll):
+deflit(`FRAME',STACK_SPACE)
+	movl	%ebp, SAVE_EBP
+	andl	$-2, %ecx		C size low bit masked out
+	andl	$1, PARAM_SIZE		C size low bit kept
+
+	movl	%ecx, %edi
+	decl	%ecx
+	movl	PARAM_DST, %ebp
+
+	shrl	$UNROLL_LOG2, %ecx
+	negl	%edi
+	movl	%esi, SAVE_ESI
+
+	andl	$UNROLL_MASK, %edi
+
+ifdef(`PIC',`
+	call	L(pic_calc)
+L(here):
+',`
+	leal	L(entry) (%edi,%edi,8), %esi	C 9 bytes per
+')
+	negl	%edi
+	shrl	%eax
+
+	leal	ifelse(UNROLL_BYTES,256,128) (%ebx,%edi,4), %ebx
+	leal	ifelse(UNROLL_BYTES,256,128) (%edx,%edi,4), %edx
+	leal	ifelse(UNROLL_BYTES,256,128) (%ebp,%edi,4), %edi
+
+	jmp	*%esi
+
+
+ifdef(`PIC',`
+L(pic_calc):
+	C See mpn/x86/README about old gas bugs
+	leal	(%edi,%edi,8), %esi
+	addl	$L(entry)-L(here), %esi
+	addl	(%esp), %esi
+	ret_internal
+')
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(32)
+L(top):
+	C eax	zero
+	C ebx	src1
+	C ecx	counter
+	C edx	src2
+	C esi	scratch (was computed jump)
+	C edi	dst
+	C ebp	scratch
+
+	leal	UNROLL_BYTES(%edx), %edx
+
+L(entry):
+deflit(CHUNK_COUNT, 2)
+forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, `
+	deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
+	deflit(`disp1', eval(disp0 + 4))
+
+Zdisp(	movl,	disp0,(%ebx), %esi)
+	movl	disp1(%ebx), %ebp
+Zdisp(	M4_inst,disp0,(%edx), %esi)
+Zdisp(	movl,	%esi, disp0,(%edi))
+	M4_inst	disp1(%edx), %ebp
+	movl	%ebp, disp1(%edi)
+')
+
+	decl	%ecx
+	leal	UNROLL_BYTES(%ebx), %ebx
+	leal	UNROLL_BYTES(%edi), %edi
+	jns	L(top)
+
+
+	mov	PARAM_SIZE, %esi
+	movl	SAVE_EBP, %ebp
+	movl	$0, %eax
+
+	decl	%esi
+	js	L(even)
+
+	movl	(%ebx), %ecx
+	M4_inst	UNROLL_BYTES(%edx), %ecx
+	movl	%ecx, (%edi)
+L(even):
+
+	movl	SAVE_EDI, %edi
+	movl	SAVE_EBX, %ebx
+	setc	%al
+
+	movl	SAVE_ESI, %esi
+	addl	$STACK_SPACE, %esp
+
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/k7/aorsmul_1.asm b/third_party/gmp/mpn/x86/k7/aorsmul_1.asm
new file mode 100644
index 0000000..eec8df6
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k7/aorsmul_1.asm

@@ -0,0 +1,167 @@
+dnl  AMD K7 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple.
+
+dnl  Copyright 1999-2002, 2005, 2008 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C			    cycles/limb
+C P5
+C P6 model 0-8,10-12
+C P6 model 9  (Banias)		 6.5
+C P6 model 13 (Dothan)
+C P4 model 0  (Willamette)
+C P4 model 1  (?)
+C P4 model 2  (Northwood)
+C P4 model 3  (Prescott)
+C P4 model 4  (Nocona)
+C AMD K6
+C AMD K7			 3.75
+C AMD K8
+
+C TODO
+C  * Improve feed-in and wind-down code.  We beat the old code for all n != 1,
+C    but lose by 2x for n == 1.
+
+ifdef(`OPERATION_addmul_1',`
+      define(`ADDSUB',        `add')
+      define(`func',  `mpn_addmul_1')
+')
+ifdef(`OPERATION_submul_1',`
+      define(`ADDSUB',        `sub')
+      define(`func',  `mpn_submul_1')
+')
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(func)
+	add	$-16, %esp
+	mov	%ebp, (%esp)
+	mov	%ebx, 4(%esp)
+	mov	%esi, 8(%esp)
+	mov	%edi, 12(%esp)
+
+	mov	20(%esp), %edi
+	mov	24(%esp), %esi
+	mov	28(%esp), %eax
+	mov	32(%esp), %ecx
+	mov	%eax, %ebx
+	shr	$2, %eax
+	mov	%eax, 28(%esp)
+	mov	(%esi), %eax
+	and	$3, %ebx
+	jz	L(b0)
+	cmp	$2, %ebx
+	jz	L(b2)
+	jg	L(b3)
+
+L(b1):	lea	-4(%esi), %esi
+	lea	-4(%edi), %edi
+	mul	%ecx
+	mov	%eax, %ebx
+	mov	%edx, %ebp
+	cmpl	$0, 28(%esp)
+	jz	L(cj1)
+	mov	8(%esi), %eax
+	jmp	L(1)
+
+L(b2):	mul	%ecx
+	mov	%eax, %ebp
+	mov	4(%esi), %eax
+	mov	%edx, %ebx
+	cmpl	$0, 28(%esp)
+	jne	L(2)
+	jmp	L(cj2)
+
+L(b3):	lea	-12(%esi), %esi
+	lea	-12(%edi), %edi
+	mul	%ecx
+	mov	%eax, %ebx
+	mov	%edx, %ebp
+	mov	16(%esi), %eax
+	incl	28(%esp)
+	jmp	L(3)
+
+L(b0):	lea	-8(%esi), %esi
+	lea	-8(%edi), %edi
+	mul	%ecx
+	mov	%eax, %ebp
+	mov	12(%esi), %eax
+	mov	%edx, %ebx
+	jmp	L(0)
+
+	ALIGN(16)
+L(top):	lea	16(%edi), %edi
+L(2):	mul	%ecx
+	ADDSUB	%ebp, 0(%edi)
+	mov	$0, %ebp
+	adc	%eax, %ebx
+	mov	8(%esi), %eax
+	adc	%edx, %ebp
+L(1):	mul	%ecx
+	ADDSUB	%ebx, 4(%edi)
+	mov	$0, %ebx
+	adc	%eax, %ebp
+	mov	12(%esi), %eax
+	adc	%edx, %ebx
+L(0):	mul	%ecx
+	ADDSUB	%ebp, 8(%edi)
+	mov	$0, %ebp
+	adc	%eax, %ebx
+	adc	%edx, %ebp
+	mov	16(%esi), %eax
+L(3):	mul	%ecx
+	ADDSUB	%ebx, 12(%edi)
+	adc	%eax, %ebp
+	mov	20(%esi), %eax
+	lea	16(%esi), %esi
+	mov	$0, %ebx
+	adc	%edx, %ebx
+	decl	28(%esp)
+	jnz	L(top)
+
+L(end):	lea	16(%edi), %edi
+L(cj2):	mul	%ecx
+	ADDSUB	%ebp, (%edi)
+	adc	%eax, %ebx
+	adc	$0, %edx
+L(cj1):	ADDSUB	%ebx, 4(%edi)
+	adc	$0, %edx
+	mov	%edx, %eax
+	mov	(%esp), %ebp
+	mov	4(%esp), %ebx
+	mov	8(%esp), %esi
+	mov	12(%esp), %edi
+	add	$16, %esp
+	ret
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/x86/k7/bdiv_q_1.asm b/third_party/gmp/mpn/x86/k7/bdiv_q_1.asm
new file mode 100644
index 0000000..2af7bb9
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k7/bdiv_q_1.asm

@@ -0,0 +1,245 @@
+dnl  AMD K7 mpn_bdiv_q_1 -- mpn by limb exact division.
+
+dnl  Rearranged from mpn/x86/k7/dive_1.asm by Marco Bodrato.
+
+dnl  Copyright 2001, 2002, 2004, 2007, 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C          cycles/limb
+C Athlon:     11.0
+C Hammer:      9.0
+
+
+C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                      mp_limb_t divisor);
+C
+C The dependent chain is mul+imul+sub for 11 cycles and that speed is
+C achieved with no special effort.  The load and shrld latencies are hidden
+C by out of order execution.
+C
+C It's a touch faster on size==1 to use the mul-by-inverse than divl.
+
+defframe(PARAM_SHIFT,  24)
+defframe(PARAM_INVERSE,20)
+defframe(PARAM_DIVISOR,16)
+defframe(PARAM_SIZE,   12)
+defframe(PARAM_SRC,    8)
+defframe(PARAM_DST,    4)
+
+defframe(SAVE_EBX,     -4)
+defframe(SAVE_ESI,     -8)
+defframe(SAVE_EDI,    -12)
+defframe(SAVE_EBP,    -16)
+defframe(VAR_INVERSE, -20)
+defframe(VAR_DST_END, -24)
+
+deflit(STACK_SPACE, 24)
+
+	TEXT
+
+C mp_limb_t
+C mpn_pi1_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor,
+C		    mp_limb_t inverse, int shift)
+	ALIGN(16)
+PROLOGUE(mpn_pi1_bdiv_q_1)
+deflit(`FRAME',0)
+
+	subl	$STACK_SPACE, %esp	deflit(`FRAME',STACK_SPACE)
+	movl	PARAM_SHIFT, %ecx	C shift count
+
+	movl	%ebp, SAVE_EBP
+	movl	PARAM_SIZE, %ebp
+
+	movl	%esi, SAVE_ESI
+	movl	PARAM_SRC, %esi
+
+	movl	%edi, SAVE_EDI
+	movl	PARAM_DST, %edi
+
+	movl	%ebx, SAVE_EBX
+
+	leal	(%esi,%ebp,4), %esi	C src end
+	leal	(%edi,%ebp,4), %edi	C dst end
+	negl	%ebp			C -size
+
+	movl	PARAM_INVERSE, %eax	C inv
+
+L(common):
+	movl	%eax, VAR_INVERSE
+	movl	(%esi,%ebp,4), %eax	C src[0]
+
+	incl	%ebp
+	jz	L(one)
+
+	movl	(%esi,%ebp,4), %edx	C src[1]
+
+	shrdl(	%cl, %edx, %eax)
+
+	movl	%edi, VAR_DST_END
+	xorl	%ebx, %ebx
+	jmp	L(entry)
+
+	ALIGN(8)
+L(top):
+	C eax	q
+	C ebx	carry bit, 0 or 1
+	C ecx	shift
+	C edx
+	C esi	src end
+	C edi	dst end
+	C ebp	counter, limbs, negative
+
+	mull	PARAM_DIVISOR		C carry limb in edx
+
+	movl	-4(%esi,%ebp,4), %eax
+	movl	(%esi,%ebp,4), %edi
+
+	shrdl(	%cl, %edi, %eax)
+
+	subl	%ebx, %eax		C apply carry bit
+	setc	%bl
+	movl	VAR_DST_END, %edi
+
+	subl	%edx, %eax		C apply carry limb
+	adcl	$0, %ebx
+
+L(entry):
+	imull	VAR_INVERSE, %eax
+
+	movl	%eax, -4(%edi,%ebp,4)
+	incl	%ebp
+	jnz	L(top)
+
+
+	mull	PARAM_DIVISOR		C carry limb in edx
+
+	movl	-4(%esi), %eax		C src high limb
+	shrl	%cl, %eax
+	movl	SAVE_ESI, %esi
+
+	subl	%ebx, %eax		C apply carry bit
+	movl	SAVE_EBX, %ebx
+	movl	SAVE_EBP, %ebp
+
+	subl	%edx, %eax		C apply carry limb
+
+	imull	VAR_INVERSE, %eax
+
+	movl	%eax, -4(%edi)
+	movl	SAVE_EDI, %edi
+	addl	$STACK_SPACE, %esp
+
+	ret
+
+L(one):
+	shrl	%cl, %eax
+	movl	SAVE_ESI, %esi
+	movl	SAVE_EBX, %ebx
+
+	imull	VAR_INVERSE, %eax
+
+	movl	SAVE_EBP, %ebp
+
+	movl	%eax, -4(%edi)
+	movl	SAVE_EDI, %edi
+	addl	$STACK_SPACE, %esp
+
+	ret
+EPILOGUE()
+
+C mp_limb_t mpn_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                           mp_limb_t divisor);
+C
+
+	ALIGN(16)
+PROLOGUE(mpn_bdiv_q_1)
+deflit(`FRAME',0)
+
+	movl	PARAM_DIVISOR, %eax
+	subl	$STACK_SPACE, %esp	deflit(`FRAME',STACK_SPACE)
+	movl	$-1, %ecx		C shift count
+
+	movl	%ebp, SAVE_EBP
+	movl	PARAM_SIZE, %ebp
+
+	movl	%esi, SAVE_ESI
+	movl	%edi, SAVE_EDI
+
+	C If there's usually only one or two trailing zero bits then this
+	C should be faster than bsfl.
+L(strip_twos):
+	incl	%ecx
+	shrl	%eax
+	jnc	L(strip_twos)
+
+	movl	%ebx, SAVE_EBX
+	leal	1(%eax,%eax), %ebx	C d without twos
+	andl	$127, %eax		C d/2, 7 bits
+
+ifdef(`PIC',`
+	LEA(	binvert_limb_table, %edx)
+	movzbl	(%eax,%edx), %eax		C inv 8 bits
+',`
+	movzbl	binvert_limb_table(%eax), %eax	C inv 8 bits
+')
+
+	leal	(%eax,%eax), %edx	C 2*inv
+	movl	%ebx, PARAM_DIVISOR	C d without twos
+
+	imull	%eax, %eax		C inv*inv
+
+	movl	PARAM_SRC, %esi
+	movl	PARAM_DST, %edi
+
+	imull	%ebx, %eax		C inv*inv*d
+
+	subl	%eax, %edx		C inv = 2*inv - inv*inv*d
+	leal	(%edx,%edx), %eax	C 2*inv
+
+	imull	%edx, %edx		C inv*inv
+
+	leal	(%esi,%ebp,4), %esi	C src end
+	leal	(%edi,%ebp,4), %edi	C dst end
+	negl	%ebp			C -size
+
+	imull	%ebx, %edx		C inv*inv*d
+
+	subl	%edx, %eax		C inv = 2*inv - inv*inv*d
+
+	ASSERT(e,`	C expect d*inv == 1 mod 2^GMP_LIMB_BITS
+	pushl	%eax	FRAME_pushl()
+	imull	PARAM_DIVISOR, %eax
+	cmpl	$1, %eax
+	popl	%eax	FRAME_popl()')
+
+	jmp	L(common)
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/x86/k7/dive_1.asm b/third_party/gmp/mpn/x86/k7/dive_1.asm
new file mode 100644
index 0000000..458bd02
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k7/dive_1.asm

@@ -0,0 +1,208 @@
+dnl  AMD K7 mpn_divexact_1 -- mpn by limb exact division.
+
+dnl  Copyright 2001, 2002, 2004, 2007 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C          cycles/limb
+C Athlon:     11.0
+C Hammer:      9.0
+
+
+C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                      mp_limb_t divisor);
+C
+C The dependent chain is mul+imul+sub for 11 cycles and that speed is
+C achieved with no special effort.  The load and shrld latencies are hidden
+C by out of order execution.
+C
+C It's a touch faster on size==1 to use the mul-by-inverse than divl.
+
+defframe(PARAM_DIVISOR,16)
+defframe(PARAM_SIZE,   12)
+defframe(PARAM_SRC,    8)
+defframe(PARAM_DST,    4)
+
+defframe(SAVE_EBX,     -4)
+defframe(SAVE_ESI,     -8)
+defframe(SAVE_EDI,    -12)
+defframe(SAVE_EBP,    -16)
+defframe(VAR_INVERSE, -20)
+defframe(VAR_DST_END, -24)
+
+deflit(STACK_SPACE, 24)
+
+	TEXT
+
+	ALIGN(16)
+PROLOGUE(mpn_divexact_1)
+deflit(`FRAME',0)
+
+	movl	PARAM_DIVISOR, %eax
+	subl	$STACK_SPACE, %esp	deflit(`FRAME',STACK_SPACE)
+	movl	$-1, %ecx		C shift count
+
+	movl	%ebp, SAVE_EBP
+	movl	PARAM_SIZE, %ebp
+
+	movl	%esi, SAVE_ESI
+	movl	%edi, SAVE_EDI
+
+	C If there's usually only one or two trailing zero bits then this
+	C should be faster than bsfl.
+L(strip_twos):
+	incl	%ecx
+	shrl	%eax
+	jnc	L(strip_twos)
+
+	movl	%ebx, SAVE_EBX
+	leal	1(%eax,%eax), %ebx	C d without twos
+	andl	$127, %eax		C d/2, 7 bits
+
+ifdef(`PIC',`
+	LEA(	binvert_limb_table, %edx)
+	movzbl	(%eax,%edx), %eax		C inv 8 bits
+',`
+	movzbl	binvert_limb_table(%eax), %eax	C inv 8 bits
+')
+
+	leal	(%eax,%eax), %edx	C 2*inv
+	movl	%ebx, PARAM_DIVISOR	C d without twos
+
+	imull	%eax, %eax		C inv*inv
+
+	movl	PARAM_SRC, %esi
+	movl	PARAM_DST, %edi
+
+	imull	%ebx, %eax		C inv*inv*d
+
+	subl	%eax, %edx		C inv = 2*inv - inv*inv*d
+	leal	(%edx,%edx), %eax	C 2*inv
+
+	imull	%edx, %edx		C inv*inv
+
+	leal	(%esi,%ebp,4), %esi	C src end
+	leal	(%edi,%ebp,4), %edi	C dst end
+	negl	%ebp			C -size
+
+	imull	%ebx, %edx		C inv*inv*d
+
+	subl	%edx, %eax		C inv = 2*inv - inv*inv*d
+
+	ASSERT(e,`	C expect d*inv == 1 mod 2^GMP_LIMB_BITS
+	pushl	%eax	FRAME_pushl()
+	imull	PARAM_DIVISOR, %eax
+	cmpl	$1, %eax
+	popl	%eax	FRAME_popl()')
+
+	movl	%eax, VAR_INVERSE
+	movl	(%esi,%ebp,4), %eax	C src[0]
+
+	incl	%ebp
+	jz	L(one)
+
+	movl	(%esi,%ebp,4), %edx	C src[1]
+
+	shrdl(	%cl, %edx, %eax)
+
+	movl	%edi, VAR_DST_END
+	xorl	%ebx, %ebx
+	jmp	L(entry)
+
+	ALIGN(8)
+L(top):
+	C eax	q
+	C ebx	carry bit, 0 or 1
+	C ecx	shift
+	C edx
+	C esi	src end
+	C edi	dst end
+	C ebp	counter, limbs, negative
+
+	mull	PARAM_DIVISOR		C carry limb in edx
+
+	movl	-4(%esi,%ebp,4), %eax
+	movl	(%esi,%ebp,4), %edi
+
+	shrdl(	%cl, %edi, %eax)
+
+	subl	%ebx, %eax		C apply carry bit
+	setc	%bl
+	movl	VAR_DST_END, %edi
+
+	subl	%edx, %eax		C apply carry limb
+	adcl	$0, %ebx
+
+L(entry):
+	imull	VAR_INVERSE, %eax
+
+	movl	%eax, -4(%edi,%ebp,4)
+	incl	%ebp
+	jnz	L(top)
+
+
+	mull	PARAM_DIVISOR		C carry limb in edx
+
+	movl	-4(%esi), %eax		C src high limb
+	shrl	%cl, %eax
+	movl	SAVE_ESI, %esi
+
+	subl	%ebx, %eax		C apply carry bit
+	movl	SAVE_EBX, %ebx
+	movl	SAVE_EBP, %ebp
+
+	subl	%edx, %eax		C apply carry limb
+
+	imull	VAR_INVERSE, %eax
+
+	movl	%eax, -4(%edi)
+	movl	SAVE_EDI, %edi
+	addl	$STACK_SPACE, %esp
+
+	ret
+
+
+L(one):
+	shrl	%cl, %eax
+	movl	SAVE_ESI, %esi
+	movl	SAVE_EBX, %ebx
+
+	imull	VAR_INVERSE, %eax
+
+	movl	SAVE_EBP, %ebp
+	movl	%eax, -4(%edi)
+
+	movl	SAVE_EDI, %edi
+	addl	$STACK_SPACE, %esp
+
+	ret
+
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/x86/k7/gcd_11.asm b/third_party/gmp/mpn/x86/k7/gcd_11.asm
new file mode 100644
index 0000000..2648dfd
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k7/gcd_11.asm

@@ -0,0 +1,107 @@
+dnl  x86 mpn_gcd_11 optimised for AMD K7.
+
+dnl  Contributed to the GNU project by by Kevin Ryde.  Rehacked by Torbjorn
+dnl  Granlund.
+
+dnl  Copyright 2000-2002, 2005, 2009, 2011, 2012, 2014, 2015 Free Software
+dnl  Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C	     cycles/bit (approx)
+C AMD K7	 5.31
+C AMD K8,K9	 5.33
+C AMD K10	 5.30
+C AMD bd1	 ?
+C AMD bobcat	 7.02
+C Intel P4-2	10.1
+C Intel P4-3/4	10.0
+C Intel P6/13	 5.88
+C Intel core2	 6.26
+C Intel NHM	 6.83
+C Intel SBR	 8.50
+C Intel atom	 8.90
+C VIA nano	 ?
+C Numbers measured with: speed -CD -s16-32 -t16 mpn_gcd_1
+
+
+C ctz_table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0.
+
+deflit(MAXSHIFT, 6)
+deflit(MASK, eval((m4_lshift(1,MAXSHIFT))-1))
+
+DEF_OBJECT(ctz_table,64)
+	.byte	MAXSHIFT
+forloop(i,1,MASK,
+`	.byte	m4_count_trailing_zeros(i)
+')
+END_OBJECT(ctz_table)
+
+
+define(`u0',    `%eax')
+define(`v0',    `%edx')
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_gcd_11)
+	push	%edi
+	push	%esi
+
+	mov	12(%esp), %eax
+	mov	16(%esp), %edx
+
+	LEAL(	ctz_table, %esi)
+	jmp	L(odd)
+
+	ALIGN(16)			C
+L(top):	cmovc(	%ecx, %eax)		C u = |v - u|
+	cmovc(	%edi, %edx)		C v = min(u,v)
+L(mid):	and	$MASK, %ecx		C
+	movzbl	(%esi,%ecx), %ecx	C
+	jz	L(shift_alot)		C
+	shr	%cl, %eax		C
+L(odd):	mov	%eax, %edi		C
+	mov	%edx, %ecx		C
+	sub	%eax, %ecx		C
+	sub	%edx, %eax		C
+	jnz	L(top)			C
+
+L(end):	mov	%edx, %eax
+	pop	%esi
+	pop	%edi
+	ret
+
+L(shift_alot):
+	shr	$MAXSHIFT, %eax
+	mov	%eax, %ecx
+	jmp	L(mid)
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/x86/k7/gmp-mparam.h b/third_party/gmp/mpn/x86/k7/gmp-mparam.h
new file mode 100644
index 0000000..25b22e2
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k7/gmp-mparam.h

@@ -0,0 +1,262 @@
+/* AMD K7 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* 2083 MHz K7 Barton */
+/* FFT tuning limit = 49,770,069 */
+/* Generated by tuneup.c, 2019-11-09, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD                 0  /* always */
+#define MOD_1_UNNORM_THRESHOLD               3
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          8
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          4
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD        24
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD         0  /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     13
+#define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_1N_PI1_METHOD                 1  /* 27.00% faster than 2 */
+#define DIV_QR_1_NORM_THRESHOLD              4
+#define DIV_QR_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           26
+
+#define DIV_1_VS_MUL_1_PERCENT             182
+
+#define MUL_TOOM22_THRESHOLD                28
+#define MUL_TOOM33_THRESHOLD                85
+#define MUL_TOOM44_THRESHOLD               154
+#define MUL_TOOM6H_THRESHOLD               208
+#define MUL_TOOM8H_THRESHOLD               309
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      97
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD      99
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD      97
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD     102
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD     121
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 50
+#define SQR_TOOM3_THRESHOLD                 86
+#define SQR_TOOM4_THRESHOLD                220
+#define SQR_TOOM6_THRESHOLD                270
+#define SQR_TOOM8_THRESHOLD                446
+
+#define MULMID_TOOM42_THRESHOLD             50
+
+#define MULMOD_BNM1_THRESHOLD               18
+#define SQRMOD_BNM1_THRESHOLD               19
+
+#define MUL_FFT_MODF_THRESHOLD             606  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    606, 5}, {     25, 6}, {     13, 5}, {     27, 6}, \
+    {     15, 5}, {     31, 6}, {     28, 7}, {     15, 6}, \
+    {     32, 7}, {     17, 6}, {     35, 7}, {     19, 6}, \
+    {     39, 7}, {     23, 6}, {     47, 7}, {     29, 8}, \
+    {     15, 7}, {     35, 8}, {     19, 7}, {     41, 8}, \
+    {     23, 7}, {     49, 8}, {     31, 7}, {     63, 8}, \
+    {     39, 9}, {     23, 8}, {     55, 9}, {     31, 8}, \
+    {     63, 7}, {    127, 8}, {     71, 9}, {     39, 6}, \
+    {    319, 9}, {     47, 8}, {     99, 6}, {    399, 9}, \
+    {     55,10}, {     31, 9}, {     63, 8}, {    127, 9}, \
+    {     79,10}, {     47, 9}, {     95, 8}, {    191, 4}, \
+    {   3135, 5}, {   1599, 4}, {   3455, 6}, {    959, 8}, \
+    {    247,10}, {     79, 9}, {    167,10}, {     95, 9}, \
+    {    199,10}, {    111,11}, {     63,10}, {    127, 9}, \
+    {    255,10}, {    143, 9}, {    287, 8}, {    575,10}, \
+    {    159, 9}, {    319, 8}, {    639, 7}, {   1279,11}, \
+    {     95,10}, {    191, 9}, {    383, 8}, {    799,10}, \
+    {    207,12}, {     63,11}, {    127,10}, {    255, 9}, \
+    {    511, 8}, {   1023,10}, {    271, 9}, {    543, 8}, \
+    {   1087, 9}, {    575,11}, {    159, 9}, {    639,10}, \
+    {    335, 9}, {    671, 8}, {   1343,10}, {    351, 9}, \
+    {    703,11}, {    191,10}, {    383, 9}, {    799, 8}, \
+    {   1599,11}, {    223,10}, {    447,12}, {    127,11}, \
+    {    255,10}, {    511, 9}, {   1023,10}, {    543, 9}, \
+    {   1087,10}, {    575, 9}, {   1151,10}, {    607, 9}, \
+    {   1215,11}, {    319,10}, {    639, 9}, {   1343,10}, \
+    {    703, 9}, {   1407,12}, {    191,11}, {    383,10}, \
+    {    767, 9}, {   1535,10}, {    799, 9}, {   1599,10}, \
+    {    831, 9}, {   1727, 8}, {   3455,11}, {    447,13}, \
+    {    127,12}, {    255,11}, {    511,10}, {   1023, 9}, \
+    {   2047,11}, {    543,10}, {   1087,11}, {    575,10}, \
+    {   1151, 9}, {   2303,11}, {    607,10}, {   1215,12}, \
+    {    319,11}, {    639,10}, {   1279,11}, {    671,10}, \
+    {   1343,11}, {    703,10}, {   1407,11}, {    735,10}, \
+    {   1471, 9}, {   2943,12}, {    383,11}, {    767,10}, \
+    {   1535,11}, {    799,10}, {   1599,11}, {    831,10}, \
+    {   1663,11}, {    863,10}, {   1727,12}, {    447,11}, \
+    {    895,10}, {   1791,11}, {    959,10}, {   1919,13}, \
+    {    255,12}, {    511,11}, {   1023,10}, {   2111,11}, \
+    {   1087,10}, {   2175,12}, {    575,11}, {   1151,10}, \
+    {   2303,11}, {   1215,10}, {   2431,12}, {    639,11}, \
+    {   1343,12}, {    703,11}, {   1407,10}, {   2815,11}, \
+    {   1471,10}, {   2943,13}, {    383,12}, {    767,11}, \
+    {   1599,12}, {    831,11}, {   1663,10}, {   3327,11}, \
+    {   1727,10}, {   3455,12}, {    895,11}, {   1855,12}, \
+    {    959,11}, {   1919,10}, {   3839,14}, {    255,13}, \
+    {    511,12}, {   1023,11}, {   2111,12}, {   1087,11}, \
+    {   2239,12}, {   1151,11}, {   2303,12}, {   1215,11}, \
+    {   2431,13}, {    639,12}, {   1343,11}, {   2687,12}, \
+    {   1407,11}, {   2815,12}, {   1471,11}, {   2943,13}, \
+    {    767,12}, {   1663,11}, {   3327,12}, {   1727,11}, \
+    {   3455,13}, {    895,12}, {   1919,11}, {   3839,12}, \
+    {   1983,14}, {    511,13}, {   1023,12}, {   2239,13}, \
+    {   1151,12}, {   2495,13}, {   1279,12}, {   2687,13}, \
+    {   1407,12}, {   2943,14}, {    767,13}, {   1535,12}, \
+    {   3135,13}, {   1663,12}, {   3455,13}, {   1791,12}, \
+    {   3583,13}, {   1919,12}, {   3967,15}, {    511,14}, \
+    {   1023,13}, {   2047,12}, {   4095,13}, {   2175,12}, \
+    {   4479,13}, {   2431,12}, {   4863,14}, {   1279,13}, \
+    {   2559,12}, {   5119,13}, {   2943,12}, {   5887,14}, \
+    {   1535,13}, {   3455,14}, {   1791,13}, {   3967,15}, \
+    {   1023,14}, {   2047,13}, {   4479,14}, {   2303,13}, \
+    {   4991,14}, {   2559,13}, {   5119,14}, {   2815,13}, \
+    {   5887,15}, {  32768,16} }
+#define MUL_FFT_TABLE3_SIZE 254
+#define MUL_FFT_THRESHOLD                 7552
+
+#define SQR_FFT_MODF_THRESHOLD             492  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    492, 5}, {     25, 6}, {     13, 5}, {     27, 6}, \
+    {     28, 7}, {     15, 6}, {     32, 7}, {     17, 6}, \
+    {     35, 7}, {     19, 6}, {     39, 7}, {     27, 8}, \
+    {     15, 7}, {     35, 8}, {     19, 7}, {     41, 8}, \
+    {     23, 7}, {     47, 8}, {     27, 9}, {     15, 8}, \
+    {     31, 7}, {     63, 8}, {     39, 9}, {     23, 8}, \
+    {     51, 9}, {     31, 8}, {     67, 9}, {     39, 8}, \
+    {     79, 9}, {     47, 8}, {     95, 9}, {     55,10}, \
+    {     31, 9}, {     79,10}, {     47, 9}, {    103,11}, \
+    {     31,10}, {     63, 9}, {    135, 8}, {    271, 9}, \
+    {    143,10}, {     79, 9}, {    167,10}, {     95, 9}, \
+    {    191, 8}, {    383,10}, {    111,11}, {     63,10}, \
+    {    127, 9}, {    255, 8}, {    511,10}, {    143, 9}, \
+    {    303,10}, {    159, 9}, {    319, 8}, {    639,11}, \
+    {     95,10}, {    191, 9}, {    383, 8}, {    767, 9}, \
+    {    399,10}, {    207,12}, {     63,11}, {    127,10}, \
+    {    255, 9}, {    511,10}, {    271, 9}, {    543, 8}, \
+    {   1087,10}, {    287, 9}, {    575,10}, {    303,11}, \
+    {    159,10}, {    319, 9}, {    639,10}, {    335, 9}, \
+    {    671, 8}, {   1343, 9}, {    703,11}, {    191,10}, \
+    {    383, 9}, {    767, 8}, {   1535,10}, {    399, 9}, \
+    {    799, 8}, {   1599, 9}, {    863,11}, {    223,10}, \
+    {    447,12}, {    127,11}, {    255,10}, {    511, 9}, \
+    {   1087,10}, {    575, 9}, {   1215,10}, {    639, 9}, \
+    {   1279,10}, {    671, 9}, {   1343,11}, {    351,10}, \
+    {    703, 9}, {   1407,10}, {    735, 9}, {   1471,12}, \
+    {    191,11}, {    383,10}, {    767, 9}, {   1535,10}, \
+    {    799, 9}, {   1599,11}, {    415,10}, {    831, 9}, \
+    {   1663,10}, {    863, 9}, {   1727, 8}, {   3455,11}, \
+    {    447,10}, {    895,13}, {    127,12}, {    255,11}, \
+    {    511,10}, {   1023, 9}, {   2047,11}, {    543,10}, \
+    {   1087, 9}, {   2175,11}, {    575,10}, {   1151, 9}, \
+    {   2303,11}, {    607,10}, {   1215, 9}, {   2431,12}, \
+    {    319,11}, {    639,10}, {   1279,11}, {    671,10}, \
+    {   1343,11}, {    703,10}, {   1407, 9}, {   2815,11}, \
+    {    735,10}, {   1471, 9}, {   2943,12}, {    383,11}, \
+    {    767,10}, {   1599,11}, {    831,10}, {   1663, 9}, \
+    {   3327,10}, {   1727,12}, {    447,11}, {    895,10}, \
+    {   1791,11}, {    959,10}, {   1919,13}, {    255,12}, \
+    {    511,11}, {   1023,10}, {   2111,11}, {   1087,10}, \
+    {   2175,12}, {    575,11}, {   1151,10}, {   2303,11}, \
+    {   1215,10}, {   2431,12}, {    639,11}, {   1343,12}, \
+    {    703,11}, {   1407,10}, {   2815,11}, {   1471,10}, \
+    {   2943,13}, {    383,12}, {    767,11}, {   1599,12}, \
+    {    831,11}, {   1663,10}, {   3327,11}, {   1727,10}, \
+    {   3455,12}, {    895,11}, {   1791,12}, {    959,11}, \
+    {   1919,10}, {   3839,14}, {    255,13}, {    511,12}, \
+    {   1023,11}, {   2111,12}, {   1087,11}, {   2239,12}, \
+    {   1151,11}, {   2303,12}, {   1215,11}, {   2431,13}, \
+    {    639,12}, {   1343,11}, {   2687,12}, {   1407,11}, \
+    {   2815,12}, {   1471,11}, {   2943,13}, {    767,12}, \
+    {   1599,11}, {   3199,12}, {   1663,11}, {   3327,12}, \
+    {   1727,11}, {   3455,13}, {    895,12}, {   1791,11}, \
+    {   3583,12}, {   1919,11}, {   3839,12}, {   1983,14}, \
+    {    511,13}, {   1023,12}, {   2239,13}, {   1151,12}, \
+    {   2431,13}, {   1279,12}, {   2687,13}, {   1407,12}, \
+    {   2943,14}, {    767,13}, {   1535,12}, {   3199,13}, \
+    {   1663,12}, {   3455,13}, {   1791,12}, {   3583,13}, \
+    {   1919,12}, {   3967,15}, {    511,14}, {   1023,13}, \
+    {   2047,12}, {   4095,13}, {   2175,12}, {   4351,13}, \
+    {   2431,14}, {   1279,13}, {   2943,12}, {   5887,14}, \
+    {   1535,13}, {   3455,14}, {   1791,13}, {   3967,15}, \
+    {   1023,14}, {   2047,13}, {   4351,14}, {   2303,13}, \
+    {   4991,14}, {   2559,13}, {   5119,14}, {   2815,13}, \
+    {   5887,15}, {  32768,16} }
+#define SQR_FFT_TABLE3_SIZE 258
+#define SQR_FFT_THRESHOLD                 5504
+
+#define MULLO_BASECASE_THRESHOLD             3
+#define MULLO_DC_THRESHOLD                  34
+#define MULLO_MUL_N_THRESHOLD            14281
+#define SQRLO_BASECASE_THRESHOLD             6
+#define SQRLO_DC_THRESHOLD                 137
+#define SQRLO_SQR_THRESHOLD              10821
+
+#define DC_DIV_QR_THRESHOLD                 45
+#define DC_DIVAPPR_Q_THRESHOLD             206
+#define DC_BDIV_QR_THRESHOLD                39
+#define DC_BDIV_Q_THRESHOLD                144
+
+#define INV_MULMOD_BNM1_THRESHOLD           54
+#define INV_NEWTON_THRESHOLD               202
+#define INV_APPR_THRESHOLD                 206
+
+#define BINV_NEWTON_THRESHOLD              224
+#define REDC_1_TO_REDC_N_THRESHOLD          63
+
+#define MU_DIV_QR_THRESHOLD               1442
+#define MU_DIVAPPR_Q_THRESHOLD            1387
+#define MUPI_DIV_QR_THRESHOLD               82
+#define MU_BDIV_QR_THRESHOLD              1308
+#define MU_BDIV_Q_THRESHOLD               1387
+
+#define POWM_SEC_TABLE  1,16,102,428,1221
+
+#define GET_STR_DC_THRESHOLD                14
+#define GET_STR_PRECOMPUTE_THRESHOLD        28
+#define SET_STR_DC_THRESHOLD               254
+#define SET_STR_PRECOMPUTE_THRESHOLD       890
+
+#define FAC_DSC_THRESHOLD                  206
+#define FAC_ODD_THRESHOLD                   29
+
+#define MATRIX22_STRASSEN_THRESHOLD         17
+#define HGCD2_DIV1_METHOD                    3  /* 3.84% faster than 4 */
+#define HGCD_THRESHOLD                     123
+#define HGCD_APPR_THRESHOLD                151
+#define HGCD_REDUCE_THRESHOLD             3389
+#define GCD_DC_THRESHOLD                   435
+#define GCDEXT_DC_THRESHOLD                318
+#define JACOBI_BASE_METHOD                   4  /* 8.04% faster than 3 */
+
+/* Tuneup completed successfully, took 175382 seconds */

diff --git a/third_party/gmp/mpn/x86/k7/invert_limb.asm b/third_party/gmp/mpn/x86/k7/invert_limb.asm
new file mode 100644
index 0000000..31a867e
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k7/invert_limb.asm

@@ -0,0 +1,194 @@
+dnl  x86 mpn_invert_limb
+
+dnl  Contributed to the GNU project by Niels Möller
+
+dnl  Copyright 2009, 2011, 2015 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C			    cycles (approx)	div
+C P5				 ?
+C P6 model 0-8,10-12		 ?
+C P6 model 9  (Banias)		 ?
+C P6 model 13 (Dothan)		 ?
+C P4 model 0  (Willamette)	 ?
+C P4 model 1  (?)		 ?
+C P4 model 2  (Northwood)	 ?
+C P4 model 3  (Prescott)	 ?
+C P4 model 4  (Nocona)		 ?
+C AMD K6			 ?
+C AMD K7			41		53
+C AMD K8			 ?
+
+C TODO
+C  * These c/l numbers are for a non-PIC build.  Consider falling back to using
+C    the 'div' instruction for PIC builds.
+C  * Perhaps use this file--or at least the algorithm--for more machines than k7.
+
+C Register usage:
+C   Input D in %edi
+C   Current approximation is in %eax and/or %ecx
+C   %ebx and %edx are temporaries
+C   %esi and %ebp are unused
+
+defframe(PARAM_DIVISOR,4)
+
+ASM_START()
+
+C Make approx_tab global to work around Apple relocation bug.
+ifdef(`DARWIN',`
+	deflit(`approx_tab', MPN(invert_limb_tab))
+	GLOBL	approx_tab')
+
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_invert_limb)
+deflit(`FRAME', 0)
+	mov	PARAM_DIVISOR, %eax
+	C Avoid push/pop on k7.
+	sub	$8, %esp	FRAME_subl_esp(8)
+	mov	%ebx, (%esp)
+	mov	%edi, 4(%esp)
+
+	mov	%eax, %edi
+	shr	$22, %eax
+ifdef(`PIC',`
+	LEAL(	approx_tab, %ebx)
+	movzwl	-1024(%ebx, %eax, 2), %eax
+',`
+	movzwl	-1024+approx_tab(%eax, %eax), %eax	C %eax = v0
+')
+
+	C v1 = (v0 << 4) - ((v0*v0*d_21) >> 32) - 1
+	mov	%eax, %ecx
+	imul	%eax, %eax
+	mov	%edi, %ebx
+	shr	$11, %ebx
+	inc	%ebx
+	mul	%ebx
+	mov	%edi, %ebx				C Prepare
+	shr	%ebx
+	sbb	%eax, %eax
+	sub	%eax, %ebx				C %ebx = d_31, %eax = mask
+	shl	$4, %ecx
+	dec	%ecx
+	sub	%edx, %ecx				C %ecx = v1
+
+	C v_2 = (v1 << 15) + ((v1 *(2^48 - v1 * d31 + (v1 >> 1) & mask)) >> 33)
+	imul	%ecx, %ebx
+	and	%ecx, %eax
+	shr	%eax
+	sub	%ebx, %eax
+	mul	%ecx
+	mov	%edi, %eax				C Prepare for next mul
+	shl	$15, %ecx
+	shr	%edx
+	add	%edx, %ecx				C %ecx = v2
+
+	mul	%ecx
+	add	%edi, %eax
+	mov	%ecx, %eax
+	adc	%edi, %edx
+	sub	%edx, %eax				C %eax = v3
+
+	mov	(%esp), %ebx
+	mov	4(%esp), %edi
+	add	$8, %esp
+
+	ret
+
+EPILOGUE()
+
+DEF_OBJECT(approx_tab,2)
+	.value	0x7fe1,0x7fa1,0x7f61,0x7f22,0x7ee3,0x7ea4,0x7e65,0x7e27
+	.value	0x7de9,0x7dab,0x7d6d,0x7d30,0x7cf3,0x7cb6,0x7c79,0x7c3d
+	.value	0x7c00,0x7bc4,0x7b89,0x7b4d,0x7b12,0x7ad7,0x7a9c,0x7a61
+	.value	0x7a27,0x79ec,0x79b2,0x7979,0x793f,0x7906,0x78cc,0x7894
+	.value	0x785b,0x7822,0x77ea,0x77b2,0x777a,0x7742,0x770b,0x76d3
+	.value	0x769c,0x7665,0x762f,0x75f8,0x75c2,0x758c,0x7556,0x7520
+	.value	0x74ea,0x74b5,0x7480,0x744b,0x7416,0x73e2,0x73ad,0x7379
+	.value	0x7345,0x7311,0x72dd,0x72aa,0x7277,0x7243,0x7210,0x71de
+	.value	0x71ab,0x7179,0x7146,0x7114,0x70e2,0x70b1,0x707f,0x704e
+	.value	0x701c,0x6feb,0x6fba,0x6f8a,0x6f59,0x6f29,0x6ef9,0x6ec8
+	.value	0x6e99,0x6e69,0x6e39,0x6e0a,0x6ddb,0x6dab,0x6d7d,0x6d4e
+	.value	0x6d1f,0x6cf1,0x6cc2,0x6c94,0x6c66,0x6c38,0x6c0a,0x6bdd
+	.value	0x6bb0,0x6b82,0x6b55,0x6b28,0x6afb,0x6acf,0x6aa2,0x6a76
+	.value	0x6a49,0x6a1d,0x69f1,0x69c6,0x699a,0x696e,0x6943,0x6918
+	.value	0x68ed,0x68c2,0x6897,0x686c,0x6842,0x6817,0x67ed,0x67c3
+	.value	0x6799,0x676f,0x6745,0x671b,0x66f2,0x66c8,0x669f,0x6676
+	.value	0x664d,0x6624,0x65fc,0x65d3,0x65aa,0x6582,0x655a,0x6532
+	.value	0x650a,0x64e2,0x64ba,0x6493,0x646b,0x6444,0x641c,0x63f5
+	.value	0x63ce,0x63a7,0x6381,0x635a,0x6333,0x630d,0x62e7,0x62c1
+	.value	0x629a,0x6275,0x624f,0x6229,0x6203,0x61de,0x61b8,0x6193
+	.value	0x616e,0x6149,0x6124,0x60ff,0x60da,0x60b6,0x6091,0x606d
+	.value	0x6049,0x6024,0x6000,0x5fdc,0x5fb8,0x5f95,0x5f71,0x5f4d
+	.value	0x5f2a,0x5f07,0x5ee3,0x5ec0,0x5e9d,0x5e7a,0x5e57,0x5e35
+	.value	0x5e12,0x5def,0x5dcd,0x5dab,0x5d88,0x5d66,0x5d44,0x5d22
+	.value	0x5d00,0x5cde,0x5cbd,0x5c9b,0x5c7a,0x5c58,0x5c37,0x5c16
+	.value	0x5bf5,0x5bd4,0x5bb3,0x5b92,0x5b71,0x5b51,0x5b30,0x5b10
+	.value	0x5aef,0x5acf,0x5aaf,0x5a8f,0x5a6f,0x5a4f,0x5a2f,0x5a0f
+	.value	0x59ef,0x59d0,0x59b0,0x5991,0x5972,0x5952,0x5933,0x5914
+	.value	0x58f5,0x58d6,0x58b7,0x5899,0x587a,0x585b,0x583d,0x581f
+	.value	0x5800,0x57e2,0x57c4,0x57a6,0x5788,0x576a,0x574c,0x572e
+	.value	0x5711,0x56f3,0x56d5,0x56b8,0x569b,0x567d,0x5660,0x5643
+	.value	0x5626,0x5609,0x55ec,0x55cf,0x55b2,0x5596,0x5579,0x555d
+	.value	0x5540,0x5524,0x5507,0x54eb,0x54cf,0x54b3,0x5497,0x547b
+	.value	0x545f,0x5443,0x5428,0x540c,0x53f0,0x53d5,0x53b9,0x539e
+	.value	0x5383,0x5368,0x534c,0x5331,0x5316,0x52fb,0x52e0,0x52c6
+	.value	0x52ab,0x5290,0x5276,0x525b,0x5240,0x5226,0x520c,0x51f1
+	.value	0x51d7,0x51bd,0x51a3,0x5189,0x516f,0x5155,0x513b,0x5121
+	.value	0x5108,0x50ee,0x50d5,0x50bb,0x50a2,0x5088,0x506f,0x5056
+	.value	0x503c,0x5023,0x500a,0x4ff1,0x4fd8,0x4fbf,0x4fa6,0x4f8e
+	.value	0x4f75,0x4f5c,0x4f44,0x4f2b,0x4f13,0x4efa,0x4ee2,0x4eca
+	.value	0x4eb1,0x4e99,0x4e81,0x4e69,0x4e51,0x4e39,0x4e21,0x4e09
+	.value	0x4df1,0x4dda,0x4dc2,0x4daa,0x4d93,0x4d7b,0x4d64,0x4d4d
+	.value	0x4d35,0x4d1e,0x4d07,0x4cf0,0x4cd8,0x4cc1,0x4caa,0x4c93
+	.value	0x4c7d,0x4c66,0x4c4f,0x4c38,0x4c21,0x4c0b,0x4bf4,0x4bde
+	.value	0x4bc7,0x4bb1,0x4b9a,0x4b84,0x4b6e,0x4b58,0x4b41,0x4b2b
+	.value	0x4b15,0x4aff,0x4ae9,0x4ad3,0x4abd,0x4aa8,0x4a92,0x4a7c
+	.value	0x4a66,0x4a51,0x4a3b,0x4a26,0x4a10,0x49fb,0x49e5,0x49d0
+	.value	0x49bb,0x49a6,0x4990,0x497b,0x4966,0x4951,0x493c,0x4927
+	.value	0x4912,0x48fe,0x48e9,0x48d4,0x48bf,0x48ab,0x4896,0x4881
+	.value	0x486d,0x4858,0x4844,0x482f,0x481b,0x4807,0x47f3,0x47de
+	.value	0x47ca,0x47b6,0x47a2,0x478e,0x477a,0x4766,0x4752,0x473e
+	.value	0x472a,0x4717,0x4703,0x46ef,0x46db,0x46c8,0x46b4,0x46a1
+	.value	0x468d,0x467a,0x4666,0x4653,0x4640,0x462c,0x4619,0x4606
+	.value	0x45f3,0x45e0,0x45cd,0x45ba,0x45a7,0x4594,0x4581,0x456e
+	.value	0x455b,0x4548,0x4536,0x4523,0x4510,0x44fe,0x44eb,0x44d8
+	.value	0x44c6,0x44b3,0x44a1,0x448f,0x447c,0x446a,0x4458,0x4445
+	.value	0x4433,0x4421,0x440f,0x43fd,0x43eb,0x43d9,0x43c7,0x43b5
+	.value	0x43a3,0x4391,0x437f,0x436d,0x435c,0x434a,0x4338,0x4327
+	.value	0x4315,0x4303,0x42f2,0x42e0,0x42cf,0x42bd,0x42ac,0x429b
+	.value	0x4289,0x4278,0x4267,0x4256,0x4244,0x4233,0x4222,0x4211
+	.value	0x4200,0x41ef,0x41de,0x41cd,0x41bc,0x41ab,0x419a,0x418a
+	.value	0x4179,0x4168,0x4157,0x4147,0x4136,0x4125,0x4115,0x4104
+	.value	0x40f4,0x40e3,0x40d3,0x40c2,0x40b2,0x40a2,0x4091,0x4081
+	.value	0x4071,0x4061,0x4050,0x4040,0x4030,0x4020,0x4010,0x4000
+END_OBJECT(approx_tab)
+ASM_END()

diff --git a/third_party/gmp/mpn/x86/k7/mmx/com.asm b/third_party/gmp/mpn/x86/k7/mmx/com.asm
new file mode 100644
index 0000000..a258c22
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k7/mmx/com.asm

@@ -0,0 +1,125 @@
+dnl  AMD Athlon mpn_com -- mpn bitwise one's complement.
+
+dnl  Copyright 2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C K7: 1.0 cycles/limb
+
+
+C void mpn_com (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C The loop form below is necessary for the claimed speed.  It needs to be
+C aligned to a 16 byte boundary and only 16 bytes long.  Maybe that's so it
+C fits in a BTB entry.  The adjustments to %eax and %edx avoid offsets on
+C the movq's and achieve the necessary size.
+C
+C If both src and dst are 4mod8, the loop runs at 1.5 c/l.  So long as one
+C of the two is 0mod8, it runs at 1.0 c/l.  On that basis dst is checked
+C (offset by the size, as per the loop addressing) and one high limb
+C processed separately to get alignment.
+C
+C The padding for the nails case is unattractive, but shouldn't cost any
+C cycles.  Explicit .byte's guarantee the desired instructions, at a point
+C where we're probably stalled waiting for loads anyway.
+C
+C Enhancements:
+C
+C The combination load/pxor/store might be able to be unrolled to approach
+C 0.5 c/l if desired.
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+	TEXT
+	ALIGN(16)
+
+PROLOGUE(mpn_com)
+deflit(`FRAME',0)
+
+	movl	PARAM_DST, %edx
+	movl	PARAM_SIZE, %ecx
+	pcmpeqd	%mm7, %mm7
+
+	leal	(%edx,%ecx,4), %eax
+	andl	$4, %eax
+ifelse(GMP_NAIL_BITS,0,,
+`	psrld	$GMP_NAIL_BITS, %mm7')		C GMP_NUMB_MASK
+
+	movl	PARAM_SRC, %eax
+	movd	-4(%eax,%ecx,4), %mm0		C src high limb
+
+ifelse(GMP_NAIL_BITS,0,,
+`	C padding for alignment below
+	.byte	0x8d, 0xb6, 0x00, 0x00, 0x00, 0x00	C lea 0(%esi),%esi
+	.byte	0x8d, 0xbf, 0x00, 0x00, 0x00, 0x00	C lea 0(%edi),%edi
+')
+
+	jz	L(aligned)
+
+	pxor	%mm7, %mm0
+	movd	%mm0, -4(%edx,%ecx,4)		C dst high limb
+	decl	%ecx
+	jz	L(done)
+L(aligned):
+
+	addl	$4, %eax
+	addl	$4, %edx
+	decl	%ecx
+	jz	L(one)
+
+	C offset 0x30 for no nails, or 0x40 for nails
+	ALIGN(16)
+L(top):
+	C eax	src
+	C ebx
+	C ecx	counter
+	C edx	dst
+
+	subl	$2, %ecx
+	movq	(%eax,%ecx,4), %mm0
+	pxor	%mm7, %mm0
+	movq	%mm0, (%edx,%ecx,4)
+	jg	L(top)
+
+	jnz	L(done)				C if size even
+
+L(one):
+	movd	-4(%eax), %mm0			C src low limb
+	pxor	%mm7, %mm0
+	movd	%mm0, -4(%edx)			C dst low limb
+
+L(done):
+	emms
+
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/k7/mmx/copyd.asm b/third_party/gmp/mpn/x86/k7/mmx/copyd.asm
new file mode 100644
index 0000000..59ece40
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k7/mmx/copyd.asm

@@ -0,0 +1,144 @@
+dnl  AMD K7 mpn_copyd -- copy limb vector, decrementing.
+
+dnl  Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C    alignment dst/src, A=0mod8 N=4mod8
+C       A/A   A/N   N/A   N/N
+C K7    0.75  1.0   1.0   0.75
+
+
+C void mpn_copyd (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C The various comments in mpn/x86/k7/copyi.asm apply here too.
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+deflit(`FRAME',0)
+
+dnl  parameter space reused
+define(SAVE_EBX,`PARAM_SIZE')
+define(SAVE_ESI,`PARAM_SRC')
+
+dnl  minimum 5 since the unrolled code can't handle less than 5
+deflit(UNROLL_THRESHOLD, 5)
+
+	TEXT
+	ALIGN(32)
+PROLOGUE(mpn_copyd)
+
+	movl	PARAM_SIZE, %ecx
+	movl	%ebx, SAVE_EBX
+
+	movl	PARAM_SRC, %eax
+	movl	PARAM_DST, %edx
+
+	cmpl	$UNROLL_THRESHOLD, %ecx
+	jae	L(unroll)
+
+	orl	%ecx, %ecx
+	jz	L(simple_done)
+
+L(simple):
+	C eax	src
+	C ebx	scratch
+	C ecx	counter
+	C edx	dst
+	C
+	C this loop is 2 cycles/limb
+
+	movl	-4(%eax,%ecx,4), %ebx
+	movl	%ebx, -4(%edx,%ecx,4)
+	decl	%ecx
+	jnz	L(simple)
+
+L(simple_done):
+	movl	SAVE_EBX, %ebx
+	ret
+
+
+L(unroll):
+	movl	%esi, SAVE_ESI
+	leal	(%eax,%ecx,4), %ebx
+	leal	(%edx,%ecx,4), %esi
+
+	andl	%esi, %ebx
+	movl	SAVE_ESI, %esi
+	subl	$4, %ecx		C size-4
+
+	testl	$4, %ebx   C testl to pad code closer to 16 bytes for L(top)
+	jz	L(aligned)
+
+	C both src and dst unaligned, process one limb to align them
+	movl	12(%eax,%ecx,4), %ebx
+	movl	%ebx, 12(%edx,%ecx,4)
+	decl	%ecx
+L(aligned):
+
+
+	ALIGN(16)
+L(top):
+	C eax	src
+	C ebx
+	C ecx	counter, limbs
+	C edx	dst
+
+	movq	8(%eax,%ecx,4), %mm0
+	movq	(%eax,%ecx,4), %mm1
+	subl	$4, %ecx
+	movq	%mm0, 16+8(%edx,%ecx,4)
+	movq	%mm1, 16(%edx,%ecx,4)
+	jns	L(top)
+
+
+	C now %ecx is -4 to -1 representing respectively 0 to 3 limbs remaining
+
+	testb	$2, %cl
+	jz	L(finish_not_two)
+
+	movq	8(%eax,%ecx,4), %mm0
+	movq	%mm0, 8(%edx,%ecx,4)
+L(finish_not_two):
+
+	testb	$1, %cl
+	jz	L(done)
+
+	movl	(%eax), %ebx
+	movl	%ebx, (%edx)
+
+L(done):
+	movl	SAVE_EBX, %ebx
+	emms
+	ret
+
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/k7/mmx/copyi.asm b/third_party/gmp/mpn/x86/k7/mmx/copyi.asm
new file mode 100644
index 0000000..9a28f92
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k7/mmx/copyi.asm

@@ -0,0 +1,157 @@
+dnl  AMD K7 mpn_copyi -- copy limb vector, incrementing.
+
+dnl  Copyright 1999, 2000, 2002, 2003 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C    alignment dst/src, A=0mod8 N=4mod8
+C       A/A   A/N   N/A   N/N
+C K7    0.75  1.0   1.0   0.75
+
+
+C void mpn_copyi (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C Copy src,size to dst,size.
+C
+C This code at 0.75 or 1.0 c/l is always faster than a plain rep movsl at
+C 1.33 c/l.
+C
+C The K7 can do a 64-bit load and 64-bit store in one cycle (optimization
+C guile 22007 appendix B), so 0.5 c/l should be possible, however nothing
+C under 0.7 c/l is known.  Apparently only two 32-bit stores can be done in
+C one cycle, so perhaps some scheduling is needed to ensure it's a
+C load+store in each cycle, not store+store.
+C
+C If both source and destination are unaligned then one limb is processed at
+C the start to make them aligned and so get 0.75 c/l, whereas if they'd been
+C used unaligned it would be 1.5 c/l.
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+dnl  parameter space reused
+define(SAVE_EBX,`PARAM_SIZE')
+
+dnl  minimum 5 since the unrolled code can't handle less than 5
+deflit(UNROLL_THRESHOLD, 5)
+
+	TEXT
+	ALIGN(32)
+PROLOGUE(mpn_copyi)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	movl	%ebx, SAVE_EBX
+
+	movl	PARAM_SRC, %eax
+	movl	PARAM_DST, %edx
+
+	cmpl	$UNROLL_THRESHOLD, %ecx
+	jae	L(unroll)
+
+	orl	%ecx, %ecx
+	jz	L(simple_done)
+
+L(simple):
+	C eax	src, incrementing
+	C ebx	scratch
+	C ecx	counter
+	C edx	dst, incrementing
+	C
+	C this loop is 2 cycles/limb
+
+	movl	(%eax), %ebx
+	movl	%ebx, (%edx)
+	decl	%ecx
+	leal	4(%eax), %eax
+	leal	4(%edx), %edx
+	jnz	L(simple)
+
+L(simple_done):
+	movl	SAVE_EBX, %ebx
+	ret
+
+
+L(unroll):
+	movl	%eax, %ebx
+	leal	-12(%eax,%ecx,4), %eax	C src end - 12
+	subl	$3, %ecx		C size-3
+
+	andl	%edx, %ebx
+	leal	(%edx,%ecx,4), %edx	C dst end - 12
+	negl	%ecx
+
+	testl	$4, %ebx   C testl to pad code closer to 16 bytes for L(top)
+	jz	L(aligned)
+
+	C both src and dst unaligned, process one limb to align them
+	movl	(%eax,%ecx,4), %ebx
+	movl	%ebx, (%edx,%ecx,4)
+	incl	%ecx
+L(aligned):
+
+
+	ALIGN(16)
+L(top):
+	C eax	src end - 12
+	C ebx
+	C ecx	counter, negative, limbs
+	C edx	dst end - 12
+
+	movq	(%eax,%ecx,4), %mm0
+	movq	8(%eax,%ecx,4), %mm1
+	addl	$4, %ecx
+	movq	%mm0, -16(%edx,%ecx,4)
+	movq	%mm1, -16+8(%edx,%ecx,4)
+	ja	L(top)		C jump no carry and not zero
+
+
+	C now %ecx is 0 to 3 representing respectively 3 to 0 limbs remaining
+
+	testb	$2, %cl
+	jnz	L(finish_not_two)
+
+	movq	(%eax,%ecx,4), %mm0
+	movq	%mm0, (%edx,%ecx,4)
+L(finish_not_two):
+
+	testb	$1, %cl
+	jnz	L(done)
+
+	movl	8(%eax), %ebx
+	movl	%ebx, 8(%edx)
+
+L(done):
+	movl	SAVE_EBX, %ebx
+	emms
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/k7/mmx/divrem_1.asm b/third_party/gmp/mpn/x86/k7/mmx/divrem_1.asm
new file mode 100644
index 0000000..cf34328
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k7/mmx/divrem_1.asm

@@ -0,0 +1,832 @@
+dnl  AMD K7 mpn_divrem_1, mpn_divrem_1c, mpn_preinv_divrem_1 -- mpn by limb
+dnl  division.
+
+dnl  Copyright 1999-2002, 2004 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C K7: 17.0 cycles/limb integer part, 15.0 cycles/limb fraction part.
+
+
+C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
+C                         mp_srcptr src, mp_size_t size,
+C                         mp_limb_t divisor);
+C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
+C                          mp_srcptr src, mp_size_t size,
+C                          mp_limb_t divisor, mp_limb_t carry);
+C mp_limb_t mpn_preinv_divrem_1 (mp_ptr dst, mp_size_t xsize,
+C                                mp_srcptr src, mp_size_t size,
+C                                mp_limb_t divisor, mp_limb_t inverse,
+C                                unsigned shift);
+C
+C Algorithm:
+C
+C The method and nomenclature follow part 8 of "Division by Invariant
+C Integers using Multiplication" by Granlund and Montgomery, reference in
+C gmp.texi.
+C
+C The "and"s shown in the paper are done here with "cmov"s.  "m" is written
+C for m', and "d" for d_norm, which won't cause any confusion since it's
+C only the normalized divisor that's of any use in the code.  "b" is written
+C for 2^N, the size of a limb, N being 32 here.
+C
+C The step "sdword dr = n - 2^N*d + (2^N-1-q1) * d" is instead done as
+C "n-(q1+1)*d"; this rearrangement gives the same two-limb answer.  If
+C q1==0xFFFFFFFF, then q1+1 would overflow.  We branch to a special case
+C "q1_ff" if this occurs.  Since the true quotient is either q1 or q1+1 then
+C if q1==0xFFFFFFFF that must be the right value.
+C
+C For the last and second last steps q1==0xFFFFFFFF is instead handled by an
+C sbbl to go back to 0xFFFFFFFF if an overflow occurs when adding 1.  This
+C then goes through as normal, and finding no addback required.  sbbl costs
+C an extra cycle over what the main loop code does, but it keeps code size
+C and complexity down.
+C
+C Notes:
+C
+C mpn_divrem_1 and mpn_preinv_divrem_1 avoid one division if the src high
+C limb is less than the divisor.  mpn_divrem_1c doesn't check for a zero
+C carry, since in normal circumstances that will be a very rare event.
+C
+C The test for skipping a division is branch free (once size>=1 is tested).
+C The store to the destination high limb is 0 when a divide is skipped, or
+C if it's not skipped then a copy of the src high limb is used.  The latter
+C is in case src==dst.
+C
+C There's a small bias towards expecting xsize==0, by having code for
+C xsize==0 in a straight line and xsize!=0 under forward jumps.
+C
+C Alternatives:
+C
+C If the divisor is normalized (high bit set) then a division step can
+C always be skipped, since the high destination limb is always 0 or 1 in
+C that case.  It doesn't seem worth checking for this though, since it
+C probably occurs infrequently, in particular note that big_base for a
+C decimal mpn_get_str is not normalized in a 32-bit limb.
+
+
+dnl  MUL_THRESHOLD is the value of xsize+size at which the multiply by
+dnl  inverse method is used, rather than plain "divl"s.  Minimum value 1.
+dnl
+dnl  The inverse takes about 50 cycles to calculate, but after that the
+dnl  multiply is 17 c/l versus division at 42 c/l.
+dnl
+dnl  At 3 limbs the mul is a touch faster than div on the integer part, and
+dnl  even more so on the fractional part.
+
+deflit(MUL_THRESHOLD, 3)
+
+
+defframe(PARAM_PREINV_SHIFT,   28)  dnl mpn_preinv_divrem_1
+defframe(PARAM_PREINV_INVERSE, 24)  dnl mpn_preinv_divrem_1
+defframe(PARAM_CARRY,  24)          dnl mpn_divrem_1c
+defframe(PARAM_DIVISOR,20)
+defframe(PARAM_SIZE,   16)
+defframe(PARAM_SRC,    12)
+defframe(PARAM_XSIZE,  8)
+defframe(PARAM_DST,    4)
+
+defframe(SAVE_EBX,    -4)
+defframe(SAVE_ESI,    -8)
+defframe(SAVE_EDI,    -12)
+defframe(SAVE_EBP,    -16)
+
+defframe(VAR_NORM,    -20)
+defframe(VAR_INVERSE, -24)
+defframe(VAR_SRC,     -28)
+defframe(VAR_DST,     -32)
+defframe(VAR_DST_STOP,-36)
+
+deflit(STACK_SPACE, 36)
+
+	TEXT
+	ALIGN(32)
+
+PROLOGUE(mpn_preinv_divrem_1)
+deflit(`FRAME',0)
+	movl	PARAM_XSIZE, %ecx
+	movl	PARAM_DST, %edx
+	subl	$STACK_SPACE, %esp	FRAME_subl_esp(STACK_SPACE)
+
+	movl	%esi, SAVE_ESI
+	movl	PARAM_SRC, %esi
+
+	movl	%ebx, SAVE_EBX
+	movl	PARAM_SIZE, %ebx
+
+	leal	8(%edx,%ecx,4), %edx	C &dst[xsize+2]
+	movl	%ebp, SAVE_EBP
+	movl	PARAM_DIVISOR, %ebp
+
+	movl	%edx, VAR_DST_STOP	C &dst[xsize+2]
+	movl	%edi, SAVE_EDI
+	xorl	%edi, %edi		C carry
+
+	movl	-4(%esi,%ebx,4), %eax	C src high limb
+	xor	%ecx, %ecx
+
+	C
+
+	C
+
+	cmpl	%ebp, %eax		C high cmp divisor
+
+	cmovc(	%eax, %edi)		C high is carry if high<divisor
+	cmovnc(	%eax, %ecx)		C 0 if skip div, src high if not
+					C (the latter in case src==dst)
+
+	movl	%ecx, -12(%edx,%ebx,4)	C dst high limb
+	sbbl	$0, %ebx		C skip one division if high<divisor
+	movl	PARAM_PREINV_SHIFT, %ecx
+
+	leal	-8(%edx,%ebx,4), %edx	C &dst[xsize+size]
+	movl	$32, %eax
+
+	movl	%edx, VAR_DST		C &dst[xsize+size]
+
+	shll	%cl, %ebp		C d normalized
+	subl	%ecx, %eax
+	movl	%ecx, VAR_NORM
+
+	movd	%eax, %mm7		C rshift
+	movl	PARAM_PREINV_INVERSE, %eax
+	jmp	L(start_preinv)
+
+EPILOGUE()
+
+
+	ALIGN(16)
+
+PROLOGUE(mpn_divrem_1c)
+deflit(`FRAME',0)
+	movl	PARAM_CARRY, %edx
+	movl	PARAM_SIZE, %ecx
+	subl	$STACK_SPACE, %esp
+deflit(`FRAME',STACK_SPACE)
+
+	movl	%ebx, SAVE_EBX
+	movl	PARAM_XSIZE, %ebx
+
+	movl	%edi, SAVE_EDI
+	movl	PARAM_DST, %edi
+
+	movl	%ebp, SAVE_EBP
+	movl	PARAM_DIVISOR, %ebp
+
+	movl	%esi, SAVE_ESI
+	movl	PARAM_SRC, %esi
+
+	leal	-4(%edi,%ebx,4), %edi	C &dst[xsize-1]
+	jmp	L(start_1c)
+
+EPILOGUE()
+
+
+	C offset 0xa1, close enough to aligned
+PROLOGUE(mpn_divrem_1)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	movl	$0, %edx		C initial carry (if can't skip a div)
+	subl	$STACK_SPACE, %esp
+deflit(`FRAME',STACK_SPACE)
+
+	movl	%esi, SAVE_ESI
+	movl	PARAM_SRC, %esi
+
+	movl	%ebx, SAVE_EBX
+	movl	PARAM_XSIZE, %ebx
+
+	movl	%ebp, SAVE_EBP
+	movl	PARAM_DIVISOR, %ebp
+	orl	%ecx, %ecx		C size
+
+	movl	%edi, SAVE_EDI
+	movl	PARAM_DST, %edi
+	leal	-4(%edi,%ebx,4), %edi	C &dst[xsize-1]
+
+	jz	L(no_skip_div)		C if size==0
+	movl	-4(%esi,%ecx,4), %eax	C src high limb
+	xorl	%esi, %esi
+
+	cmpl	%ebp, %eax		C high cmp divisor
+
+	cmovc(	%eax, %edx)		C high is carry if high<divisor
+	cmovnc(	%eax, %esi)		C 0 if skip div, src high if not
+
+	movl	%esi, (%edi,%ecx,4)	C dst high limb
+	sbbl	$0, %ecx		C size-1 if high<divisor
+	movl	PARAM_SRC, %esi		C reload
+L(no_skip_div):
+
+
+L(start_1c):
+	C eax
+	C ebx	xsize
+	C ecx	size
+	C edx	carry
+	C esi	src
+	C edi	&dst[xsize-1]
+	C ebp	divisor
+
+	leal	(%ebx,%ecx), %eax	C size+xsize
+	cmpl	$MUL_THRESHOLD, %eax
+	jae	L(mul_by_inverse)
+
+
+C With MUL_THRESHOLD set to 3, the simple loops here only do 0 to 2 limbs.
+C It'd be possible to write them out without the looping, but no speedup
+C would be expected.
+C
+C Using PARAM_DIVISOR instead of %ebp measures 1 cycle/loop faster on the
+C integer part, but curiously not on the fractional part, where %ebp is a
+C (fixed) couple of cycles faster.
+
+	orl	%ecx, %ecx
+	jz	L(divide_no_integer)
+
+L(divide_integer):
+	C eax	scratch (quotient)
+	C ebx	xsize
+	C ecx	counter
+	C edx	scratch (remainder)
+	C esi	src
+	C edi	&dst[xsize-1]
+	C ebp	divisor
+
+	movl	-4(%esi,%ecx,4), %eax
+
+	divl	PARAM_DIVISOR
+
+	movl	%eax, (%edi,%ecx,4)
+	decl	%ecx
+	jnz	L(divide_integer)
+
+
+L(divide_no_integer):
+	movl	PARAM_DST, %edi
+	orl	%ebx, %ebx
+	jnz	L(divide_fraction)
+
+L(divide_done):
+	movl	SAVE_ESI, %esi
+	movl	SAVE_EDI, %edi
+	movl	%edx, %eax
+
+	movl	SAVE_EBX, %ebx
+	movl	SAVE_EBP, %ebp
+	addl	$STACK_SPACE, %esp
+
+	ret
+
+
+L(divide_fraction):
+	C eax	scratch (quotient)
+	C ebx	counter
+	C ecx
+	C edx	scratch (remainder)
+	C esi
+	C edi	dst
+	C ebp	divisor
+
+	movl	$0, %eax
+
+	divl	%ebp
+
+	movl	%eax, -4(%edi,%ebx,4)
+	decl	%ebx
+	jnz	L(divide_fraction)
+
+	jmp	L(divide_done)
+
+
+
+C -----------------------------------------------------------------------------
+
+L(mul_by_inverse):
+	C eax
+	C ebx	xsize
+	C ecx	size
+	C edx	carry
+	C esi	src
+	C edi	&dst[xsize-1]
+	C ebp	divisor
+
+	bsrl	%ebp, %eax		C 31-l
+
+	leal	12(%edi), %ebx		C &dst[xsize+2], loop dst stop
+	leal	4(%edi,%ecx,4), %edi	C &dst[xsize+size]
+
+	movl	%edi, VAR_DST
+	movl	%ebx, VAR_DST_STOP
+
+	movl	%ecx, %ebx		C size
+	movl	$31, %ecx
+
+	movl	%edx, %edi		C carry
+	movl	$-1, %edx
+
+	C
+
+	xorl	%eax, %ecx		C l
+	incl	%eax			C 32-l
+
+	shll	%cl, %ebp		C d normalized
+	movl	%ecx, VAR_NORM
+
+	movd	%eax, %mm7
+
+	movl	$-1, %eax
+	subl	%ebp, %edx		C (b-d)-1 giving edx:eax = b*(b-d)-1
+
+	divl	%ebp			C floor (b*(b-d)-1) / d
+
+L(start_preinv):
+	C eax	inverse
+	C ebx	size
+	C ecx	shift
+	C edx
+	C esi	src
+	C edi	carry
+	C ebp	divisor
+	C
+	C mm7	rshift
+
+	orl	%ebx, %ebx		C size
+	movl	%eax, VAR_INVERSE
+	leal	-12(%esi,%ebx,4), %eax	C &src[size-3]
+
+	jz	L(start_zero)
+	movl	%eax, VAR_SRC
+	cmpl	$1, %ebx
+
+	movl	8(%eax), %esi		C src high limb
+	jz	L(start_one)
+
+L(start_two_or_more):
+	movl	4(%eax), %edx		C src second highest limb
+
+	shldl(	%cl, %esi, %edi)	C n2 = carry,high << l
+
+	shldl(	%cl, %edx, %esi)	C n10 = high,second << l
+
+	cmpl	$2, %ebx
+	je	L(integer_two_left)
+	jmp	L(integer_top)
+
+
+L(start_one):
+	shldl(	%cl, %esi, %edi)	C n2 = carry,high << l
+
+	shll	%cl, %esi		C n10 = high << l
+	movl	%eax, VAR_SRC
+	jmp	L(integer_one_left)
+
+
+L(start_zero):
+	C Can be here with xsize==0 if mpn_preinv_divrem_1 had size==1 and
+	C skipped a division.
+
+	shll	%cl, %edi		C n2 = carry << l
+	movl	%edi, %eax		C return value for zero_done
+	cmpl	$0, PARAM_XSIZE
+
+	je	L(zero_done)
+	jmp	L(fraction_some)
+
+
+
+C -----------------------------------------------------------------------------
+C
+C The multiply by inverse loop is 17 cycles, and relies on some out-of-order
+C execution.  The instruction scheduling is important, with various
+C apparently equivalent forms running 1 to 5 cycles slower.
+C
+C A lower bound for the time would seem to be 16 cycles, based on the
+C following successive dependencies.
+C
+C		      cycles
+C		n2+n1	1
+C		mul	6
+C		q1+1	1
+C		mul	6
+C		sub	1
+C		addback	1
+C		       ---
+C		       16
+C
+C This chain is what the loop has already, but 16 cycles isn't achieved.
+C K7 has enough decode, and probably enough execute (depending maybe on what
+C a mul actually consumes), but nothing running under 17 has been found.
+C
+C In theory n2+n1 could be done in the sub and addback stages (by
+C calculating both n2 and n2+n1 there), but lack of registers makes this an
+C unlikely proposition.
+C
+C The jz in the loop keeps the q1+1 stage to 1 cycle.  Handling an overflow
+C from q1+1 with an "sbbl $0, %ebx" would add a cycle to the dependent
+C chain, and nothing better than 18 cycles has been found when using it.
+C The jump is taken only when q1 is 0xFFFFFFFF, and on random data this will
+C be an extremely rare event.
+C
+C Branch mispredictions will hit random occurrences of q1==0xFFFFFFFF, but
+C if some special data is coming out with this always, the q1_ff special
+C case actually runs at 15 c/l.  0x2FFF...FFFD divided by 3 is a good way to
+C induce the q1_ff case, for speed measurements or testing.  Note that
+C 0xFFF...FFF divided by 1 or 2 doesn't induce it.
+C
+C The instruction groupings and empty comments show the cycles for a naive
+C in-order view of the code (conveniently ignoring the load latency on
+C VAR_INVERSE).  This shows some of where the time is going, but is nonsense
+C to the extent that out-of-order execution rearranges it.  In this case
+C there's 19 cycles shown, but it executes at 17.
+
+	ALIGN(16)
+L(integer_top):
+	C eax	scratch
+	C ebx	scratch (nadj, q1)
+	C ecx	scratch (src, dst)
+	C edx	scratch
+	C esi	n10
+	C edi	n2
+	C ebp	divisor
+	C
+	C mm0	scratch (src qword)
+	C mm7	rshift for normalization
+
+	cmpl	$0x80000000, %esi  C n1 as 0=c, 1=nc
+	movl	%edi, %eax         C n2
+	movl	VAR_SRC, %ecx
+
+	leal	(%ebp,%esi), %ebx
+	cmovc(	%esi, %ebx)	   C nadj = n10 + (-n1 & d), ignoring overflow
+	sbbl	$-1, %eax          C n2+n1
+
+	mull	VAR_INVERSE        C m*(n2+n1)
+
+	movq	(%ecx), %mm0       C next limb and the one below it
+	subl	$4, %ecx
+
+	movl	%ecx, VAR_SRC
+
+	C
+
+	addl	%ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
+	leal	1(%edi), %ebx      C n2+1
+	movl	%ebp, %eax	   C d
+
+	C
+
+	adcl	%edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
+	jz	L(q1_ff)
+	movl	VAR_DST, %ecx
+
+	mull	%ebx		   C (q1+1)*d
+
+	psrlq	%mm7, %mm0
+
+	leal	-4(%ecx), %ecx
+
+	C
+
+	subl	%eax, %esi
+	movl	VAR_DST_STOP, %eax
+
+	C
+
+	sbbl	%edx, %edi	   C n - (q1+1)*d
+	movl	%esi, %edi	   C remainder -> n2
+	leal	(%ebp,%esi), %edx
+
+	movd	%mm0, %esi
+
+	cmovc(	%edx, %edi)	   C n - q1*d if underflow from using q1+1
+	sbbl	$0, %ebx	   C q
+	cmpl	%eax, %ecx
+
+	movl	%ebx, (%ecx)
+	movl	%ecx, VAR_DST
+	jne	L(integer_top)
+
+
+L(integer_loop_done):
+
+
+C -----------------------------------------------------------------------------
+C
+C Here, and in integer_one_left below, an sbbl $0 is used rather than a jz
+C q1_ff special case.  This make the code a bit smaller and simpler, and
+C costs only 1 cycle (each).
+
+L(integer_two_left):
+	C eax	scratch
+	C ebx	scratch (nadj, q1)
+	C ecx	scratch (src, dst)
+	C edx	scratch
+	C esi	n10
+	C edi	n2
+	C ebp	divisor
+	C
+	C mm7	rshift
+
+	cmpl	$0x80000000, %esi  C n1 as 0=c, 1=nc
+	movl	%edi, %eax         C n2
+	movl	PARAM_SRC, %ecx
+
+	leal	(%ebp,%esi), %ebx
+	cmovc(	%esi, %ebx)	   C nadj = n10 + (-n1 & d), ignoring overflow
+	sbbl	$-1, %eax          C n2+n1
+
+	mull	VAR_INVERSE        C m*(n2+n1)
+
+	movd	(%ecx), %mm0	   C src low limb
+
+	movl	VAR_DST_STOP, %ecx
+
+	C
+
+	addl	%ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
+	leal	1(%edi), %ebx      C n2+1
+	movl	%ebp, %eax	   C d
+
+	adcl	%edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
+
+	sbbl	$0, %ebx
+
+	mull	%ebx		   C (q1+1)*d
+
+	psllq	$32, %mm0
+
+	psrlq	%mm7, %mm0
+
+	C
+
+	subl	%eax, %esi
+
+	C
+
+	sbbl	%edx, %edi	   C n - (q1+1)*d
+	movl	%esi, %edi	   C remainder -> n2
+	leal	(%ebp,%esi), %edx
+
+	movd	%mm0, %esi
+
+	cmovc(	%edx, %edi)	   C n - q1*d if underflow from using q1+1
+	sbbl	$0, %ebx	   C q
+
+	movl	%ebx, -4(%ecx)
+
+
+C -----------------------------------------------------------------------------
+L(integer_one_left):
+	C eax	scratch
+	C ebx	scratch (nadj, q1)
+	C ecx	dst
+	C edx	scratch
+	C esi	n10
+	C edi	n2
+	C ebp	divisor
+	C
+	C mm7	rshift
+
+	movl	VAR_DST_STOP, %ecx
+	cmpl	$0x80000000, %esi  C n1 as 0=c, 1=nc
+	movl	%edi, %eax         C n2
+
+	leal	(%ebp,%esi), %ebx
+	cmovc(	%esi, %ebx)	   C nadj = n10 + (-n1 & d), ignoring overflow
+	sbbl	$-1, %eax          C n2+n1
+
+	mull	VAR_INVERSE        C m*(n2+n1)
+
+	C
+
+	C
+
+	C
+
+	addl	%ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
+	leal	1(%edi), %ebx      C n2+1
+	movl	%ebp, %eax	   C d
+
+	C
+
+	adcl	%edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
+
+	sbbl	$0, %ebx           C q1 if q1+1 overflowed
+
+	mull	%ebx
+
+	C
+
+	C
+
+	C
+
+	subl	%eax, %esi
+
+	C
+
+	sbbl	%edx, %edi	   C n - (q1+1)*d
+	movl	%esi, %edi	   C remainder -> n2
+	leal	(%ebp,%esi), %edx
+
+	cmovc(	%edx, %edi)	   C n - q1*d if underflow from using q1+1
+	sbbl	$0, %ebx	   C q
+
+	movl	%ebx, -8(%ecx)
+	subl	$8, %ecx
+
+
+
+L(integer_none):
+	cmpl	$0, PARAM_XSIZE
+	jne	L(fraction_some)
+
+	movl	%edi, %eax
+L(fraction_done):
+	movl	VAR_NORM, %ecx
+L(zero_done):
+	movl	SAVE_EBP, %ebp
+
+	movl	SAVE_EDI, %edi
+	movl	SAVE_ESI, %esi
+
+	movl	SAVE_EBX, %ebx
+	addl	$STACK_SPACE, %esp
+
+	shrl	%cl, %eax
+	emms
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+C
+C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword
+C of q*d is simply -d and the remainder n-q*d = n10+d
+
+L(q1_ff):
+	C eax	(divisor)
+	C ebx	(q1+1 == 0)
+	C ecx
+	C edx
+	C esi	n10
+	C edi	n2
+	C ebp	divisor
+
+	movl	VAR_DST, %ecx
+	movl	VAR_DST_STOP, %edx
+	subl	$4, %ecx
+
+	psrlq	%mm7, %mm0
+	leal	(%ebp,%esi), %edi	C n-q*d remainder -> next n2
+	movl	%ecx, VAR_DST
+
+	movd	%mm0, %esi		C next n10
+
+	movl	$-1, (%ecx)
+	cmpl	%ecx, %edx
+	jne	L(integer_top)
+
+	jmp	L(integer_loop_done)
+
+
+
+C -----------------------------------------------------------------------------
+C
+C Being the fractional part, the "source" limbs are all zero, meaning
+C n10=0, n1=0, and hence nadj=0, leading to many instructions eliminated.
+C
+C The loop runs at 15 cycles.  The dependent chain is the same as the
+C general case above, but without the n2+n1 stage (due to n1==0), so 15
+C would seem to be the lower bound.
+C
+C A not entirely obvious simplification is that q1+1 never overflows a limb,
+C and so there's no need for the sbbl $0 or jz q1_ff from the general case.
+C q1 is the high word of m*n2+b*n2 and the following shows q1<=b-2 always.
+C rnd() means rounding down to a multiple of d.
+C
+C	m*n2 + b*n2 <= m*(d-1) + b*(d-1)
+C		     = m*d + b*d - m - b
+C		     = floor((b(b-d)-1)/d)*d + b*d - m - b
+C		     = rnd(b(b-d)-1) + b*d - m - b
+C		     = rnd(b(b-d)-1 + b*d) - m - b
+C		     = rnd(b*b-1) - m - b
+C		     <= (b-2)*b
+C
+C Unchanged from the general case is that the final quotient limb q can be
+C either q1 or q1+1, and the q1+1 case occurs often.  This can be seen from
+C equation 8.4 of the paper which simplifies as follows when n1==0 and
+C n0==0.
+C
+C	n-q1*d = (n2*k+q0*d)/b <= d + (d*d-2d)/b
+C
+C As before, the instruction groupings and empty comments show a naive
+C in-order view of the code, which is made a nonsense by out of order
+C execution.  There's 17 cycles shown, but it executes at 15.
+C
+C Rotating the store q and remainder->n2 instructions up to the top of the
+C loop gets the run time down from 16 to 15.
+
+	ALIGN(16)
+L(fraction_some):
+	C eax
+	C ebx
+	C ecx
+	C edx
+	C esi
+	C edi	carry
+	C ebp	divisor
+
+	movl	PARAM_DST, %esi
+	movl	VAR_DST_STOP, %ecx	C &dst[xsize+2]
+	movl	%edi, %eax
+
+	subl	$8, %ecx		C &dst[xsize]
+	jmp	L(fraction_entry)
+
+
+	ALIGN(16)
+L(fraction_top):
+	C eax	n2 carry, then scratch
+	C ebx	scratch (nadj, q1)
+	C ecx	dst, decrementing
+	C edx	scratch
+	C esi	dst stop point
+	C edi	(will be n2)
+	C ebp	divisor
+
+	movl	%ebx, (%ecx)	C previous q
+	movl	%eax, %edi	C remainder->n2
+
+L(fraction_entry):
+	mull	VAR_INVERSE	C m*n2
+
+	movl	%ebp, %eax	C d
+	subl	$4, %ecx	C dst
+	leal	1(%edi), %ebx
+
+	C
+
+	C
+
+	C
+
+	C
+
+	addl	%edx, %ebx	C 1 + high(n2<<32 + m*n2) = q1+1
+
+	mull	%ebx		C (q1+1)*d
+
+	C
+
+	C
+
+	C
+
+	negl	%eax		C low of n - (q1+1)*d
+
+	C
+
+	sbbl	%edx, %edi	C high of n - (q1+1)*d, caring only about carry
+	leal	(%ebp,%eax), %edx
+
+	cmovc(	%edx, %eax)	C n - q1*d if underflow from using q1+1
+	sbbl	$0, %ebx	C q
+	cmpl	%esi, %ecx
+
+	jne	L(fraction_top)
+
+
+	movl	%ebx, (%ecx)
+	jmp	L(fraction_done)
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/k7/mmx/lshift.asm b/third_party/gmp/mpn/x86/k7/mmx/lshift.asm
new file mode 100644
index 0000000..b3383cf
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k7/mmx/lshift.asm

@@ -0,0 +1,481 @@
+dnl  AMD K7 mpn_lshift -- mpn left shift.
+
+dnl  Copyright 1999-2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C K7: 1.21 cycles/limb (at 16 limbs/loop).
+
+
+
+dnl  K7: UNROLL_COUNT cycles/limb
+dnl           4           1.51
+dnl           8           1.26
+dnl          16           1.21
+dnl          32           1.2
+dnl  Maximum possible with the current code is 64.
+
+deflit(UNROLL_COUNT, 16)
+
+
+C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                       unsigned shift);
+C
+C Shift src,size left by shift many bits and store the result in dst,size.
+C Zeros are shifted in at the right.  The bits shifted out at the left are
+C the return value.
+C
+C The comments in mpn_rshift apply here too.
+
+ifdef(`PIC',`
+deflit(UNROLL_THRESHOLD, 10)
+',`
+deflit(UNROLL_THRESHOLD, 10)
+')
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC,  8)
+defframe(PARAM_DST,  4)
+
+defframe(SAVE_EDI, -4)
+defframe(SAVE_ESI, -8)
+defframe(SAVE_EBX, -12)
+deflit(SAVE_SIZE, 12)
+
+	TEXT
+	ALIGN(32)
+
+PROLOGUE(mpn_lshift)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %eax
+	movl	PARAM_SRC, %edx
+	subl	$SAVE_SIZE, %esp
+deflit(`FRAME',SAVE_SIZE)
+
+	movl	PARAM_SHIFT, %ecx
+	movl	%edi, SAVE_EDI
+
+	movl	PARAM_DST, %edi
+	decl	%eax
+	jnz	L(more_than_one_limb)
+
+	movl	(%edx), %edx
+
+	shldl(	%cl, %edx, %eax)	C eax was decremented to zero
+
+	shll	%cl, %edx
+
+	movl	%edx, (%edi)
+	movl	SAVE_EDI, %edi
+	addl	$SAVE_SIZE, %esp
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+L(more_than_one_limb):
+	C eax	size-1
+	C ebx
+	C ecx	shift
+	C edx	src
+	C esi
+	C edi	dst
+	C ebp
+
+	movd	PARAM_SHIFT, %mm6
+	movd	(%edx,%eax,4), %mm5	C src high limb
+	cmp	$UNROLL_THRESHOLD-1, %eax
+
+	jae	L(unroll)
+	negl	%ecx
+	movd	(%edx), %mm4		C src low limb
+
+	addl	$32, %ecx
+
+	movd	%ecx, %mm7
+
+L(simple_top):
+	C eax	loop counter, limbs
+	C ebx
+	C ecx
+	C edx	src
+	C esi
+	C edi	dst
+	C ebp
+	C
+	C mm0	scratch
+	C mm4	src low limb
+	C mm5	src high limb
+	C mm6	shift
+	C mm7	32-shift
+
+	movq	-4(%edx,%eax,4), %mm0
+	decl	%eax
+
+	psrlq	%mm7, %mm0
+
+	movd	%mm0, 4(%edi,%eax,4)
+	jnz	L(simple_top)
+
+
+	psllq	%mm6, %mm5
+	psllq	%mm6, %mm4
+
+	psrlq	$32, %mm5
+	movd	%mm4, (%edi)		C dst low limb
+
+	movd	%mm5, %eax		C return value
+
+	movl	SAVE_EDI, %edi
+	addl	$SAVE_SIZE, %esp
+	emms
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(16)
+L(unroll):
+	C eax	size-1
+	C ebx	(saved)
+	C ecx	shift
+	C edx	src
+	C esi
+	C edi	dst
+	C ebp
+	C
+	C mm5	src high limb, for return value
+	C mm6	lshift
+
+	movl	%esi, SAVE_ESI
+	movl	%ebx, SAVE_EBX
+	leal	-4(%edx,%eax,4), %edx   C &src[size-2]
+
+	testb	$4, %dl
+	movq	(%edx), %mm1		C src high qword
+
+	jz	L(start_src_aligned)
+
+
+	C src isn't aligned, process high limb (marked xxx) separately to
+	C make it so
+	C
+	C  source    -4(edx,%eax,4)
+	C                  |
+	C  +-------+-------+-------+--
+	C  |  xxx          |
+	C  +-------+-------+-------+--
+	C        0mod8   4mod8   0mod8
+	C
+	C  dest      -4(edi,%eax,4)
+	C                  |
+	C  +-------+-------+--
+	C  |  xxx  |       |
+	C  +-------+-------+--
+
+	psllq	%mm6, %mm1
+	subl	$4, %edx
+	movl	%eax, PARAM_SIZE	C size-1
+
+	psrlq	$32, %mm1
+	decl	%eax			C size-2 is new size-1
+
+	movd	%mm1, 4(%edi,%eax,4)
+	movq	(%edx), %mm1		C new src high qword
+L(start_src_aligned):
+
+
+	leal	-4(%edi,%eax,4), %edi   C &dst[size-2]
+	psllq	%mm6, %mm5
+
+	testl	$4, %edi
+	psrlq	$32, %mm5		C return value
+
+	jz	L(start_dst_aligned)
+
+
+	C dst isn't aligned, subtract 4 bytes to make it so, and pretend the
+	C shift is 32 bits extra.  High limb of dst (marked xxx) handled
+	C here separately.
+	C
+	C  source       %edx
+	C  +-------+-------+--
+	C  |      mm1      |
+	C  +-------+-------+--
+	C                0mod8   4mod8
+	C
+	C  dest         %edi
+	C  +-------+-------+-------+--
+	C  |  xxx  |
+	C  +-------+-------+-------+--
+	C        0mod8   4mod8   0mod8
+
+	movq	%mm1, %mm0
+	psllq	%mm6, %mm1
+	addl	$32, %ecx		C shift+32
+
+	psrlq	$32, %mm1
+
+	movd	%mm1, 4(%edi)
+	movq	%mm0, %mm1
+	subl	$4, %edi
+
+	movd	%ecx, %mm6		C new lshift
+L(start_dst_aligned):
+
+	decl	%eax			C size-2, two last limbs handled at end
+	movq	%mm1, %mm2		C copy of src high qword
+	negl	%ecx
+
+	andl	$-2, %eax		C round size down to even
+	addl	$64, %ecx
+
+	movl	%eax, %ebx
+	negl	%eax
+
+	andl	$UNROLL_MASK, %eax
+	decl	%ebx
+
+	shll	%eax
+
+	movd	%ecx, %mm7		C rshift = 64-lshift
+
+ifdef(`PIC',`
+	call	L(pic_calc)
+L(here):
+',`
+	leal	L(entry) (%eax,%eax,4), %esi
+')
+	shrl	$UNROLL_LOG2, %ebx	C loop counter
+
+	leal	ifelse(UNROLL_BYTES,256,128) -8(%edx,%eax,2), %edx
+	leal	ifelse(UNROLL_BYTES,256,128) (%edi,%eax,2), %edi
+	movl	PARAM_SIZE, %eax	C for use at end
+	jmp	*%esi
+
+
+ifdef(`PIC',`
+L(pic_calc):
+	C See mpn/x86/README about old gas bugs
+	leal	(%eax,%eax,4), %esi
+	addl	$L(entry)-L(here), %esi
+	addl	(%esp), %esi
+
+	ret_internal
+')
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(32)
+L(top):
+	C eax	size (for use at end)
+	C ebx	loop counter
+	C ecx	rshift
+	C edx	src
+	C esi	computed jump
+	C edi	dst
+	C ebp
+	C
+	C mm0	scratch
+	C mm1	\ carry (alternating, mm2 first)
+	C mm2	/
+	C mm6	lshift
+	C mm7	rshift
+	C
+	C 10 code bytes/limb
+	C
+	C The two chunks differ in whether mm1 or mm2 hold the carry.
+	C The computed jump puts the initial carry in both mm1 and mm2.
+
+L(entry):
+deflit(CHUNK_COUNT, 4)
+forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, `
+	deflit(`disp0', eval(-i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
+	deflit(`disp1', eval(disp0 - 8))
+
+Zdisp(	movq,	disp0,(%edx), %mm0)
+	psllq	%mm6, %mm2
+
+	movq	%mm0, %mm1
+	psrlq	%mm7, %mm0
+
+	por	%mm2, %mm0
+Zdisp(	movq,	%mm0, disp0,(%edi))
+
+
+Zdisp(	movq,	disp1,(%edx), %mm0)
+	psllq	%mm6, %mm1
+
+	movq	%mm0, %mm2
+	psrlq	%mm7, %mm0
+
+	por	%mm1, %mm0
+Zdisp(	movq,	%mm0, disp1,(%edi))
+')
+
+	subl	$UNROLL_BYTES, %edx
+	subl	$UNROLL_BYTES, %edi
+	decl	%ebx
+
+	jns	L(top)
+
+
+
+define(`disp', `m4_empty_if_zero(eval($1 ifelse(UNROLL_BYTES,256,-128)))')
+
+L(end):
+	testb	$1, %al
+	movl	SAVE_EBX, %ebx
+	psllq	%mm6, %mm2	C wanted left shifted in all cases below
+
+	movd	%mm5, %eax
+
+	movl	SAVE_ESI, %esi
+	jz	L(end_even)
+
+
+L(end_odd):
+
+	C Size odd, destination was aligned.
+	C
+	C                 source        edx+8   edx+4
+	C                 --+---------------+-------+
+	C                   |      mm2      |       |
+	C                 --+---------------+-------+
+	C
+	C dest                            edi
+	C --+---------------+---------------+-------+
+	C   |   written     |               |       |
+	C --+---------------+---------------+-------+
+	C
+	C mm6 = shift
+	C mm7 = ecx = 64-shift
+
+
+	C Size odd, destination was unaligned.
+	C
+	C                 source        edx+8   edx+4
+	C                 --+---------------+-------+
+	C                   |      mm2      |       |
+	C                 --+---------------+-------+
+	C
+	C         dest                            edi
+	C         --+---------------+---------------+
+	C           |   written     |               |
+	C         --+---------------+---------------+
+	C
+	C mm6 = shift+32
+	C mm7 = ecx = 64-(shift+32)
+
+
+	C In both cases there's one extra limb of src to fetch and combine
+	C with mm2 to make a qword at (%edi), and in the aligned case
+	C there's an extra limb of dst to be formed from that extra src limb
+	C left shifted.
+
+	movd	disp(4) (%edx), %mm0
+	testb	$32, %cl
+
+	movq	%mm0, %mm1
+	psllq	$32, %mm0
+
+	psrlq	%mm7, %mm0
+	psllq	%mm6, %mm1
+
+	por	%mm2, %mm0
+
+	movq	%mm0, disp(0) (%edi)
+	jz	L(end_odd_unaligned)
+	movd	%mm1, disp(-4) (%edi)
+L(end_odd_unaligned):
+
+	movl	SAVE_EDI, %edi
+	addl	$SAVE_SIZE, %esp
+	emms
+
+	ret
+
+
+L(end_even):
+
+	C Size even, destination was aligned.
+	C
+	C                 source        edx+8
+	C                 --+---------------+
+	C                   |      mm2      |
+	C                 --+---------------+
+	C
+	C dest                            edi
+	C --+---------------+---------------+
+	C   |   written     |               |
+	C --+---------------+---------------+
+	C
+	C mm6 = shift
+	C mm7 = ecx = 64-shift
+
+
+	C Size even, destination was unaligned.
+	C
+	C               source          edx+8
+	C                 --+---------------+
+	C                   |      mm2      |
+	C                 --+---------------+
+	C
+	C         dest                  edi+4
+	C         --+---------------+-------+
+	C           |    written    |       |
+	C         --+---------------+-------+
+	C
+	C mm6 = shift+32
+	C mm7 = ecx = 64-(shift+32)
+
+
+	C The movq for the aligned case overwrites the movd for the
+	C unaligned case.
+
+	movq	%mm2, %mm0
+	psrlq	$32, %mm2
+
+	testb	$32, %cl
+	movd	%mm2, disp(4) (%edi)
+
+	jz	L(end_even_unaligned)
+	movq	%mm0, disp(0) (%edi)
+L(end_even_unaligned):
+
+	movl	SAVE_EDI, %edi
+	addl	$SAVE_SIZE, %esp
+	emms
+
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/k7/mmx/popham.asm b/third_party/gmp/mpn/x86/k7/mmx/popham.asm
new file mode 100644
index 0000000..95965b7
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k7/mmx/popham.asm

@@ -0,0 +1,213 @@
+dnl  AMD K7 mpn_popcount, mpn_hamdist -- population count and hamming
+dnl  distance.
+
+dnl  Copyright 2000-2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C			     popcount	     hamdist
+C P3 generic			6.5		7
+C P3 model 9  (Banias)          5.7		6.1
+C P3 model 13 (Dothan)		5.75		6
+C K7				5		6
+
+C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size);
+C unsigned long mpn_hamdist (mp_srcptr src, mp_srcptr src2, mp_size_t size);
+C
+C The code here is almost certainly not optimal, but is already a 3x speedup
+C over the generic C code.  The main improvement would be to interleave
+C processing of two qwords in the loop so as to fully exploit the available
+C execution units, possibly leading to 3.25 c/l (13 cycles for 4 limbs).
+C
+C The loop is based on the example "Efficient 64-bit population count using
+C MMX instructions" in the Athlon Optimization Guide, AMD document 22007,
+C page 158 of rev E (reference in mpn/x86/k7/README).
+
+ifdef(`OPERATION_popcount',,
+`ifdef(`OPERATION_hamdist',,
+`m4_error(`Need OPERATION_popcount or OPERATION_hamdist defined
+')')')
+
+define(HAM,
+m4_assert_numargs(1)
+`ifdef(`OPERATION_hamdist',`$1')')
+
+define(POP,
+m4_assert_numargs(1)
+`ifdef(`OPERATION_popcount',`$1')')
+
+HAM(`
+defframe(PARAM_SIZE,   12)
+defframe(PARAM_SRC2,   8)
+defframe(PARAM_SRC,    4)
+define(M4_function,mpn_hamdist)
+')
+POP(`
+defframe(PARAM_SIZE,   8)
+defframe(PARAM_SRC,    4)
+define(M4_function,mpn_popcount)
+')
+
+MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
+
+
+ifdef(`PIC',,`
+	dnl  non-PIC
+
+	RODATA
+	ALIGN(8)
+
+L(rodata_AAAAAAAAAAAAAAAA):
+	.long	0xAAAAAAAA
+	.long	0xAAAAAAAA
+
+L(rodata_3333333333333333):
+	.long	0x33333333
+	.long	0x33333333
+
+L(rodata_0F0F0F0F0F0F0F0F):
+	.long	0x0F0F0F0F
+	.long	0x0F0F0F0F
+')
+
+	TEXT
+	ALIGN(32)
+
+PROLOGUE(M4_function)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+
+ifdef(`PIC',`
+	movl	$0xAAAAAAAA, %eax
+	movl	$0x33333333, %edx
+
+	movd	%eax, %mm7
+	movd	%edx, %mm6
+
+	movl	$0x0F0F0F0F, %eax
+
+	punpckldq %mm7, %mm7
+	punpckldq %mm6, %mm6
+
+	movd	%eax, %mm5
+	movd	%edx, %mm4
+
+	punpckldq %mm5, %mm5
+
+',`
+	movq	L(rodata_AAAAAAAAAAAAAAAA), %mm7
+	movq	L(rodata_3333333333333333), %mm6
+	movq	L(rodata_0F0F0F0F0F0F0F0F), %mm5
+')
+	pxor	%mm4, %mm4
+
+define(REG_AAAAAAAAAAAAAAAA,%mm7)
+define(REG_3333333333333333,%mm6)
+define(REG_0F0F0F0F0F0F0F0F,%mm5)
+define(REG_0000000000000000,%mm4)
+
+
+	movl	PARAM_SRC, %eax
+HAM(`	movl	PARAM_SRC2, %edx')
+
+	pxor	%mm2, %mm2	C total
+
+	shrl	%ecx
+	jnc	L(top)
+
+	movd	(%eax,%ecx,8), %mm1
+
+HAM(`	movd	(%edx,%ecx,8), %mm0
+	pxor	%mm0, %mm1
+')
+	orl	%ecx, %ecx
+	jmp	L(loaded)
+
+
+	ALIGN(16)
+L(top):
+	C eax	src
+	C ebx
+	C ecx	counter, qwords, decrementing
+	C edx	[hamdist] src2
+	C
+	C mm0	(scratch)
+	C mm1	(scratch)
+	C mm2	total (low dword)
+	C mm3
+	C mm4	\
+	C mm5	| special constants
+	C mm6	|
+	C mm7	/
+
+	movq	-8(%eax,%ecx,8), %mm1
+
+HAM(`	pxor	-8(%edx,%ecx,8), %mm1')
+	decl	%ecx
+
+L(loaded):
+	movq	%mm1, %mm0
+	pand	REG_AAAAAAAAAAAAAAAA, %mm1
+
+	psrlq	$1, %mm1
+
+	psubd	%mm1, %mm0	C bit pairs
+
+
+	movq	%mm0, %mm1
+	psrlq	$2, %mm0
+
+	pand	REG_3333333333333333, %mm0
+	pand	REG_3333333333333333, %mm1
+
+	paddd	%mm1, %mm0	C nibbles
+
+
+	movq	%mm0, %mm1
+	psrlq	$4, %mm0
+
+	pand	REG_0F0F0F0F0F0F0F0F, %mm0
+	pand	REG_0F0F0F0F0F0F0F0F, %mm1
+
+	paddd	%mm1, %mm0	C bytes
+
+
+	psadbw(	%mm4, %mm0)
+
+	paddd	%mm0, %mm2	C add to total
+	jnz	L(top)
+
+
+	movd	%mm2, %eax
+	emms
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/k7/mmx/rshift.asm b/third_party/gmp/mpn/x86/k7/mmx/rshift.asm
new file mode 100644
index 0000000..345d23a
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k7/mmx/rshift.asm

@@ -0,0 +1,480 @@
+dnl  AMD K7 mpn_rshift -- mpn right shift.
+
+dnl  Copyright 1999-2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C K7: 1.21 cycles/limb (at 16 limbs/loop).
+
+
+
+dnl  K7: UNROLL_COUNT cycles/limb
+dnl           4           1.51
+dnl           8           1.26
+dnl          16           1.21
+dnl          32           1.2
+dnl  Maximum possible with the current code is 64.
+
+deflit(UNROLL_COUNT, 16)
+
+
+C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                       unsigned shift);
+C
+C Shift src,size right by shift many bits and store the result in dst,size.
+C Zeros are shifted in at the left.  The bits shifted out at the right are
+C the return value.
+C
+C This code uses 64-bit MMX operations, which makes it possible to handle
+C two limbs at a time, for a theoretical 1.0 cycles/limb.  Plain integer
+C code, on the other hand, suffers from shrd being a vector path decode and
+C running at 3 cycles back-to-back.
+C
+C Full speed depends on source and destination being aligned, and some hairy
+C setups and finish-ups are done to arrange this for the loop.
+
+ifdef(`PIC',`
+deflit(UNROLL_THRESHOLD, 10)
+',`
+deflit(UNROLL_THRESHOLD, 10)
+')
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC,  8)
+defframe(PARAM_DST,  4)
+
+defframe(SAVE_EDI, -4)
+defframe(SAVE_ESI, -8)
+defframe(SAVE_EBX, -12)
+deflit(SAVE_SIZE, 12)
+
+	TEXT
+	ALIGN(32)
+
+PROLOGUE(mpn_rshift)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %eax
+	movl	PARAM_SRC, %edx
+	subl	$SAVE_SIZE, %esp
+deflit(`FRAME',SAVE_SIZE)
+
+	movl	PARAM_SHIFT, %ecx
+	movl	%edi, SAVE_EDI
+
+	movl	PARAM_DST, %edi
+	decl	%eax
+	jnz	L(more_than_one_limb)
+
+	movl	(%edx), %edx		C src limb
+
+	shrdl(	%cl, %edx, %eax)	C eax was decremented to zero
+
+	shrl	%cl, %edx
+
+	movl	%edx, (%edi)		C dst limb
+	movl	SAVE_EDI, %edi
+	addl	$SAVE_SIZE, %esp
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+L(more_than_one_limb):
+	C eax	size-1
+	C ebx
+	C ecx	shift
+	C edx	src
+	C esi
+	C edi	dst
+	C ebp
+
+	movd	PARAM_SHIFT, %mm6	C rshift
+	movd	(%edx), %mm5		C src low limb
+	cmp	$UNROLL_THRESHOLD-1, %eax
+
+	jae	L(unroll)
+	leal	(%edx,%eax,4), %edx	C &src[size-1]
+	leal	-4(%edi,%eax,4), %edi	C &dst[size-2]
+
+	movd	(%edx), %mm4		C src high limb
+	negl	%eax
+
+
+L(simple_top):
+	C eax	loop counter, limbs, negative
+	C ebx
+	C ecx	shift
+	C edx	carry
+	C edx	&src[size-1]
+	C edi	&dst[size-2]
+	C ebp
+	C
+	C mm0	scratch
+	C mm4	src high limb
+	C mm5	src low limb
+	C mm6	shift
+
+	movq	(%edx,%eax,4), %mm0
+	incl	%eax
+
+	psrlq	%mm6, %mm0
+
+	movd	%mm0, (%edi,%eax,4)
+	jnz	L(simple_top)
+
+
+	psllq	$32, %mm5
+	psrlq	%mm6, %mm4
+
+	psrlq	%mm6, %mm5
+	movd	%mm4, 4(%edi)		C dst high limb
+
+	movd	%mm5, %eax		C return value
+
+	movl	SAVE_EDI, %edi
+	addl	$SAVE_SIZE, %esp
+	emms
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(16)
+L(unroll):
+	C eax	size-1
+	C ebx
+	C ecx	shift
+	C edx	src
+	C esi
+	C edi	dst
+	C ebp
+	C
+	C mm5	src low limb
+	C mm6	rshift
+
+	testb	$4, %dl
+	movl	%esi, SAVE_ESI
+	movl	%ebx, SAVE_EBX
+
+	psllq	$32, %mm5
+	jz	L(start_src_aligned)
+
+
+	C src isn't aligned, process low limb separately (marked xxx) and
+	C step src and dst by one limb, making src aligned.
+	C
+	C source                  edx
+	C --+-------+-------+-------+
+	C           |          xxx  |
+	C --+-------+-------+-------+
+	C         4mod8   0mod8   4mod8
+	C
+	C         dest            edi
+	C         --+-------+-------+
+	C           |       |  xxx  |
+	C         --+-------+-------+
+
+	movq	(%edx), %mm0		C src low two limbs
+	addl	$4, %edx
+	movl	%eax, PARAM_SIZE	C size-1
+
+	addl	$4, %edi
+	decl	%eax			C size-2 is new size-1
+
+	psrlq	%mm6, %mm0
+	movl	%edi, PARAM_DST		C new dst
+
+	movd	%mm0, -4(%edi)
+L(start_src_aligned):
+
+
+	movq	(%edx), %mm1		C src low two limbs
+	decl	%eax			C size-2, two last limbs handled at end
+	testl	$4, %edi
+
+	psrlq	%mm6, %mm5
+	jz	L(start_dst_aligned)
+
+
+	C dst isn't aligned, add 4 to make it so, and pretend the shift is
+	C 32 bits extra.  Low limb of dst (marked xxx) handled here separately.
+	C
+	C          source          edx
+	C          --+-------+-------+
+	C            |      mm1      |
+	C          --+-------+-------+
+	C                  4mod8   0mod8
+	C
+	C  dest                    edi
+	C  --+-------+-------+-------+
+	C                    |  xxx  |
+	C  --+-------+-------+-------+
+	C          4mod8   0mod8   4mod8
+
+	movq	%mm1, %mm0
+	psrlq	%mm6, %mm1
+	addl	$32, %ecx		C shift+32
+
+	movd	%mm1, (%edi)
+	movq	%mm0, %mm1
+	addl	$4, %edi		C new dst
+
+	movd	%ecx, %mm6
+L(start_dst_aligned):
+
+
+	movq	%mm1, %mm2		C copy of src low two limbs
+	negl	%ecx
+	andl	$-2, %eax		C round size down to even
+
+	movl	%eax, %ebx
+	negl	%eax
+	addl	$64, %ecx
+
+	andl	$UNROLL_MASK, %eax
+	decl	%ebx
+
+	shll	%eax
+
+	movd	%ecx, %mm7		C lshift = 64-rshift
+
+ifdef(`PIC',`
+	call	L(pic_calc)
+L(here):
+',`
+	leal	L(entry) (%eax,%eax,4), %esi
+	negl	%eax
+')
+	shrl	$UNROLL_LOG2, %ebx	C loop counter
+
+	leal	ifelse(UNROLL_BYTES,256,128+) 8(%edx,%eax,2), %edx
+	leal	ifelse(UNROLL_BYTES,256,128) (%edi,%eax,2), %edi
+	movl	PARAM_SIZE, %eax	C for use at end
+
+	jmp	*%esi
+
+
+ifdef(`PIC',`
+L(pic_calc):
+	C See mpn/x86/README about old gas bugs
+	leal	(%eax,%eax,4), %esi
+	addl	$L(entry)-L(here), %esi
+	addl	(%esp), %esi
+	negl	%eax
+
+	ret_internal
+')
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(64)
+L(top):
+	C eax	size, for use at end
+	C ebx	loop counter
+	C ecx	lshift
+	C edx	src
+	C esi	was computed jump
+	C edi	dst
+	C ebp
+	C
+	C mm0	scratch
+	C mm1	\ carry (alternating)
+	C mm2	/
+	C mm6	rshift
+	C mm7	lshift
+	C
+	C 10 code bytes/limb
+	C
+	C The two chunks differ in whether mm1 or mm2 hold the carry.
+	C The computed jump puts the initial carry in both mm1 and mm2.
+
+L(entry):
+deflit(CHUNK_COUNT, 4)
+forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, `
+	deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
+	deflit(`disp1', eval(disp0 + 8))
+
+Zdisp(	movq,	disp0,(%edx), %mm0)
+	psrlq	%mm6, %mm2
+
+	movq	%mm0, %mm1
+	psllq	%mm7, %mm0
+
+	por	%mm2, %mm0
+Zdisp(	movq,	%mm0, disp0,(%edi))
+
+
+Zdisp(	movq,	disp1,(%edx), %mm0)
+	psrlq	%mm6, %mm1
+
+	movq	%mm0, %mm2
+	psllq	%mm7, %mm0
+
+	por	%mm1, %mm0
+Zdisp(	movq,	%mm0, disp1,(%edi))
+')
+
+	addl	$UNROLL_BYTES, %edx
+	addl	$UNROLL_BYTES, %edi
+	decl	%ebx
+
+	jns	L(top)
+
+
+deflit(`disp0', ifelse(UNROLL_BYTES,256,-128))
+deflit(`disp1', eval(disp0-0 + 8))
+
+	testb	$1, %al
+	psrlq	%mm6, %mm2	C wanted rshifted in all cases below
+	movl	SAVE_ESI, %esi
+
+	movd	%mm5, %eax		C return value
+
+	movl	SAVE_EBX, %ebx
+	jz	L(end_even)
+
+
+	C Size odd, destination was aligned.
+	C
+	C source
+	C       edx
+	C +-------+---------------+--
+	C |       |      mm2      |
+	C +-------+---------------+--
+	C
+	C dest                  edi
+	C +-------+---------------+---------------+--
+	C |       |               |    written    |
+	C +-------+---------------+---------------+--
+	C
+	C mm6 = shift
+	C mm7 = ecx = 64-shift
+
+
+	C Size odd, destination was unaligned.
+	C
+	C source
+	C       edx
+	C +-------+---------------+--
+	C |       |      mm2      |
+	C +-------+---------------+--
+	C
+	C dest          edi
+	C +---------------+---------------+--
+	C |               |    written    |
+	C +---------------+---------------+--
+	C
+	C mm6 = shift+32
+	C mm7 = ecx = 64-(shift+32)
+
+
+	C In both cases there's one extra limb of src to fetch and combine
+	C with mm2 to make a qword to store, and in the aligned case there's
+	C a further extra limb of dst to be formed.
+
+
+	movd	disp0(%edx), %mm0
+	movq	%mm0, %mm1
+
+	psllq	%mm7, %mm0
+	testb	$32, %cl
+
+	por	%mm2, %mm0
+	psrlq	%mm6, %mm1
+
+	movq	%mm0, disp0(%edi)
+	jz	L(finish_odd_unaligned)
+
+	movd	%mm1, disp1(%edi)
+L(finish_odd_unaligned):
+
+	movl	SAVE_EDI, %edi
+	addl	$SAVE_SIZE, %esp
+	emms
+
+	ret
+
+
+L(end_even):
+
+	C Size even, destination was aligned.
+	C
+	C source
+	C +---------------+--
+	C |      mm2      |
+	C +---------------+--
+	C
+	C dest          edi
+	C +---------------+---------------+--
+	C |               |      mm3      |
+	C +---------------+---------------+--
+	C
+	C mm6 = shift
+	C mm7 = ecx = 64-shift
+
+
+	C Size even, destination was unaligned.
+	C
+	C source
+	C +---------------+--
+	C |      mm2      |
+	C +---------------+--
+	C
+	C dest  edi
+	C +-------+---------------+--
+	C |       |      mm3      |
+	C +-------+---------------+--
+	C
+	C mm6 = shift+32
+	C mm7 = 64-(shift+32)
+
+
+	C The movd for the unaligned case is the same data as the movq for
+	C the aligned case, it's just a choice between whether one or two
+	C limbs should be written.
+
+
+	testb	$32, %cl
+	movd	%mm2, disp0(%edi)
+
+	jz	L(end_even_unaligned)
+
+	movq	%mm2, disp0(%edi)
+L(end_even_unaligned):
+
+	movl	SAVE_EDI, %edi
+	addl	$SAVE_SIZE, %esp
+	emms
+
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/k7/mod_1_1.asm b/third_party/gmp/mpn/x86/k7/mod_1_1.asm
new file mode 100644
index 0000000..1bbe6f9
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k7/mod_1_1.asm

@@ -0,0 +1,221 @@
+dnl  x86-32 mpn_mod_1_1p, requiring cmov.
+
+dnl  Contributed to the GNU project by Niels Möller and Torbjorn Granlund.
+
+dnl  Copyright 2010, 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C			    cycles/limb
+C P5				 ?
+C P6 model 0-8,10-12		 ?
+C P6 model 9  (Banias)		 ?
+C P6 model 13 (Dothan)		 ?
+C P4 model 0  (Willamette)	 ?
+C P4 model 1  (?)		 ?
+C P4 model 2  (Northwood)	 ?
+C P4 model 3  (Prescott)	 ?
+C P4 model 4  (Nocona)		 ?
+C AMD K6			 ?
+C AMD K7			 7
+C AMD K8			 ?
+
+define(`B2mb', `%ebx')
+define(`r0', `%esi')
+define(`r2', `%ebp')
+define(`t0', `%edi')
+define(`ap', `%ecx')  C Also shift count
+
+C Stack frame
+C	pre	36(%esp)
+C	b	32(%esp)
+C	n	28(%esp)
+C	ap	24(%esp)
+C	return	20(%esp)
+C	%ebp	16(%esp)
+C	%edi	12(%esp)
+C	%esi	8(%esp)
+C	%ebx	4(%esp)
+C	B2mod	(%esp)
+
+define(`B2modb', `(%esp)')
+define(`n', `28(%esp)')
+define(`b', `32(%esp)')
+define(`pre', `36(%esp)')
+
+C mp_limb_t
+C mpn_mod_1_1p (mp_srcptr ap, mp_size_t n, mp_limb_t b, mp_limb_t pre[4])
+C
+C The pre array contains bi, cnt, B1modb, B2modb
+C Note: This implementation needs B1modb only when cnt > 0
+
+ASM_START()
+	TEXT
+	ALIGN(8)
+PROLOGUE(mpn_mod_1_1p)
+	push	%ebp
+	push	%edi
+	push	%esi
+	push	%ebx
+	mov	32(%esp), %ebp		C pre[]
+
+	mov	12(%ebp), %eax		C B2modb
+	push	%eax			C Put it on stack
+
+	mov	n, %edx
+	mov	24(%esp), ap
+
+	lea	(ap, %edx, 4), ap
+	mov	-4(ap), %eax
+	cmp	$3, %edx
+	jnc	L(first)
+	mov	-8(ap), r0
+	jmp	L(reduce_two)
+
+L(first):
+	C First iteration, no r2
+	mull	B2modb
+	mov	-12(ap), r0
+	add	%eax, r0
+	mov	-8(ap), %eax
+	adc	%edx, %eax
+	sbb	r2, r2
+	subl	$3, n
+	lea	-16(ap), ap
+	jz	L(reduce_three)
+
+	mov	B2modb, B2mb
+	sub	b, B2mb
+	lea	(B2mb, r0), t0
+	jmp	L(mid)
+
+	ALIGN(16)
+L(top): C Loopmixed to 7 c/l on k7
+	add	%eax, r0
+	lea	(B2mb, r0), t0
+	mov	r2, %eax
+	adc	%edx, %eax
+	sbb	r2, r2
+L(mid):	mull	B2modb
+	and	B2modb, r2
+	add	r0, r2
+	decl	n
+	mov	(ap), r0
+	cmovc(	t0, r2)
+	lea	-4(ap), ap
+	jnz	L(top)
+
+	add	%eax, r0
+	mov	r2, %eax
+	adc	%edx, %eax
+	sbb	r2, r2
+
+L(reduce_three):
+	C Eliminate r2
+	and	b, r2
+	sub	r2, %eax
+
+L(reduce_two):
+	mov	pre, %ebp
+	movb	4(%ebp), %cl
+	test	%cl, %cl
+	jz	L(normalized)
+
+	C Unnormalized, use B1modb to reduce to size < B b
+	mull	8(%ebp)
+	xor	t0, t0
+	add	%eax, r0
+	adc	%edx, t0
+	mov	t0, %eax
+
+	C Left-shift to normalize
+	shld	%cl, r0, %eax C Always use shld?
+
+	shl	%cl, r0
+	jmp	L(udiv)
+
+L(normalized):
+	mov	%eax, t0
+	sub	b, t0
+	cmovnc(	t0, %eax)
+
+L(udiv):
+	lea	1(%eax), t0
+	mull	(%ebp)
+	mov	b, %ebx		C Needed in register for lea
+	add	r0, %eax
+	adc	t0, %edx
+	imul	%ebx, %edx
+	sub	%edx, r0
+	cmp	r0, %eax
+	lea	(%ebx, r0), %eax
+	cmovnc(	r0, %eax)
+	cmp	%ebx, %eax
+	jnc	L(fix)
+L(ok):	shr	%cl, %eax
+
+	add	$4, %esp
+	pop	%ebx
+	pop	%esi
+	pop	%edi
+	pop	%ebp
+
+	ret
+L(fix):	sub	%ebx, %eax
+	jmp	L(ok)
+EPILOGUE()
+
+PROLOGUE(mpn_mod_1_1p_cps)
+	push	%ebp
+	mov	12(%esp), %ebp
+	push	%esi
+	bsr	%ebp, %ecx
+	push	%ebx
+	xor	$31, %ecx
+	mov	16(%esp), %esi
+	sal	%cl, %ebp
+	mov	%ebp, %edx
+	not	%edx
+	mov	$-1, %eax
+	div	%ebp			C On K7, invert_limb would be a few cycles faster.
+	mov	%eax, (%esi)		C store bi
+	mov	%ecx, 4(%esi)		C store cnt
+	neg	%ebp
+	mov	$1, %edx
+	shld	%cl, %eax, %edx
+	imul	%ebp, %edx
+	shr	%cl, %edx
+	imul	%ebp, %eax
+	mov	%edx, 8(%esi)		C store B1modb
+	mov	%eax, 12(%esi)		C store B2modb
+	pop	%ebx
+	pop	%esi
+	pop	%ebp
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/k7/mod_1_4.asm b/third_party/gmp/mpn/x86/k7/mod_1_4.asm
new file mode 100644
index 0000000..bb7597e
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k7/mod_1_4.asm

@@ -0,0 +1,260 @@
+dnl  x86-32 mpn_mod_1s_4p, requiring cmov.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  Copyright 2009, 2010 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C			    cycles/limb
+C P5				 ?
+C P6 model 0-8,10-12		 ?
+C P6 model 9  (Banias)		 ?
+C P6 model 13 (Dothan)		 6
+C P4 model 0  (Willamette)	 ?
+C P4 model 1  (?)		 ?
+C P4 model 2  (Northwood)	15.5
+C P4 model 3  (Prescott)	 ?
+C P4 model 4  (Nocona)		 ?
+C AMD K6			 ?
+C AMD K7			 4.75
+C AMD K8			 ?
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_mod_1s_4p)
+	push	%ebp
+	push	%edi
+	push	%esi
+	push	%ebx
+	sub	$28, %esp
+	mov	60(%esp), %edi		C cps[]
+	mov	8(%edi), %eax
+	mov	12(%edi), %edx
+	mov	16(%edi), %ecx
+	mov	20(%edi), %esi
+	mov	24(%edi), %edi
+	mov	%eax, 4(%esp)
+	mov	%edx, 8(%esp)
+	mov	%ecx, 12(%esp)
+	mov	%esi, 16(%esp)
+	mov	%edi, 20(%esp)
+	mov	52(%esp), %eax		C n
+	xor	%edi, %edi
+	mov	48(%esp), %esi		C up
+	lea	-12(%esi,%eax,4), %esi
+	and	$3, %eax
+	je	L(b0)
+	cmp	$2, %eax
+	jc	L(b1)
+	je	L(b2)
+
+L(b3):	mov	4(%esi), %eax
+	mull	4(%esp)
+	mov	(%esi), %ebp
+	add	%eax, %ebp
+	adc	%edx, %edi
+	mov	8(%esi), %eax
+	mull	8(%esp)
+	lea	-12(%esi), %esi
+	jmp	L(m0)
+
+L(b0):	mov	(%esi), %eax
+	mull	4(%esp)
+	mov	-4(%esi), %ebp
+	add	%eax, %ebp
+	adc	%edx, %edi
+	mov	4(%esi), %eax
+	mull	8(%esp)
+	add	%eax, %ebp
+	adc	%edx, %edi
+	mov	8(%esi), %eax
+	mull	12(%esp)
+	lea	-16(%esi), %esi
+	jmp	L(m0)
+
+L(b1):	mov	8(%esi), %ebp
+	lea	-4(%esi), %esi
+	jmp	L(m1)
+
+L(b2):	mov	8(%esi), %edi
+	mov	4(%esi), %ebp
+	lea	-8(%esi), %esi
+	jmp	L(m1)
+
+	ALIGN(16)
+L(top):	mov	(%esi), %eax
+	mull	4(%esp)
+	mov	-4(%esi), %ebx
+	xor	%ecx, %ecx
+	add	%eax, %ebx
+	adc	%edx, %ecx
+	mov	4(%esi), %eax
+	mull	8(%esp)
+	add	%eax, %ebx
+	adc	%edx, %ecx
+	mov	8(%esi), %eax
+	mull	12(%esp)
+	add	%eax, %ebx
+	adc	%edx, %ecx
+	lea	-16(%esi), %esi
+	mov	16(%esp), %eax
+	mul	%ebp
+	add	%eax, %ebx
+	adc	%edx, %ecx
+	mov	20(%esp), %eax
+	mul	%edi
+	mov	%ebx, %ebp
+	mov	%ecx, %edi
+L(m0):	add	%eax, %ebp
+	adc	%edx, %edi
+L(m1):	subl	$4, 52(%esp)
+	ja	L(top)
+
+L(end):	mov	4(%esp), %eax
+	mul	%edi
+	mov	60(%esp), %edi
+	add	%eax, %ebp
+	adc	$0, %edx
+	mov	4(%edi), %ecx
+	mov	%edx, %esi
+	mov	%ebp, %eax
+	sal	%cl, %esi
+	mov	%ecx, %ebx
+	neg	%ecx
+	shr	%cl, %eax
+	or	%esi, %eax
+	lea	1(%eax), %esi
+	mull	(%edi)
+	mov	%ebx, %ecx
+	mov	%eax, %ebx
+	mov	%ebp, %eax
+	mov	56(%esp), %ebp
+	sal	%cl, %eax
+	add	%eax, %ebx
+	adc	%esi, %edx
+	imul	%ebp, %edx
+	sub	%edx, %eax
+	lea	(%eax,%ebp), %edx
+	cmp	%eax, %ebx
+	cmovc(	%edx, %eax)
+	mov	%eax, %edx
+	sub	%ebp, %eax
+	cmovc(	%edx, %eax)
+	add	$28, %esp
+	pop	%ebx
+	pop	%esi
+	pop	%edi
+	pop	%ebp
+	shr	%cl, %eax
+	ret
+EPILOGUE()
+
+	ALIGN(16)
+PROLOGUE(mpn_mod_1s_4p_cps)
+C CAUTION: This is the same code as in pentium4/sse2/mod_1_4.asm
+	push	%ebp
+	push	%edi
+	push	%esi
+	push	%ebx
+	mov	20(%esp), %ebp		C FIXME: avoid bp for 0-idx
+	mov	24(%esp), %ebx
+	bsr	%ebx, %ecx
+	xor	$31, %ecx
+	sal	%cl, %ebx		C b << cnt
+	mov	%ebx, %edx
+	not	%edx
+	mov	$-1, %eax
+	div	%ebx
+	xor	%edi, %edi
+	sub	%ebx, %edi
+	mov	$1, %esi
+	mov	%eax, (%ebp)		C store bi
+	mov	%ecx, 4(%ebp)		C store cnt
+	shld	%cl, %eax, %esi
+	imul	%edi, %esi
+	mov	%eax, %edi
+	mul	%esi
+
+	add	%esi, %edx
+	shr	%cl, %esi
+	mov	%esi, 8(%ebp)		C store B1modb
+
+	not	%edx
+	imul	%ebx, %edx
+	lea	(%edx,%ebx), %esi
+	cmp	%edx, %eax
+	cmovnc(	%edx, %esi)
+	mov	%edi, %eax
+	mul	%esi
+
+	add	%esi, %edx
+	shr	%cl, %esi
+	mov	%esi, 12(%ebp)		C store B2modb
+
+	not	%edx
+	imul	%ebx, %edx
+	lea	(%edx,%ebx), %esi
+	cmp	%edx, %eax
+	cmovnc(	%edx, %esi)
+	mov	%edi, %eax
+	mul	%esi
+
+	add	%esi, %edx
+	shr	%cl, %esi
+	mov	%esi, 16(%ebp)		C store B3modb
+
+	not	%edx
+	imul	%ebx, %edx
+	lea	(%edx,%ebx), %esi
+	cmp	%edx, %eax
+	cmovnc(	%edx, %esi)
+	mov	%edi, %eax
+	mul	%esi
+
+	add	%esi, %edx
+	shr	%cl, %esi
+	mov	%esi, 20(%ebp)		C store B4modb
+
+	not	%edx
+	imul	%ebx, %edx
+	add	%edx, %ebx
+	cmp	%edx, %eax
+	cmovnc(	%edx, %ebx)
+
+	shr	%cl, %ebx
+	mov	%ebx, 24(%ebp)		C store B5modb
+
+	pop	%ebx
+	pop	%esi
+	pop	%edi
+	pop	%ebp
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/k7/mod_34lsub1.asm b/third_party/gmp/mpn/x86/k7/mod_34lsub1.asm
new file mode 100644
index 0000000..ee3ad04
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k7/mod_34lsub1.asm

@@ -0,0 +1,188 @@
+dnl  AMD K7 mpn_mod_34lsub1 -- remainder modulo 2^24-1.
+
+dnl  Copyright 2000-2002, 2004, 2005, 2008 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C         cycles/limb
+C Athlon:     1
+C Hammer:     1
+
+
+C mp_limb_t mpn_mod_34lsub1 (mp_srcptr src, mp_size_t size)
+C
+C The loop form below and the 64 byte code alignment seem necessary for the
+C claimed speed.  This is a bit strange, since normally k7 isn't very
+C sensitive to such things.  Perhaps there has to be 6 instructions in the
+C first 16 bytes for the BTB entry or something.
+
+defframe(PARAM_SIZE, 8)
+defframe(PARAM_SRC,  4)
+
+dnl  re-use parameter space
+define(SAVE_EDI, `PARAM_SIZE')
+
+	TEXT
+	ALIGN(64)
+PROLOGUE(mpn_mod_34lsub1)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	movl	PARAM_SRC, %edx
+
+	subl	$2, %ecx
+	ja	L(three_or_more)
+
+	movl	(%edx), %eax
+	jb	L(one)
+
+	movl	4(%edx), %ecx
+	movl	%eax, %edx
+	shrl	$24, %eax		C src[0] low
+
+	andl	$0xFFFFFF, %edx		C src[0] high
+	addl	%edx, %eax
+	movl	%ecx, %edx
+
+	andl	$0xFFFF, %ecx
+	shrl	$16, %edx		C src[1] high
+	addl	%edx, %eax
+
+	shll	$8, %ecx		C src[1] low
+	addl	%ecx, %eax
+
+L(one):
+	ret
+
+
+L(three_or_more):
+	C eax
+	C ebx
+	C ecx	size-2
+	C edx	src
+	C esi
+	C edi
+
+	pushl	%ebx	FRAME_pushl()
+	xorl	%eax, %eax
+	xorl	%ebx, %ebx
+
+	movl	%edi, SAVE_EDI
+	pushl	%esi	FRAME_pushl()
+	xorl	%esi, %esi		C and clear carry flag
+
+
+	C code offset 0x40 at this point
+L(top):
+	C eax	acc 0mod3
+	C ebx	acc 1mod3
+	C ecx	counter, limbs
+	C edx	src
+	C esi	acc 2mod3
+	C edi
+
+	leal	24(%edx), %edx
+	leal	-2(%ecx), %ecx
+	adcl	-24(%edx), %eax
+	adcl	-20(%edx), %ebx
+	adcl	-16(%edx), %esi
+
+	decl	%ecx
+	jng	L(done_loop)
+
+	leal	-2(%ecx), %ecx
+	adcl	-12(%edx), %eax
+	adcl	-8(%edx), %ebx
+	adcl	-4(%edx), %esi
+
+	decl	%ecx
+	jg	L(top)
+
+
+	leal	12(%edx), %edx
+
+
+L(done_loop):
+	C ecx is -2, -1 or 0 representing 0, 1 or 2 more limbs, respectively
+
+	incl	%ecx
+	movl	$0xFFFFFFFF, %edi
+	js	L(combine)
+
+	adcl	-12(%edx), %eax
+	decl	%ecx
+	movl	$0xFFFFFF00, %edi
+	js	L(combine)
+
+	adcl	-8(%edx), %ebx
+	movl	$0xFFFF0000, %edi
+
+
+L(combine):
+	C eax	acc 0mod3
+	C ebx	acc 1mod3
+	C ecx
+	C edx
+	C esi	acc 2mod3
+	C edi	mask
+
+	sbbl	%ecx, %ecx		C carry
+	movl	%eax, %edx		C 0mod3
+	shrl	$24, %eax		C 0mod3 high
+
+	andl	%edi, %ecx		C carry masked
+	andl	$0x00FFFFFF, %edx	C 0mod3 low
+	movl	%ebx, %edi		C 1mod3
+
+	subl	%ecx, %eax		C apply carry
+	shrl	$16, %ebx		C 1mod3 high
+	andl	$0xFFFF, %edi
+
+	addl	%edx, %eax		C apply 0mod3 low
+	movl	%esi, %edx		C 2mod3
+	shll	$8, %edi		C 1mod3 low
+
+	addl	%ebx, %eax		C apply 1mod3 high
+	shrl	$8, %esi		C 2mod3 high
+	movzbl	%dl, %edx		C 2mod3 low
+
+	addl	%edi, %eax		C apply 1mod3 low
+	shll	$16, %edx		C 2mod3 low
+
+	addl	%esi, %eax		C apply 2mod3 high
+	popl	%esi	FRAME_popl()
+
+	movl	SAVE_EDI, %edi
+	addl	%edx, %eax		C apply 2mod3 low
+	popl	%ebx	FRAME_popl()
+
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/k7/mode1o.asm b/third_party/gmp/mpn/x86/k7/mode1o.asm
new file mode 100644
index 0000000..2394033
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k7/mode1o.asm

@@ -0,0 +1,181 @@
+dnl  AMD K7 mpn_modexact_1_odd -- exact division style remainder.
+
+dnl  Copyright 2000-2002, 2004, 2007 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C          cycles/limb
+C Athlon:     11.0
+C Hammer:      7.0
+
+
+C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size,
+C                               mp_limb_t divisor);
+C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size,
+C                                mp_limb_t divisor, mp_limb_t carry);
+C
+C With the loop running at just 11 cycles it doesn't seem worth bothering to
+C check for high<divisor to save one step.
+C
+C Using a divl for size==1 measures slower than the modexact method, which
+C is not too surprising since for the latter it's only about 24 cycles to
+C calculate the modular inverse.
+
+defframe(PARAM_CARRY,  16)
+defframe(PARAM_DIVISOR,12)
+defframe(PARAM_SIZE,   8)
+defframe(PARAM_SRC,    4)
+
+defframe(SAVE_EBX,     -4)
+defframe(SAVE_ESI,     -8)
+defframe(SAVE_EDI,    -12)
+defframe(SAVE_EBP,    -16)
+
+deflit(STACK_SPACE, 16)
+
+	TEXT
+
+	ALIGN(16)
+PROLOGUE(mpn_modexact_1c_odd)
+deflit(`FRAME',0)
+
+	movl	PARAM_CARRY, %ecx
+	jmp	L(start_1c)
+
+EPILOGUE()
+
+
+	ALIGN(16)
+PROLOGUE(mpn_modexact_1_odd)
+deflit(`FRAME',0)
+
+	xorl	%ecx, %ecx
+L(start_1c):
+	movl	PARAM_DIVISOR, %eax
+	subl	$STACK_SPACE, %esp	FRAME_subl_esp(STACK_SPACE)
+
+	movl	%esi, SAVE_ESI
+	movl	PARAM_DIVISOR, %esi
+
+	movl	%edi, SAVE_EDI
+
+	shrl	%eax			C d/2
+
+	andl	$127, %eax
+
+ifdef(`PIC',`
+	LEA(	binvert_limb_table, %edi)
+	movzbl	(%eax,%edi), %edi		C inv 8 bits
+',`
+	movzbl	binvert_limb_table(%eax), %edi	C inv 8 bits
+')
+
+	xorl	%edx, %edx		C initial extra carry
+	leal	(%edi,%edi), %eax	C 2*inv
+
+	imull	%edi, %edi		C inv*inv
+
+	movl	%ebp, SAVE_EBP
+	movl	PARAM_SIZE, %ebp
+
+	movl	%ebx, SAVE_EBX
+	movl	PARAM_SRC, %ebx
+
+	imull	%esi, %edi		C inv*inv*d
+
+	subl	%edi, %eax		C inv = 2*inv - inv*inv*d
+	leal	(%eax,%eax), %edi	C 2*inv
+
+	imull	%eax, %eax		C inv*inv
+
+	imull	%esi, %eax		C inv*inv*d
+
+	leal	(%ebx,%ebp,4), %ebx	C src end
+	negl	%ebp			C -size
+
+	subl	%eax, %edi		C inv = 2*inv - inv*inv*d
+
+	ASSERT(e,`	C d*inv == 1 mod 2^GMP_LIMB_BITS
+	movl	%esi, %eax
+	imull	%edi, %eax
+	cmpl	$1, %eax')
+
+
+C The dependent chain here is
+C
+C                            cycles
+C	subl	%edx, %eax	1
+C	imull	%edi, %eax	4
+C	mull	%esi		6  (high limb)
+C			      ----
+C       total		       11
+C
+C Out of order execution hides the load latency for the source data, so no
+C special scheduling is required.
+
+L(top):
+	C eax	src limb
+	C ebx	src end ptr
+	C ecx	next carry bit, 0 or 1 (or initial carry param)
+	C edx	carry limb, high of last product
+	C esi	divisor
+	C edi	inverse
+	C ebp	counter, limbs, negative
+
+	movl	(%ebx,%ebp,4), %eax
+
+	subl	%ecx, %eax		C apply carry bit
+	movl	$0, %ecx
+
+	setc	%cl			C new carry bit
+
+	subl	%edx, %eax		C apply carry limb
+	adcl	$0, %ecx
+
+	imull	%edi, %eax
+
+	mull	%esi
+
+	incl	%ebp
+	jnz	L(top)
+
+
+	movl	SAVE_ESI, %esi
+	movl	SAVE_EDI, %edi
+	leal	(%ecx,%edx), %eax
+
+	movl	SAVE_EBX, %ebx
+	movl	SAVE_EBP, %ebp
+	addl	$STACK_SPACE, %esp
+
+	ret
+
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/x86/k7/mul_1.asm b/third_party/gmp/mpn/x86/k7/mul_1.asm
new file mode 100644
index 0000000..755cd2e
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k7/mul_1.asm

@@ -0,0 +1,237 @@
+dnl  AMD K7 mpn_mul_1.
+
+dnl  Copyright 1999-2002, 2005, 2008 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C			    cycles/limb
+C P5
+C P6 model 0-8,10-12)
+C P6 model 9  (Banias)
+C P6 model 13 (Dothan)
+C P4 model 0  (Willamette)
+C P4 model 1  (?)
+C P4 model 2  (Northwood)
+C P4 model 3  (Prescott)
+C P4 model 4  (Nocona)
+C AMD K6
+C AMD K7			 3.25
+C AMD K8
+
+C TODO
+C  * Improve feed-in and wind-down code.  We beat the old code for all n != 1,
+C    but we might be able to do even better.
+C  * The feed-in code for mul_1c is crude.
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_mul_1c)
+	add	$-16, %esp
+	mov	%ebp, (%esp)
+	mov	%ebx, 4(%esp)
+	mov	%esi, 8(%esp)
+	mov	%edi, 12(%esp)
+
+	mov	20(%esp), %edi
+	mov	24(%esp), %esi
+	mov	28(%esp), %ebp
+	mov	32(%esp), %ecx
+	mov	%ebp, %ebx
+	shr	$2, %ebp
+	mov	%ebp, 28(%esp)
+	mov	(%esi), %eax
+	and	$3, %ebx
+	jz	L(c0)
+	cmp	$2, %ebx
+	mov	36(%esp), %ebx
+	jz	L(c2)
+	jg	L(c3)
+
+L(c1):	lea	-4(%edi), %edi
+	mul	%ecx
+	test	%ebp, %ebp
+	jnz	1f
+	add	%ebx, %eax
+	mov	%eax, 4(%edi)
+	mov	%edx, %eax
+	adc	%ebp, %eax
+	jmp	L(rt)
+1:	add	%eax, %ebx
+	mov	$0, %ebp
+	adc	%edx, %ebp
+	mov	4(%esi), %eax
+	jmp	L(1)
+
+L(c2):	lea	4(%esi), %esi
+	mul	%ecx
+	test	%ebp, %ebp
+	mov	%ebx, %ebp
+	jnz	2f
+	add	%eax, %ebp
+	mov	$0, %ebx
+	adc	%edx, %ebx
+	mov	(%esi), %eax
+	jmp	L(cj2)
+2:	add	%eax, %ebp
+	mov	$0, %ebx
+	adc	%edx, %ebx
+	mov	(%esi), %eax
+	jmp	L(2)
+
+L(c3):	lea	8(%esi), %esi
+	lea	-12(%edi), %edi
+	mul	%ecx
+	add	%eax, %ebx
+	mov	$0, %ebp
+	adc	%edx, %ebp
+	mov	-4(%esi), %eax
+	incl	28(%esp)
+	jmp	L(3)
+
+L(c0):	mov	36(%esp), %ebx
+	lea	-4(%esi), %esi
+	lea	-8(%edi), %edi
+	mul	%ecx
+	mov	%ebx, %ebp
+	add	%eax, %ebp
+	mov	$0, %ebx
+	adc	%edx, %ebx
+	mov	8(%esi), %eax
+	jmp	L(0)
+
+EPILOGUE()
+	ALIGN(16)
+PROLOGUE(mpn_mul_1)
+	add	$-16, %esp
+	mov	%ebp, (%esp)
+	mov	%ebx, 4(%esp)
+	mov	%esi, 8(%esp)
+	mov	%edi, 12(%esp)
+
+	mov	20(%esp), %edi
+	mov	24(%esp), %esi
+	mov	28(%esp), %ebp
+	mov	32(%esp), %ecx
+	mov	%ebp, %ebx
+	shr	$2, %ebp
+	mov	%ebp, 28(%esp)
+	mov	(%esi), %eax
+	and	$3, %ebx
+	jz	L(b0)
+	cmp	$2, %ebx
+	jz	L(b2)
+	jg	L(b3)
+
+L(b1):	lea	-4(%edi), %edi
+	mul	%ecx
+	test	%ebp, %ebp
+	jnz	L(gt1)
+	mov	%eax, 4(%edi)
+	mov	%edx, %eax
+	jmp	L(rt)
+L(gt1):	mov	%eax, %ebx
+	mov	%edx, %ebp
+	mov	4(%esi), %eax
+	jmp	L(1)
+
+L(b2):	lea	4(%esi), %esi
+	mul	%ecx
+	test	%ebp, %ebp
+	mov	%eax, %ebp
+	mov	%edx, %ebx
+	mov	(%esi), %eax
+	jnz	L(2)
+	jmp	L(cj2)
+
+L(b3):	lea	8(%esi), %esi
+	lea	-12(%edi), %edi
+	mul	%ecx
+	mov	%eax, %ebx
+	mov	%edx, %ebp
+	mov	-4(%esi), %eax
+	incl	28(%esp)
+	jmp	L(3)
+
+L(b0):	lea	-4(%esi), %esi
+	lea	-8(%edi), %edi
+	mul	%ecx
+	mov	%eax, %ebp
+	mov	%edx, %ebx
+	mov	8(%esi), %eax
+	jmp	L(0)
+
+	ALIGN(16)
+L(top):	mov	$0, %ebx
+	adc	%edx, %ebx
+L(2):	mul	%ecx
+	add	%eax, %ebx
+	mov	%ebp, 0(%edi)
+	mov	4(%esi), %eax
+	mov	$0, %ebp
+	adc	%edx, %ebp
+L(1):	mul	%ecx
+	add	%eax, %ebp
+	mov	8(%esi), %eax
+	mov	%ebx, 4(%edi)
+	mov	$0, %ebx
+	adc	%edx, %ebx
+L(0):	mov	%ebp, 8(%edi)
+	mul	%ecx
+	add	%eax, %ebx
+	mov	12(%esi), %eax
+	lea	16(%esi), %esi
+	mov	$0, %ebp
+	adc	%edx, %ebp
+L(3):	mov	%ebx, 12(%edi)
+	mul	%ecx
+	lea	16(%edi), %edi
+	add	%eax, %ebp
+	decl	28(%esp)
+	mov	0(%esi), %eax
+	jnz	L(top)
+
+L(end):	mov	$0, %ebx
+	adc	%edx, %ebx
+L(cj2):	mul	%ecx
+	add	%eax, %ebx
+	mov	%ebp, (%edi)
+L(cj1):	mov	%ebx, 4(%edi)
+	adc	$0, %edx
+	mov	%edx, %eax
+
+L(rt):	mov	(%esp), %ebp
+	mov	4(%esp), %ebx
+	mov	8(%esp), %esi
+	mov	12(%esp), %edi
+	add	$16, %esp
+	ret
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/x86/k7/mul_basecase.asm b/third_party/gmp/mpn/x86/k7/mul_basecase.asm
new file mode 100644
index 0000000..4dfb500
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k7/mul_basecase.asm

@@ -0,0 +1,602 @@
+dnl  AMD K7 mpn_mul_basecase -- multiply two mpn numbers.
+
+dnl  Copyright 1999-2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C K7: approx 4.42 cycles per cross product at around 20x20 limbs (16
+C     limbs/loop unrolling).
+
+
+
+dnl  K7 UNROLL_COUNT cycles/product (at around 20x20)
+dnl           8           4.67
+dnl          16           4.59
+dnl          32           4.42
+dnl  Maximum possible with the current code is 32.
+dnl
+dnl  At 32 the typical 13-26 limb sizes from the karatsuba code will get
+dnl  done with a straight run through a block of code, no inner loop.  Using
+dnl  32 gives 1k of code, but the k7 has a 64k L1 code cache.
+
+deflit(UNROLL_COUNT, 32)
+
+
+C void mpn_mul_basecase (mp_ptr wp,
+C                        mp_srcptr xp, mp_size_t xsize,
+C                        mp_srcptr yp, mp_size_t ysize);
+C
+C Calculate xp,xsize multiplied by yp,ysize, storing the result in
+C wp,xsize+ysize.
+C
+C This routine is essentially the same as mpn/generic/mul_basecase.c, but
+C it's faster because it does most of the mpn_addmul_1() startup
+C calculations only once.  The saving is 15-25% on typical sizes coming from
+C the Karatsuba multiply code.
+
+ifdef(`PIC',`
+deflit(UNROLL_THRESHOLD, 5)
+',`
+deflit(UNROLL_THRESHOLD, 5)
+')
+
+defframe(PARAM_YSIZE,20)
+defframe(PARAM_YP,   16)
+defframe(PARAM_XSIZE,12)
+defframe(PARAM_XP,   8)
+defframe(PARAM_WP,   4)
+
+	TEXT
+	ALIGN(32)
+PROLOGUE(mpn_mul_basecase)
+deflit(`FRAME',0)
+
+	movl	PARAM_XSIZE, %ecx
+	movl	PARAM_YP, %eax
+
+	movl	PARAM_XP, %edx
+	movl	(%eax), %eax	C yp low limb
+
+	cmpl	$2, %ecx
+	ja	L(xsize_more_than_two)
+	je	L(two_by_something)
+
+
+	C one limb by one limb
+
+	mull	(%edx)
+
+	movl	PARAM_WP, %ecx
+	movl	%eax, (%ecx)
+	movl	%edx, 4(%ecx)
+	ret
+
+
+C -----------------------------------------------------------------------------
+L(two_by_something):
+deflit(`FRAME',0)
+	decl	PARAM_YSIZE
+	pushl	%ebx		defframe_pushl(`SAVE_EBX')
+	movl	%eax, %ecx	C yp low limb
+
+	movl	PARAM_WP, %ebx
+	pushl	%esi		defframe_pushl(`SAVE_ESI')
+	movl	%edx, %esi	C xp
+
+	movl	(%edx), %eax	C xp low limb
+	jnz	L(two_by_two)
+
+
+	C two limbs by one limb
+
+	mull	%ecx
+
+	movl	%eax, (%ebx)
+	movl	4(%esi), %eax
+	movl	%edx, %esi	C carry
+
+	mull	%ecx
+
+	addl	%eax, %esi
+
+	movl	%esi, 4(%ebx)
+	movl	SAVE_ESI, %esi
+
+	adcl	$0, %edx
+
+	movl	%edx, 8(%ebx)
+	movl	SAVE_EBX, %ebx
+	addl	$FRAME, %esp
+
+	ret
+
+
+
+C -----------------------------------------------------------------------------
+C Could load yp earlier into another register.
+
+	ALIGN(16)
+L(two_by_two):
+	C eax	xp low limb
+	C ebx	wp
+	C ecx	yp low limb
+	C edx
+	C esi	xp
+	C edi
+	C ebp
+
+dnl  FRAME carries on from previous
+
+	mull	%ecx		C xp[0] * yp[0]
+
+	push	%edi		defframe_pushl(`SAVE_EDI')
+	movl	%edx, %edi	C carry, for wp[1]
+
+	movl	%eax, (%ebx)
+	movl	4(%esi), %eax
+
+	mull	%ecx		C xp[1] * yp[0]
+
+	addl	%eax, %edi
+	movl	PARAM_YP, %ecx
+
+	adcl	$0, %edx
+	movl	4(%ecx), %ecx	C yp[1]
+	movl	%edi, 4(%ebx)
+
+	movl	4(%esi), %eax	C xp[1]
+	movl	%edx, %edi	C carry, for wp[2]
+
+	mull	%ecx		C xp[1] * yp[1]
+
+	addl	%eax, %edi
+
+	adcl	$0, %edx
+	movl	(%esi), %eax	C xp[0]
+
+	movl	%edx, %esi	C carry, for wp[3]
+
+	mull	%ecx		C xp[0] * yp[1]
+
+	addl	%eax, 4(%ebx)
+	adcl	%edx, %edi
+	movl	%edi, 8(%ebx)
+
+	adcl	$0, %esi
+	movl	SAVE_EDI, %edi
+	movl	%esi, 12(%ebx)
+
+	movl	SAVE_ESI, %esi
+	movl	SAVE_EBX, %ebx
+	addl	$FRAME, %esp
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(16)
+L(xsize_more_than_two):
+
+C The first limb of yp is processed with a simple mpn_mul_1 style loop
+C inline.  Unrolling this doesn't seem worthwhile since it's only run once
+C (whereas the addmul below is run ysize-1 many times).  A call to the
+C actual mpn_mul_1 will be slowed down by the call and parameter pushing and
+C popping, and doesn't seem likely to be worthwhile on the typical 13-26
+C limb operations the Karatsuba code calls here with.
+
+	C eax	yp[0]
+	C ebx
+	C ecx	xsize
+	C edx	xp
+	C esi
+	C edi
+	C ebp
+
+dnl  FRAME doesn't carry on from previous, no pushes yet here
+defframe(`SAVE_EBX',-4)
+defframe(`SAVE_ESI',-8)
+defframe(`SAVE_EDI',-12)
+defframe(`SAVE_EBP',-16)
+deflit(`FRAME',0)
+
+	subl	$16, %esp
+deflit(`FRAME',16)
+
+	movl	%edi, SAVE_EDI
+	movl	PARAM_WP, %edi
+
+	movl	%ebx, SAVE_EBX
+	movl	%ebp, SAVE_EBP
+	movl	%eax, %ebp
+
+	movl	%esi, SAVE_ESI
+	xorl	%ebx, %ebx
+	leal	(%edx,%ecx,4), %esi	C xp end
+
+	leal	(%edi,%ecx,4), %edi	C wp end of mul1
+	negl	%ecx
+
+
+L(mul1):
+	C eax	scratch
+	C ebx	carry
+	C ecx	counter, negative
+	C edx	scratch
+	C esi	xp end
+	C edi	wp end of mul1
+	C ebp	multiplier
+
+	movl	(%esi,%ecx,4), %eax
+
+	mull	%ebp
+
+	addl	%ebx, %eax
+	movl	%eax, (%edi,%ecx,4)
+	movl	$0, %ebx
+
+	adcl	%edx, %ebx
+	incl	%ecx
+	jnz	L(mul1)
+
+
+	movl	PARAM_YSIZE, %edx
+	movl	PARAM_XSIZE, %ecx
+
+	movl	%ebx, (%edi)		C final carry
+	decl	%edx
+
+	jnz	L(ysize_more_than_one)
+
+
+	movl	SAVE_EDI, %edi
+	movl	SAVE_EBX, %ebx
+
+	movl	SAVE_EBP, %ebp
+	movl	SAVE_ESI, %esi
+	addl	$FRAME, %esp
+
+	ret
+
+
+L(ysize_more_than_one):
+	cmpl	$UNROLL_THRESHOLD, %ecx
+	movl	PARAM_YP, %eax
+
+	jae	L(unroll)
+
+
+C -----------------------------------------------------------------------------
+	C simple addmul looping
+	C
+	C eax	yp
+	C ebx
+	C ecx	xsize
+	C edx	ysize-1
+	C esi	xp end
+	C edi	wp end of mul1
+	C ebp
+
+	leal	4(%eax,%edx,4), %ebp	C yp end
+	negl	%ecx
+	negl	%edx
+
+	movl	(%esi,%ecx,4), %eax	C xp low limb
+	movl	%edx, PARAM_YSIZE	C -(ysize-1)
+	incl	%ecx
+
+	xorl	%ebx, %ebx		C initial carry
+	movl	%ecx, PARAM_XSIZE	C -(xsize-1)
+	movl	%ebp, PARAM_YP
+
+	movl	(%ebp,%edx,4), %ebp	C yp second lowest limb - multiplier
+	jmp	L(simple_outer_entry)
+
+
+	C this is offset 0x121 so close enough to aligned
+L(simple_outer_top):
+	C ebp	ysize counter, negative
+
+	movl	PARAM_YP, %edx
+	movl	PARAM_XSIZE, %ecx	C -(xsize-1)
+	xorl	%ebx, %ebx		C carry
+
+	movl	%ebp, PARAM_YSIZE
+	addl	$4, %edi		C next position in wp
+
+	movl	(%edx,%ebp,4), %ebp	C yp limb - multiplier
+	movl	-4(%esi,%ecx,4), %eax	C xp low limb
+
+
+L(simple_outer_entry):
+
+L(simple_inner):
+	C eax	xp limb
+	C ebx	carry limb
+	C ecx	loop counter (negative)
+	C edx	scratch
+	C esi	xp end
+	C edi	wp end
+	C ebp	multiplier
+
+	mull	%ebp
+
+	addl	%eax, %ebx
+	adcl	$0, %edx
+
+	addl	%ebx, (%edi,%ecx,4)
+	movl	(%esi,%ecx,4), %eax
+	adcl	$0, %edx
+
+	incl	%ecx
+	movl	%edx, %ebx
+	jnz	L(simple_inner)
+
+
+	mull	%ebp
+
+	movl	PARAM_YSIZE, %ebp
+	addl	%eax, %ebx
+
+	adcl	$0, %edx
+	addl	%ebx, (%edi)
+
+	adcl	$0, %edx
+	incl	%ebp
+
+	movl	%edx, 4(%edi)
+	jnz	L(simple_outer_top)
+
+
+	movl	SAVE_EBX, %ebx
+	movl	SAVE_ESI, %esi
+
+	movl	SAVE_EDI, %edi
+	movl	SAVE_EBP, %ebp
+	addl	$FRAME, %esp
+
+	ret
+
+
+
+C -----------------------------------------------------------------------------
+C
+C The unrolled loop is the same as in mpn_addmul_1(), see that code for some
+C comments.
+C
+C VAR_ADJUST is the negative of how many limbs the leals in the inner loop
+C increment xp and wp.  This is used to adjust back xp and wp, and rshifted
+C to given an initial VAR_COUNTER at the top of the outer loop.
+C
+C VAR_COUNTER is for the unrolled loop, running from VAR_ADJUST/UNROLL_COUNT
+C up to -1, inclusive.
+C
+C VAR_JMP is the computed jump into the unrolled loop.
+C
+C VAR_XP_LOW is the least significant limb of xp, which is needed at the
+C start of the unrolled loop.
+C
+C PARAM_YSIZE is the outer loop counter, going from -(ysize-1) up to -1,
+C inclusive.
+C
+C PARAM_YP is offset appropriately so that the PARAM_YSIZE counter can be
+C added to give the location of the next limb of yp, which is the multiplier
+C in the unrolled loop.
+C
+C The trick with VAR_ADJUST means it's only necessary to do one fetch in the
+C outer loop to take care of xp, wp and the inner loop counter.
+
+defframe(VAR_COUNTER,  -20)
+defframe(VAR_ADJUST,   -24)
+defframe(VAR_JMP,      -28)
+defframe(VAR_XP_LOW,   -32)
+deflit(VAR_EXTRA_SPACE, 16)
+
+
+L(unroll):
+	C eax	yp
+	C ebx
+	C ecx	xsize
+	C edx	ysize-1
+	C esi	xp end
+	C edi	wp end of mul1
+	C ebp
+
+	movl	PARAM_XP, %esi
+	movl	4(%eax), %ebp		C multiplier (yp second limb)
+	leal	4(%eax,%edx,4), %eax	C yp adjust for ysize indexing
+
+	movl	PARAM_WP, %edi
+	movl	%eax, PARAM_YP
+	negl	%edx
+
+	movl	%edx, PARAM_YSIZE
+	leal	UNROLL_COUNT-2(%ecx), %ebx	C (xsize-1)+UNROLL_COUNT-1
+	decl	%ecx				C xsize-1
+
+	movl	(%esi), %eax		C xp low limb
+	andl	$-UNROLL_MASK-1, %ebx
+	negl	%ecx
+
+	subl	$VAR_EXTRA_SPACE, %esp
+deflit(`FRAME',16+VAR_EXTRA_SPACE)
+	negl	%ebx
+	andl	$UNROLL_MASK, %ecx
+
+	movl	%ebx, VAR_ADJUST
+	movl	%ecx, %edx
+	shll	$4, %ecx
+
+	sarl	$UNROLL_LOG2, %ebx
+
+	C 17 code bytes per limb
+ifdef(`PIC',`
+	call	L(pic_calc)
+L(unroll_here):
+',`
+	leal	L(unroll_entry) (%ecx,%edx,1), %ecx
+')
+	negl	%edx
+
+	movl	%eax, VAR_XP_LOW
+	movl	%ecx, VAR_JMP
+	leal	4(%edi,%edx,4), %edi	C wp and xp, adjust for unrolling,
+	leal	4(%esi,%edx,4), %esi	C  and start at second limb
+	jmp	L(unroll_outer_entry)
+
+
+ifdef(`PIC',`
+L(pic_calc):
+	C See mpn/x86/README about old gas bugs
+	leal	(%ecx,%edx,1), %ecx
+	addl	$L(unroll_entry)-L(unroll_here), %ecx
+	addl	(%esp), %ecx
+	ret_internal
+')
+
+
+C --------------------------------------------------------------------------
+	ALIGN(32)
+L(unroll_outer_top):
+	C ebp	ysize counter, negative
+
+	movl	VAR_ADJUST, %ebx
+	movl	PARAM_YP, %edx
+
+	movl	VAR_XP_LOW, %eax
+	movl	%ebp, PARAM_YSIZE	C store incremented ysize counter
+
+	leal	4(%edi,%ebx,4), %edi
+	leal	(%esi,%ebx,4), %esi
+	sarl	$UNROLL_LOG2, %ebx
+
+	movl	(%edx,%ebp,4), %ebp	C yp next multiplier
+	movl	VAR_JMP, %ecx
+
+L(unroll_outer_entry):
+	mull	%ebp
+
+	testb	$1, %cl		C and clear carry bit
+	movl	%ebx, VAR_COUNTER
+	movl	$0, %ebx
+
+	movl	$0, %ecx
+	cmovz(	%eax, %ecx)	C eax into low carry, zero into high carry limb
+	cmovnz(	%eax, %ebx)
+
+	C Extra fetch of VAR_JMP is bad, but registers are tight
+	jmp	*VAR_JMP
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(32)
+L(unroll_top):
+	C eax	xp limb
+	C ebx	carry high
+	C ecx	carry low
+	C edx	scratch
+	C esi	xp+8
+	C edi	wp
+	C ebp	yp multiplier limb
+	C
+	C VAR_COUNTER  loop counter, negative
+	C
+	C 17 bytes each limb
+
+L(unroll_entry):
+
+deflit(CHUNK_COUNT,2)
+forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
+	deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
+	deflit(`disp1', eval(disp0 + 4))
+
+Zdisp(	movl,	disp0,(%esi), %eax)
+	adcl	%edx, %ebx
+
+	mull	%ebp
+
+Zdisp(	addl,	%ecx, disp0,(%edi))
+	movl	$0, %ecx
+
+	adcl	%eax, %ebx
+
+
+	movl	disp1(%esi), %eax
+	adcl	%edx, %ecx
+
+	mull	%ebp
+
+	addl	%ebx, disp1(%edi)
+	movl	$0, %ebx
+
+	adcl	%eax, %ecx
+')
+
+
+	incl	VAR_COUNTER
+	leal	UNROLL_BYTES(%esi), %esi
+	leal	UNROLL_BYTES(%edi), %edi
+
+	jnz	L(unroll_top)
+
+
+	C eax
+	C ebx	zero
+	C ecx	low
+	C edx	high
+	C esi
+	C edi	wp, pointing at second last limb)
+	C ebp
+	C
+	C carry flag to be added to high
+
+deflit(`disp0', ifelse(UNROLL_BYTES,256,-128))
+deflit(`disp1', eval(disp0-0 + 4))
+
+	movl	PARAM_YSIZE, %ebp
+	adcl	$0, %edx
+	addl	%ecx, disp0(%edi)
+
+	adcl	$0, %edx
+	incl	%ebp
+
+	movl	%edx, disp1(%edi)
+	jnz	L(unroll_outer_top)
+
+
+	movl	SAVE_ESI, %esi
+	movl	SAVE_EBP, %ebp
+
+	movl	SAVE_EDI, %edi
+	movl	SAVE_EBX, %ebx
+	addl	$FRAME, %esp
+
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/k7/sqr_basecase.asm b/third_party/gmp/mpn/x86/k7/sqr_basecase.asm
new file mode 100644
index 0000000..7b6a97e
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k7/sqr_basecase.asm

@@ -0,0 +1,635 @@
+dnl  AMD K7 mpn_sqr_basecase -- square an mpn number.
+
+dnl  Copyright 1999-2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C K7: approx 2.3 cycles/crossproduct, or 4.55 cycles/triangular product
+C     (measured on the speed difference between 25 and 50 limbs, which is
+C     roughly the Karatsuba recursing range).
+
+
+dnl  These are the same as mpn/x86/k6/sqr_basecase.asm, see that code for
+dnl  some comments.
+
+deflit(SQR_TOOM2_THRESHOLD_MAX, 66)
+
+ifdef(`SQR_TOOM2_THRESHOLD_OVERRIDE',
+`define(`SQR_TOOM2_THRESHOLD',SQR_TOOM2_THRESHOLD_OVERRIDE)')
+
+m4_config_gmp_mparam(`SQR_TOOM2_THRESHOLD')
+deflit(UNROLL_COUNT, eval(SQR_TOOM2_THRESHOLD-3))
+
+
+C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C With a SQR_TOOM2_THRESHOLD around 50 this code is about 1500 bytes,
+C which is quite a bit, but is considered good value since squares big
+C enough to use most of the code will be spending quite a few cycles in it.
+
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+	TEXT
+	ALIGN(32)
+PROLOGUE(mpn_sqr_basecase)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	movl	PARAM_SRC, %eax
+	cmpl	$2, %ecx
+
+	movl	PARAM_DST, %edx
+	je	L(two_limbs)
+	ja	L(three_or_more)
+
+
+C------------------------------------------------------------------------------
+C one limb only
+	C eax	src
+	C ecx	size
+	C edx	dst
+
+	movl	(%eax), %eax
+	movl	%edx, %ecx
+
+	mull	%eax
+
+	movl	%edx, 4(%ecx)
+	movl	%eax, (%ecx)
+	ret
+
+
+C------------------------------------------------------------------------------
+C
+C Using the read/modify/write "add"s seems to be faster than saving and
+C restoring registers.  Perhaps the loads for the first set hide under the
+C mul latency and the second gets store to load forwarding.
+
+	ALIGN(16)
+L(two_limbs):
+	C eax	src
+	C ebx
+	C ecx	size
+	C edx	dst
+deflit(`FRAME',0)
+
+	pushl	%ebx		FRAME_pushl()
+	movl	%eax, %ebx	C src
+	movl	(%eax), %eax
+
+	movl	%edx, %ecx	C dst
+
+	mull	%eax		C src[0]^2
+
+	movl	%eax, (%ecx)	C dst[0]
+	movl	4(%ebx), %eax
+
+	movl	%edx, 4(%ecx)	C dst[1]
+
+	mull	%eax		C src[1]^2
+
+	movl	%eax, 8(%ecx)	C dst[2]
+	movl	(%ebx), %eax
+
+	movl	%edx, 12(%ecx)	C dst[3]
+
+	mull	4(%ebx)		C src[0]*src[1]
+
+	popl	%ebx
+
+	addl	%eax, 4(%ecx)
+	adcl	%edx, 8(%ecx)
+	adcl	$0, 12(%ecx)
+	ASSERT(nc)
+
+	addl	%eax, 4(%ecx)
+	adcl	%edx, 8(%ecx)
+	adcl	$0, 12(%ecx)
+	ASSERT(nc)
+
+	ret
+
+
+C------------------------------------------------------------------------------
+defframe(SAVE_EBX,  -4)
+defframe(SAVE_ESI,  -8)
+defframe(SAVE_EDI, -12)
+defframe(SAVE_EBP, -16)
+deflit(STACK_SPACE, 16)
+
+L(three_or_more):
+	subl	$STACK_SPACE, %esp
+	cmpl	$4, %ecx
+	jae	L(four_or_more)
+deflit(`FRAME',STACK_SPACE)
+
+
+C------------------------------------------------------------------------------
+C Three limbs
+C
+C Writing out the loads and stores separately at the end of this code comes
+C out about 10 cycles faster than using adcls to memory.
+
+	C eax	src
+	C ecx	size
+	C edx	dst
+
+	movl	%ebx, SAVE_EBX
+	movl	%eax, %ebx	C src
+	movl	(%eax), %eax
+
+	movl	%edx, %ecx	C dst
+	movl	%esi, SAVE_ESI
+	movl	%edi, SAVE_EDI
+
+	mull	%eax		C src[0] ^ 2
+
+	movl	%eax, (%ecx)
+	movl	4(%ebx), %eax
+	movl	%edx, 4(%ecx)
+
+	mull	%eax		C src[1] ^ 2
+
+	movl	%eax, 8(%ecx)
+	movl	8(%ebx), %eax
+	movl	%edx, 12(%ecx)
+
+	mull	%eax		C src[2] ^ 2
+
+	movl	%eax, 16(%ecx)
+	movl	(%ebx), %eax
+	movl	%edx, 20(%ecx)
+
+	mull	4(%ebx)		C src[0] * src[1]
+
+	movl	%eax, %esi
+	movl	(%ebx), %eax
+	movl	%edx, %edi
+
+	mull	8(%ebx)		C src[0] * src[2]
+
+	addl	%eax, %edi
+	movl	%ebp, SAVE_EBP
+	movl	$0, %ebp
+
+	movl	4(%ebx), %eax
+	adcl	%edx, %ebp
+
+	mull	8(%ebx)		C src[1] * src[2]
+
+	xorl	%ebx, %ebx
+	addl	%eax, %ebp
+
+	adcl	$0, %edx
+
+	C eax
+	C ebx	zero, will be dst[5]
+	C ecx	dst
+	C edx	dst[4]
+	C esi	dst[1]
+	C edi	dst[2]
+	C ebp	dst[3]
+
+	adcl	$0, %edx
+	addl	%esi, %esi
+
+	adcl	%edi, %edi
+	movl	4(%ecx), %eax
+
+	adcl	%ebp, %ebp
+
+	adcl	%edx, %edx
+
+	adcl	$0, %ebx
+	addl	%eax, %esi
+	movl	8(%ecx), %eax
+
+	adcl	%eax, %edi
+	movl	12(%ecx), %eax
+	movl	%esi, 4(%ecx)
+
+	adcl	%eax, %ebp
+	movl	16(%ecx), %eax
+	movl	%edi, 8(%ecx)
+
+	movl	SAVE_ESI, %esi
+	movl	SAVE_EDI, %edi
+
+	adcl	%eax, %edx
+	movl	20(%ecx), %eax
+	movl	%ebp, 12(%ecx)
+
+	adcl	%ebx, %eax
+	ASSERT(nc)
+	movl	SAVE_EBX, %ebx
+	movl	SAVE_EBP, %ebp
+
+	movl	%edx, 16(%ecx)
+	movl	%eax, 20(%ecx)
+	addl	$FRAME, %esp
+
+	ret
+
+
+C------------------------------------------------------------------------------
+L(four_or_more):
+
+C First multiply src[0]*src[1..size-1] and store at dst[1..size].
+C Further products are added in rather than stored.
+
+	C eax	src
+	C ebx
+	C ecx	size
+	C edx	dst
+	C esi
+	C edi
+	C ebp
+
+defframe(`VAR_COUNTER',-20)
+defframe(`VAR_JMP',    -24)
+deflit(EXTRA_STACK_SPACE, 8)
+
+	movl	%ebx, SAVE_EBX
+	movl	%edi, SAVE_EDI
+	leal	(%edx,%ecx,4), %edi	C &dst[size]
+
+	movl	%esi, SAVE_ESI
+	movl	%ebp, SAVE_EBP
+	leal	(%eax,%ecx,4), %esi	C &src[size]
+
+	movl	(%eax), %ebp		C multiplier
+	movl	$0, %ebx
+	decl	%ecx
+
+	negl	%ecx
+	subl	$EXTRA_STACK_SPACE, %esp
+FRAME_subl_esp(EXTRA_STACK_SPACE)
+
+L(mul_1):
+	C eax	scratch
+	C ebx	carry
+	C ecx	counter
+	C edx	scratch
+	C esi	&src[size]
+	C edi	&dst[size]
+	C ebp	multiplier
+
+	movl	(%esi,%ecx,4), %eax
+
+	mull	%ebp
+
+	addl	%ebx, %eax
+	movl	%eax, (%edi,%ecx,4)
+	movl	$0, %ebx
+
+	adcl	%edx, %ebx
+	incl	%ecx
+	jnz	L(mul_1)
+
+
+C Add products src[n]*src[n+1..size-1] at dst[2*n-1...], for each n=1..size-2.
+C
+C The last two products, which are the bottom right corner of the product
+C triangle, are left to the end.  These are src[size-3]*src[size-2,size-1]
+C and src[size-2]*src[size-1].  If size is 4 then it's only these corner
+C cases that need to be done.
+C
+C The unrolled code is the same as in mpn_addmul_1, see that routine for
+C some comments.
+C
+C VAR_COUNTER is the outer loop, running from -size+4 to -1, inclusive.
+C
+C VAR_JMP is the computed jump into the unrolled code, stepped by one code
+C chunk each outer loop.
+C
+C K7 does branch prediction on indirect jumps, which is bad since it's a
+C different target each time.  There seems no way to avoid this.
+
+dnl  This value also hard coded in some shifts and adds
+deflit(CODE_BYTES_PER_LIMB, 17)
+
+dnl  With the unmodified &src[size] and &dst[size] pointers, the
+dnl  displacements in the unrolled code fit in a byte for UNROLL_COUNT
+dnl  values up to 31, but above that an offset must be added to them.
+
+deflit(OFFSET,
+ifelse(eval(UNROLL_COUNT>31),1,
+eval((UNROLL_COUNT-31)*4),
+0))
+
+dnl  Because the last chunk of code is generated differently, a label placed
+dnl  at the end doesn't work.  Instead calculate the implied end using the
+dnl  start and how many chunks of code there are.
+
+deflit(UNROLL_INNER_END,
+`L(unroll_inner_start)+eval(UNROLL_COUNT*CODE_BYTES_PER_LIMB)')
+
+	C eax
+	C ebx	carry
+	C ecx
+	C edx
+	C esi	&src[size]
+	C edi	&dst[size]
+	C ebp
+
+	movl	PARAM_SIZE, %ecx
+	movl	%ebx, (%edi)
+
+	subl	$4, %ecx
+	jz	L(corner)
+
+	negl	%ecx
+ifelse(OFFSET,0,,`subl	$OFFSET, %edi')
+ifelse(OFFSET,0,,`subl	$OFFSET, %esi')
+
+	movl	%ecx, %edx
+	shll	$4, %ecx
+
+ifdef(`PIC',`
+	call	L(pic_calc)
+L(here):
+',`
+	leal	UNROLL_INNER_END-eval(2*CODE_BYTES_PER_LIMB)(%ecx,%edx), %ecx
+')
+
+
+	C The calculated jump mustn't come out to before the start of the
+	C code available.  This is the limit UNROLL_COUNT puts on the src
+	C operand size, but checked here directly using the jump address.
+	ASSERT(ae,
+	`movl_text_address(L(unroll_inner_start), %eax)
+	cmpl	%eax, %ecx')
+
+
+C------------------------------------------------------------------------------
+	ALIGN(16)
+L(unroll_outer_top):
+	C eax
+	C ebx	high limb to store
+	C ecx	VAR_JMP
+	C edx	VAR_COUNTER, limbs, negative
+	C esi	&src[size], constant
+	C edi	dst ptr, high of last addmul
+	C ebp
+
+	movl	-12+OFFSET(%esi,%edx,4), %ebp	C next multiplier
+	movl	-8+OFFSET(%esi,%edx,4), %eax	C first of multiplicand
+
+	movl	%edx, VAR_COUNTER
+
+	mull	%ebp
+
+define(cmovX,`ifelse(eval(UNROLL_COUNT%2),0,`cmovz($@)',`cmovnz($@)')')
+
+	testb	$1, %cl
+	movl	%edx, %ebx	C high carry
+	movl	%ecx, %edx	C jump
+
+	movl	%eax, %ecx	C low carry
+	cmovX(	%ebx, %ecx)	C high carry reverse
+	cmovX(	%eax, %ebx)	C low carry reverse
+
+	leal	CODE_BYTES_PER_LIMB(%edx), %eax
+	xorl	%edx, %edx
+	leal	4(%edi), %edi
+
+	movl	%eax, VAR_JMP
+
+	jmp	*%eax
+
+
+ifdef(`PIC',`
+L(pic_calc):
+	addl	(%esp), %ecx
+	addl	$UNROLL_INNER_END-eval(2*CODE_BYTES_PER_LIMB)-L(here), %ecx
+	addl	%edx, %ecx
+	ret_internal
+')
+
+
+	C Must be an even address to preserve the significance of the low
+	C bit of the jump address indicating which way around ecx/ebx should
+	C start.
+	ALIGN(2)
+
+L(unroll_inner_start):
+	C eax	next limb
+	C ebx	carry high
+	C ecx	carry low
+	C edx	scratch
+	C esi	src
+	C edi	dst
+	C ebp	multiplier
+
+forloop(`i', UNROLL_COUNT, 1, `
+	deflit(`disp_src', eval(-i*4 + OFFSET))
+	deflit(`disp_dst', eval(disp_src - 4))
+
+	m4_assert(`disp_src>=-128 && disp_src<128')
+	m4_assert(`disp_dst>=-128 && disp_dst<128')
+
+ifelse(eval(i%2),0,`
+Zdisp(	movl,	disp_src,(%esi), %eax)
+	adcl	%edx, %ebx
+
+	mull	%ebp
+
+Zdisp(  addl,	%ecx, disp_dst,(%edi))
+	movl	$0, %ecx
+
+	adcl	%eax, %ebx
+
+',`
+	dnl  this bit comes out last
+Zdisp(  movl,	disp_src,(%esi), %eax)
+	adcl	%edx, %ecx
+
+	mull	%ebp
+
+Zdisp(	addl,	%ebx, disp_dst,(%edi))
+
+ifelse(forloop_last,0,
+`	movl	$0, %ebx')
+
+	adcl	%eax, %ecx
+')
+')
+
+	C eax	next limb
+	C ebx	carry high
+	C ecx	carry low
+	C edx	scratch
+	C esi	src
+	C edi	dst
+	C ebp	multiplier
+
+	adcl	$0, %edx
+	addl	%ecx, -4+OFFSET(%edi)
+	movl	VAR_JMP, %ecx
+
+	adcl	$0, %edx
+
+	movl	%edx, m4_empty_if_zero(OFFSET) (%edi)
+	movl	VAR_COUNTER, %edx
+
+	incl	%edx
+	jnz	L(unroll_outer_top)
+
+
+ifelse(OFFSET,0,,`
+	addl	$OFFSET, %esi
+	addl	$OFFSET, %edi
+')
+
+
+C------------------------------------------------------------------------------
+L(corner):
+	C esi	&src[size]
+	C edi	&dst[2*size-5]
+
+	movl	-12(%esi), %ebp
+	movl	-8(%esi), %eax
+	movl	%eax, %ecx
+
+	mull	%ebp
+
+	addl	%eax, -4(%edi)
+	movl	-4(%esi), %eax
+
+	adcl	$0, %edx
+	movl	%edx, %ebx
+	movl	%eax, %esi
+
+	mull	%ebp
+
+	addl	%ebx, %eax
+
+	adcl	$0, %edx
+	addl	%eax, (%edi)
+	movl	%esi, %eax
+
+	adcl	$0, %edx
+	movl	%edx, %ebx
+
+	mull	%ecx
+
+	addl	%ebx, %eax
+	movl	%eax, 4(%edi)
+
+	adcl	$0, %edx
+	movl	%edx, 8(%edi)
+
+
+
+C Left shift of dst[1..2*size-2], high bit shifted out becomes dst[2*size-1].
+
+L(lshift_start):
+	movl	PARAM_SIZE, %eax
+	movl	PARAM_DST, %edi
+	xorl	%ecx, %ecx		C clear carry
+
+	leal	(%edi,%eax,8), %edi
+	notl	%eax			C -size-1, preserve carry
+
+	leal	2(%eax), %eax		C -(size-1)
+
+L(lshift):
+	C eax	counter, negative
+	C ebx
+	C ecx
+	C edx
+	C esi
+	C edi	dst, pointing just after last limb
+	C ebp
+
+	rcll	-4(%edi,%eax,8)
+	rcll	(%edi,%eax,8)
+	incl	%eax
+	jnz	L(lshift)
+
+	setc	%al
+
+	movl	PARAM_SRC, %esi
+	movl	%eax, -4(%edi)		C dst most significant limb
+
+	movl	PARAM_SIZE, %ecx
+
+
+C Now add in the squares on the diagonal, src[0]^2, src[1]^2, ...,
+C src[size-1]^2.  dst[0] hasn't yet been set at all yet, and just gets the
+C low limb of src[0]^2.
+
+	movl	(%esi), %eax		C src[0]
+
+	mull	%eax
+
+	leal	(%esi,%ecx,4), %esi	C src point just after last limb
+	negl	%ecx
+
+	movl	%eax, (%edi,%ecx,8)	C dst[0]
+	incl	%ecx
+
+L(diag):
+	C eax	scratch
+	C ebx	scratch
+	C ecx	counter, negative
+	C edx	carry
+	C esi	src just after last limb
+	C edi	dst just after last limb
+	C ebp
+
+	movl	(%esi,%ecx,4), %eax
+	movl	%edx, %ebx
+
+	mull	%eax
+
+	addl	%ebx, -4(%edi,%ecx,8)
+	adcl	%eax, (%edi,%ecx,8)
+	adcl	$0, %edx
+
+	incl	%ecx
+	jnz	L(diag)
+
+
+	movl	SAVE_ESI, %esi
+	movl	SAVE_EBX, %ebx
+
+	addl	%edx, -4(%edi)		C dst most significant limb
+	movl	SAVE_EDI, %edi
+
+	movl	SAVE_EBP, %ebp
+	addl	$FRAME, %esp
+
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/k7/sublsh1_n.asm b/third_party/gmp/mpn/x86/k7/sublsh1_n.asm
new file mode 100644
index 0000000..8851683
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k7/sublsh1_n.asm

@@ -0,0 +1,173 @@
+dnl  AMD K7 mpn_sublsh1_n_ip1 -- rp[] = rp[] - (up[] << 1)
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C This is an attempt at a sublsh1_n for x86-32, not relying on sse2 insns.  The
+C innerloop is 2*3-way unrolled, which is best we can do with the available
+C registers.  It seems tricky to use the same structure for rsblsh1_n, since we
+C cannot feed carry between operations there.
+
+C			    cycles/limb
+C P5
+C P6 model 0-8,10-12
+C P6 model 9  (Banias)
+C P6 model 13 (Dothan)
+C P4 model 0  (Willamette)
+C P4 model 1  (?)
+C P4 model 2  (Northwood)
+C P4 model 3  (Prescott)
+C P4 model 4  (Nocona)
+C Intel Atom			 6.75
+C AMD K6
+C AMD K7
+C AMD K8
+
+C This is a basic sublsh1_n for k7, atom, and perhaps some other x86-32
+C processors.  It uses 2*4-way unrolling, for good reasons.
+C
+C Breaking carry recurrency might be a good idea.  We would then need separate
+C registers for the shift carry and add/subtract carry, which in turn would
+C force us to 2*2-way unrolling.
+
+defframe(PARAM_SIZE,	12)
+defframe(PARAM_SRC,	 8)
+defframe(PARAM_DST,	 4)
+
+dnl  re-use parameter space
+define(VAR_COUNT,`PARAM_SIZE')
+define(SAVE_EBX,`PARAM_SRC')
+define(SAVE_EBP,`PARAM_DST')
+
+ASM_START()
+	TEXT
+	ALIGN(8)
+PROLOGUE(mpn_sublsh1_n_ip1)
+deflit(`FRAME',0)
+
+define(`rp',  `%edi')
+define(`up',  `%esi')
+
+	mov	PARAM_SIZE, %eax	C size
+	push	up			FRAME_pushl()
+	push	rp			FRAME_pushl()
+	xor	%edx, %edx
+	mov	PARAM_SRC, up
+	mov	PARAM_DST, rp
+	mov	%ebx, SAVE_EBX
+	mov	%eax, %ebx
+	shr	$3, %eax
+
+	not	%eax			C count = -(size\8)-i
+	and	$7, %ebx		C size % 8
+	jz	L(exact)
+
+L(oop):
+ifdef(`CPU_P6',`
+	shr	%edx ')			C restore 2nd saved carry bit
+	mov	(up), %ecx
+	adc	%ecx, %ecx
+	rcr	%edx			C restore 1st saved carry bit
+	lea	4(up), up
+	sbb	%ecx, (rp)
+	lea	4(rp), rp
+	adc	%edx, %edx		C save a carry bit in edx
+ifdef(`CPU_P6',`
+	adc	%edx, %edx ')		C save another carry bit in edx
+	dec	%ebx
+	jnz	L(oop)
+L(exact):
+	inc	%eax
+	jz	L(end)
+	mov	%eax, VAR_COUNT
+	mov	%ebp, SAVE_EBP
+
+	ALIGN(16)
+L(top):
+ifdef(`CPU_P6',`
+	shr	%edx ')			C restore 2nd saved carry bit
+	mov	(up), %eax
+	adc	%eax, %eax
+	mov	4(up), %ebx
+	adc	%ebx, %ebx
+	mov	8(up), %ecx
+	adc	%ecx, %ecx
+	mov	12(up), %ebp
+	adc	%ebp, %ebp
+
+	rcr	%edx			C restore 1st saved carry bit
+
+	sbb	%eax, (rp)
+	sbb	%ebx, 4(rp)
+	sbb	%ecx, 8(rp)
+	sbb	%ebp, 12(rp)
+
+	mov	16(up), %eax
+	adc	%eax, %eax
+	mov	20(up), %ebx
+	adc	%ebx, %ebx
+	mov	24(up), %ecx
+	adc	%ecx, %ecx
+	mov	28(up), %ebp
+	adc	%ebp, %ebp
+
+	lea	32(up), up
+	adc	%edx, %edx		C save a carry bit in edx
+
+	sbb	%eax, 16(rp)
+	sbb	%ebx, 20(rp)
+	sbb	%ecx, 24(rp)
+	sbb	%ebp, 28(rp)
+
+ifdef(`CPU_P6',`
+	adc	%edx, %edx ')		C save another carry bit in edx
+	incl	VAR_COUNT
+	lea	32(rp), rp
+	jne	L(top)
+
+	mov	SAVE_EBP, %ebp
+L(end):
+	mov	SAVE_EBX, %ebx
+
+ifdef(`CPU_P6',`
+	xor	%eax, %eax
+	shr	$1, %edx
+	adc	%edx, %eax
+',`
+	adc	$0, %edx
+	mov	%edx, %eax
+')
+	pop	rp			FRAME_popl()
+	pop	up			FRAME_popl()
+	ret
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/x86/k8/gmp-mparam.h b/third_party/gmp/mpn/x86/k8/gmp-mparam.h
new file mode 100644
index 0000000..fa71292
--- /dev/null
+++ b/third_party/gmp/mpn/x86/k8/gmp-mparam.h

@@ -0,0 +1,215 @@
+/* x86/k8 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 2000-2011, 2014 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* 2500 MHz K8 Brisbane */
+/* FFT tuning limit = 67,000,000 */
+/* Generated by tuneup.c, 2019-10-20, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD                 0  /* always */
+#define MOD_1_UNNORM_THRESHOLD               0  /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD         11
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          4
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD        12
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD         0  /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     21
+#define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_1N_PI1_METHOD                 1  /* 36.85% faster than 2 */
+#define DIV_QR_1_NORM_THRESHOLD              3
+#define DIV_QR_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           44
+
+#define DIV_1_VS_MUL_1_PERCENT             251
+
+#define MUL_TOOM22_THRESHOLD                26
+#define MUL_TOOM33_THRESHOLD                78
+#define MUL_TOOM44_THRESHOLD               136
+#define MUL_TOOM6H_THRESHOLD               270
+#define MUL_TOOM8H_THRESHOLD               430
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      85
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD      91
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD      89
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD      96
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD     121
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 46
+#define SQR_TOOM3_THRESHOLD                 81
+#define SQR_TOOM4_THRESHOLD                202
+#define SQR_TOOM6_THRESHOLD                300
+#define SQR_TOOM8_THRESHOLD                430
+
+#define MULMID_TOOM42_THRESHOLD             50
+
+#define MULMOD_BNM1_THRESHOLD               18
+#define SQRMOD_BNM1_THRESHOLD               22
+
+#define MUL_FFT_MODF_THRESHOLD             606  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    606, 5}, {     27, 6}, {     15, 5}, {     31, 6}, \
+    {     25, 7}, {     13, 6}, {     29, 7}, {     15, 6}, \
+    {     33, 7}, {     17, 6}, {     35, 7}, {     19, 6}, \
+    {     39, 7}, {     23, 6}, {     47, 7}, {     35, 8}, \
+    {     19, 7}, {     41, 8}, {     23, 7}, {     47, 8}, \
+    {     31, 7}, {     63, 8}, {     39, 9}, {     23, 8}, \
+    {     55, 9}, {     31, 8}, {     67, 9}, {     39, 8}, \
+    {     79, 9}, {     47, 8}, {     95, 9}, {     55,10}, \
+    {     31, 9}, {     79,10}, {     47, 9}, {    103,11}, \
+    {     31,10}, {     63, 9}, {    135,10}, {     79, 9}, \
+    {    167,10}, {     95, 9}, {    191,10}, {    111,11}, \
+    {     63,10}, {    159,11}, {     95,10}, {    191,12}, \
+    {     63,11}, {    127,10}, {    255, 9}, {    511,10}, \
+    {    271, 9}, {    543,10}, {    287,11}, {    159,10}, \
+    {    319, 9}, {    639,10}, {    335, 9}, {    671,11}, \
+    {    191,10}, {    383, 9}, {    767,10}, {    399, 9}, \
+    {    799,11}, {    223,12}, {    127,11}, {    255,10}, \
+    {    511, 9}, {   1023,10}, {    543,11}, {    287,10}, \
+    {    607,11}, {    319,10}, {    671,11}, {    351,12}, \
+    {    191,11}, {    383,10}, {    799,11}, {    415,13}, \
+    {    127,12}, {    255,11}, {    511,10}, {   1023,11}, \
+    {    543,10}, {   1087,11}, {    607,12}, {    319,11}, \
+    {    671,10}, {   1343,11}, {    735,10}, {   1471,12}, \
+    {    383,11}, {    799,10}, {   1599,11}, {    863,12}, \
+    {    447,11}, {    927,13}, {    255,12}, {    511,11}, \
+    {   1087,12}, {    575,11}, {   1215,12}, {    639,11}, \
+    {   1343,12}, {    703,11}, {   1471,13}, {    383,12}, \
+    {    767,11}, {   1599,12}, {    831,11}, {   1727,12}, \
+    {    895,11}, {   1791,12}, {    959,14}, {    255,13}, \
+    {    511,12}, {   1087,11}, {   2239,12}, {   1215,13}, \
+    {    639,12}, {   1471,13}, {    767,12}, {   1727,13}, \
+    {    895,12}, {   1919,14}, {    511,13}, {   1023,12}, \
+    {   2239,13}, {   1151,12}, {   2431,13}, {   1279,12}, \
+    {   2623,13}, {   1407,12}, {   2943,14}, {    767,13}, \
+    {   1663,12}, {   3455,13}, {   1919,15}, {    511,14}, \
+    {   1023,13}, {   2175,12}, {   4351,13}, {   2431,14}, \
+    {   1279,13}, {   2943,14}, {   1535,13}, {   3455,14}, \
+    {   1791,13}, {   3967,15}, {   1023,14}, {   2047,13}, \
+    {   4351,14}, {   2303,13}, {   4991,14}, {   2815,15}, \
+    {   1535,14}, {   3839,16} }
+#define MUL_FFT_TABLE3_SIZE 158
+#define MUL_FFT_THRESHOLD                 7296
+
+#define SQR_FFT_MODF_THRESHOLD             500  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    500, 5}, {     25, 6}, {     13, 5}, {     27, 6}, \
+    {     25, 7}, {     13, 6}, {     29, 7}, {     15, 6}, \
+    {     32, 7}, {     17, 6}, {     35, 7}, {     19, 6}, \
+    {     39, 7}, {     29, 8}, {     15, 7}, {     35, 8}, \
+    {     19, 7}, {     41, 8}, {     23, 7}, {     47, 8}, \
+    {     27, 9}, {     15, 8}, {     31, 7}, {     63, 8}, \
+    {     39, 9}, {     23, 8}, {     51, 9}, {     31, 8}, \
+    {     67, 9}, {     39, 8}, {     79, 9}, {     47, 8}, \
+    {     95,10}, {     31, 9}, {     79,10}, {     47, 9}, \
+    {     95,11}, {     31,10}, {     63, 9}, {    135,10}, \
+    {     79, 9}, {    159,10}, {     95, 9}, {    191,10}, \
+    {    111,11}, {     63,10}, {    127, 9}, {    255,10}, \
+    {    143, 9}, {    287, 8}, {    575,10}, {    159,11}, \
+    {     95,10}, {    191,12}, {     63,11}, {    127,10}, \
+    {    255, 9}, {    511,10}, {    271, 9}, {    543,10}, \
+    {    287,11}, {    159,10}, {    319, 9}, {    639,10}, \
+    {    335, 9}, {    671,10}, {    351,11}, {    191,10}, \
+    {    383, 9}, {    767,10}, {    399, 9}, {    799,10}, \
+    {    415,11}, {    223,12}, {    127,11}, {    255,10}, \
+    {    511, 9}, {   1023,10}, {    543,11}, {    287,10}, \
+    {    607, 9}, {   1215,11}, {    319,10}, {    671,11}, \
+    {    351,10}, {    703,12}, {    191,11}, {    383,10}, \
+    {    799,11}, {    415,10}, {    831,13}, {    127,12}, \
+    {    255,11}, {    511,10}, {   1023,11}, {    543,10}, \
+    {   1087,11}, {    607,12}, {    319,11}, {    671,10}, \
+    {   1343,11}, {    735,10}, {   1471,12}, {    383,11}, \
+    {    799,10}, {   1599,11}, {    863,12}, {    447,11}, \
+    {    959,13}, {    255,12}, {    511,11}, {   1087,12}, \
+    {    575,11}, {   1215,10}, {   2431,12}, {    639,11}, \
+    {   1343,12}, {    703,11}, {   1471,13}, {    383,12}, \
+    {    767,11}, {   1599,12}, {    831,11}, {   1727,12}, \
+    {    959,14}, {    255,13}, {    511,12}, {   1087,11}, \
+    {   2239,12}, {   1215,11}, {   2431,13}, {    639,12}, \
+    {   1471,13}, {    767,12}, {   1727,13}, {    895,12}, \
+    {   1919,14}, {    511,13}, {   1023,12}, {   2239,13}, \
+    {   1151,12}, {   2431,13}, {   1279,12}, {   2623,13}, \
+    {   1407,12}, {   2943,14}, {    767,13}, {   1663,12}, \
+    {   3455,13}, {   1919,15}, {    511,14}, {   1023,13}, \
+    {   2431,14}, {   1279,13}, {   2943,14}, {   1535,13}, \
+    {   3455,14}, {   1791,13}, {   3839,15}, {   1023,14}, \
+    {   2047,13}, {   4223,14}, {   2303,13}, {   4863,14}, \
+    {   2815,15}, {   1535,14}, {   3839,16} }
+#define SQR_FFT_TABLE3_SIZE 167
+#define SQR_FFT_THRESHOLD                 5504
+
+#define MULLO_BASECASE_THRESHOLD             4
+#define MULLO_DC_THRESHOLD                  29
+#define MULLO_MUL_N_THRESHOLD            14281
+#define SQRLO_BASECASE_THRESHOLD             6
+#define SQRLO_DC_THRESHOLD                 193
+#define SQRLO_SQR_THRESHOLD              10704
+
+#define DC_DIV_QR_THRESHOLD                 84
+#define DC_DIVAPPR_Q_THRESHOLD             278
+#define DC_BDIV_QR_THRESHOLD                87
+#define DC_BDIV_Q_THRESHOLD                216
+
+#define INV_MULMOD_BNM1_THRESHOLD           50
+#define INV_NEWTON_THRESHOLD               268
+#define INV_APPR_THRESHOLD                 268
+
+#define BINV_NEWTON_THRESHOLD              276
+#define REDC_1_TO_REDC_N_THRESHOLD          78
+
+#define MU_DIV_QR_THRESHOLD               1652
+#define MU_DIVAPPR_Q_THRESHOLD            1528
+#define MUPI_DIV_QR_THRESHOLD              114
+#define MU_BDIV_QR_THRESHOLD              1442
+#define MU_BDIV_Q_THRESHOLD               1466
+
+#define POWM_SEC_TABLE  1,22,102,452,1357
+
+#define GET_STR_DC_THRESHOLD                14
+#define GET_STR_PRECOMPUTE_THRESHOLD        24
+#define SET_STR_DC_THRESHOLD               270
+#define SET_STR_PRECOMPUTE_THRESHOLD      1149
+
+#define FAC_DSC_THRESHOLD                  208
+#define FAC_ODD_THRESHOLD                   48
+
+#define MATRIX22_STRASSEN_THRESHOLD         16
+#define HGCD2_DIV1_METHOD                    3  /* 4.69% faster than 1 */
+#define HGCD_THRESHOLD                     139
+#define HGCD_APPR_THRESHOLD                174
+#define HGCD_REDUCE_THRESHOLD             3389
+#define GCD_DC_THRESHOLD                   599
+#define GCDEXT_DC_THRESHOLD                419
+#define JACOBI_BASE_METHOD                   1  /* 1.57% faster than 4 */
+
+/* Tuneup completed successfully, took 83851 seconds */

diff --git a/third_party/gmp/mpn/x86/lshift.asm b/third_party/gmp/mpn/x86/lshift.asm
new file mode 100644
index 0000000..6ee6153
--- /dev/null
+++ b/third_party/gmp/mpn/x86/lshift.asm

@@ -0,0 +1,106 @@
+dnl  x86 mpn_lshift -- mpn left shift.
+
+dnl  Copyright 1992, 1994, 1996, 1999-2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C     cycles/limb
+C P54	 7.5
+C P55	 7.0
+C P6	 2.5
+C K6	 4.5
+C K7	 5.0
+C P4	14.5
+
+
+C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                       unsigned shift);
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC,  8)
+defframe(PARAM_DST,  4)
+
+	TEXT
+	ALIGN(8)
+PROLOGUE(mpn_lshift)
+
+	pushl	%edi
+	pushl	%esi
+	pushl	%ebx
+deflit(`FRAME',12)
+
+	movl	PARAM_DST,%edi
+	movl	PARAM_SRC,%esi
+	movl	PARAM_SIZE,%edx
+	movl	PARAM_SHIFT,%ecx
+
+	subl	$4,%esi			C adjust src
+
+	movl	(%esi,%edx,4),%ebx	C read most significant limb
+	xorl	%eax,%eax
+	shldl(	%cl, %ebx, %eax)	C compute carry limb
+	decl	%edx
+	jz	L(end)
+	pushl	%eax			C push carry limb onto stack
+	testb	$1,%dl
+	jnz	L(1)			C enter loop in the middle
+	movl	%ebx,%eax
+
+	ALIGN(8)
+L(oop):	movl	(%esi,%edx,4),%ebx	C load next lower limb
+	shldl(	%cl, %ebx, %eax)	C compute result limb
+	movl	%eax,(%edi,%edx,4)	C store it
+	decl	%edx
+L(1):	movl	(%esi,%edx,4),%eax
+	shldl(	%cl, %eax, %ebx)
+	movl	%ebx,(%edi,%edx,4)
+	decl	%edx
+	jnz	L(oop)
+
+	shll	%cl,%eax		C compute least significant limb
+	movl	%eax,(%edi)		C store it
+
+	popl	%eax			C pop carry limb
+
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
+
+L(end):	shll	%cl,%ebx		C compute least significant limb
+	movl	%ebx,(%edi)		C store it
+
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/mmx/sec_tabselect.asm b/third_party/gmp/mpn/x86/mmx/sec_tabselect.asm
new file mode 100644
index 0000000..aae158a
--- /dev/null
+++ b/third_party/gmp/mpn/x86/mmx/sec_tabselect.asm

@@ -0,0 +1,163 @@
+dnl  X86 MMX mpn_sec_tabselect.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2011-2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C			     cycles/limb     cycles/limb
+C			      ali,evn n	     unal,evn n
+C P5
+C P6 model 0-8,10-12
+C P6 model 9  (Banias)
+C P6 model 13 (Dothan)		 1.33		 1.87
+C P4 model 0  (Willamette)
+C P4 model 1  (?)
+C P4 model 2  (Northwood)	 2.1		 2.63
+C P4 model 3  (Prescott)
+C P4 model 4  (Nocona)		 1.7		 2.57
+C Intel Atom			 1.85		 2.7
+C AMD K6
+C AMD K7			 1.33		 1.33
+C AMD K8
+C AMD K10
+
+define(`rp',     `%edi')
+define(`tp',     `%esi')
+define(`n',      `%edx')
+define(`nents',  `%ecx')
+define(`which',  `')
+
+define(`i',      `%ebp')
+define(`j',      `%ebx')
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_sec_tabselect)
+	push	%ebx
+	push	%esi
+	push	%edi
+	push	%ebp
+
+	mov	20(%esp), rp
+	mov	24(%esp), tp
+	mov	28(%esp), n
+	mov	32(%esp), nents
+
+	movd	36(%esp), %mm6
+	punpckldq %mm6, %mm6		C 2 copies of `which'
+
+	mov	$1, %ebx
+	movd	%ebx, %mm7
+	punpckldq %mm7, %mm7		C 2 copies of 1
+
+	mov	n, j
+	add	$-4, j
+	js	L(outer_end)
+
+L(outer_top):
+	mov	nents, i
+	mov	tp, %eax
+	pxor	%mm1, %mm1
+	pxor	%mm4, %mm4
+	pxor	%mm5, %mm5
+	ALIGN(16)
+L(top):	movq	%mm6, %mm0
+	pcmpeqd	%mm1, %mm0
+	paddd	%mm7, %mm1
+	movq	(tp), %mm2
+	movq	8(tp), %mm3
+	pand	%mm0, %mm2
+	pand	%mm0, %mm3
+	por	%mm2, %mm4
+	por	%mm3, %mm5
+	lea	(tp,n,4), tp
+	add	$-1, i
+	jne	L(top)
+
+	movq	%mm4, (rp)
+	movq	%mm5, 8(rp)
+
+	lea	16(%eax), tp
+	lea	16(rp), rp
+	add	$-4, j
+	jns	L(outer_top)
+L(outer_end):
+
+	test	$2, %dl
+	jz	L(b0x)
+
+L(b1x):	mov	nents, i
+	mov	tp, %eax
+	pxor	%mm1, %mm1
+	pxor	%mm4, %mm4
+	ALIGN(16)
+L(tp2):	movq	%mm6, %mm0
+	pcmpeqd	%mm1, %mm0
+	paddd	%mm7, %mm1
+	movq	(tp), %mm2
+	pand	%mm0, %mm2
+	por	%mm2, %mm4
+	lea	(tp,n,4), tp
+	add	$-1, i
+	jne	L(tp2)
+
+	movq	%mm4, (rp)
+
+	lea	8(%eax), tp
+	lea	8(rp), rp
+
+L(b0x):	test	$1, %dl
+	jz	L(b00)
+
+L(b01):	mov	nents, i
+	pxor	%mm1, %mm1
+	pxor	%mm4, %mm4
+	ALIGN(16)
+L(tp1):	movq	%mm6, %mm0
+	pcmpeqd	%mm1, %mm0
+	paddd	%mm7, %mm1
+	movd	(tp), %mm2
+	pand	%mm0, %mm2
+	por	%mm2, %mm4
+	lea	(tp,n,4), tp
+	add	$-1, i
+	jne	L(tp1)
+
+	movd	%mm4, (rp)
+
+L(b00):	pop	%ebp
+	pop	%edi
+	pop	%esi
+	pop	%ebx
+	emms
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/mod_34lsub1.asm b/third_party/gmp/mpn/x86/mod_34lsub1.asm
new file mode 100644
index 0000000..e09e702
--- /dev/null
+++ b/third_party/gmp/mpn/x86/mod_34lsub1.asm

@@ -0,0 +1,183 @@
+dnl  Generic x86 mpn_mod_34lsub1 -- mpn remainder modulo 2^24-1.
+
+dnl  Copyright 2000-2002, 2004 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C      cycles/limb
+C P5	  3.0
+C P6	  3.66
+C K6	  3.0
+C K7	  1.3
+C P4	  9
+
+
+C mp_limb_t mpn_mod_34lsub1 (mp_srcptr src, mp_size_t size)
+C
+
+defframe(PARAM_SIZE, 8)
+defframe(PARAM_SRC,  4)
+
+dnl  re-use parameter space
+define(SAVE_EBX, `PARAM_SRC')
+
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_mod_34lsub1)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	movl	PARAM_SRC, %edx
+
+	subl	$2, %ecx
+	ja	L(three_or_more)
+
+	movl	(%edx), %eax
+	jb	L(one)
+
+	movl	4(%edx), %ecx
+	movl	%eax, %edx
+	shrl	$24, %eax		C src[0] low
+
+	andl	$0xFFFFFF, %edx		C src[0] high
+	addl	%edx, %eax
+	movl	%ecx, %edx
+
+	andl	$0xFFFF, %ecx
+	shrl	$16, %edx		C src[1] high
+	addl	%edx, %eax
+
+	shll	$8, %ecx		C src[1] low
+	addl	%ecx, %eax
+
+L(one):
+	ret
+
+
+L(three_or_more):
+	C eax
+	C ebx
+	C ecx	size-2
+	C edx	src
+	C esi
+	C edi
+	C ebp
+
+	movl	%ebx, SAVE_EBX		C and arrange 16-byte loop alignment
+	xorl	%ebx, %ebx
+
+	pushl	%esi	FRAME_pushl()
+	xorl	%esi, %esi
+
+	pushl	%edi	FRAME_pushl()
+	xorl	%eax, %eax		C and clear carry flag
+
+
+	C offset 0x40 here
+L(top):
+	C eax	acc 0mod3
+	C ebx	acc 1mod3
+	C ecx	counter, limbs
+	C edx	src
+	C esi	acc 2mod3
+	C edi
+	C ebp
+
+	leal	12(%edx), %edx
+	leal	-2(%ecx), %ecx
+
+	adcl	-12(%edx), %eax
+	adcl	-8(%edx), %ebx
+	adcl	-4(%edx), %esi
+
+	decl	%ecx
+	jg	L(top)
+
+
+	C ecx is -2, -1 or 0 representing 0, 1 or 2 more limbs, respectively
+
+	movl	$0xFFFFFFFF, %edi
+	incl	%ecx
+	js	L(combine)
+
+	adcl	(%edx), %eax
+	movl	$0xFFFFFF00, %edi
+	decl	%ecx
+	js	L(combine)
+
+	adcl	4(%edx), %ebx
+	movl	$0xFFFF0000, %edi
+
+
+L(combine):
+	C eax	acc 0mod3
+	C ebx	acc 1mod3
+	C ecx
+	C edx
+	C esi	acc 2mod3
+	C edi	mask
+	C ebp
+
+	sbbl	%ecx, %ecx		C carry
+	movl	%eax, %edx		C 0mod3
+
+	shrl	$24, %eax		C 0mod3 high
+	andl	%edi, %ecx		C carry masked
+
+	subl	%ecx, %eax		C apply carry
+	movl	%ebx, %edi		C 1mod3
+
+	shrl	$16, %ebx		C 1mod3 high
+	andl	$0x00FFFFFF, %edx	C 0mod3 low
+
+	addl	%edx, %eax		C apply 0mod3 low
+	andl	$0xFFFF, %edi
+
+	shll	$8, %edi		C 1mod3 low
+	addl	%ebx, %eax		C apply 1mod3 high
+
+	addl	%edi, %eax		C apply 1mod3 low
+	movl	%esi, %edx		C 2mod3
+
+	shrl	$8, %esi		C 2mod3 high
+	andl	$0xFF, %edx		C 2mod3 low
+
+	shll	$16, %edx		C 2mod3 low
+	addl	%esi, %eax		C apply 2mod3 high
+
+	addl	%edx, %eax		C apply 2mod3 low
+	popl	%edi	FRAME_popl()
+
+	movl	SAVE_EBX, %ebx
+	popl	%esi	FRAME_popl()
+
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/mul_1.asm b/third_party/gmp/mpn/x86/mul_1.asm
new file mode 100644
index 0000000..421de62
--- /dev/null
+++ b/third_party/gmp/mpn/x86/mul_1.asm

@@ -0,0 +1,140 @@
+dnl  x86 mpn_mul_1 (for 386, 486, and Pentium Pro) -- Multiply a limb vector
+dnl  with a limb and store the result in a second limb vector.
+
+dnl  Copyright 1992, 1994, 1997-2002, 2005 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C			    cycles/limb
+C P5				12.5
+C P6 model 0-8,10-12		 5.5
+C P6 model 9  (Banias)
+C P6 model 13 (Dothan)		 5.25
+C P4 model 0  (Willamette)	19.0
+C P4 model 1  (?)		19.0
+C P4 model 2  (Northwood)	19.0
+C P4 model 3  (Prescott)
+C P4 model 4  (Nocona)
+C AMD K6			10.5
+C AMD K7			 4.5
+C AMD K8
+
+
+C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                      mp_limb_t multiplier);
+
+defframe(PARAM_MULTIPLIER,16)
+defframe(PARAM_SIZE,      12)
+defframe(PARAM_SRC,       8)
+defframe(PARAM_DST,       4)
+
+	TEXT
+	ALIGN(8)
+PROLOGUE(mpn_mul_1)
+deflit(`FRAME',0)
+
+	pushl	%edi
+	pushl	%esi
+	pushl	%ebx
+	pushl	%ebp
+deflit(`FRAME',16)
+
+	movl	PARAM_DST,%edi
+	movl	PARAM_SRC,%esi
+	movl	PARAM_SIZE,%ecx
+
+	xorl	%ebx,%ebx
+	andl	$3,%ecx
+	jz	L(end0)
+
+L(oop0):
+	movl	(%esi),%eax
+	mull	PARAM_MULTIPLIER
+	leal	4(%esi),%esi
+	addl	%ebx,%eax
+	movl	$0,%ebx
+	adcl	%ebx,%edx
+	movl	%eax,(%edi)
+	movl	%edx,%ebx	C propagate carry into cylimb
+
+	leal	4(%edi),%edi
+	decl	%ecx
+	jnz	L(oop0)
+
+L(end0):
+	movl	PARAM_SIZE,%ecx
+	shrl	$2,%ecx
+	jz	L(end)
+
+
+	ALIGN(8)
+L(oop):	movl	(%esi),%eax
+	mull	PARAM_MULTIPLIER
+	addl	%eax,%ebx
+	movl	$0,%ebp
+	adcl	%edx,%ebp
+
+	movl	4(%esi),%eax
+	mull	PARAM_MULTIPLIER
+	movl	%ebx,(%edi)
+	addl	%eax,%ebp	C new lo + cylimb
+	movl	$0,%ebx
+	adcl	%edx,%ebx
+
+	movl	8(%esi),%eax
+	mull	PARAM_MULTIPLIER
+	movl	%ebp,4(%edi)
+	addl	%eax,%ebx	C new lo + cylimb
+	movl	$0,%ebp
+	adcl	%edx,%ebp
+
+	movl	12(%esi),%eax
+	mull	PARAM_MULTIPLIER
+	movl	%ebx,8(%edi)
+	addl	%eax,%ebp	C new lo + cylimb
+	movl	$0,%ebx
+	adcl	%edx,%ebx
+
+	movl	%ebp,12(%edi)
+
+	leal	16(%esi),%esi
+	leal	16(%edi),%edi
+	decl	%ecx
+	jnz	L(oop)
+
+L(end):	movl	%ebx,%eax
+
+	popl	%ebp
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/mul_basecase.asm b/third_party/gmp/mpn/x86/mul_basecase.asm
new file mode 100644
index 0000000..8339732
--- /dev/null
+++ b/third_party/gmp/mpn/x86/mul_basecase.asm

@@ -0,0 +1,223 @@
+dnl  x86 mpn_mul_basecase -- Multiply two limb vectors and store the result
+dnl  in a third limb vector.
+
+dnl  Copyright 1996-2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C     cycles/crossproduct
+C P5	  15
+C P6	   7.5
+C K6	  12.5
+C K7	   5.5
+C P4	  24
+
+
+C void mpn_mul_basecase (mp_ptr wp,
+C                        mp_srcptr xp, mp_size_t xsize,
+C                        mp_srcptr yp, mp_size_t ysize);
+C
+C This was written in a haste since the Pentium optimized code that was used
+C for all x86 machines was slow for the Pentium II.  This code would benefit
+C from some cleanup.
+C
+C To shave off some percentage of the run-time, one should make 4 variants
+C of the Louter loop, for the four different outcomes of un mod 4.  That
+C would avoid Loop0 altogether.  Code expansion would be > 4-fold for that
+C part of the function, but since it is not very large, that would be
+C acceptable.
+C
+C The mul loop (at L(oopM)) might need some tweaking.  It's current speed is
+C unknown.
+
+defframe(PARAM_YSIZE,20)
+defframe(PARAM_YP,   16)
+defframe(PARAM_XSIZE,12)
+defframe(PARAM_XP,   8)
+defframe(PARAM_WP,   4)
+
+defframe(VAR_MULTIPLIER, -4)
+defframe(VAR_COUNTER,    -8)
+deflit(VAR_STACK_SPACE,  8)
+
+	TEXT
+	ALIGN(8)
+
+PROLOGUE(mpn_mul_basecase)
+deflit(`FRAME',0)
+
+	subl	$VAR_STACK_SPACE,%esp
+	pushl	%esi
+	pushl	%ebp
+	pushl	%edi
+deflit(`FRAME',eval(VAR_STACK_SPACE+12))
+
+	movl	PARAM_XP,%esi
+	movl	PARAM_WP,%edi
+	movl	PARAM_YP,%ebp
+
+	movl	(%esi),%eax		C load xp[0]
+	mull	(%ebp)			C multiply by yp[0]
+	movl	%eax,(%edi)		C store to wp[0]
+	movl	PARAM_XSIZE,%ecx	C xsize
+	decl	%ecx			C If xsize = 1, ysize = 1 too
+	jz	L(done)
+
+	pushl	%ebx
+FRAME_pushl()
+	movl	%edx,%ebx
+
+	leal	4(%esi),%esi
+	leal	4(%edi),%edi
+
+L(oopM):
+	movl	(%esi),%eax		C load next limb at xp[j]
+	leal	4(%esi),%esi
+	mull	(%ebp)
+	addl	%ebx,%eax
+	movl	%edx,%ebx
+	adcl	$0,%ebx
+	movl	%eax,(%edi)
+	leal	4(%edi),%edi
+	decl	%ecx
+	jnz	L(oopM)
+
+	movl	%ebx,(%edi)		C most significant limb of product
+	addl	$4,%edi			C increment wp
+	movl	PARAM_XSIZE,%eax
+	shll	$2,%eax
+	subl	%eax,%edi
+	subl	%eax,%esi
+
+	movl	PARAM_YSIZE,%eax	C ysize
+	decl	%eax
+	jz	L(skip)
+	movl	%eax,VAR_COUNTER	C set index i to ysize
+
+L(outer):
+	movl	PARAM_YP,%ebp		C yp
+	addl	$4,%ebp			C make ebp point to next v limb
+	movl	%ebp,PARAM_YP
+	movl	(%ebp),%eax		C copy y limb ...
+	movl	%eax,VAR_MULTIPLIER	C ... to stack slot
+	movl	PARAM_XSIZE,%ecx
+
+	xorl	%ebx,%ebx
+	andl	$3,%ecx
+	jz	L(end0)
+
+L(oop0):
+	movl	(%esi),%eax
+	mull	VAR_MULTIPLIER
+	leal	4(%esi),%esi
+	addl	%ebx,%eax
+	movl	$0,%ebx
+	adcl	%ebx,%edx
+	addl	%eax,(%edi)
+	adcl	%edx,%ebx		C propagate carry into cylimb
+
+	leal	4(%edi),%edi
+	decl	%ecx
+	jnz	L(oop0)
+
+L(end0):
+	movl	PARAM_XSIZE,%ecx
+	shrl	$2,%ecx
+	jz	L(endX)
+
+	ALIGN(8)
+L(oopX):
+	movl	(%esi),%eax
+	mull	VAR_MULTIPLIER
+	addl	%eax,%ebx
+	movl	$0,%ebp
+	adcl	%edx,%ebp
+
+	movl	4(%esi),%eax
+	mull	VAR_MULTIPLIER
+	addl	%ebx,(%edi)
+	adcl	%eax,%ebp	C new lo + cylimb
+	movl	$0,%ebx
+	adcl	%edx,%ebx
+
+	movl	8(%esi),%eax
+	mull	VAR_MULTIPLIER
+	addl	%ebp,4(%edi)
+	adcl	%eax,%ebx	C new lo + cylimb
+	movl	$0,%ebp
+	adcl	%edx,%ebp
+
+	movl	12(%esi),%eax
+	mull	VAR_MULTIPLIER
+	addl	%ebx,8(%edi)
+	adcl	%eax,%ebp	C new lo + cylimb
+	movl	$0,%ebx
+	adcl	%edx,%ebx
+
+	addl	%ebp,12(%edi)
+	adcl	$0,%ebx		C propagate carry into cylimb
+
+	leal	16(%esi),%esi
+	leal	16(%edi),%edi
+	decl	%ecx
+	jnz	L(oopX)
+
+L(endX):
+	movl	%ebx,(%edi)
+	addl	$4,%edi
+
+	C we incremented wp and xp in the loop above; compensate
+	movl	PARAM_XSIZE,%eax
+	shll	$2,%eax
+	subl	%eax,%edi
+	subl	%eax,%esi
+
+	movl	VAR_COUNTER,%eax
+	decl	%eax
+	movl	%eax,VAR_COUNTER
+	jnz	L(outer)
+
+L(skip):
+	popl	%ebx
+	popl	%edi
+	popl	%ebp
+	popl	%esi
+	addl	$8,%esp
+	ret
+
+L(done):
+	movl	%edx,4(%edi)	   C store to wp[1]
+	popl	%edi
+	popl	%ebp
+	popl	%esi
+	addl	$8,%esp
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/nano/gmp-mparam.h b/third_party/gmp/mpn/x86/nano/gmp-mparam.h
new file mode 100644
index 0000000..cd8ac4e
--- /dev/null
+++ b/third_party/gmp/mpn/x86/nano/gmp-mparam.h

@@ -0,0 +1,162 @@
+/* x86/nano gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 2000-2011 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* Generated by tuneup.c, 2011-11-25, gcc 4.2 */
+
+#define MOD_1_1P_METHOD                      1
+#define MOD_1_NORM_THRESHOLD                 3
+#define MOD_1_UNNORM_THRESHOLD               3
+#define MOD_1N_TO_MOD_1_1_THRESHOLD         10
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          9
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD         0  /* never mpn_mod_1_1p */
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD        53
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     12
+#define USE_PREINV_DIVREM_1                  1
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           32
+
+#define MUL_TOOM22_THRESHOLD                16
+#define MUL_TOOM33_THRESHOLD               132
+#define MUL_TOOM44_THRESHOLD               195
+#define MUL_TOOM6H_THRESHOLD               270
+#define MUL_TOOM8H_THRESHOLD               478
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD     129
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     138
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD     130
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD     135
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 28
+#define SQR_TOOM3_THRESHOLD                194
+#define SQR_TOOM4_THRESHOLD                502
+#define SQR_TOOM6_THRESHOLD                746
+#define SQR_TOOM8_THRESHOLD               1005
+
+#define MULMID_TOOM42_THRESHOLD             40
+
+#define MULMOD_BNM1_THRESHOLD               14
+#define SQRMOD_BNM1_THRESHOLD               19
+
+#define POWM_SEC_TABLE  4,23,258,828,2246
+
+#define MUL_FFT_MODF_THRESHOLD             308  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    308, 5}, {     13, 6}, {      7, 5}, {     17, 6}, \
+    {      9, 5}, {     19, 6}, {     11, 5}, {     23, 6}, \
+    {     13, 7}, {      7, 6}, {     17, 7}, {      9, 6}, \
+    {     19, 7}, {     11, 6}, {     24, 7}, {     15, 6}, \
+    {     31, 7}, {     19, 8}, {     11, 7}, {     25, 8}, \
+    {     15, 7}, {     33, 8}, {     19, 7}, {     39, 8}, \
+    {     23, 7}, {     47, 9}, {     15, 8}, {     31, 7}, \
+    {     63, 8}, {     39, 9}, {     23, 8}, {     47,10}, \
+    {     15, 9}, {     31, 8}, {     63, 9}, {     47,10}, \
+    {     31, 9}, {     71,10}, {     47, 9}, {     95,11}, \
+    {     31,10}, {     63, 9}, {    127, 8}, {    255,10}, \
+    {     79, 9}, {    159,10}, {     95, 9}, {    191,11}, \
+    {     63,10}, {    127, 9}, {    255, 8}, {    543, 9}, \
+    {    287, 8}, {    575, 7}, {   1215,10}, {    159,11}, \
+    {     95,10}, {    191,12}, {     63,11}, {    127,10}, \
+    {    255, 9}, {    543, 8}, {   1087,10}, {    287, 9}, \
+    {    607, 8}, {   1215,11}, {    159,10}, {    319, 9}, \
+    {    639,10}, {    351, 9}, {    703, 8}, {   1407, 9}, \
+    {    735, 8}, {   1471,11}, {    191,10}, {    383, 9}, \
+    {    767,10}, {    415, 9}, {    831,11}, {    223,10}, \
+    {    447, 9}, {    895,10}, {    479, 9}, {    959, 8}, \
+    {   1919,12}, {   4096,13}, {   8192,14}, {  16384,15}, \
+    {  32768,16} }
+#define MUL_FFT_TABLE3_SIZE 89
+#define MUL_FFT_THRESHOLD                 1856
+
+#define SQR_FFT_MODF_THRESHOLD             396  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    396, 5}, {     13, 6}, {      7, 5}, {     21, 6}, \
+    {     11, 5}, {     23, 6}, {     21, 7}, {     11, 6}, \
+    {     25, 7}, {     15, 6}, {     31, 7}, {     19, 6}, \
+    {     39, 7}, {     21, 8}, {     11, 7}, {     23, 6}, \
+    {     47, 7}, {     27, 8}, {     15, 7}, {     33, 8}, \
+    {     19, 7}, {     39, 8}, {     23, 7}, {     47, 8}, \
+    {     27, 9}, {     15, 8}, {     31, 7}, {     63, 8}, \
+    {     39, 9}, {     23, 8}, {     47,10}, {     15, 9}, \
+    {     31, 8}, {     63, 9}, {     39, 8}, {     79, 9}, \
+    {     47,10}, {     31, 9}, {     79,10}, {     47, 9}, \
+    {     95,11}, {     31,10}, {     63, 9}, {    127,10}, \
+    {     79, 9}, {    159,10}, {     95,11}, {     63,10}, \
+    {    127, 9}, {    255, 8}, {    543,10}, {    143, 9}, \
+    {    287, 8}, {    607, 7}, {   1215, 6}, {   2431,10}, \
+    {    159, 8}, {    639,11}, {     95,10}, {    191,12}, \
+    {     63,11}, {    127,10}, {    255, 9}, {    543, 8}, \
+    {   1087,10}, {    287, 9}, {    607, 8}, {   1215,11}, \
+    {    159,10}, {    319, 9}, {    671,10}, {    351, 9}, \
+    {    703, 8}, {   1407, 9}, {    735, 8}, {   1471, 7}, \
+    {   2943,11}, {    191,10}, {    383, 9}, {    799,10}, \
+    {    415, 9}, {    895,10}, {    479,12}, {   4096,13}, \
+    {   8192,14}, {  16384,15}, {  32768,16} }
+#define SQR_FFT_TABLE3_SIZE 87
+#define SQR_FFT_THRESHOLD                 2368
+
+#define MULLO_BASECASE_THRESHOLD             0  /* always */
+#define MULLO_DC_THRESHOLD                  51
+#define MULLO_MUL_N_THRESHOLD             3369
+
+#define DC_DIV_QR_THRESHOLD                 56
+#define DC_DIVAPPR_Q_THRESHOLD             183
+#define DC_BDIV_QR_THRESHOLD                55
+#define DC_BDIV_Q_THRESHOLD                118
+
+#define INV_MULMOD_BNM1_THRESHOLD           30
+#define INV_NEWTON_THRESHOLD               266
+#define INV_APPR_THRESHOLD                 218
+
+#define BINV_NEWTON_THRESHOLD              268
+#define REDC_1_TO_REDC_N_THRESHOLD          56
+
+#define MU_DIV_QR_THRESHOLD               1308
+#define MU_DIVAPPR_Q_THRESHOLD            1528
+#define MUPI_DIV_QR_THRESHOLD              124
+#define MU_BDIV_QR_THRESHOLD               855
+#define MU_BDIV_Q_THRESHOLD               1334
+
+#define MATRIX22_STRASSEN_THRESHOLD         14
+#define HGCD_THRESHOLD                     104
+#define HGCD_APPR_THRESHOLD                139
+#define HGCD_REDUCE_THRESHOLD             2121
+#define GCD_DC_THRESHOLD                   456
+#define GCDEXT_DC_THRESHOLD                321
+#define JACOBI_BASE_METHOD                   4
+
+#define GET_STR_DC_THRESHOLD                11
+#define GET_STR_PRECOMPUTE_THRESHOLD        25
+#define SET_STR_DC_THRESHOLD               542
+#define SET_STR_PRECOMPUTE_THRESHOLD       840

diff --git a/third_party/gmp/mpn/x86/p6/README b/third_party/gmp/mpn/x86/p6/README
new file mode 100644
index 0000000..f19d47b
--- /dev/null
+++ b/third_party/gmp/mpn/x86/p6/README

@@ -0,0 +1,125 @@
+Copyright 2000, 2001 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.
+
+
+
+
+
+                      INTEL P6 MPN SUBROUTINES
+
+
+
+This directory contains code optimized for Intel P6 class CPUs, meaning
+PentiumPro, Pentium II and Pentium III.  The mmx and p3mmx subdirectories
+have routines using MMX instructions.
+
+
+
+STATUS
+
+Times for the loops, with all code and data in L1 cache, are as follows.
+Some of these might be able to be improved.
+
+                               cycles/limb
+
+	mpn_add_n/sub_n           3.7
+
+	mpn_copyi                 0.75
+	mpn_copyd                 1.75 (or 0.75 if no overlap)
+
+	mpn_divrem_1             39.0
+	mpn_mod_1                21.5
+	mpn_divexact_by3          8.5
+
+	mpn_mul_1                 5.5
+	mpn_addmul/submul_1       6.35
+
+	mpn_l/rshift              2.5
+
+	mpn_mul_basecase          8.2 cycles/crossproduct (approx)
+	mpn_sqr_basecase          4.0 cycles/crossproduct (approx)
+				  or 7.75 cycles/triangleproduct (approx)
+
+Pentium II and III have MMX and get the following improvements.
+
+	mpn_divrem_1             25.0 integer part, 17.5 fractional part
+
+	mpn_l/rshift              1.75
+
+
+
+
+NOTES
+
+Write-allocate L1 data cache means prefetching of destinations is unnecessary.
+
+Mispredicted branches have a penalty of between 9 and 15 cycles, and even up
+to 26 cycles depending how far speculative execution has gone.  The 9 cycle
+minimum penalty comes from the issue pipeline being 9 stages.
+
+A copy with rep movs seems to copy 16 bytes at a time, since speeds for 4,
+5, 6 or 7 limb operations are all the same.  The 0.75 cycles/limb would be 3
+cycles per 16 byte block.
+
+
+
+
+CODING
+
+Instructions in general code have been shown grouped if they can execute
+together, which means up to three instructions with no successive
+dependencies, and with only the first being a multiple micro-op.
+
+P6 has out-of-order execution, so the groupings are really only showing
+dependent paths where some shuffling might allow some latencies to be
+hidden.
+
+
+
+
+REFERENCES
+
+"Intel Architecture Optimization Reference Manual", 1999, revision 001 dated
+02/99, order number 245127 (order number 730795-001 is in the document too).
+Available on-line:
+
+	http://download.intel.com/design/PentiumII/manuals/245127.htm
+
+"Intel Architecture Optimization Manual", 1997, order number 242816.  This
+is an older document mostly about P5 and not as good as the above.
+Available on-line:
+
+	http://download.intel.com/design/PentiumII/manuals/242816.htm
+
+
+
+----------------
+Local variables:
+mode: text
+fill-column: 76
+End:

diff --git a/third_party/gmp/mpn/x86/p6/aors_n.asm b/third_party/gmp/mpn/x86/p6/aors_n.asm
new file mode 100644
index 0000000..df51c2e
--- /dev/null
+++ b/third_party/gmp/mpn/x86/p6/aors_n.asm

@@ -0,0 +1,156 @@
+dnl  Intel P6 mpn_add_n/mpn_sub_n -- mpn add or subtract.
+
+dnl  Copyright 2006 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C TODO:
+C  * Avoid indexed addressing, it makes us stall on the two-ported register
+C    file.
+
+C			    cycles/limb
+C P6 model 0-8,10-12		3.17
+C P6 model 9   (Banias)		2.15
+C P6 model 13  (Dothan)		2.25
+
+
+define(`rp',	`%edi')
+define(`up',	`%esi')
+define(`vp',	`%ebx')
+define(`n',	`%ecx')
+
+ifdef(`OPERATION_add_n', `
+	define(ADCSBB,	      adc)
+	define(func,	      mpn_add_n)
+	define(func_nc,	      mpn_add_nc)')
+ifdef(`OPERATION_sub_n', `
+	define(ADCSBB,	      sbb)
+	define(func,	      mpn_sub_n)
+	define(func_nc,	      mpn_sub_nc)')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+
+ASM_START()
+
+	TEXT
+	ALIGN(16)
+
+PROLOGUE(func)
+	xor	%edx, %edx
+L(start):
+	push	%edi
+	push	%esi
+	push	%ebx
+
+	mov	16(%esp), rp
+	mov	20(%esp), up
+	mov	24(%esp), vp
+	mov	28(%esp), n
+
+	lea	(up,n,4), up
+	lea	(vp,n,4), vp
+	lea	(rp,n,4), rp
+
+	neg	n
+	mov	n, %eax
+	and	$-8, n
+	and	$7, %eax
+	shl	$2, %eax			C 4x
+ifdef(`PIC',`
+	call	L(pic_calc)
+L(here):
+',`
+	lea	L(ent) (%eax,%eax,2), %eax	C 12x
+')
+
+	shr	%edx				C set cy flag
+	jmp	*%eax
+
+ifdef(`PIC',`
+L(pic_calc):
+	C See mpn/x86/README about old gas bugs
+	lea	(%eax,%eax,2), %eax
+	add	$L(ent)-L(here), %eax
+	add	(%esp), %eax
+	ret_internal
+')
+
+L(end):
+	sbb	%eax, %eax
+	neg	%eax
+	pop	%ebx
+	pop	%esi
+	pop	%edi
+	ret
+
+	ALIGN(16)
+L(top):
+	jecxz	L(end)
+L(ent):
+Zdisp(	mov,	0,(up,n,4), %eax)
+Zdisp(	ADCSBB,	0,(vp,n,4), %eax)
+Zdisp(	mov,	%eax, 0,(rp,n,4))
+
+	mov	4(up,n,4), %edx
+	ADCSBB	4(vp,n,4), %edx
+	mov	%edx, 4(rp,n,4)
+
+	mov	8(up,n,4), %eax
+	ADCSBB	8(vp,n,4), %eax
+	mov	%eax, 8(rp,n,4)
+
+	mov	12(up,n,4), %edx
+	ADCSBB	12(vp,n,4), %edx
+	mov	%edx, 12(rp,n,4)
+
+	mov	16(up,n,4), %eax
+	ADCSBB	16(vp,n,4), %eax
+	mov	%eax, 16(rp,n,4)
+
+	mov	20(up,n,4), %edx
+	ADCSBB	20(vp,n,4), %edx
+	mov	%edx, 20(rp,n,4)
+
+	mov	24(up,n,4), %eax
+	ADCSBB	24(vp,n,4), %eax
+	mov	%eax, 24(rp,n,4)
+
+	mov	28(up,n,4), %edx
+	ADCSBB	28(vp,n,4), %edx
+	mov	%edx, 28(rp,n,4)
+
+	lea	8(n), n
+	jmp	L(top)
+
+EPILOGUE()
+
+PROLOGUE(func_nc)
+	movl	20(%esp), %edx
+	jmp	L(start)
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/p6/aorsmul_1.asm b/third_party/gmp/mpn/x86/p6/aorsmul_1.asm
new file mode 100644
index 0000000..bc8c49c
--- /dev/null
+++ b/third_party/gmp/mpn/x86/p6/aorsmul_1.asm

@@ -0,0 +1,320 @@
+dnl  Intel P6 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple.
+
+dnl  Copyright 1999-2002, 2005 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C			    cycles/limb
+C P5
+C P6 model 0-8,10-12		 6.44
+C P6 model 9  (Banias)		 6.15
+C P6 model 13 (Dothan)		 6.11
+C P4 model 0  (Willamette)
+C P4 model 1  (?)
+C P4 model 2  (Northwood)
+C P4 model 3  (Prescott)
+C P4 model 4  (Nocona)
+C AMD K6
+C AMD K7
+C AMD K8
+
+
+dnl  P6 UNROLL_COUNT cycles/limb
+dnl          8           6.7
+dnl         16           6.35
+dnl         32           6.3
+dnl         64           6.3
+dnl  Maximum possible with the current code is 64.
+
+deflit(UNROLL_COUNT, 16)
+
+
+ifdef(`OPERATION_addmul_1', `
+	define(M4_inst,        addl)
+	define(M4_function_1,  mpn_addmul_1)
+	define(M4_function_1c, mpn_addmul_1c)
+	define(M4_description, add it to)
+	define(M4_desc_retval, carry)
+',`ifdef(`OPERATION_submul_1', `
+	define(M4_inst,        subl)
+	define(M4_function_1,  mpn_submul_1)
+	define(M4_function_1c, mpn_submul_1c)
+	define(M4_description, subtract it from)
+	define(M4_desc_retval, borrow)
+',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1
+')')')
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c)
+
+
+C mp_limb_t M4_function_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                            mp_limb_t mult);
+C mp_limb_t M4_function_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                             mp_limb_t mult, mp_limb_t carry);
+C
+C Calculate src,size multiplied by mult and M4_description dst,size.
+C Return the M4_desc_retval limb from the top of the result.
+C
+C This code is pretty much the same as the K6 code.  The unrolled loop is
+C the same, but there's just a few scheduling tweaks in the setups and the
+C simple loop.
+C
+C A number of variations have been tried for the unrolled loop, with one or
+C two carries, and with loads scheduled earlier, but nothing faster than 6
+C cycles/limb has been found.
+
+ifdef(`PIC',`
+deflit(UNROLL_THRESHOLD, 5)
+',`
+deflit(UNROLL_THRESHOLD, 5)
+')
+
+defframe(PARAM_CARRY,     20)
+defframe(PARAM_MULTIPLIER,16)
+defframe(PARAM_SIZE,      12)
+defframe(PARAM_SRC,       8)
+defframe(PARAM_DST,       4)
+
+	TEXT
+	ALIGN(32)
+
+PROLOGUE(M4_function_1c)
+	pushl	%ebx
+deflit(`FRAME',4)
+	movl	PARAM_CARRY, %ebx
+	jmp	L(start_nc)
+EPILOGUE()
+
+PROLOGUE(M4_function_1)
+	push	%ebx
+deflit(`FRAME',4)
+	xorl	%ebx, %ebx	C initial carry
+
+L(start_nc):
+	movl	PARAM_SIZE, %ecx
+	pushl	%esi
+deflit(`FRAME',8)
+
+	movl	PARAM_SRC, %esi
+	pushl	%edi
+deflit(`FRAME',12)
+
+	movl	PARAM_DST, %edi
+	pushl	%ebp
+deflit(`FRAME',16)
+	cmpl	$UNROLL_THRESHOLD, %ecx
+
+	movl	PARAM_MULTIPLIER, %ebp
+	jae	L(unroll)
+
+
+	C simple loop
+	C this is offset 0x22, so close enough to aligned
+L(simple):
+	C eax	scratch
+	C ebx	carry
+	C ecx	counter
+	C edx	scratch
+	C esi	src
+	C edi	dst
+	C ebp	multiplier
+
+	movl	(%esi), %eax
+	addl	$4, %edi
+
+	mull	%ebp
+
+	addl	%ebx, %eax
+	adcl	$0, %edx
+
+	M4_inst	%eax, -4(%edi)
+	movl	%edx, %ebx
+
+	adcl	$0, %ebx
+	decl	%ecx
+
+	leal	4(%esi), %esi
+	jnz	L(simple)
+
+
+	popl	%ebp
+	popl	%edi
+
+	popl	%esi
+	movl	%ebx, %eax
+
+	popl	%ebx
+	ret
+
+
+
+C------------------------------------------------------------------------------
+C VAR_JUMP holds the computed jump temporarily because there's not enough
+C registers when doing the mul for the initial two carry limbs.
+C
+C The add/adc for the initial carry in %ebx is necessary only for the
+C mpn_add/submul_1c entry points.  Duplicating the startup code to
+C eliminate this for the plain mpn_add/submul_1 doesn't seem like a good
+C idea.
+
+dnl  overlapping with parameters already fetched
+define(VAR_COUNTER,`PARAM_SIZE')
+define(VAR_JUMP,   `PARAM_DST')
+
+	C this is offset 0x43, so close enough to aligned
+L(unroll):
+	C eax
+	C ebx	initial carry
+	C ecx	size
+	C edx
+	C esi	src
+	C edi	dst
+	C ebp
+
+	movl	%ecx, %edx
+	decl	%ecx
+
+	subl	$2, %edx
+	negl	%ecx
+
+	shrl	$UNROLL_LOG2, %edx
+	andl	$UNROLL_MASK, %ecx
+
+	movl	%edx, VAR_COUNTER
+	movl	%ecx, %edx
+
+	C 15 code bytes per limb
+ifdef(`PIC',`
+	call	L(pic_calc)
+L(here):
+',`
+	shll	$4, %edx
+	negl	%ecx
+
+	leal	L(entry) (%edx,%ecx,1), %edx
+')
+	movl	(%esi), %eax		C src low limb
+
+	movl	%edx, VAR_JUMP
+	leal	ifelse(UNROLL_BYTES,256,128+) 4(%esi,%ecx,4), %esi
+
+	mull	%ebp
+
+	addl	%ebx, %eax	C initial carry (from _1c)
+	adcl	$0, %edx
+
+	movl	%edx, %ebx	C high carry
+	leal	ifelse(UNROLL_BYTES,256,128) (%edi,%ecx,4), %edi
+
+	movl	VAR_JUMP, %edx
+	testl	$1, %ecx
+	movl	%eax, %ecx	C low carry
+
+	cmovnz(	%ebx, %ecx)	C high,low carry other way around
+	cmovnz(	%eax, %ebx)
+
+	jmp	*%edx
+
+
+ifdef(`PIC',`
+L(pic_calc):
+	shll	$4, %edx
+	negl	%ecx
+
+	C See mpn/x86/README about old gas bugs
+	leal	(%edx,%ecx,1), %edx
+	addl	$L(entry)-L(here), %edx
+
+	addl	(%esp), %edx
+
+	ret_internal
+')
+
+
+C -----------------------------------------------------------
+	ALIGN(32)
+L(top):
+deflit(`FRAME',16)
+	C eax	scratch
+	C ebx	carry hi
+	C ecx	carry lo
+	C edx	scratch
+	C esi	src
+	C edi	dst
+	C ebp	multiplier
+	C
+	C VAR_COUNTER	loop counter
+	C
+	C 15 code bytes per limb
+
+	addl	$UNROLL_BYTES, %edi
+
+L(entry):
+deflit(CHUNK_COUNT,2)
+forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
+	deflit(`disp0', eval(i*4*CHUNK_COUNT ifelse(UNROLL_BYTES,256,-128)))
+	deflit(`disp1', eval(disp0 + 4))
+
+Zdisp(	movl,	disp0,(%esi), %eax)
+	mull	%ebp
+Zdisp(	M4_inst,%ecx, disp0,(%edi))
+	adcl	%eax, %ebx
+	movl	%edx, %ecx
+	adcl	$0, %ecx
+
+	movl	disp1(%esi), %eax
+	mull	%ebp
+	M4_inst	%ebx, disp1(%edi)
+	adcl	%eax, %ecx
+	movl	%edx, %ebx
+	adcl	$0, %ebx
+')
+
+	decl	VAR_COUNTER
+	leal	UNROLL_BYTES(%esi), %esi
+
+	jns	L(top)
+
+
+deflit(`disp0',	eval(UNROLL_BYTES ifelse(UNROLL_BYTES,256,-128)))
+
+	M4_inst	%ecx, disp0(%edi)
+	movl	%ebx, %eax
+
+	popl	%ebp
+	popl	%edi
+
+	popl	%esi
+	popl	%ebx
+	adcl	$0, %eax
+
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/p6/bdiv_q_1.asm b/third_party/gmp/mpn/x86/p6/bdiv_q_1.asm
new file mode 100644
index 0000000..a0a9d90
--- /dev/null
+++ b/third_party/gmp/mpn/x86/p6/bdiv_q_1.asm

@@ -0,0 +1,287 @@
+dnl  Intel P6 mpn_modexact_1_odd -- exact division style remainder.
+
+dnl  Rearranged from mpn/x86/p6/dive_1.asm by Marco Bodrato.
+
+dnl  Copyright 2001, 2002, 2007, 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C       odd  even  divisor
+C P6:  10.0  12.0  cycles/limb
+
+C MULFUNC_PROLOGUE(mpn_bdiv_q_1 mpn_pi1_bdiv_q_1)
+
+C The odd case is basically the same as mpn_modexact_1_odd, just with an
+C extra store, and it runs at the same 10 cycles which is the dependent
+C chain.
+C
+C The shifts for the even case aren't on the dependent chain so in principle
+C it could run the same too, but nothing running at 10 has been found.
+C Perhaps there's too many uops (an extra 4 over the odd case).
+
+defframe(PARAM_SHIFT,  24)
+defframe(PARAM_INVERSE,20)
+defframe(PARAM_DIVISOR,16)
+defframe(PARAM_SIZE,   12)
+defframe(PARAM_SRC,     8)
+defframe(PARAM_DST,     4)
+
+defframe(SAVE_EBX,     -4)
+defframe(SAVE_ESI,     -8)
+defframe(SAVE_EDI,    -12)
+defframe(SAVE_EBP,    -16)
+deflit(STACK_SPACE, 16)
+
+dnl  re-use parameter space
+define(VAR_INVERSE,`PARAM_SRC')
+
+	TEXT
+
+C mp_limb_t
+C mpn_pi1_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor,
+C		    mp_limb_t inverse, int shift)
+
+	ALIGN(16)
+PROLOGUE(mpn_pi1_bdiv_q_1)
+deflit(`FRAME',0)
+
+	subl	$STACK_SPACE, %esp	FRAME_subl_esp(STACK_SPACE)
+
+	movl	%esi, SAVE_ESI
+	movl	PARAM_SRC, %esi
+
+	movl	%ebx, SAVE_EBX
+	movl	PARAM_SIZE, %ebx
+
+	movl	%ebp, SAVE_EBP
+	movl	PARAM_INVERSE, %ebp
+
+	movl	PARAM_SHIFT, %ecx	C trailing twos
+
+L(common):
+	movl	%edi, SAVE_EDI
+	movl	PARAM_DST, %edi
+
+	leal	(%esi,%ebx,4), %esi	C src end
+
+	leal	(%edi,%ebx,4), %edi	C dst end
+	negl	%ebx			C -size
+
+	movl	(%esi,%ebx,4), %eax	C src[0]
+
+	orl	%ecx, %ecx
+	jz	L(odd_entry)
+
+	movl	%edi, PARAM_DST
+	movl	%ebp, VAR_INVERSE
+
+L(even):
+	C eax	src[0]
+	C ebx	counter, limbs, negative
+	C ecx	shift
+	C edx
+	C esi
+	C edi
+	C ebp
+
+	xorl	%ebp, %ebp		C initial carry bit
+	xorl	%edx, %edx		C initial carry limb (for size==1)
+
+	incl	%ebx
+	jz	L(even_one)
+
+	movl	(%esi,%ebx,4), %edi	C src[1]
+
+	shrdl(	%cl, %edi, %eax)
+
+	jmp	L(even_entry)
+
+
+L(even_top):
+	C eax	scratch
+	C ebx	counter, limbs, negative
+	C ecx	shift
+	C edx	scratch
+	C esi	&src[size]
+	C edi	&dst[size] and scratch
+	C ebp	carry bit
+
+	movl	(%esi,%ebx,4), %edi
+
+	mull	PARAM_DIVISOR
+
+	movl	-4(%esi,%ebx,4), %eax
+	shrdl(	%cl, %edi, %eax)
+
+	subl	%ebp, %eax
+
+	sbbl	%ebp, %ebp
+	subl	%edx, %eax
+
+	sbbl	$0, %ebp
+
+L(even_entry):
+	imull	VAR_INVERSE, %eax
+
+	movl	PARAM_DST, %edi
+	negl	%ebp
+
+	movl	%eax, -4(%edi,%ebx,4)
+	incl	%ebx
+	jnz	L(even_top)
+
+	mull	PARAM_DIVISOR
+
+	movl	-4(%esi), %eax
+
+L(even_one):
+	shrl	%cl, %eax
+	movl	SAVE_ESI, %esi
+
+	subl	%ebp, %eax
+	movl	SAVE_EBP, %ebp
+
+	subl	%edx, %eax
+	movl	SAVE_EBX, %ebx
+
+	imull	VAR_INVERSE, %eax
+
+	movl	%eax, -4(%edi)
+	movl	SAVE_EDI, %edi
+	addl	$STACK_SPACE, %esp
+
+	ret
+
+C The dependent chain here is
+C
+C	subl	%edx, %eax       1
+C	imull	%ebp, %eax       4
+C	mull	PARAM_DIVISOR    5
+C			       ----
+C	total			10
+C
+C and this is the measured speed.  No special scheduling is necessary, out
+C of order execution hides the load latency.
+
+L(odd_top):
+	C eax	scratch (src limb)
+	C ebx	counter, limbs, negative
+	C ecx	carry bit
+	C edx	carry limb, high of last product
+	C esi	&src[size]
+	C edi	&dst[size]
+	C ebp	inverse
+
+	mull	PARAM_DIVISOR
+
+	movl	(%esi,%ebx,4), %eax
+	subl	%ecx, %eax
+
+	sbbl	%ecx, %ecx
+	subl	%edx, %eax
+
+	sbbl	$0, %ecx
+
+L(odd_entry):
+	imull	%ebp, %eax
+
+	movl	%eax, (%edi,%ebx,4)
+	negl	%ecx
+
+	incl	%ebx
+	jnz	L(odd_top)
+
+
+	movl	SAVE_ESI, %esi
+
+	movl	SAVE_EDI, %edi
+
+	movl	SAVE_EBP, %ebp
+
+	movl	SAVE_EBX, %ebx
+	addl	$STACK_SPACE, %esp
+
+	ret
+
+EPILOGUE()
+
+C mp_limb_t mpn_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                           mp_limb_t divisor);
+C
+
+	ALIGN(16)
+PROLOGUE(mpn_bdiv_q_1)
+deflit(`FRAME',0)
+
+	movl	PARAM_DIVISOR, %eax
+	subl	$STACK_SPACE, %esp	FRAME_subl_esp(STACK_SPACE)
+
+	movl	%esi, SAVE_ESI
+	movl	PARAM_SRC, %esi
+
+	movl	%ebx, SAVE_EBX
+	movl	PARAM_SIZE, %ebx
+
+	bsfl	%eax, %ecx		C trailing twos
+
+	movl	%ebp, SAVE_EBP
+
+	shrl	%cl, %eax		C d without twos
+
+	movl	%eax, %edx
+	shrl	%eax			C d/2 without twos
+
+	movl	%edx, PARAM_DIVISOR
+	andl	$127, %eax
+
+ifdef(`PIC',`
+	LEA(	binvert_limb_table, %ebp)
+	movzbl	(%eax,%ebp), %ebp		C inv 8 bits
+',`
+	movzbl	binvert_limb_table(%eax), %ebp	C inv 8 bits
+')
+
+	leal	(%ebp,%ebp), %eax	C 2*inv
+
+	imull	%ebp, %ebp		C inv*inv
+	imull	%edx, %ebp	C inv*inv*d
+
+	subl	%ebp, %eax		C inv = 2*inv - inv*inv*d
+	leal	(%eax,%eax), %ebp	C 2*inv
+
+	imull	%eax, %eax		C inv*inv
+	imull	%edx, %eax	C inv*inv*d
+
+	subl	%eax, %ebp		C inv = 2*inv - inv*inv*d
+
+	jmp	L(common)
+
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/x86/p6/copyd.asm b/third_party/gmp/mpn/x86/p6/copyd.asm
new file mode 100644
index 0000000..1be7636
--- /dev/null
+++ b/third_party/gmp/mpn/x86/p6/copyd.asm

@@ -0,0 +1,178 @@
+dnl  Intel P6 mpn_copyd -- copy limb vector backwards.
+
+dnl  Copyright 2001, 2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P6: 1.75 cycles/limb, or 0.75 if no overlap
+
+
+C void mpn_copyd (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C An explicit loop is used because a decrementing rep movsl is a bit slow at
+C 2.4 c/l.  That rep movsl also has about a 40 cycle startup time, and the
+C code here stands a chance of being faster if the branches predict well.
+C
+C The slightly strange loop form seems necessary for the claimed speed.
+C Maybe load/store ordering affects it.
+C
+C The source and destination are checked to see if they're actually
+C overlapping, since it might be possible to use an incrementing rep movsl
+C at 0.75 c/l.  (It doesn't suffer the bad startup time of the decrementing
+C version.)
+C
+C Enhancements:
+C
+C Top speed for an all-integer copy is probably 1.0 c/l, being one load and
+C one store each cycle.  Unrolling the loop below would approach 1.0, but
+C it'd be good to know why something like store/load/subl + store/load/jnz
+C doesn't already run at 1.0 c/l.  It looks like it should decode in 2
+C cycles, but doesn't run that way.
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+dnl  re-using parameter space
+define(SAVE_ESI,`PARAM_SIZE')
+define(SAVE_EDI,`PARAM_SRC')
+
+	TEXT
+	ALIGN(16)
+
+PROLOGUE(mpn_copyd)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+
+	movl	%esi, SAVE_ESI
+	movl	PARAM_SRC, %esi
+
+	movl	%edi, SAVE_EDI
+	movl	PARAM_DST, %edi
+
+	subl	$1, %ecx
+	jb	L(zero)
+
+	movl	(%esi,%ecx,4), %eax		C src[size-1]
+	jz	L(one)
+
+	movl	-4(%esi,%ecx,4), %edx		C src[size-2]
+	subl	$2, %ecx
+	jbe	L(done_loop)			C 2 or 3 limbs only
+
+
+	C The usual overlap is
+	C
+	C     high                   low
+	C     +------------------+
+	C     |               dst|
+	C     +------------------+
+	C           +------------------+
+	C           |               src|
+	C           +------------------+
+	C
+	C We can use an incrementing copy in the following circumstances.
+	C
+	C     src+4*size<=dst, since then the regions are disjoint
+	C
+	C     src==dst, clearly (though this shouldn't occur normally)
+	C
+	C     src>dst, since in that case it's a requirement of the
+	C              parameters that src>=dst+size*4, and hence the
+	C              regions are disjoint
+	C
+
+	leal	(%edi,%ecx,4), %edx
+	cmpl	%edi, %esi
+	jae	L(use_movsl)		C src >= dst
+
+	cmpl	%edi, %edx
+	movl	4(%esi,%ecx,4), %edx	C src[size-2] again
+	jbe	L(use_movsl)		C src+4*size <= dst
+
+
+L(top):
+	C eax	prev high limb
+	C ebx
+	C ecx	counter, size-3 down to 0 or -1, inclusive, by 2s
+	C edx	prev low limb
+	C esi	src
+	C edi	dst
+	C ebp
+
+	movl	%eax, 8(%edi,%ecx,4)
+	movl	(%esi,%ecx,4), %eax
+
+	movl	%edx, 4(%edi,%ecx,4)
+	movl	-4(%esi,%ecx,4), %edx
+
+	subl	$2, %ecx
+	jnbe	L(top)
+
+
+L(done_loop):
+	movl	%eax, 8(%edi,%ecx,4)
+	movl	%edx, 4(%edi,%ecx,4)
+
+	C copy low limb (needed if size was odd, but will already have been
+	C done in the loop if size was even)
+	movl	(%esi), %eax
+L(one):
+	movl	%eax, (%edi)
+	movl	SAVE_EDI, %edi
+	movl	SAVE_ESI, %esi
+
+	ret
+
+
+L(use_movsl):
+	C eax
+	C ebx
+	C ecx	size-3
+	C edx
+	C esi	src
+	C edi	dst
+	C ebp
+
+	addl	$3, %ecx
+
+	cld		C better safe than sorry, see mpn/x86/README
+
+	rep
+	movsl
+
+L(zero):
+	movl	SAVE_ESI, %esi
+	movl	SAVE_EDI, %edi
+
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/p6/dive_1.asm b/third_party/gmp/mpn/x86/p6/dive_1.asm
new file mode 100644
index 0000000..7d61a18
--- /dev/null
+++ b/third_party/gmp/mpn/x86/p6/dive_1.asm

@@ -0,0 +1,267 @@
+dnl  Intel P6 mpn_modexact_1_odd -- exact division style remainder.
+
+dnl  Copyright 2001, 2002, 2007 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C       odd  even  divisor
+C P6:  10.0  12.0  cycles/limb
+
+
+C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                      mp_limb_t divisor);
+C
+C The odd case is basically the same as mpn_modexact_1_odd, just with an
+C extra store, and it runs at the same 10 cycles which is the dependent
+C chain.
+C
+C The shifts for the even case aren't on the dependent chain so in principle
+C it could run the same too, but nothing running at 10 has been found.
+C Perhaps there's too many uops (an extra 4 over the odd case).
+
+defframe(PARAM_DIVISOR,16)
+defframe(PARAM_SIZE,   12)
+defframe(PARAM_SRC,     8)
+defframe(PARAM_DST,     4)
+
+defframe(SAVE_EBX,     -4)
+defframe(SAVE_ESI,     -8)
+defframe(SAVE_EDI,    -12)
+defframe(SAVE_EBP,    -16)
+defframe(VAR_INVERSE, -20)
+deflit(STACK_SPACE, 20)
+
+	TEXT
+
+	ALIGN(16)
+PROLOGUE(mpn_divexact_1)
+deflit(`FRAME',0)
+
+	movl	PARAM_DIVISOR, %eax
+	subl	$STACK_SPACE, %esp	FRAME_subl_esp(STACK_SPACE)
+
+	movl	%esi, SAVE_ESI
+	movl	PARAM_SRC, %esi
+
+	movl	%ebx, SAVE_EBX
+	movl	PARAM_SIZE, %ebx
+
+	bsfl	%eax, %ecx		C trailing twos
+
+	movl	%ebp, SAVE_EBP
+
+	shrl	%cl, %eax		C d without twos
+
+	movl	%eax, %edx
+	shrl	%eax			C d/2 without twos
+
+	movl	%edx, PARAM_DIVISOR
+	andl	$127, %eax
+
+ifdef(`PIC',`
+	LEA(	binvert_limb_table, %ebp)
+	movzbl	(%eax,%ebp), %ebp		C inv 8 bits
+',`
+	movzbl	binvert_limb_table(%eax), %ebp	C inv 8 bits
+')
+
+	leal	(%ebp,%ebp), %eax	C 2*inv
+
+	imull	%ebp, %ebp		C inv*inv
+
+	movl	%edi, SAVE_EDI
+	movl	PARAM_DST, %edi
+
+	leal	(%esi,%ebx,4), %esi	C src end
+
+	imull	PARAM_DIVISOR, %ebp	C inv*inv*d
+
+	subl	%ebp, %eax		C inv = 2*inv - inv*inv*d
+	leal	(%eax,%eax), %ebp	C 2*inv
+
+	imull	%eax, %eax		C inv*inv
+
+	leal	(%edi,%ebx,4), %edi	C dst end
+	negl	%ebx			C -size
+
+	movl	%edi, PARAM_DST
+
+	imull	PARAM_DIVISOR, %eax	C inv*inv*d
+
+	subl	%eax, %ebp		C inv = 2*inv - inv*inv*d
+
+	ASSERT(e,`	C d*inv == 1 mod 2^GMP_LIMB_BITS
+	movl	PARAM_DIVISOR, %eax
+	imull	%ebp, %eax
+	cmpl	$1, %eax')
+
+	movl	%ebp, VAR_INVERSE
+	movl	(%esi,%ebx,4), %eax	C src[0]
+
+	orl	%ecx, %ecx
+	jnz	L(even)
+
+	C ecx initial carry is zero
+	jmp	L(odd_entry)
+
+
+C The dependent chain here is
+C
+C	subl	%edx, %eax       1
+C	imull	%ebp, %eax       4
+C	mull	PARAM_DIVISOR    5
+C			       ----
+C	total			10
+C
+C and this is the measured speed.  No special scheduling is necessary, out
+C of order execution hides the load latency.
+
+L(odd_top):
+	C eax	scratch (src limb)
+	C ebx	counter, limbs, negative
+	C ecx	carry bit
+	C edx	carry limb, high of last product
+	C esi	&src[size]
+	C edi	&dst[size]
+	C ebp
+
+	mull	PARAM_DIVISOR
+
+	movl	(%esi,%ebx,4), %eax
+	subl	%ecx, %eax
+
+	sbbl	%ecx, %ecx
+	subl	%edx, %eax
+
+	sbbl	$0, %ecx
+
+L(odd_entry):
+	imull	VAR_INVERSE, %eax
+
+	movl	%eax, (%edi,%ebx,4)
+	negl	%ecx
+
+	incl	%ebx
+	jnz	L(odd_top)
+
+
+	movl	SAVE_ESI, %esi
+
+	movl	SAVE_EDI, %edi
+
+	movl	SAVE_EBP, %ebp
+
+	movl	SAVE_EBX, %ebx
+	addl	$STACK_SPACE, %esp
+
+	ret
+
+
+L(even):
+	C eax	src[0]
+	C ebx	counter, limbs, negative
+	C ecx	shift
+	C edx
+	C esi
+	C edi
+	C ebp
+
+	xorl	%ebp, %ebp		C initial carry bit
+	xorl	%edx, %edx		C initial carry limb (for size==1)
+
+	incl	%ebx
+	jz	L(even_one)
+
+	movl	(%esi,%ebx,4), %edi	C src[1]
+
+	shrdl(	%cl, %edi, %eax)
+
+	jmp	L(even_entry)
+
+
+L(even_top):
+	C eax	scratch
+	C ebx	counter, limbs, negative
+	C ecx	shift
+	C edx	scratch
+	C esi	&src[size]
+	C edi	&dst[size] and scratch
+	C ebp	carry bit
+
+	movl	(%esi,%ebx,4), %edi
+
+	mull	PARAM_DIVISOR
+
+	movl	-4(%esi,%ebx,4), %eax
+	shrdl(	%cl, %edi, %eax)
+
+	subl	%ebp, %eax
+
+	sbbl	%ebp, %ebp
+	subl	%edx, %eax
+
+	sbbl	$0, %ebp
+
+L(even_entry):
+	imull	VAR_INVERSE, %eax
+
+	movl	PARAM_DST, %edi
+	negl	%ebp
+
+	movl	%eax, -4(%edi,%ebx,4)
+	incl	%ebx
+	jnz	L(even_top)
+
+
+
+	mull	PARAM_DIVISOR
+
+	movl	-4(%esi), %eax
+
+L(even_one):
+	shrl	%cl, %eax
+	movl	SAVE_ESI, %esi
+
+	subl	%ebp, %eax
+	movl	SAVE_EBP, %ebp
+
+	subl	%edx, %eax
+	movl	SAVE_EBX, %ebx
+
+	imull	VAR_INVERSE, %eax
+
+	movl	%eax, -4(%edi)
+	movl	SAVE_EDI, %edi
+	addl	$STACK_SPACE, %esp
+
+	ret
+
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/x86/p6/gcd_11.asm b/third_party/gmp/mpn/x86/p6/gcd_11.asm
new file mode 100644
index 0000000..80e055e
--- /dev/null
+++ b/third_party/gmp/mpn/x86/p6/gcd_11.asm

@@ -0,0 +1,83 @@
+dnl  x86 mpn_gcd_11 optimised for processors with fast BSF.
+
+dnl  Based on the K7 gcd_1.asm, by Kevin Ryde.  Rehacked by Torbjorn Granlund.
+
+dnl  Copyright 2000-2002, 2005, 2009, 2011, 2012, 2015 Free Software
+dnl  Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C	     cycles/bit (approx)
+C AMD K7	 7.80
+C AMD K8,K9	 7.79
+C AMD K10	 4.08
+C AMD bd1	 ?
+C AMD bobcat	 7.82
+C Intel P4-2	14.9
+C Intel P4-3/4	14.0
+C Intel P6/13	 5.09
+C Intel core2	 4.22
+C Intel NHM	 5.00
+C Intel SBR	 5.00
+C Intel atom	17.1
+C VIA nano	?
+C Numbers measured with: speed -CD -s16-32 -t16 mpn_gcd_1
+
+
+define(`u0',    `%eax')
+define(`v0',    `%edx')
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_gcd_11)
+	push	%edi
+	push	%esi
+
+	mov	12(%esp), %eax
+	mov	16(%esp), %edx
+	jmp	L(odd)
+
+	ALIGN(16)		C               K10   BD    C2    NHM   SBR
+L(top):	cmovc(	%esi, %eax)	C u = |v - u|   0,3   0,3   0,6   0,5   0,5
+	cmovc(	%edi, %edx)	C v = min(u,v)  0,3   0,3   2,8   1,7   1,7
+	shr	%cl, %eax	C               1,7   1,6   2,8   2,8   2,8
+L(odd):	mov	%edx, %esi	C               1     1     4     3     3
+	sub	%eax, %esi	C               2     2     5     4     4
+	bsf	%esi, %ecx	C               3     3     6     5     5
+	mov	%eax, %edi	C               2     2     3     3     4
+	sub	%edx, %eax	C               2     2     4     3     4
+	jnz	L(top)		C
+
+L(end):	mov	%edx, %eax
+	pop	%esi
+	pop	%edi
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/p6/gmp-mparam.h b/third_party/gmp/mpn/x86/p6/gmp-mparam.h
new file mode 100644
index 0000000..96c96fd
--- /dev/null
+++ b/third_party/gmp/mpn/x86/p6/gmp-mparam.h

@@ -0,0 +1,194 @@
+/* Intel P6 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 1999-2003, 2008-2010, 2012 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+
+/* NOTE: In a fat binary build SQR_TOOM2_THRESHOLD here cannot be more than the
+   value in mpn/x86/p6/gmp-mparam.h.  The latter is used as a hard limit in
+   mpn/x86/p6/sqr_basecase.asm.  */
+
+
+/* 1867 MHz P6 model 13 */
+
+#define MOD_1_NORM_THRESHOLD                 4
+#define MOD_1_UNNORM_THRESHOLD               4
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          5
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          4
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD        11
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD         0  /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD      8
+#define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           21
+
+#define MUL_TOOM22_THRESHOLD                20
+#define MUL_TOOM33_THRESHOLD                74
+#define MUL_TOOM44_THRESHOLD               181
+#define MUL_TOOM6H_THRESHOLD               252
+#define MUL_TOOM8H_THRESHOLD               363
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      73
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     114
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD     115
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD      80
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 30
+#define SQR_TOOM3_THRESHOLD                101
+#define SQR_TOOM4_THRESHOLD                154
+#define SQR_TOOM6_THRESHOLD                222
+#define SQR_TOOM8_THRESHOLD                527
+
+#define MULMID_TOOM42_THRESHOLD             58
+
+#define MULMOD_BNM1_THRESHOLD               13
+#define SQRMOD_BNM1_THRESHOLD               17
+
+#define POWM_SEC_TABLE  4,23,258,768,2388
+
+#define MUL_FFT_MODF_THRESHOLD             565  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    565, 5}, {     25, 6}, {     13, 5}, {     27, 6}, \
+    {     25, 7}, {     13, 6}, {     28, 7}, {     15, 6}, \
+    {     31, 7}, {     17, 6}, {     35, 7}, {     27, 8}, \
+    {     15, 7}, {     35, 8}, {     19, 7}, {     41, 8}, \
+    {     23, 7}, {     47, 8}, {     27, 9}, {     15, 8}, \
+    {     31, 7}, {     63, 8}, {     39, 9}, {     23, 5}, \
+    {    383, 4}, {    991, 5}, {    511, 6}, {    267, 7}, \
+    {    157, 8}, {     91, 9}, {     47, 8}, {    111, 9}, \
+    {     63, 8}, {    127, 9}, {     79,10}, {     47, 9}, \
+    {     95,11}, {     31,10}, {     63, 9}, {    135,10}, \
+    {     79, 9}, {    159,10}, {     95,11}, {     63,10}, \
+    {    143, 9}, {    287,10}, {    159,11}, {     95,10}, \
+    {    191,12}, {     63,11}, {    127,10}, {    255, 9}, \
+    {    511,10}, {    271, 9}, {    543,10}, {    287,11}, \
+    {    159,10}, {    335, 9}, {    671,11}, {    191,10}, \
+    {    383, 9}, {    767,10}, {    399, 9}, {    799,10}, \
+    {    415,11}, {    223,12}, {    127,11}, {    255,10}, \
+    {    543, 9}, {   1087,11}, {    287,10}, {    607,11}, \
+    {    319,10}, {    671,12}, {    191,11}, {    383,10}, \
+    {    799,11}, {    415,10}, {    831,13}, {    127,12}, \
+    {    255,11}, {    543,10}, {   1087,11}, {    607,10}, \
+    {   1215,12}, {    319,11}, {    671,10}, {   1343,11}, \
+    {    735,10}, {   1471,12}, {    383,11}, {    799,10}, \
+    {   1599,11}, {    863,12}, {    447,11}, {    959,13}, \
+    {    255,12}, {    511,11}, {   1087,12}, {    575,11}, \
+    {   1215,12}, {    639,11}, {   1343,12}, {    703,11}, \
+    {   1471,13}, {    383,12}, {    831,11}, {   1727,12}, \
+    {    959,14}, {    255,13}, {    511,12}, {   1215,13}, \
+    {    639,12}, {   1471,11}, {   2943,13}, {    767,12}, \
+    {   1727,13}, {    895,12}, {   1919,14}, {    511,13}, \
+    {   1023,12}, {   2111,13}, {   1151,12}, {   2431,13}, \
+    {   1407,12}, {   2815,14}, {    767,13}, {   1663,12}, \
+    {   3455,13}, {   8192,14}, {  16384,15}, {  32768,16} }
+#define MUL_FFT_TABLE3_SIZE 132
+#define MUL_FFT_THRESHOLD                 6784
+
+#define SQR_FFT_MODF_THRESHOLD             472  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    472, 5}, {     25, 6}, {     13, 5}, {     27, 6}, \
+    {     25, 7}, {     13, 6}, {     27, 7}, {     15, 6}, \
+    {     31, 7}, {     17, 6}, {     35, 7}, {     27, 8}, \
+    {     15, 7}, {     35, 8}, {     19, 7}, {     41, 8}, \
+    {     23, 7}, {     49, 8}, {     27, 9}, {     15, 8}, \
+    {     39, 9}, {     23, 8}, {     51,10}, {     15, 9}, \
+    {     31, 8}, {     63, 4}, {   1023, 8}, {     67, 9}, \
+    {     39, 5}, {    639, 4}, {   1471, 6}, {    383, 7}, \
+    {    209, 8}, {    119, 9}, {     63, 7}, {    255, 8}, \
+    {    139, 9}, {     71, 8}, {    143, 9}, {     79,10}, \
+    {     47, 9}, {     95,11}, {     31,10}, {     63, 9}, \
+    {    135,10}, {     79, 9}, {    159, 8}, {    319, 9}, \
+    {    167,10}, {     95,11}, {     63,10}, {    143, 9}, \
+    {    287,10}, {    159,11}, {     95,10}, {    191,12}, \
+    {     63,11}, {    127,10}, {    255, 9}, {    543, 8}, \
+    {   1087,10}, {    287, 9}, {    575,11}, {    159,10}, \
+    {    319, 9}, {    639,10}, {    335, 9}, {    671,10}, \
+    {    351, 9}, {    703,11}, {    191,10}, {    383, 9}, \
+    {    767,10}, {    399, 9}, {    799,10}, {    415, 9}, \
+    {    831,11}, {    223,12}, {    127,11}, {    255,10}, \
+    {    543, 9}, {   1087,11}, {    287,10}, {    607, 9}, \
+    {   1215,11}, {    319,10}, {    671, 9}, {   1343,11}, \
+    {    351,10}, {    703,12}, {    191,11}, {    383,10}, \
+    {    799,11}, {    415,10}, {    831,13}, {    127,12}, \
+    {    255,11}, {    543,10}, {   1087,11}, {    607,12}, \
+    {    319,11}, {    671,10}, {   1343,11}, {    735,12}, \
+    {    383,11}, {    799,10}, {   1599,11}, {    863,12}, \
+    {    447,11}, {    959,13}, {    255,12}, {    511,11}, \
+    {   1087,12}, {    575,11}, {   1215,12}, {    639,11}, \
+    {   1343,12}, {    703,11}, {   1471,13}, {    383,12}, \
+    {    767,11}, {   1599,12}, {    831,11}, {   1727,12}, \
+    {    959,14}, {    255,13}, {    511,12}, {   1215,13}, \
+    {    639,12}, {   1471,13}, {    767,12}, {   1727,13}, \
+    {    895,12}, {   1919,14}, {    511,13}, {   1023,12}, \
+    {   2111,13}, {   1151,12}, {   2431,13}, {   1407,14}, \
+    {    767,13}, {   1663,12}, {   3455,13}, {   8192,14}, \
+    {  16384,15}, {  32768,16} }
+#define SQR_FFT_TABLE3_SIZE 146
+#define SQR_FFT_THRESHOLD                 5760
+
+#define MULLO_BASECASE_THRESHOLD             0  /* always */
+#define MULLO_DC_THRESHOLD                  33
+#define MULLO_MUL_N_THRESHOLD            13463
+
+#define DC_DIV_QR_THRESHOLD                 20
+#define DC_DIVAPPR_Q_THRESHOLD              56
+#define DC_BDIV_QR_THRESHOLD                60
+#define DC_BDIV_Q_THRESHOLD                134
+
+#define INV_MULMOD_BNM1_THRESHOLD           38
+#define INV_NEWTON_THRESHOLD                66
+#define INV_APPR_THRESHOLD                  63
+
+#define BINV_NEWTON_THRESHOLD              250
+#define REDC_1_TO_REDC_N_THRESHOLD          63
+
+#define MU_DIV_QR_THRESHOLD               1164
+#define MU_DIVAPPR_Q_THRESHOLD             979
+#define MUPI_DIV_QR_THRESHOLD               38
+#define MU_BDIV_QR_THRESHOLD              1442
+#define MU_BDIV_Q_THRESHOLD               1470
+
+#define MATRIX22_STRASSEN_THRESHOLD         17
+#define HGCD_THRESHOLD                      64
+#define HGCD_APPR_THRESHOLD                105
+#define HGCD_REDUCE_THRESHOLD             3524
+#define GCD_DC_THRESHOLD                   386
+#define GCDEXT_DC_THRESHOLD                309
+#define JACOBI_BASE_METHOD                   1
+
+#define GET_STR_DC_THRESHOLD                13
+#define GET_STR_PRECOMPUTE_THRESHOLD        26
+#define SET_STR_DC_THRESHOLD               587
+#define SET_STR_PRECOMPUTE_THRESHOLD      1104

diff --git a/third_party/gmp/mpn/x86/p6/lshsub_n.asm b/third_party/gmp/mpn/x86/p6/lshsub_n.asm
new file mode 100644
index 0000000..7ada213
--- /dev/null
+++ b/third_party/gmp/mpn/x86/p6/lshsub_n.asm

@@ -0,0 +1,169 @@
+dnl  Intel P6 mpn_lshsub_n -- mpn papillion support.
+
+dnl  Copyright 2006 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C P6/13: 3.35 cycles/limb	(separate mpn_sub_n + mpn_lshift needs 4.12)
+
+C (1) The loop is not scheduled in any way, and scheduling attempts have not
+C     improved speed on P6/13.  Presumably, the K7 will want scheduling, if it
+C     at all wants to use MMX.
+C (2) We could save a register by not alternatingly using eax and edx in the
+C     loop.
+
+define(`rp',	`%edi')
+define(`up',	`%esi')
+define(`vp',	`%ebx')
+define(`n',	`%ecx')
+define(`cnt',	`%mm7')
+
+ASM_START()
+
+	TEXT
+	ALIGN(16)
+
+PROLOGUE(mpn_lshsub_n)
+	push	%edi
+	push	%esi
+	push	%ebx
+
+	mov	16(%esp), rp
+	mov	20(%esp), up
+	mov	24(%esp), vp
+	mov	28(%esp), n
+	mov	$32, %eax
+	sub	32(%esp), %eax
+	movd	%eax, cnt
+
+	lea	(up,n,4), up
+	lea	(vp,n,4), vp
+	lea	(rp,n,4), rp
+
+	neg	n
+	mov	n, %eax
+	and	$-8, n
+	and	$7, %eax
+	shl	%eax				C eax = 2x
+	lea	(%eax,%eax,4), %edx		C edx = 10x
+ifdef(`PIC',`
+	call	L(pic_calc)
+L(here):
+',`
+	lea	L(ent)(%eax,%edx,2), %eax	C eax = 22x
+')
+
+	pxor	%mm1, %mm1
+	pxor	%mm0, %mm0
+
+	jmp	*%eax
+
+ifdef(`PIC',`
+L(pic_calc):
+	C See mpn/x86/README about old gas bugs
+	lea	(%eax,%edx,2), %eax
+	add	$L(ent)-L(here), %eax
+	add	(%esp), %eax
+	ret_internal
+')
+
+L(end):	C compute (cy<<cnt) | (edx>>(32-cnt))
+	sbb	%eax, %eax
+	neg	%eax
+	mov	32(%esp), %ecx
+	shld	%cl, %edx, %eax
+
+	emms
+
+	pop	%ebx
+	pop	%esi
+	pop	%edi
+	ret
+	ALIGN(16)
+L(top):	jecxz	L(end)
+L(ent):	mov	   0(up,n,4), %eax
+	sbb	   0(vp,n,4), %eax
+	movd	   %eax, %mm0
+	punpckldq  %mm0, %mm1
+	psrlq	   %mm7, %mm1
+	movd	   %mm1, 0(rp,n,4)
+
+	mov	   4(up,n,4), %edx
+	sbb	   4(vp,n,4), %edx
+	movd	   %edx, %mm1
+	punpckldq  %mm1, %mm0
+	psrlq	   %mm7, %mm0
+	movd	   %mm0, 4(rp,n,4)
+
+	mov	   8(up,n,4), %eax
+	sbb	   8(vp,n,4), %eax
+	movd	   %eax, %mm0
+	punpckldq  %mm0, %mm1
+	psrlq	   %mm7, %mm1
+	movd	   %mm1, 8(rp,n,4)
+
+	mov	   12(up,n,4), %edx
+	sbb	   12(vp,n,4), %edx
+	movd	   %edx, %mm1
+	punpckldq  %mm1, %mm0
+	psrlq	   %mm7, %mm0
+	movd	   %mm0, 12(rp,n,4)
+
+	mov	   16(up,n,4), %eax
+	sbb	   16(vp,n,4), %eax
+	movd	   %eax, %mm0
+	punpckldq  %mm0, %mm1
+	psrlq	   %mm7, %mm1
+	movd	   %mm1, 16(rp,n,4)
+
+	mov	   20(up,n,4), %edx
+	sbb	   20(vp,n,4), %edx
+	movd	   %edx, %mm1
+	punpckldq  %mm1, %mm0
+	psrlq	   %mm7, %mm0
+	movd	   %mm0, 20(rp,n,4)
+
+	mov	   24(up,n,4), %eax
+	sbb	   24(vp,n,4), %eax
+	movd	   %eax, %mm0
+	punpckldq  %mm0, %mm1
+	psrlq	   %mm7, %mm1
+	movd	   %mm1, 24(rp,n,4)
+
+	mov	   28(up,n,4), %edx
+	sbb	   28(vp,n,4), %edx
+	movd	   %edx, %mm1
+	punpckldq  %mm1, %mm0
+	psrlq	   %mm7, %mm0
+	movd	   %mm0, 28(rp,n,4)
+
+	lea	   8(n), n
+	jmp	   L(top)
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/p6/mmx/divrem_1.asm b/third_party/gmp/mpn/x86/p6/mmx/divrem_1.asm
new file mode 100644
index 0000000..5300616
--- /dev/null
+++ b/third_party/gmp/mpn/x86/p6/mmx/divrem_1.asm

@@ -0,0 +1,767 @@
+dnl  Intel Pentium-II mpn_divrem_1 -- mpn by limb division.
+
+dnl  Copyright 1999-2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P6MMX: 25.0 cycles/limb integer part, 17.5 cycles/limb fraction part.
+
+
+C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
+C                         mp_srcptr src, mp_size_t size,
+C                         mp_limb_t divisor);
+C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
+C                          mp_srcptr src, mp_size_t size,
+C                          mp_limb_t divisor, mp_limb_t carry);
+C mp_limb_t mpn_preinv_divrem_1 (mp_ptr dst, mp_size_t xsize,
+C                                mp_srcptr src, mp_size_t size,
+C                                mp_limb_t divisor, mp_limb_t inverse,
+C                                unsigned shift);
+C
+C This code is a lightly reworked version of mpn/x86/k7/mmx/divrem_1.asm,
+C see that file for some comments.  It's possible what's here can be improved.
+
+
+dnl  MUL_THRESHOLD is the value of xsize+size at which the multiply by
+dnl  inverse method is used, rather than plain "divl"s.  Minimum value 1.
+dnl
+dnl  The different speeds of the integer and fraction parts means that using
+dnl  xsize+size isn't quite right.  The threshold wants to be a bit higher
+dnl  for the integer part and a bit lower for the fraction part.  (Or what's
+dnl  really wanted is to speed up the integer part!)
+dnl
+dnl  The threshold is set to make the integer part right.  At 4 limbs the
+dnl  div and mul are about the same there, but on the fractional part the
+dnl  mul is much faster.
+
+deflit(MUL_THRESHOLD, 4)
+
+
+defframe(PARAM_PREINV_SHIFT,   28)  dnl mpn_preinv_divrem_1
+defframe(PARAM_PREINV_INVERSE, 24)  dnl mpn_preinv_divrem_1
+defframe(PARAM_CARRY,  24)          dnl mpn_divrem_1c
+defframe(PARAM_DIVISOR,20)
+defframe(PARAM_SIZE,   16)
+defframe(PARAM_SRC,    12)
+defframe(PARAM_XSIZE,  8)
+defframe(PARAM_DST,    4)
+
+defframe(SAVE_EBX,    -4)
+defframe(SAVE_ESI,    -8)
+defframe(SAVE_EDI,    -12)
+defframe(SAVE_EBP,    -16)
+
+defframe(VAR_NORM,    -20)
+defframe(VAR_INVERSE, -24)
+defframe(VAR_SRC,     -28)
+defframe(VAR_DST,     -32)
+defframe(VAR_DST_STOP,-36)
+
+deflit(STACK_SPACE, 36)
+
+	TEXT
+	ALIGN(16)
+
+PROLOGUE(mpn_preinv_divrem_1)
+deflit(`FRAME',0)
+	movl	PARAM_XSIZE, %ecx
+	subl	$STACK_SPACE, %esp	FRAME_subl_esp(STACK_SPACE)
+
+	movl	%esi, SAVE_ESI
+	movl	PARAM_SRC, %esi
+
+	movl	%ebx, SAVE_EBX
+	movl	PARAM_SIZE, %ebx
+
+	movl	%ebp, SAVE_EBP
+	movl	PARAM_DIVISOR, %ebp
+
+	movl	%edi, SAVE_EDI
+	movl	PARAM_DST, %edx
+
+	movl	-4(%esi,%ebx,4), %eax	C src high limb
+	xorl	%edi, %edi		C initial carry (if can't skip a div)
+
+	C
+
+	leal	8(%edx,%ecx,4), %edx	C &dst[xsize+2]
+	xor	%ecx, %ecx
+
+	movl	%edx, VAR_DST_STOP	C &dst[xsize+2]
+	cmpl	%ebp, %eax		C high cmp divisor
+
+	cmovc(	%eax, %edi)		C high is carry if high<divisor
+
+	cmovnc(	%eax, %ecx)		C 0 if skip div, src high if not
+					C (the latter in case src==dst)
+
+	movl	%ecx, -12(%edx,%ebx,4)	C dst high limb
+
+	sbbl	$0, %ebx		C skip one division if high<divisor
+	movl	PARAM_PREINV_SHIFT, %ecx
+
+	leal	-8(%edx,%ebx,4), %edx	C &dst[xsize+size]
+	movl	$32, %eax
+
+	movl	%edx, VAR_DST		C &dst[xsize+size]
+
+	shll	%cl, %ebp		C d normalized
+	subl	%ecx, %eax
+	movl	%ecx, VAR_NORM
+
+	movd	%eax, %mm7		C rshift
+	movl	PARAM_PREINV_INVERSE, %eax
+	jmp	L(start_preinv)
+
+EPILOGUE()
+
+
+
+	ALIGN(16)
+
+PROLOGUE(mpn_divrem_1c)
+deflit(`FRAME',0)
+	movl	PARAM_CARRY, %edx
+
+	movl	PARAM_SIZE, %ecx
+	subl	$STACK_SPACE, %esp
+deflit(`FRAME',STACK_SPACE)
+
+	movl	%ebx, SAVE_EBX
+	movl	PARAM_XSIZE, %ebx
+
+	movl	%edi, SAVE_EDI
+	movl	PARAM_DST, %edi
+
+	movl	%ebp, SAVE_EBP
+	movl	PARAM_DIVISOR, %ebp
+
+	movl	%esi, SAVE_ESI
+	movl	PARAM_SRC, %esi
+
+	leal	-4(%edi,%ebx,4), %edi
+	jmp	L(start_1c)
+
+EPILOGUE()
+
+
+	C offset 0x31, close enough to aligned
+PROLOGUE(mpn_divrem_1)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	movl	$0, %edx		C initial carry (if can't skip a div)
+	subl	$STACK_SPACE, %esp
+deflit(`FRAME',STACK_SPACE)
+
+	movl	%ebp, SAVE_EBP
+	movl	PARAM_DIVISOR, %ebp
+
+	movl	%ebx, SAVE_EBX
+	movl	PARAM_XSIZE, %ebx
+
+	movl	%esi, SAVE_ESI
+	movl	PARAM_SRC, %esi
+	orl	%ecx, %ecx		C size
+
+	movl	%edi, SAVE_EDI
+	movl	PARAM_DST, %edi
+
+	leal	-4(%edi,%ebx,4), %edi	C &dst[xsize-1]
+	jz	L(no_skip_div)		C if size==0
+
+	movl	-4(%esi,%ecx,4), %eax	C src high limb
+	xorl	%esi, %esi
+	cmpl	%ebp, %eax		C high cmp divisor
+
+	cmovc(	%eax, %edx)		C high is carry if high<divisor
+
+	cmovnc(	%eax, %esi)		C 0 if skip div, src high if not
+					C (the latter in case src==dst)
+
+	movl	%esi, (%edi,%ecx,4)	C dst high limb
+
+	sbbl	$0, %ecx		C size-1 if high<divisor
+	movl	PARAM_SRC, %esi		C reload
+L(no_skip_div):
+
+
+L(start_1c):
+	C eax
+	C ebx	xsize
+	C ecx	size
+	C edx	carry
+	C esi	src
+	C edi	&dst[xsize-1]
+	C ebp	divisor
+
+	leal	(%ebx,%ecx), %eax	C size+xsize
+	cmpl	$MUL_THRESHOLD, %eax
+	jae	L(mul_by_inverse)
+
+	orl	%ecx, %ecx
+	jz	L(divide_no_integer)
+
+L(divide_integer):
+	C eax	scratch (quotient)
+	C ebx	xsize
+	C ecx	counter
+	C edx	scratch (remainder)
+	C esi	src
+	C edi	&dst[xsize-1]
+	C ebp	divisor
+
+	movl	-4(%esi,%ecx,4), %eax
+
+	divl	%ebp
+
+	movl	%eax, (%edi,%ecx,4)
+	decl	%ecx
+	jnz	L(divide_integer)
+
+
+L(divide_no_integer):
+	movl	PARAM_DST, %edi
+	orl	%ebx, %ebx
+	jnz	L(divide_fraction)
+
+L(divide_done):
+	movl	SAVE_ESI, %esi
+
+	movl	SAVE_EDI, %edi
+
+	movl	SAVE_EBX, %ebx
+	movl	%edx, %eax
+
+	movl	SAVE_EBP, %ebp
+	addl	$STACK_SPACE, %esp
+
+	ret
+
+
+L(divide_fraction):
+	C eax	scratch (quotient)
+	C ebx	counter
+	C ecx
+	C edx	scratch (remainder)
+	C esi
+	C edi	dst
+	C ebp	divisor
+
+	movl	$0, %eax
+
+	divl	%ebp
+
+	movl	%eax, -4(%edi,%ebx,4)
+	decl	%ebx
+	jnz	L(divide_fraction)
+
+	jmp	L(divide_done)
+
+
+
+C -----------------------------------------------------------------------------
+
+L(mul_by_inverse):
+	C eax
+	C ebx	xsize
+	C ecx	size
+	C edx	carry
+	C esi	src
+	C edi	&dst[xsize-1]
+	C ebp	divisor
+
+	leal	12(%edi), %ebx		C &dst[xsize+2], loop dst stop
+
+	movl	%ebx, VAR_DST_STOP
+	leal	4(%edi,%ecx,4), %edi	C &dst[xsize+size]
+
+	movl	%edi, VAR_DST
+	movl	%ecx, %ebx		C size
+
+	bsrl	%ebp, %ecx		C 31-l
+	movl	%edx, %edi		C carry
+
+	leal	1(%ecx), %eax		C 32-l
+	xorl	$31, %ecx		C l
+
+	movl	%ecx, VAR_NORM
+	movl	$-1, %edx
+
+	shll	%cl, %ebp		C d normalized
+	movd	%eax, %mm7
+
+	movl	$-1, %eax
+	subl	%ebp, %edx		C (b-d)-1 giving edx:eax = b*(b-d)-1
+
+	divl	%ebp			C floor (b*(b-d)-1) / d
+
+L(start_preinv):
+	C eax	inverse
+	C ebx	size
+	C ecx	shift
+	C edx
+	C esi	src
+	C edi	carry
+	C ebp	divisor
+	C
+	C mm7	rshift
+
+	movl	%eax, VAR_INVERSE
+	orl	%ebx, %ebx		C size
+	leal	-12(%esi,%ebx,4), %eax	C &src[size-3]
+
+	movl	%eax, VAR_SRC
+	jz	L(start_zero)
+
+	movl	8(%eax), %esi		C src high limb
+	cmpl	$1, %ebx
+	jz	L(start_one)
+
+L(start_two_or_more):
+	movl	4(%eax), %edx		C src second highest limb
+
+	shldl(	%cl, %esi, %edi)	C n2 = carry,high << l
+
+	shldl(	%cl, %edx, %esi)	C n10 = high,second << l
+
+	cmpl	$2, %ebx
+	je	L(integer_two_left)
+	jmp	L(integer_top)
+
+
+L(start_one):
+	shldl(	%cl, %esi, %edi)	C n2 = carry,high << l
+
+	shll	%cl, %esi		C n10 = high << l
+	jmp	L(integer_one_left)
+
+
+L(start_zero):
+	C Can be here with xsize==0 if mpn_preinv_divrem_1 had size==1 and
+	C skipped a division.
+
+	shll	%cl, %edi		C n2 = carry << l
+	movl	%edi, %eax		C return value for zero_done
+	cmpl	$0, PARAM_XSIZE
+
+	je	L(zero_done)
+	jmp	L(fraction_some)
+
+
+
+C -----------------------------------------------------------------------------
+C
+C This loop runs at about 25 cycles, which is probably sub-optimal, and
+C certainly more than the dependent chain would suggest.  A better loop, or
+C a better rough analysis of what's possible, would be welcomed.
+C
+C In the current implementation, the following successively dependent
+C micro-ops seem to exist.
+C
+C		       uops
+C		n2+n1	1   (addl)
+C		mul	5
+C		q1+1	3   (addl/adcl)
+C		mul	5
+C		sub	3   (subl/sbbl)
+C		addback	2   (cmov)
+C		       ---
+C		       19
+C
+C Lack of registers hinders explicit scheduling and it might be that the
+C normal out of order execution isn't able to hide enough under the mul
+C latencies.
+C
+C Using sarl/negl to pick out n1 for the n2+n1 stage is a touch faster than
+C cmov (and takes one uop off the dependent chain).  A sarl/andl/addl
+C combination was tried for the addback (despite the fact it would lengthen
+C the dependent chain) but found to be no faster.
+
+
+	ALIGN(16)
+L(integer_top):
+	C eax	scratch
+	C ebx	scratch (nadj, q1)
+	C ecx	scratch (src, dst)
+	C edx	scratch
+	C esi	n10
+	C edi	n2
+	C ebp	d
+	C
+	C mm0	scratch (src qword)
+	C mm7	rshift for normalization
+
+	movl	%esi, %eax
+	movl	%ebp, %ebx
+
+	sarl	$31, %eax          C -n1
+	movl	VAR_SRC, %ecx
+
+	andl	%eax, %ebx         C -n1 & d
+	negl	%eax               C n1
+
+	addl	%esi, %ebx         C nadj = n10 + (-n1 & d), ignoring overflow
+	addl	%edi, %eax         C n2+n1
+	movq	(%ecx), %mm0       C next src limb and the one below it
+
+	mull	VAR_INVERSE        C m*(n2+n1)
+
+	subl	$4, %ecx
+
+	movl	%ecx, VAR_SRC
+
+	C
+
+	C
+
+	addl	%ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
+	movl	%ebp, %eax	   C d
+	leal	1(%edi), %ebx      C n2+1
+
+	adcl	%edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
+	jz	L(q1_ff)
+
+	mull	%ebx		   C (q1+1)*d
+
+	movl	VAR_DST, %ecx
+	psrlq	%mm7, %mm0
+
+	C
+
+	C
+
+	C
+
+	subl	%eax, %esi
+	movl	VAR_DST_STOP, %eax
+
+	sbbl	%edx, %edi	   C n - (q1+1)*d
+	movl	%esi, %edi	   C remainder -> n2
+	leal	(%ebp,%esi), %edx
+
+	cmovc(	%edx, %edi)	   C n - q1*d if underflow from using q1+1
+	movd	%mm0, %esi
+
+	sbbl	$0, %ebx	   C q
+	subl	$4, %ecx
+
+	movl	%ebx, (%ecx)
+	cmpl	%eax, %ecx
+
+	movl	%ecx, VAR_DST
+	jne	L(integer_top)
+
+
+L(integer_loop_done):
+
+
+C -----------------------------------------------------------------------------
+C
+C Here, and in integer_one_left below, an sbbl $0 is used rather than a jz
+C q1_ff special case.  This make the code a bit smaller and simpler, and
+C costs only 2 cycles (each).
+
+L(integer_two_left):
+	C eax	scratch
+	C ebx	scratch (nadj, q1)
+	C ecx	scratch (src, dst)
+	C edx	scratch
+	C esi	n10
+	C edi	n2
+	C ebp	divisor
+	C
+	C mm7	rshift
+
+
+	movl	%esi, %eax
+	movl	%ebp, %ebx
+
+	sarl	$31, %eax          C -n1
+	movl	PARAM_SRC, %ecx
+
+	andl	%eax, %ebx         C -n1 & d
+	negl	%eax               C n1
+
+	addl	%esi, %ebx         C nadj = n10 + (-n1 & d), ignoring overflow
+	addl	%edi, %eax         C n2+n1
+
+	mull	VAR_INVERSE        C m*(n2+n1)
+
+	movd	(%ecx), %mm0	   C src low limb
+
+	movl	VAR_DST_STOP, %ecx
+
+	C
+
+	C
+
+	addl	%ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
+	leal	1(%edi), %ebx      C n2+1
+	movl	%ebp, %eax	   C d
+
+	adcl	%edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
+
+	sbbl	$0, %ebx
+
+	mull	%ebx		   C (q1+1)*d
+
+	psllq	$32, %mm0
+
+	psrlq	%mm7, %mm0
+
+	C
+
+	C
+
+	subl	%eax, %esi
+
+	sbbl	%edx, %edi	   C n - (q1+1)*d
+	movl	%esi, %edi	   C remainder -> n2
+	leal	(%ebp,%esi), %edx
+
+	cmovc(	%edx, %edi)	   C n - q1*d if underflow from using q1+1
+	movd	%mm0, %esi
+
+	sbbl	$0, %ebx	   C q
+
+	movl	%ebx, -4(%ecx)
+
+
+C -----------------------------------------------------------------------------
+L(integer_one_left):
+	C eax	scratch
+	C ebx	scratch (nadj, q1)
+	C ecx	scratch (dst)
+	C edx	scratch
+	C esi	n10
+	C edi	n2
+	C ebp	divisor
+	C
+	C mm7	rshift
+
+
+	movl	%esi, %eax
+	movl	%ebp, %ebx
+
+	sarl	$31, %eax          C -n1
+	movl	VAR_DST_STOP, %ecx
+
+	andl	%eax, %ebx         C -n1 & d
+	negl	%eax               C n1
+
+	addl	%esi, %ebx         C nadj = n10 + (-n1 & d), ignoring overflow
+	addl	%edi, %eax         C n2+n1
+
+	mull	VAR_INVERSE        C m*(n2+n1)
+
+	C
+
+	C
+
+	C
+
+	addl	%ebx, %eax         C m*(n2+n1) + nadj, low giving carry flag
+	leal	1(%edi), %ebx      C n2+1
+	movl	%ebp, %eax	   C d
+
+	C
+
+	adcl	%edx, %ebx         C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
+
+	sbbl	$0, %ebx           C q1 if q1+1 overflowed
+
+	mull	%ebx
+
+	C
+
+	C
+
+	C
+
+	C
+
+	subl	%eax, %esi
+	movl	PARAM_XSIZE, %eax
+
+	sbbl	%edx, %edi	   C n - (q1+1)*d
+	movl	%esi, %edi	   C remainder -> n2
+	leal	(%ebp,%esi), %edx
+
+	cmovc(	%edx, %edi)	   C n - q1*d if underflow from using q1+1
+
+	sbbl	$0, %ebx	   C q
+
+	movl	%ebx, -8(%ecx)
+	subl	$8, %ecx
+
+
+
+	orl	%eax, %eax         C xsize
+	jnz	L(fraction_some)
+
+	movl	%edi, %eax
+L(fraction_done):
+	movl	VAR_NORM, %ecx
+L(zero_done):
+	movl	SAVE_EBP, %ebp
+
+	movl	SAVE_EDI, %edi
+
+	movl	SAVE_ESI, %esi
+
+	movl	SAVE_EBX, %ebx
+	addl	$STACK_SPACE, %esp
+
+	shrl	%cl, %eax
+	emms
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+C
+C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword
+C of q*d is simply -d and the remainder n-q*d = n10+d
+
+L(q1_ff):
+	C eax	(divisor)
+	C ebx	(q1+1 == 0)
+	C ecx
+	C edx
+	C esi	n10
+	C edi	n2
+	C ebp	divisor
+
+	movl	VAR_DST, %ecx
+	movl	VAR_DST_STOP, %edx
+	subl	$4, %ecx
+
+	movl	%ecx, VAR_DST
+	psrlq	%mm7, %mm0
+	leal	(%ebp,%esi), %edi	C n-q*d remainder -> next n2
+
+	movl	$-1, (%ecx)
+	movd	%mm0, %esi		C next n10
+
+	cmpl	%ecx, %edx
+	jne	L(integer_top)
+
+	jmp	L(integer_loop_done)
+
+
+
+C -----------------------------------------------------------------------------
+C
+C In the current implementation, the following successively dependent
+C micro-ops seem to exist.
+C
+C		       uops
+C		mul	5
+C		q1+1	1   (addl)
+C		mul	5
+C		sub	3   (negl/sbbl)
+C		addback	2   (cmov)
+C		       ---
+C		       16
+C
+C The loop in fact runs at about 17.5 cycles.  Using a sarl/andl/addl for
+C the addback was found to be a touch slower.
+
+
+	ALIGN(16)
+L(fraction_some):
+	C eax
+	C ebx
+	C ecx
+	C edx
+	C esi
+	C edi	carry
+	C ebp	divisor
+
+	movl	PARAM_DST, %esi
+	movl	VAR_DST_STOP, %ecx	C &dst[xsize+2]
+	movl	%edi, %eax
+
+	subl	$8, %ecx		C &dst[xsize]
+
+
+	ALIGN(16)
+L(fraction_top):
+	C eax	n2, then scratch
+	C ebx	scratch (nadj, q1)
+	C ecx	dst, decrementing
+	C edx	scratch
+	C esi	dst stop point
+	C edi	n2
+	C ebp	divisor
+
+	mull	VAR_INVERSE	C m*n2
+
+	movl	%ebp, %eax	C d
+	subl	$4, %ecx	C dst
+	leal	1(%edi), %ebx
+
+	C
+
+	C
+
+	C
+
+	addl	%edx, %ebx	C 1 + high(n2<<32 + m*n2) = q1+1
+
+	mull	%ebx		C (q1+1)*d
+
+	C
+
+	C
+
+	C
+
+	C
+
+	negl	%eax		C low of n - (q1+1)*d
+
+	sbbl	%edx, %edi	C high of n - (q1+1)*d, caring only about carry
+	leal	(%ebp,%eax), %edx
+
+	cmovc(	%edx, %eax)	C n - q1*d if underflow from using q1+1
+
+	sbbl	$0, %ebx	C q
+	movl	%eax, %edi	C remainder->n2
+	cmpl	%esi, %ecx
+
+	movl	%ebx, (%ecx)	C previous q
+	jne	L(fraction_top)
+
+
+	jmp	L(fraction_done)
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/p6/mmx/gmp-mparam.h b/third_party/gmp/mpn/x86/p6/mmx/gmp-mparam.h
new file mode 100644
index 0000000..ef29061
--- /dev/null
+++ b/third_party/gmp/mpn/x86/p6/mmx/gmp-mparam.h

@@ -0,0 +1,218 @@
+/* Intel P6/mmx gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991-2017 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+
+/* NOTE: In a fat binary build SQR_TOOM2_THRESHOLD here cannot be more than the
+   value in mpn/x86/p6/gmp-mparam.h.  The latter is used as a hard limit in
+   mpn/x86/p6/sqr_basecase.asm.  */
+
+
+/* 800 MHz P6 model 8 */
+/* Generated by tuneup.c, 2017-02-03, gcc 4.8 */
+
+#define MOD_1_1P_METHOD                      2
+#define MOD_1_NORM_THRESHOLD                 3
+#define MOD_1_UNNORM_THRESHOLD               4
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          7
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          7
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD         8
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD        30
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     14
+#define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_1N_PI1_METHOD                 1
+#define DIV_QR_1_NORM_THRESHOLD              4
+#define DIV_QR_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           62
+
+#define DIV_1_VS_MUL_1_PERCENT             168
+
+#define MUL_TOOM22_THRESHOLD                22
+#define MUL_TOOM33_THRESHOLD                73
+#define MUL_TOOM44_THRESHOLD               195
+#define MUL_TOOM6H_THRESHOLD               254
+#define MUL_TOOM8H_THRESHOLD               381
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      81
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     122
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD      73
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD      80
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD     100
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 30	/* WRONG value, see comment above */
+#define SQR_TOOM3_THRESHOLD                 83
+#define SQR_TOOM4_THRESHOLD                196
+#define SQR_TOOM6_THRESHOLD                214
+#define SQR_TOOM8_THRESHOLD                381
+
+#define MULMID_TOOM42_THRESHOLD             56
+
+#define MULMOD_BNM1_THRESHOLD               16
+#define SQRMOD_BNM1_THRESHOLD               17
+
+#define MUL_FFT_MODF_THRESHOLD             476  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    476, 5}, {     21, 6}, {     11, 5}, {     23, 6}, \
+    {     21, 7}, {     11, 6}, {     25, 7}, {     13, 6}, \
+    {     27, 7}, {     15, 6}, {     31, 7}, {     21, 8}, \
+    {     11, 7}, {     27, 8}, {     15, 7}, {     35, 8}, \
+    {     19, 7}, {     41, 8}, {     23, 7}, {     47, 8}, \
+    {     27, 9}, {     15, 8}, {     31, 7}, {     63, 8}, \
+    {     39, 9}, {     23, 8}, {     51,10}, {     15, 9}, \
+    {     31, 8}, {     67, 9}, {     39, 8}, {     79, 9}, \
+    {     47, 8}, {     95, 9}, {     55,10}, {     31, 9}, \
+    {     63, 8}, {    127, 9}, {     79,10}, {     47, 9}, \
+    {     95,11}, {     31,10}, {     63, 9}, {    135,10}, \
+    {     79, 9}, {    167,10}, {     95, 9}, {    199,10}, \
+    {    111,11}, {     63,10}, {    127, 9}, {    255, 8}, \
+    {    511,10}, {    143, 9}, {    287, 8}, {    575,10}, \
+    {    159,11}, {     95,10}, {    191, 9}, {    383,10}, \
+    {    207,12}, {     63,11}, {    127,10}, {    255, 9}, \
+    {    511,10}, {    271, 9}, {    543, 8}, {   1087,10}, \
+    {    287, 9}, {    575,11}, {    159,10}, {    319, 9}, \
+    {    639,10}, {    351, 9}, {    703,11}, {    191,10}, \
+    {    383, 9}, {    767,10}, {    415, 9}, {    831,11}, \
+    {    223,10}, {    447,12}, {    127,11}, {    255,10}, \
+    {    543, 9}, {   1087,11}, {    287,10}, {    607, 9}, \
+    {   1215,11}, {    319,10}, {    671,11}, {    351,10}, \
+    {    703,12}, {    191,11}, {    383,10}, {    767,11}, \
+    {    415,10}, {    831,11}, {    447,13}, {    127,12}, \
+    {    255,11}, {    543,10}, {   1087,11}, {    607,10}, \
+    {   1215,12}, {    319,11}, {    671,10}, {   1343,11}, \
+    {    703,10}, {   1407,11}, {    735,12}, {    383,11}, \
+    {    831,12}, {    447,11}, {    959,10}, {   1919,13}, \
+    {    255,12}, {    511,11}, {   1087,12}, {    575,11}, \
+    {   1215,10}, {   2431,12}, {    639,11}, {   1343,12}, \
+    {    703,11}, {   1471,13}, {    383,12}, {    767,11}, \
+    {   1535,12}, {    831,11}, {   1727,12}, {    959,11}, \
+    {   1919,14}, {    255,13}, {    511,12}, {   1215,11}, \
+    {   2431,13}, {    639,12}, {   1471,11}, {   2943,13}, \
+    {    767,12}, {   1727,13}, {    895,12}, {   1919,11}, \
+    {   3839,14}, {    511,13}, {   1023,12}, {   2111,13}, \
+    {   1151,12}, {   2431,13}, {   1279,12}, {   2559,13}, \
+    {   1407,12}, {   2943,14}, {    767,13}, {   1663,12}, \
+    {   3327,13}, {   1919,12}, {   3839,15}, {  32768,16} }
+#define MUL_FFT_TABLE3_SIZE 160
+#define MUL_FFT_THRESHOLD                 7040
+
+#define SQR_FFT_MODF_THRESHOLD             376  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    376, 5}, {     21, 6}, {     11, 5}, {     23, 6}, \
+    {     21, 7}, {     11, 6}, {     24, 7}, {     13, 6}, \
+    {     27, 7}, {     15, 6}, {     31, 7}, {     21, 8}, \
+    {     11, 7}, {     27, 8}, {     15, 7}, {     33, 8}, \
+    {     19, 7}, {     39, 8}, {     23, 7}, {     47, 8}, \
+    {     27, 9}, {     15, 8}, {     39, 9}, {     23, 8}, \
+    {     51,10}, {     15, 9}, {     31, 8}, {     67, 9}, \
+    {     39, 8}, {     79, 9}, {     47, 8}, {     95, 9}, \
+    {     55,10}, {     31, 9}, {     79,10}, {     47, 9}, \
+    {     95,11}, {     31,10}, {     63, 9}, {    127, 8}, \
+    {    255, 9}, {    135,10}, {     79, 9}, {    167,10}, \
+    {     95, 9}, {    191, 8}, {    383,10}, {    111,11}, \
+    {     63,10}, {    127, 9}, {    255, 8}, {    511, 9}, \
+    {    271,10}, {    143, 9}, {    287, 8}, {    575, 9}, \
+    {    303, 8}, {    607,10}, {    159, 9}, {    319,11}, \
+    {     95,10}, {    191, 9}, {    383,10}, {    207,12}, \
+    {     63,11}, {    127,10}, {    255, 9}, {    511,10}, \
+    {    271, 9}, {    543,10}, {    287, 9}, {    575,10}, \
+    {    303,11}, {    159,10}, {    319, 9}, {    639,10}, \
+    {    351, 9}, {    703,11}, {    191,10}, {    383, 9}, \
+    {    767,10}, {    415, 9}, {    831,11}, {    223,10}, \
+    {    479,12}, {    127,11}, {    255,10}, {    543, 9}, \
+    {   1087,11}, {    287,10}, {    607, 9}, {   1215,11}, \
+    {    319,10}, {    671,11}, {    351,10}, {    703,12}, \
+    {    191,11}, {    383,10}, {    767,11}, {    415,10}, \
+    {    831,11}, {    479,13}, {    127,12}, {    255,11}, \
+    {    543,10}, {   1087,11}, {    607,10}, {   1215,12}, \
+    {    319,11}, {    671,10}, {   1343,11}, {    703,10}, \
+    {   1407,11}, {    735,12}, {    383,11}, {    831,12}, \
+    {    447,11}, {    959,10}, {   1919,13}, {    255,12}, \
+    {    511,11}, {   1087,12}, {    575,11}, {   1215,10}, \
+    {   2431,12}, {    639,11}, {   1343,12}, {    703,11}, \
+    {   1407,13}, {    383,12}, {    831,11}, {   1727,12}, \
+    {    959,11}, {   1919,14}, {    255,13}, {    511,12}, \
+    {   1215,11}, {   2431,13}, {    639,12}, {   1471,11}, \
+    {   2943,13}, {    767,12}, {   1727,13}, {    895,12}, \
+    {   1919,11}, {   3839,14}, {    511,13}, {   1023,12}, \
+    {   2111,13}, {   1151,12}, {   2431,13}, {   1407,12}, \
+    {   2943,14}, {    767,13}, {   1535,12}, {   3071,13}, \
+    {   1663,12}, {   3455,13}, {   1919,12}, {   3839,15}, \
+    {  32768,16} }
+#define SQR_FFT_TABLE3_SIZE 161
+#define SQR_FFT_THRESHOLD                 3712
+
+#define MULLO_BASECASE_THRESHOLD             0  /* always */
+#define MULLO_DC_THRESHOLD                  62
+#define MULLO_MUL_N_THRESHOLD            13463
+#define SQRLO_BASECASE_THRESHOLD             0  /* always */
+#define SQRLO_DC_THRESHOLD                 177
+#define SQRLO_SQR_THRESHOLD               8937
+
+#define DC_DIV_QR_THRESHOLD                 80
+#define DC_DIVAPPR_Q_THRESHOLD             240
+#define DC_BDIV_QR_THRESHOLD                76
+#define DC_BDIV_Q_THRESHOLD                166
+
+#define INV_MULMOD_BNM1_THRESHOLD           42
+#define INV_NEWTON_THRESHOLD               262
+#define INV_APPR_THRESHOLD                 250
+
+#define BINV_NEWTON_THRESHOLD              272
+#define REDC_1_TO_REDC_N_THRESHOLD          72
+
+#define MU_DIV_QR_THRESHOLD               1499
+#define MU_DIVAPPR_Q_THRESHOLD            1470
+#define MUPI_DIV_QR_THRESHOLD              124
+#define MU_BDIV_QR_THRESHOLD              1142
+#define MU_BDIV_Q_THRESHOLD               1341
+
+#define POWM_SEC_TABLE  1,16,96,416,1259
+
+#define GET_STR_DC_THRESHOLD                14
+#define GET_STR_PRECOMPUTE_THRESHOLD        27
+#define SET_STR_DC_THRESHOLD               270
+#define SET_STR_PRECOMPUTE_THRESHOLD      1084
+
+#define FAC_DSC_THRESHOLD                  194
+#define FAC_ODD_THRESHOLD                   25
+
+#define MATRIX22_STRASSEN_THRESHOLD         16
+#define HGCD_THRESHOLD                     124
+#define HGCD_APPR_THRESHOLD                152
+#define HGCD_REDUCE_THRESHOLD             3014
+#define GCD_DC_THRESHOLD                   474
+#define GCDEXT_DC_THRESHOLD                321
+#define JACOBI_BASE_METHOD                   1

diff --git a/third_party/gmp/mpn/x86/p6/mmx/lshift.asm b/third_party/gmp/mpn/x86/p6/mmx/lshift.asm
new file mode 100644
index 0000000..febd1c0
--- /dev/null
+++ b/third_party/gmp/mpn/x86/p6/mmx/lshift.asm

@@ -0,0 +1,38 @@
+dnl  Intel Pentium-II mpn_lshift -- mpn left shift.
+
+dnl  Copyright 2001 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+dnl  The P55 code runs well on P-II/III, but could stand some minor tweaks
+dnl  at some stage probably.
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_lshift)
+include_mpn(`x86/pentium/mmx/lshift.asm')

diff --git a/third_party/gmp/mpn/x86/p6/mmx/popham.asm b/third_party/gmp/mpn/x86/p6/mmx/popham.asm
new file mode 100644
index 0000000..fd340e4
--- /dev/null
+++ b/third_party/gmp/mpn/x86/p6/mmx/popham.asm

@@ -0,0 +1,39 @@
+dnl  Intel Pentium-II mpn_popcount, mpn_hamdist -- population count and
+dnl  hamming distance.
+
+dnl  Copyright 2000, 2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P6MMX: popcount 11 cycles/limb (approx), hamdist 11.5 cycles/limb (approx)
+
+
+MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
+include_mpn(`x86/k6/mmx/popham.asm')

diff --git a/third_party/gmp/mpn/x86/p6/mmx/rshift.asm b/third_party/gmp/mpn/x86/p6/mmx/rshift.asm
new file mode 100644
index 0000000..77aa190
--- /dev/null
+++ b/third_party/gmp/mpn/x86/p6/mmx/rshift.asm

@@ -0,0 +1,38 @@
+dnl  Intel Pentium-II mpn_rshift -- mpn left shift.
+
+dnl  Copyright 2001 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+dnl  The P55 code runs well on P-II/III, but could stand some minor tweaks
+dnl  at some stage probably.
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_rshift)
+include_mpn(`x86/pentium/mmx/rshift.asm')

diff --git a/third_party/gmp/mpn/x86/p6/mod_34lsub1.asm b/third_party/gmp/mpn/x86/p6/mod_34lsub1.asm
new file mode 100644
index 0000000..b88ab5d
--- /dev/null
+++ b/third_party/gmp/mpn/x86/p6/mod_34lsub1.asm

@@ -0,0 +1,190 @@
+dnl  Intel P6 mpn_mod_34lsub1 -- remainder modulo 2^24-1.
+
+dnl  Copyright 2000-2002, 2004 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P6: 2.0 cycles/limb
+
+C TODO
+C  Experiments with more unrolling indicate that 1.5 c/l is possible on P6-13
+C  with the current carry handling scheme.
+
+C mp_limb_t mpn_mod_34lsub1 (mp_srcptr src, mp_size_t size)
+C
+C Groups of three limbs are handled, with carry bits from 0mod3 into 1mod3
+C into 2mod3, but at that point going into a separate carries total so we
+C don't keep the carry flag live across the loop control.  Avoiding decl
+C lets us get to 2.0 c/l, as compared to the generic x86 code at 3.66.
+C
+
+defframe(PARAM_SIZE, 8)
+defframe(PARAM_SRC,  4)
+
+dnl  re-use parameter space
+define(SAVE_EBX, `PARAM_SIZE')
+define(SAVE_ESI, `PARAM_SRC')
+
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_mod_34lsub1)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	movl	PARAM_SRC, %edx
+
+	subl	$2, %ecx		C size-2
+	movl	(%edx), %eax		C src[0]
+	ja	L(three_or_more)
+	jb	L(one)
+
+	C size==2
+
+	movl	4(%edx), %ecx		C src[1]
+
+	movl	%eax, %edx		C src[0]
+	shrl	$24, %eax		C src[0] high
+
+	andl	$0xFFFFFF, %edx		C src[0] low
+
+	addl	%edx, %eax
+	movl	%ecx, %edx		C src[1]
+	shrl	$16, %ecx		C src[1] high
+
+	andl	$0xFFFF, %edx
+	addl	%ecx, %eax
+
+	shll	$8, %edx		C src[1] low
+
+	addl	%edx, %eax
+L(one):
+	ret
+
+
+L(three_or_more):
+	C eax	src[0], initial acc 0mod3
+	C ebx
+	C ecx	size-2
+	C edx	src
+	C esi
+	C edi
+	C ebp
+
+	movl	%ebx, SAVE_EBX
+	movl	4(%edx), %ebx		C src[1], initial 1mod3
+	subl	$3, %ecx		C size-5
+
+	movl	%esi, SAVE_ESI
+	movl	8(%edx), %esi		C src[2], initial 2mod3
+
+	pushl	%edi	FRAME_pushl()
+	movl	$0, %edi		C initial carries 0mod3
+	jng	L(done)			C if size < 6
+
+
+L(top):
+	C eax	acc 0mod3
+	C ebx	acc 1mod3
+	C ecx	counter, limbs
+	C edx	src
+	C esi	acc 2mod3
+	C edi	carrys into 0mod3
+	C ebp
+
+	addl	12(%edx), %eax
+	adcl	16(%edx), %ebx
+	adcl	20(%edx), %esi
+	leal	12(%edx), %edx
+	adcl	$0, %edi
+
+	subl	$3, %ecx
+	jg	L(top)			C at least 3 more to process
+
+
+L(done):
+	C ecx is -2, -1 or 0 representing 0, 1 or 2 more limbs respectively
+	cmpl	$-1, %ecx
+	jl	L(done_0)		C if -2, meaning 0 more limbs
+
+	C 1 or 2 more limbs
+	movl	$0, %ecx
+	je	L(done_1)		C if -1, meaning 1 more limb only
+	movl	16(%edx), %ecx
+L(done_1):
+	addl	12(%edx), %eax		C 0mod3
+	adcl	%ecx, %ebx		C 1mod3
+	adcl	$0, %esi		C 2mod3
+	adcl	$0, %edi		C carries 0mod3
+
+L(done_0):
+	C eax	acc 0mod3
+	C ebx	acc 1mod3
+	C ecx
+	C edx
+	C esi	acc 2mod3
+	C edi	carries 0mod3
+	C ebp
+
+	movl	%eax, %ecx		C 0mod3
+	shrl	$24, %eax		C 0mod3 high initial total
+
+	andl	$0xFFFFFF, %ecx		C 0mod3 low
+	movl	%edi, %edx		C carries
+	shrl	$24, %edi		C carries high
+
+	addl	%ecx, %eax		C add 0mod3 low
+	andl	$0xFFFFFF, %edx		C carries 0mod3 low
+	movl	%ebx, %ecx		C 1mod3
+
+	shrl	$16, %ebx		C 1mod3 high
+	addl	%edi, %eax		C add carries high
+	addl	%edx, %eax		C add carries 0mod3 low
+
+	andl	$0xFFFF, %ecx		C 1mod3 low mask
+	addl	%ebx, %eax		C add 1mod3 high
+	movl	SAVE_EBX, %ebx
+
+	shll	$8, %ecx		C 1mod3 low
+	movl	%esi, %edx		C 2mod3
+	popl	%edi	FRAME_popl()
+
+	shrl	$8, %esi		C 2mod3 high
+	andl	$0xFF, %edx		C 2mod3 low mask
+	addl	%ecx, %eax		C add 1mod3 low
+
+	shll	$16, %edx		C 2mod3 low
+	addl	%esi, %eax		C add 2mod3 high
+	movl	SAVE_ESI, %esi
+
+	addl	%edx, %eax		C add 2mod3 low
+
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/p6/mode1o.asm b/third_party/gmp/mpn/x86/p6/mode1o.asm
new file mode 100644
index 0000000..7083195
--- /dev/null
+++ b/third_party/gmp/mpn/x86/p6/mode1o.asm

@@ -0,0 +1,170 @@
+dnl  Intel P6 mpn_modexact_1_odd -- exact division style remainder.
+
+dnl  Copyright 2000-2002, 2007 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P6: 10.0 cycles/limb
+
+
+C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size,
+C                               mp_limb_t divisor);
+C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size,
+C                                mp_limb_t divisor, mp_limb_t carry);
+C
+C It's not worth skipping a step at the end when high<divisor since the main
+C loop is only 10 cycles.
+
+defframe(PARAM_CARRY,  16)
+defframe(PARAM_DIVISOR,12)
+defframe(PARAM_SIZE,   8)
+defframe(PARAM_SRC,    4)
+
+dnl  Not enough room under modexact_1 to make these re-use the parameter
+dnl  space, unfortunately.
+defframe(SAVE_EBX,     -4)
+defframe(SAVE_ESI,     -8)
+defframe(SAVE_EDI,    -12)
+deflit(STACK_SPACE, 12)
+
+	TEXT
+
+	ALIGN(16)
+PROLOGUE(mpn_modexact_1c_odd)
+deflit(`FRAME',0)
+
+	movl	PARAM_CARRY, %ecx
+	jmp	L(start_1c)
+
+EPILOGUE()
+
+	ALIGN(16)
+PROLOGUE(mpn_modexact_1_odd)
+deflit(`FRAME',0)
+
+	xorl	%ecx, %ecx
+L(start_1c):
+	movl	PARAM_DIVISOR, %eax
+
+	subl	$STACK_SPACE, %esp	FRAME_subl_esp(STACK_SPACE)
+
+	movl	%esi, SAVE_ESI
+	movl	PARAM_SRC, %esi
+
+	shrl	%eax			C d/2
+	movl	%edi, SAVE_EDI
+
+	andl	$127, %eax
+
+ifdef(`PIC',`
+	LEA(	binvert_limb_table, %edi)
+	movzbl	(%eax,%edi), %edi		C inv 8 bits
+',`
+	movzbl	binvert_limb_table(%eax), %edi	C inv 8 bits
+')
+
+	xorl	%edx, %edx		C initial extra carry
+	leal	(%edi,%edi), %eax	C 2*inv
+
+	imull	%edi, %edi		C inv*inv
+
+	movl	%ebx, SAVE_EBX
+	movl	PARAM_SIZE, %ebx
+
+	imull	PARAM_DIVISOR, %edi	C inv*inv*d
+
+	subl	%edi, %eax		C inv = 2*inv - inv*inv*d
+	leal	(%eax,%eax), %edi	C 2*inv
+
+	imull	%eax, %eax		C inv*inv
+
+	imull	PARAM_DIVISOR, %eax	C inv*inv*d
+
+	leal	(%esi,%ebx,4), %esi	C src end
+	negl	%ebx			C -size
+
+	subl	%eax, %edi		C inv = 2*inv - inv*inv*d
+
+	ASSERT(e,`	C d*inv == 1 mod 2^GMP_LIMB_BITS
+	movl	PARAM_DIVISOR, %eax
+	imull	%edi, %eax
+	cmpl	$1, %eax')
+
+
+C The dependent chain here is
+C
+C	subl	%edx, %eax       1
+C	imull	%edi, %eax       4
+C	mull	PARAM_DIVISOR    5
+C			       ----
+C	total			10
+C
+C and this is the measured speed.  No special scheduling is necessary, out
+C of order execution hides the load latency.
+
+L(top):
+	C eax	scratch (src limb)
+	C ebx	counter, limbs, negative
+	C ecx	carry bit, 0 or 1
+	C edx	carry limb, high of last product
+	C esi	&src[size]
+	C edi	inverse
+	C ebp
+
+	movl	(%esi,%ebx,4), %eax
+	subl	%ecx, %eax
+
+	sbbl	%ecx, %ecx
+	subl	%edx, %eax
+
+	sbbl	$0, %ecx
+
+	imull	%edi, %eax
+
+	negl	%ecx
+
+	mull	PARAM_DIVISOR
+
+	incl	%ebx
+	jnz	L(top)
+
+
+	movl	SAVE_ESI, %esi
+	leal	(%ecx,%edx), %eax
+
+	movl	SAVE_EDI, %edi
+
+	movl	SAVE_EBX, %ebx
+	addl	$STACK_SPACE, %esp
+
+	ret
+
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/x86/p6/mul_basecase.asm b/third_party/gmp/mpn/x86/p6/mul_basecase.asm
new file mode 100644
index 0000000..d87bc12
--- /dev/null
+++ b/third_party/gmp/mpn/x86/p6/mul_basecase.asm

@@ -0,0 +1,607 @@
+dnl  Intel P6 mpn_mul_basecase -- multiply two mpn numbers.
+
+dnl  Copyright 1999-2003 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P6: approx 6.5 cycles per cross product (16 limbs/loop unrolling).
+
+
+dnl  P6 UNROLL_COUNT cycles/product (approx)
+dnl           8           7
+dnl          16           6.5
+dnl          32           6.4
+dnl  Maximum possible with the current code is 32.
+
+deflit(UNROLL_COUNT, 16)
+
+
+C void mpn_mul_basecase (mp_ptr wp,
+C                        mp_srcptr xp, mp_size_t xsize,
+C                        mp_srcptr yp, mp_size_t ysize);
+C
+C This routine is essentially the same as mpn/generic/mul_basecase.c, but
+C it's faster because it does most of the mpn_addmul_1() startup
+C calculations only once.
+
+ifdef(`PIC',`
+deflit(UNROLL_THRESHOLD, 5)
+',`
+deflit(UNROLL_THRESHOLD, 5)
+')
+
+defframe(PARAM_YSIZE,20)
+defframe(PARAM_YP,   16)
+defframe(PARAM_XSIZE,12)
+defframe(PARAM_XP,   8)
+defframe(PARAM_WP,   4)
+
+	TEXT
+	ALIGN(16)
+
+PROLOGUE(mpn_mul_basecase)
+deflit(`FRAME',0)
+
+	movl	PARAM_XSIZE, %ecx
+
+	movl	PARAM_YP, %eax
+
+	movl	PARAM_XP, %edx
+
+	movl	(%eax), %eax		C yp[0]
+	cmpl	$2, %ecx
+	ja	L(xsize_more_than_two)
+	je	L(two_by_something)
+
+
+	C one limb by one limb
+
+	mull	(%edx)
+
+	movl	PARAM_WP, %ecx
+	movl	%eax, (%ecx)
+	movl	%edx, 4(%ecx)
+	ret
+
+
+C -----------------------------------------------------------------------------
+L(two_by_something):
+deflit(`FRAME',0)
+
+dnl  re-use parameter space
+define(SAVE_EBX, `PARAM_XSIZE')
+define(SAVE_ESI, `PARAM_YSIZE')
+
+	movl	%ebx, SAVE_EBX
+	cmpl	$1, PARAM_YSIZE
+	movl	%eax, %ecx		C yp[0]
+
+	movl	%esi, SAVE_ESI		C save esi
+	movl	PARAM_WP, %ebx
+	movl	%edx, %esi		C xp
+
+	movl	(%edx), %eax		C xp[0]
+	jne	L(two_by_two)
+
+
+	C two limbs by one limb
+	C
+	C eax	xp[0]
+	C ebx	wp
+	C ecx	yp[0]
+	C edx
+	C esi	xp
+
+	mull	%ecx
+
+	movl	%eax, (%ebx)
+	movl	4(%esi), %eax
+	movl	%edx, %esi		C carry
+
+	mull	%ecx
+
+	addl	%eax, %esi
+
+	movl	%esi, 4(%ebx)
+	movl	SAVE_ESI, %esi
+
+	adcl	$0, %edx
+
+	movl	%edx, 8(%ebx)
+	movl	SAVE_EBX, %ebx
+
+	ret
+
+
+
+C -----------------------------------------------------------------------------
+
+	ALIGN(16)
+L(two_by_two):
+	C eax	xp[0]
+	C ebx	wp
+	C ecx	yp[0]
+	C edx
+	C esi	xp
+	C edi
+	C ebp
+
+dnl  more parameter space re-use
+define(SAVE_EDI, `PARAM_WP')
+
+	mull	%ecx		C xp[0] * yp[0]
+
+	movl	%edi, SAVE_EDI
+	movl	%edx, %edi	C carry, for wp[1]
+
+	movl	%eax, (%ebx)
+	movl	4(%esi), %eax
+
+	mull	%ecx		C xp[1] * yp[0]
+
+	addl	%eax, %edi
+	movl	PARAM_YP, %ecx
+
+	adcl	$0, %edx
+	movl	4(%ecx), %ecx	C yp[1]
+
+	movl	%edi, 4(%ebx)
+	movl	4(%esi), %eax	C xp[1]
+	movl	%edx, %edi	C carry, for wp[2]
+
+	mull	%ecx		C xp[1] * yp[1]
+
+	addl	%eax, %edi
+	movl	(%esi), %eax	C xp[0]
+
+	adcl	$0, %edx
+	movl	%edx, %esi	C carry, for wp[3]
+
+	mull	%ecx		C xp[0] * yp[1]
+
+	addl	%eax, 4(%ebx)
+	movl	%esi, %eax
+
+	adcl	%edx, %edi
+	movl	SAVE_ESI, %esi
+
+	movl	%edi, 8(%ebx)
+
+	adcl	$0, %eax
+	movl	SAVE_EDI, %edi
+
+	movl	%eax, 12(%ebx)
+	movl	SAVE_EBX, %ebx
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(16)
+L(xsize_more_than_two):
+
+C The first limb of yp is processed with a simple mpn_mul_1 loop running at
+C about 6.2 c/l.  Unrolling this doesn't seem worthwhile since it's only run
+C once (whereas the addmul_1 below is run ysize-1 many times).  A call to
+C mpn_mul_1 would be slowed down by the parameter pushing and popping etc,
+C and doesn't seem likely to be worthwhile on the typical sizes reaching
+C here from the Karatsuba code.
+
+	C eax	yp[0]
+	C ebx
+	C ecx	xsize
+	C edx	xp
+	C esi
+	C edi
+	C ebp
+
+defframe(`SAVE_EBX',    -4)
+defframe(`SAVE_ESI',    -8)
+defframe(`SAVE_EDI',   -12)
+defframe(`SAVE_EBP',   -16)
+defframe(VAR_COUNTER,  -20)  dnl for use in the unroll case
+defframe(VAR_ADJUST,   -24)
+defframe(VAR_JMP,      -28)
+defframe(VAR_SWAP,     -32)
+defframe(VAR_XP_LOW,   -36)
+deflit(STACK_SPACE, 36)
+
+	subl	$STACK_SPACE, %esp
+deflit(`FRAME',STACK_SPACE)
+
+	movl	%edi, SAVE_EDI
+	movl	PARAM_WP, %edi
+
+	movl	%ebx, SAVE_EBX
+
+	movl	%ebp, SAVE_EBP
+	movl	%eax, %ebp
+
+	movl	%esi, SAVE_ESI
+	xorl	%ebx, %ebx
+	leal	(%edx,%ecx,4), %esi	C xp end
+
+	leal	(%edi,%ecx,4), %edi	C wp end of mul1
+	negl	%ecx
+
+
+L(mul1):
+	C eax	scratch
+	C ebx	carry
+	C ecx	counter, negative
+	C edx	scratch
+	C esi	xp end
+	C edi	wp end of mul1
+	C ebp	multiplier
+
+	movl	(%esi,%ecx,4), %eax
+
+	mull	%ebp
+
+	addl	%ebx, %eax
+	movl	%eax, (%edi,%ecx,4)
+	movl	$0, %ebx
+
+	adcl	%edx, %ebx
+	incl	%ecx
+	jnz	L(mul1)
+
+
+	movl	PARAM_YSIZE, %edx
+
+	movl	%ebx, (%edi)		C final carry
+	movl	PARAM_XSIZE, %ecx
+	decl	%edx
+
+	jz	L(done)			C if ysize==1
+
+	cmpl	$UNROLL_THRESHOLD, %ecx
+	movl	PARAM_YP, %eax
+	jae	L(unroll)
+
+
+C -----------------------------------------------------------------------------
+	C simple addmul looping
+	C
+	C eax	yp
+	C ebx
+	C ecx	xsize
+	C edx	ysize-1
+	C esi	xp end
+	C edi	wp end of mul1
+	C ebp
+
+	leal	4(%eax,%edx,4), %ebp	C yp end
+	negl	%ecx
+	negl	%edx
+
+	movl	%edx, PARAM_YSIZE	C -(ysize-1)
+	movl	(%esi,%ecx,4), %eax	C xp low limb
+	incl	%ecx
+
+	movl	%ecx, PARAM_XSIZE	C -(xsize-1)
+	xorl	%ebx, %ebx		C initial carry
+
+	movl	%ebp, PARAM_YP
+	movl	(%ebp,%edx,4), %ebp	C yp second lowest limb - multiplier
+	jmp	L(simple_outer_entry)
+
+
+L(simple_outer_top):
+	C ebp	ysize counter, negative
+
+	movl	PARAM_YP, %edx
+
+	movl	PARAM_XSIZE, %ecx	C -(xsize-1)
+	xorl	%ebx, %ebx		C carry
+
+	movl	%ebp, PARAM_YSIZE
+	addl	$4, %edi		C next position in wp
+
+	movl	(%edx,%ebp,4), %ebp	C yp limb - multiplier
+
+	movl	-4(%esi,%ecx,4), %eax	C xp low limb
+
+
+L(simple_outer_entry):
+
+L(simple_inner_top):
+	C eax	xp limb
+	C ebx	carry limb
+	C ecx	loop counter (negative)
+	C edx	scratch
+	C esi	xp end
+	C edi	wp end
+	C ebp	multiplier
+
+	mull	%ebp
+
+	addl	%eax, %ebx
+	adcl	$0, %edx
+
+	addl	%ebx, (%edi,%ecx,4)
+	movl	(%esi,%ecx,4), %eax
+	adcl	$0, %edx
+
+	incl	%ecx
+	movl	%edx, %ebx
+	jnz	L(simple_inner_top)
+
+
+	C separate code for last limb so outer loop counter handling can be
+	C interleaved
+
+	mull	%ebp
+
+	movl	PARAM_YSIZE, %ebp
+	addl	%eax, %ebx
+
+	adcl	$0, %edx
+
+	addl	%ebx, (%edi)
+
+	adcl	$0, %edx
+	incl	%ebp
+
+	movl	%edx, 4(%edi)
+	jnz	L(simple_outer_top)
+
+
+L(done):
+	movl	SAVE_EBX, %ebx
+
+	movl	SAVE_ESI, %esi
+
+	movl	SAVE_EDI, %edi
+
+	movl	SAVE_EBP, %ebp
+	addl	$FRAME, %esp
+
+	ret
+
+
+
+C -----------------------------------------------------------------------------
+C
+C The unrolled loop is the same as in mpn_addmul_1, see that code for some
+C comments.
+C
+C VAR_ADJUST is the negative of how many limbs the leals in the inner loop
+C increment xp and wp.  This is used to adjust xp and wp, and is rshifted to
+C given an initial VAR_COUNTER at the top of the outer loop.
+C
+C VAR_COUNTER is for the unrolled loop, running from VAR_ADJUST/UNROLL_COUNT
+C up to -1, inclusive.
+C
+C VAR_JMP is the computed jump into the unrolled loop.
+C
+C VAR_SWAP is 0 if xsize odd or 0xFFFFFFFF if xsize even, used to swap the
+C initial ebx and ecx on entry to the unrolling.
+C
+C VAR_XP_LOW is the least significant limb of xp, which is needed at the
+C start of the unrolled loop.
+C
+C PARAM_YSIZE is the outer loop counter, going from -(ysize-1) up to -1,
+C inclusive.
+C
+C PARAM_YP is offset appropriately so that the PARAM_YSIZE counter can be
+C added to give the location of the next limb of yp, which is the multiplier
+C in the unrolled loop.
+C
+C The trick with the VAR_ADJUST value means it's only necessary to do one
+C fetch in the outer loop to take care of xp, wp and the inner loop counter.
+
+
+L(unroll):
+	C eax	yp
+	C ebx
+	C ecx	xsize
+	C edx	ysize-1
+	C esi	xp end
+	C edi	wp end of mul1
+	C ebp
+
+	movl	PARAM_XP, %esi
+
+	movl	4(%eax), %ebp		C multiplier (yp second limb)
+	leal	4(%eax,%edx,4), %eax	C yp adjust for ysize indexing
+
+	movl	%eax, PARAM_YP
+	movl	PARAM_WP, %edi
+	negl	%edx
+
+	movl	%edx, PARAM_YSIZE
+	leal	UNROLL_COUNT-2(%ecx), %ebx	C (xsize-1)+UNROLL_COUNT-1
+	decl	%ecx				C xsize-1
+
+	movl	(%esi), %eax		C xp low limb
+	andl	$-UNROLL_MASK-1, %ebx
+	negl	%ecx			C -(xsize-1)
+
+	negl	%ebx
+	andl	$UNROLL_MASK, %ecx
+
+	movl	%ebx, VAR_ADJUST
+	movl	%ecx, %edx
+	shll	$4, %ecx
+
+	movl	%eax, VAR_XP_LOW
+	sarl	$UNROLL_LOG2, %ebx
+	negl	%edx
+
+	C 15 code bytes per limb
+ifdef(`PIC',`
+	call	L(pic_calc)
+L(unroll_here):
+',`
+	leal	L(unroll_inner_entry) (%ecx,%edx,1), %ecx
+')
+
+	movl	%ecx, VAR_JMP
+	movl	%edx, %ecx
+	shll	$31, %edx
+
+	sarl	$31, %edx		C 0 or -1 as xsize odd or even
+	leal	4(%edi,%ecx,4), %edi	C wp and xp, adjust for unrolling,
+	leal	4(%esi,%ecx,4), %esi	C  and start at second limb
+
+	movl	%edx, VAR_SWAP
+	jmp	L(unroll_outer_entry)
+
+
+ifdef(`PIC',`
+L(pic_calc):
+	C See mpn/x86/README about old gas bugs
+	leal	(%ecx,%edx,1), %ecx
+	addl	$L(unroll_inner_entry)-L(unroll_here), %ecx
+	addl	(%esp), %ecx
+	ret_internal
+')
+
+
+C --------------------------------------------------------------------------
+	ALIGN(16)
+L(unroll_outer_top):
+	C eax
+	C ebx
+	C ecx
+	C edx
+	C esi	xp + offset
+	C edi	wp + offset
+	C ebp	ysize counter, negative
+
+	movl	VAR_ADJUST, %ebx
+	movl	PARAM_YP, %edx
+
+	movl	VAR_XP_LOW, %eax
+	movl	%ebp, PARAM_YSIZE	C store incremented ysize counter
+
+	leal	eval(UNROLL_BYTES + 4) (%edi,%ebx,4), %edi
+	leal	(%esi,%ebx,4), %esi
+	sarl	$UNROLL_LOG2, %ebx
+
+	movl	(%edx,%ebp,4), %ebp	C yp next multiplier
+
+L(unroll_outer_entry):
+	mull	%ebp
+
+	movl	%ebx, VAR_COUNTER
+	movl	%edx, %ebx		C carry high
+	movl	%eax, %ecx		C carry low
+
+	xorl	%edx, %eax
+	movl	VAR_JMP, %edx
+
+	andl	VAR_SWAP, %eax
+
+	xorl	%eax, %ebx		C carries other way for odd index
+	xorl	%eax, %ecx
+
+	jmp	*%edx
+
+
+C -----------------------------------------------------------------------------
+
+L(unroll_inner_top):
+	C eax	xp limb
+	C ebx	carry high
+	C ecx	carry low
+	C edx	scratch
+	C esi	xp+8
+	C edi	wp
+	C ebp	yp multiplier limb
+	C
+	C VAR_COUNTER  loop counter, negative
+	C
+	C 15 bytes each limb
+
+	addl	$UNROLL_BYTES, %edi
+
+L(unroll_inner_entry):
+
+deflit(CHUNK_COUNT,2)
+forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
+	deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
+	deflit(`disp1', eval(disp0 + 4))
+
+Zdisp(	movl,	disp0,(%esi), %eax)
+	mull	%ebp
+Zdisp(	addl,	%ecx, disp0,(%edi))
+	adcl	%eax, %ebx		C new carry low
+	movl	%edx, %ecx
+	adcl	$0, %ecx		C new carry high
+
+	movl	disp1(%esi), %eax
+	mull	%ebp
+	addl	%ebx, disp1(%edi)
+	adcl	%eax, %ecx		C new carry low
+	movl	%edx, %ebx
+	adcl	$0, %ebx		C new carry high
+')
+
+
+	incl	VAR_COUNTER
+	leal	UNROLL_BYTES(%esi), %esi
+	jnz	L(unroll_inner_top)
+
+
+	C eax
+	C ebx	carry high
+	C ecx	carry low
+	C edx
+	C esi
+	C edi	wp, pointing at second last limb)
+	C ebp
+
+deflit(`disp0',	eval(UNROLL_BYTES ifelse(UNROLL_BYTES,256,-128)))
+deflit(`disp1', eval(disp0 + 4))
+
+	movl	PARAM_YSIZE, %ebp
+	addl	%ecx, disp0(%edi)	C carry low
+
+	adcl	$0, %ebx
+	incl	%ebp
+
+	movl	%ebx, disp1(%edi)	C carry high
+	jnz	L(unroll_outer_top)
+
+
+	movl	SAVE_ESI, %esi
+
+	movl	SAVE_EBP, %ebp
+
+	movl	SAVE_EDI, %edi
+
+	movl	SAVE_EBX, %ebx
+	addl	$FRAME, %esp
+
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/p6/p3mmx/popham.asm b/third_party/gmp/mpn/x86/p6/p3mmx/popham.asm
new file mode 100644
index 0000000..db2f260
--- /dev/null
+++ b/third_party/gmp/mpn/x86/p6/p3mmx/popham.asm

@@ -0,0 +1,42 @@
+dnl  Intel Pentium-III mpn_popcount, mpn_hamdist -- population count and
+dnl  hamming distance.
+
+dnl  Copyright 2000, 2002, 2004, 2007 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C			     popcount	     hamdist
+C P3 generic			6.5		7
+C P3 model 9  (Banias)		?		?
+C P3 model 13 (Dothan)		5.75		6
+
+
+MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
+include_mpn(`x86/k7/mmx/popham.asm')

diff --git a/third_party/gmp/mpn/x86/p6/sqr_basecase.asm b/third_party/gmp/mpn/x86/p6/sqr_basecase.asm
new file mode 100644
index 0000000..8fc7fdf
--- /dev/null
+++ b/third_party/gmp/mpn/x86/p6/sqr_basecase.asm

@@ -0,0 +1,649 @@
+dnl  Intel P6 mpn_sqr_basecase -- square an mpn number.
+
+dnl  Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P6: approx 4.0 cycles per cross product, or 7.75 cycles per triangular
+C     product (measured on the speed difference between 20 and 40 limbs,
+C     which is the Karatsuba recursing range).
+
+
+dnl  These are the same as in mpn/x86/k6/sqr_basecase.asm, see that file for
+dnl  a description.  The only difference here is that UNROLL_COUNT can go up
+dnl  to 64 (not 63) making SQR_TOOM2_THRESHOLD_MAX 67.
+
+deflit(SQR_TOOM2_THRESHOLD_MAX, 67)
+
+ifdef(`SQR_TOOM2_THRESHOLD_OVERRIDE',
+`define(`SQR_TOOM2_THRESHOLD',SQR_TOOM2_THRESHOLD_OVERRIDE)')
+
+m4_config_gmp_mparam(`SQR_TOOM2_THRESHOLD')
+deflit(UNROLL_COUNT, eval(SQR_TOOM2_THRESHOLD-3))
+
+
+C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C The algorithm is basically the same as mpn/generic/sqr_basecase.c, but a
+C lot of function call overheads are avoided, especially when the given size
+C is small.
+C
+C The code size might look a bit excessive, but not all of it is executed so
+C it won't all get into the code cache.  The 1x1, 2x2 and 3x3 special cases
+C clearly apply only to those sizes; mid sizes like 10x10 only need part of
+C the unrolled addmul; and big sizes like 40x40 that do use the full
+C unrolling will least be making good use of it, because 40x40 will take
+C something like 7000 cycles.
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+	TEXT
+	ALIGN(32)
+PROLOGUE(mpn_sqr_basecase)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %edx
+
+	movl	PARAM_SRC, %eax
+
+	cmpl	$2, %edx
+	movl	PARAM_DST, %ecx
+	je	L(two_limbs)
+
+	movl	(%eax), %eax
+	ja	L(three_or_more)
+
+
+C -----------------------------------------------------------------------------
+C one limb only
+	C eax	src limb
+	C ebx
+	C ecx	dst
+	C edx
+
+	mull	%eax
+
+	movl	%eax, (%ecx)
+	movl	%edx, 4(%ecx)
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+L(two_limbs):
+	C eax	src
+	C ebx
+	C ecx	dst
+	C edx
+
+defframe(SAVE_ESI, -4)
+defframe(SAVE_EBX, -8)
+defframe(SAVE_EDI, -12)
+defframe(SAVE_EBP, -16)
+deflit(`STACK_SPACE',16)
+
+	subl	$STACK_SPACE, %esp
+deflit(`FRAME',STACK_SPACE)
+
+	movl	%esi, SAVE_ESI
+	movl	%eax, %esi
+	movl	(%eax), %eax
+
+	mull	%eax		C src[0]^2
+
+	movl	%eax, (%ecx)	C dst[0]
+	movl	4(%esi), %eax
+
+	movl	%ebx, SAVE_EBX
+	movl	%edx, %ebx	C dst[1]
+
+	mull	%eax		C src[1]^2
+
+	movl	%edi, SAVE_EDI
+	movl	%eax, %edi	C dst[2]
+	movl	(%esi), %eax
+
+	movl	%ebp, SAVE_EBP
+	movl	%edx, %ebp	C dst[3]
+
+	mull	4(%esi)		C src[0]*src[1]
+
+	addl	%eax, %ebx
+	movl	SAVE_ESI, %esi
+
+	adcl	%edx, %edi
+
+	adcl	$0, %ebp
+	addl	%ebx, %eax
+	movl	SAVE_EBX, %ebx
+
+	adcl	%edi, %edx
+	movl	SAVE_EDI, %edi
+
+	adcl	$0, %ebp
+
+	movl	%eax, 4(%ecx)
+
+	movl	%ebp, 12(%ecx)
+	movl	SAVE_EBP, %ebp
+
+	movl	%edx, 8(%ecx)
+	addl	$FRAME, %esp
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+L(three_or_more):
+	C eax	src low limb
+	C ebx
+	C ecx	dst
+	C edx	size
+deflit(`FRAME',0)
+
+	pushl	%esi	defframe_pushl(`SAVE_ESI')
+	cmpl	$4, %edx
+
+	movl	PARAM_SRC, %esi
+	jae	L(four_or_more)
+
+
+C -----------------------------------------------------------------------------
+C three limbs
+
+	C eax	src low limb
+	C ebx
+	C ecx	dst
+	C edx
+	C esi	src
+	C edi
+	C ebp
+
+	pushl	%ebp	defframe_pushl(`SAVE_EBP')
+	pushl	%edi	defframe_pushl(`SAVE_EDI')
+
+	mull	%eax		C src[0] ^ 2
+
+	movl	%eax, (%ecx)
+	movl	%edx, 4(%ecx)
+
+	movl	4(%esi), %eax
+	xorl	%ebp, %ebp
+
+	mull	%eax		C src[1] ^ 2
+
+	movl	%eax, 8(%ecx)
+	movl	%edx, 12(%ecx)
+	movl	8(%esi), %eax
+
+	pushl	%ebx	defframe_pushl(`SAVE_EBX')
+
+	mull	%eax		C src[2] ^ 2
+
+	movl	%eax, 16(%ecx)
+	movl	%edx, 20(%ecx)
+
+	movl	(%esi), %eax
+
+	mull	4(%esi)		C src[0] * src[1]
+
+	movl	%eax, %ebx
+	movl	%edx, %edi
+
+	movl	(%esi), %eax
+
+	mull	8(%esi)		C src[0] * src[2]
+
+	addl	%eax, %edi
+	movl	%edx, %ebp
+
+	adcl	$0, %ebp
+	movl	4(%esi), %eax
+
+	mull	8(%esi)		C src[1] * src[2]
+
+	xorl	%esi, %esi
+	addl	%eax, %ebp
+
+	C eax
+	C ebx	dst[1]
+	C ecx	dst
+	C edx	dst[4]
+	C esi	zero, will be dst[5]
+	C edi	dst[2]
+	C ebp	dst[3]
+
+	adcl	$0, %edx
+	addl	%ebx, %ebx
+
+	adcl	%edi, %edi
+
+	adcl	%ebp, %ebp
+
+	adcl	%edx, %edx
+	movl	4(%ecx), %eax
+
+	adcl	$0, %esi
+	addl	%ebx, %eax
+
+	movl	%eax, 4(%ecx)
+	movl	8(%ecx), %eax
+
+	adcl	%edi, %eax
+	movl	12(%ecx), %ebx
+
+	adcl	%ebp, %ebx
+	movl	16(%ecx), %edi
+
+	movl	%eax, 8(%ecx)
+	movl	SAVE_EBP, %ebp
+
+	movl	%ebx, 12(%ecx)
+	movl	SAVE_EBX, %ebx
+
+	adcl	%edx, %edi
+	movl	20(%ecx), %eax
+
+	movl	%edi, 16(%ecx)
+	movl	SAVE_EDI, %edi
+
+	adcl	%esi, %eax	C no carry out of this
+	movl	SAVE_ESI, %esi
+
+	movl	%eax, 20(%ecx)
+	addl	$FRAME, %esp
+
+	ret
+
+
+
+C -----------------------------------------------------------------------------
+defframe(VAR_COUNTER,-20)
+defframe(VAR_JMP,    -24)
+deflit(`STACK_SPACE',24)
+
+L(four_or_more):
+	C eax	src low limb
+	C ebx
+	C ecx
+	C edx	size
+	C esi	src
+	C edi
+	C ebp
+deflit(`FRAME',4)  dnl  %esi already pushed
+
+C First multiply src[0]*src[1..size-1] and store at dst[1..size].
+
+	subl	$STACK_SPACE-FRAME, %esp
+deflit(`FRAME',STACK_SPACE)
+	movl	$1, %ecx
+
+	movl	%edi, SAVE_EDI
+	movl	PARAM_DST, %edi
+
+	movl	%ebx, SAVE_EBX
+	subl	%edx, %ecx		C -(size-1)
+
+	movl	%ebp, SAVE_EBP
+	movl	$0, %ebx		C initial carry
+
+	leal	(%esi,%edx,4), %esi	C &src[size]
+	movl	%eax, %ebp		C multiplier
+
+	leal	-4(%edi,%edx,4), %edi	C &dst[size-1]
+
+
+C This loop runs at just over 6 c/l.
+
+L(mul_1):
+	C eax	scratch
+	C ebx	carry
+	C ecx	counter, limbs, negative, -(size-1) to -1
+	C edx	scratch
+	C esi	&src[size]
+	C edi	&dst[size-1]
+	C ebp	multiplier
+
+	movl	%ebp, %eax
+
+	mull	(%esi,%ecx,4)
+
+	addl	%ebx, %eax
+	movl	$0, %ebx
+
+	adcl	%edx, %ebx
+	movl	%eax, 4(%edi,%ecx,4)
+
+	incl	%ecx
+	jnz	L(mul_1)
+
+
+	movl	%ebx, 4(%edi)
+
+
+C Addmul src[n]*src[n+1..size-1] at dst[2*n-1...], for each n=1..size-2.
+C
+C The last two addmuls, which are the bottom right corner of the product
+C triangle, are left to the end.  These are src[size-3]*src[size-2,size-1]
+C and src[size-2]*src[size-1].  If size is 4 then it's only these corner
+C cases that need to be done.
+C
+C The unrolled code is the same as mpn_addmul_1(), see that routine for some
+C comments.
+C
+C VAR_COUNTER is the outer loop, running from -(size-4) to -1, inclusive.
+C
+C VAR_JMP is the computed jump into the unrolled code, stepped by one code
+C chunk each outer loop.
+
+dnl  This is also hard-coded in the address calculation below.
+deflit(CODE_BYTES_PER_LIMB, 15)
+
+dnl  With &src[size] and &dst[size-1] pointers, the displacements in the
+dnl  unrolled code fit in a byte for UNROLL_COUNT values up to 32, but above
+dnl  that an offset must be added to them.
+deflit(OFFSET,
+ifelse(eval(UNROLL_COUNT>32),1,
+eval((UNROLL_COUNT-32)*4),
+0))
+
+	C eax
+	C ebx	carry
+	C ecx
+	C edx
+	C esi	&src[size]
+	C edi	&dst[size-1]
+	C ebp
+
+	movl	PARAM_SIZE, %ecx
+
+	subl	$4, %ecx
+	jz	L(corner)
+
+	movl	%ecx, %edx
+	negl	%ecx
+
+	shll	$4, %ecx
+ifelse(OFFSET,0,,`subl	$OFFSET, %esi')
+
+ifdef(`PIC',`
+	call	L(pic_calc)
+L(here):
+',`
+	leal	L(unroll_inner_end)-eval(2*CODE_BYTES_PER_LIMB)(%ecx,%edx), %ecx
+')
+	negl	%edx
+
+ifelse(OFFSET,0,,`subl	$OFFSET, %edi')
+
+	C The calculated jump mustn't be before the start of the available
+	C code.  This is the limit that UNROLL_COUNT puts on the src operand
+	C size, but checked here using the jump address directly.
+
+	ASSERT(ae,
+	`movl_text_address( L(unroll_inner_start), %eax)
+	cmpl	%eax, %ecx')
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(16)
+L(unroll_outer_top):
+	C eax
+	C ebx	high limb to store
+	C ecx	VAR_JMP
+	C edx	VAR_COUNTER, limbs, negative
+	C esi	&src[size], constant
+	C edi	dst ptr, second highest limb of last addmul
+	C ebp
+
+	movl	-12+OFFSET(%esi,%edx,4), %ebp	C multiplier
+	movl	%edx, VAR_COUNTER
+
+	movl	-8+OFFSET(%esi,%edx,4), %eax	C first limb of multiplicand
+
+	mull	%ebp
+
+define(cmovX,`ifelse(eval(UNROLL_COUNT%2),1,`cmovz($@)',`cmovnz($@)')')
+
+	testb	$1, %cl
+
+	movl	%edx, %ebx	C high carry
+	leal	4(%edi), %edi
+
+	movl	%ecx, %edx	C jump
+
+	movl	%eax, %ecx	C low carry
+	leal	CODE_BYTES_PER_LIMB(%edx), %edx
+
+	cmovX(	%ebx, %ecx)	C high carry reverse
+	cmovX(	%eax, %ebx)	C low carry reverse
+	movl	%edx, VAR_JMP
+	jmp	*%edx
+
+
+	C Must be on an even address here so the low bit of the jump address
+	C will indicate which way around ecx/ebx should start.
+
+	ALIGN(2)
+
+L(unroll_inner_start):
+	C eax	scratch
+	C ebx	carry high
+	C ecx	carry low
+	C edx	scratch
+	C esi	src pointer
+	C edi	dst pointer
+	C ebp	multiplier
+	C
+	C 15 code bytes each limb
+	C ecx/ebx reversed on each chunk
+
+forloop(`i', UNROLL_COUNT, 1, `
+	deflit(`disp_src', eval(-i*4 + OFFSET))
+	deflit(`disp_dst', eval(disp_src))
+
+	m4_assert(`disp_src>=-128 && disp_src<128')
+	m4_assert(`disp_dst>=-128 && disp_dst<128')
+
+ifelse(eval(i%2),0,`
+Zdisp(	movl,	disp_src,(%esi), %eax)
+	mull	%ebp
+Zdisp(	addl,	%ebx, disp_dst,(%edi))
+	adcl	%eax, %ecx
+	movl	%edx, %ebx
+	adcl	$0, %ebx
+',`
+	dnl  this one comes out last
+Zdisp(	movl,	disp_src,(%esi), %eax)
+	mull	%ebp
+Zdisp(	addl,	%ecx, disp_dst,(%edi))
+	adcl	%eax, %ebx
+	movl	%edx, %ecx
+	adcl	$0, %ecx
+')
+')
+L(unroll_inner_end):
+
+	addl	%ebx, m4_empty_if_zero(OFFSET)(%edi)
+
+	movl	VAR_COUNTER, %edx
+	adcl	$0, %ecx
+
+	movl	%ecx, m4_empty_if_zero(OFFSET+4)(%edi)
+	movl	VAR_JMP, %ecx
+
+	incl	%edx
+	jnz	L(unroll_outer_top)
+
+
+ifelse(OFFSET,0,,`
+	addl	$OFFSET, %esi
+	addl	$OFFSET, %edi
+')
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(16)
+L(corner):
+	C eax
+	C ebx
+	C ecx
+	C edx
+	C esi	&src[size]
+	C edi	&dst[2*size-5]
+	C ebp
+
+	movl	-12(%esi), %eax
+
+	mull	-8(%esi)
+
+	addl	%eax, (%edi)
+	movl	-12(%esi), %eax
+	movl	$0, %ebx
+
+	adcl	%edx, %ebx
+
+	mull	-4(%esi)
+
+	addl	%eax, %ebx
+	movl	-8(%esi), %eax
+
+	adcl	$0, %edx
+
+	addl	%ebx, 4(%edi)
+	movl	$0, %ebx
+
+	adcl	%edx, %ebx
+
+	mull	-4(%esi)
+
+	movl	PARAM_SIZE, %ecx
+	addl	%ebx, %eax
+
+	adcl	$0, %edx
+
+	movl	%eax, 8(%edi)
+
+	movl	%edx, 12(%edi)
+	movl	PARAM_DST, %edi
+
+
+C Left shift of dst[1..2*size-2], the bit shifted out becomes dst[2*size-1].
+
+	subl	$1, %ecx		C size-1
+	xorl	%eax, %eax		C ready for final adcl, and clear carry
+
+	movl	%ecx, %edx
+	movl	PARAM_SRC, %esi
+
+
+L(lshift):
+	C eax
+	C ebx
+	C ecx	counter, size-1 to 1
+	C edx	size-1 (for later use)
+	C esi	src (for later use)
+	C edi	dst, incrementing
+	C ebp
+
+	rcll	4(%edi)
+	rcll	8(%edi)
+
+	leal	8(%edi), %edi
+	decl	%ecx
+	jnz	L(lshift)
+
+
+	adcl	%eax, %eax
+
+	movl	%eax, 4(%edi)		C dst most significant limb
+	movl	(%esi), %eax		C src[0]
+
+	leal	4(%esi,%edx,4), %esi	C &src[size]
+	subl	%edx, %ecx		C -(size-1)
+
+
+C Now add in the squares on the diagonal, src[0]^2, src[1]^2, ...,
+C src[size-1]^2.  dst[0] hasn't yet been set at all yet, and just gets the
+C low limb of src[0]^2.
+
+
+	mull	%eax
+
+	movl	%eax, (%edi,%ecx,8)	C dst[0]
+
+
+L(diag):
+	C eax	scratch
+	C ebx	scratch
+	C ecx	counter, negative
+	C edx	carry
+	C esi	&src[size]
+	C edi	dst[2*size-2]
+	C ebp
+
+	movl	(%esi,%ecx,4), %eax
+	movl	%edx, %ebx
+
+	mull	%eax
+
+	addl	%ebx, 4(%edi,%ecx,8)
+	adcl	%eax, 8(%edi,%ecx,8)
+	adcl	$0, %edx
+
+	incl	%ecx
+	jnz	L(diag)
+
+
+	movl	SAVE_ESI, %esi
+	movl	SAVE_EBX, %ebx
+
+	addl	%edx, 4(%edi)		C dst most significant limb
+
+	movl	SAVE_EDI, %edi
+	movl	SAVE_EBP, %ebp
+	addl	$FRAME, %esp
+	ret
+
+
+
+C -----------------------------------------------------------------------------
+ifdef(`PIC',`
+L(pic_calc):
+	addl	(%esp), %ecx
+	addl	$L(unroll_inner_end)-L(here)-eval(2*CODE_BYTES_PER_LIMB), %ecx
+	addl	%edx, %ecx
+	ret_internal
+')
+
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/p6/sse2/addmul_1.asm b/third_party/gmp/mpn/x86/p6/sse2/addmul_1.asm
new file mode 100644
index 0000000..144b627
--- /dev/null
+++ b/third_party/gmp/mpn/x86/p6/sse2/addmul_1.asm

@@ -0,0 +1,37 @@
+dnl  Intel P6/SSE2 mpn_addmul_1.
+
+dnl  Copyright 2008 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C TODO
+C  * Write P6 specific SSE2 code.
+
+MULFUNC_PROLOGUE(mpn_addmul_1)
+include_mpn(`x86/pentium4/sse2/addmul_1.asm')

diff --git a/third_party/gmp/mpn/x86/p6/sse2/gmp-mparam.h b/third_party/gmp/mpn/x86/p6/sse2/gmp-mparam.h
new file mode 100644
index 0000000..a1e261b
--- /dev/null
+++ b/third_party/gmp/mpn/x86/p6/sse2/gmp-mparam.h

@@ -0,0 +1,200 @@
+/* Intel P6/sse2 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 1999-2003, 2008-2010 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+
+/* NOTE: In a fat binary build SQR_TOOM2_THRESHOLD here cannot be more than the
+   value in mpn/x86/p6/gmp-mparam.h.  The latter is used as a hard limit in
+   mpn/x86/p6/sqr_basecase.asm.  */
+
+
+/* 1867 MHz P6 model 13 */
+
+#define MOD_1_NORM_THRESHOLD                 4
+#define MOD_1_UNNORM_THRESHOLD               4
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          5
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          4
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD        11
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD         0  /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD      8
+#define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           21
+
+#define MUL_TOOM22_THRESHOLD                20
+#define MUL_TOOM33_THRESHOLD                77
+#define MUL_TOOM44_THRESHOLD               169
+#define MUL_TOOM6H_THRESHOLD               246
+#define MUL_TOOM8H_THRESHOLD               381
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      73
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     114
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD      97
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD      80
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD     106
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 30
+#define SQR_TOOM3_THRESHOLD                101
+#define SQR_TOOM4_THRESHOLD                154
+#define SQR_TOOM6_THRESHOLD                222
+#define SQR_TOOM8_THRESHOLD                527
+
+#define MULMID_TOOM42_THRESHOLD             58
+
+#define MULMOD_BNM1_THRESHOLD               13
+#define SQRMOD_BNM1_THRESHOLD               17
+
+#define MUL_FFT_MODF_THRESHOLD             690  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    565, 5}, {     25, 6}, {     13, 5}, {     27, 6}, \
+    {     25, 7}, {     13, 6}, {     28, 7}, {     15, 6}, \
+    {     31, 7}, {     17, 6}, {     35, 7}, {     27, 8}, \
+    {     15, 7}, {     35, 8}, {     19, 7}, {     41, 8}, \
+    {     23, 7}, {     47, 8}, {     27, 9}, {     15, 8}, \
+    {     31, 7}, {     63, 8}, {     39, 9}, {     23, 5}, \
+    {    383, 4}, {    991, 5}, {    511, 6}, {    267, 7}, \
+    {    157, 8}, {     91, 9}, {     47, 8}, {    111, 9}, \
+    {     63, 8}, {    127, 9}, {     79,10}, {     47, 9}, \
+    {     95,11}, {     31,10}, {     63, 9}, {    135,10}, \
+    {     79, 9}, {    159,10}, {     95,11}, {     63,10}, \
+    {    143, 9}, {    287,10}, {    159,11}, {     95,10}, \
+    {    191,12}, {     63,11}, {    127,10}, {    255, 9}, \
+    {    511,10}, {    271, 9}, {    543,10}, {    287,11}, \
+    {    159,10}, {    335, 9}, {    671,11}, {    191,10}, \
+    {    383, 9}, {    767,10}, {    399, 9}, {    799,10}, \
+    {    415,11}, {    223,12}, {    127,11}, {    255,10}, \
+    {    543, 9}, {   1087,11}, {    287,10}, {    607,11}, \
+    {    319,10}, {    671,12}, {    191,11}, {    383,10}, \
+    {    799,11}, {    415,10}, {    831,13}, {    127,12}, \
+    {    255,11}, {    543,10}, {   1087,11}, {    607,10}, \
+    {   1215,12}, {    319,11}, {    671,10}, {   1343,11}, \
+    {    735,10}, {   1471,12}, {    383,11}, {    799,10}, \
+    {   1599,11}, {    863,12}, {    447,11}, {    959,13}, \
+    {    255,12}, {    511,11}, {   1087,12}, {    575,11}, \
+    {   1215,12}, {    639,11}, {   1343,12}, {    703,11}, \
+    {   1471,13}, {    383,12}, {    831,11}, {   1727,12}, \
+    {    959,14}, {    255,13}, {    511,12}, {   1215,13}, \
+    {    639,12}, {   1471,11}, {   2943,13}, {    767,12}, \
+    {   1727,13}, {    895,12}, {   1919,14}, {    511,13}, \
+    {   1023,12}, {   2111,13}, {   1151,12}, {   2431,13}, \
+    {   1407,12}, {   2815,14}, {    767,13}, {   1663,12}, \
+    {   3455,13}, {   8192,14}, {  16384,15}, {  32768,16} }
+#define MUL_FFT_TABLE3_SIZE 132
+#define MUL_FFT_THRESHOLD                 7424
+
+#define SQR_FFT_MODF_THRESHOLD             565  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    472, 5}, {     25, 6}, {     13, 5}, {     27, 6}, \
+    {     25, 7}, {     13, 6}, {     27, 7}, {     15, 6}, \
+    {     31, 7}, {     17, 6}, {     35, 7}, {     27, 8}, \
+    {     15, 7}, {     35, 8}, {     19, 7}, {     41, 8}, \
+    {     23, 7}, {     49, 8}, {     27, 9}, {     15, 8}, \
+    {     39, 9}, {     23, 8}, {     51,10}, {     15, 9}, \
+    {     31, 8}, {     63, 4}, {   1023, 8}, {     67, 9}, \
+    {     39, 5}, {    639, 4}, {   1471, 6}, {    383, 7}, \
+    {    209, 8}, {    119, 9}, {     63, 7}, {    255, 8}, \
+    {    139, 9}, {     71, 8}, {    143, 9}, {     79,10}, \
+    {     47, 9}, {     95,11}, {     31,10}, {     63, 9}, \
+    {    135,10}, {     79, 9}, {    159, 8}, {    319, 9}, \
+    {    167,10}, {     95,11}, {     63,10}, {    143, 9}, \
+    {    287,10}, {    159,11}, {     95,10}, {    191,12}, \
+    {     63,11}, {    127,10}, {    255, 9}, {    543, 8}, \
+    {   1087,10}, {    287, 9}, {    575,11}, {    159,10}, \
+    {    319, 9}, {    639,10}, {    335, 9}, {    671,10}, \
+    {    351, 9}, {    703,11}, {    191,10}, {    383, 9}, \
+    {    767,10}, {    399, 9}, {    799,10}, {    415, 9}, \
+    {    831,11}, {    223,12}, {    127,11}, {    255,10}, \
+    {    543, 9}, {   1087,11}, {    287,10}, {    607, 9}, \
+    {   1215,11}, {    319,10}, {    671, 9}, {   1343,11}, \
+    {    351,10}, {    703,12}, {    191,11}, {    383,10}, \
+    {    799,11}, {    415,10}, {    831,13}, {    127,12}, \
+    {    255,11}, {    543,10}, {   1087,11}, {    607,12}, \
+    {    319,11}, {    671,10}, {   1343,11}, {    735,12}, \
+    {    383,11}, {    799,10}, {   1599,11}, {    863,12}, \
+    {    447,11}, {    959,13}, {    255,12}, {    511,11}, \
+    {   1087,12}, {    575,11}, {   1215,12}, {    639,11}, \
+    {   1343,12}, {    703,11}, {   1471,13}, {    383,12}, \
+    {    767,11}, {   1599,12}, {    831,11}, {   1727,12}, \
+    {    959,14}, {    255,13}, {    511,12}, {   1215,13}, \
+    {    639,12}, {   1471,13}, {    767,12}, {   1727,13}, \
+    {    895,12}, {   1919,14}, {    511,13}, {   1023,12}, \
+    {   2111,13}, {   1151,12}, {   2431,13}, {   1407,14}, \
+    {    767,13}, {   1663,12}, {   3455,13}, {   8192,14}, \
+    {  16384,15}, {  32768,16} }
+#define SQR_FFT_TABLE3_SIZE 146
+#define SQR_FFT_THRESHOLD                 5760
+
+#define MULLO_BASECASE_THRESHOLD             0  /* always */
+#define MULLO_DC_THRESHOLD                  31
+#define MULLO_MUL_N_THRESHOLD            13463
+#define SQRLO_BASECASE_THRESHOLD             0  /* always */
+#define SQRLO_DC_THRESHOLD                 100
+#define SQRLO_SQR_THRESHOLD               9236
+
+#define DC_DIV_QR_THRESHOLD                 25
+#define DC_DIVAPPR_Q_THRESHOLD              55
+#define DC_BDIV_QR_THRESHOLD                60
+#define DC_BDIV_Q_THRESHOLD                132
+
+#define INV_MULMOD_BNM1_THRESHOLD           38
+#define INV_NEWTON_THRESHOLD                65
+#define INV_APPR_THRESHOLD                  65
+
+#define BINV_NEWTON_THRESHOLD              252
+#define REDC_1_TO_REDC_N_THRESHOLD          62
+
+#define MU_DIV_QR_THRESHOLD               1164
+#define MU_DIVAPPR_Q_THRESHOLD             748
+#define MUPI_DIV_QR_THRESHOLD               38
+#define MU_BDIV_QR_THRESHOLD              1360
+#define MU_BDIV_Q_THRESHOLD               1470
+
+#define POWM_SEC_TABLE  2,23,258,879,2246
+
+#define GET_STR_DC_THRESHOLD                13
+#define GET_STR_PRECOMPUTE_THRESHOLD        25
+#define SET_STR_DC_THRESHOLD               582
+#define SET_STR_PRECOMPUTE_THRESHOLD      1118
+
+#define FAC_DSC_THRESHOLD                  178
+#define FAC_ODD_THRESHOLD                   34
+
+#define MATRIX22_STRASSEN_THRESHOLD         17
+#define HGCD_THRESHOLD                      69
+#define HGCD_APPR_THRESHOLD                112
+#define HGCD_REDUCE_THRESHOLD             3389
+#define GCD_DC_THRESHOLD                   386
+#define GCDEXT_DC_THRESHOLD                303
+#define JACOBI_BASE_METHOD                   1

diff --git a/third_party/gmp/mpn/x86/p6/sse2/mod_1_1.asm b/third_party/gmp/mpn/x86/p6/sse2/mod_1_1.asm
new file mode 100644
index 0000000..8b7b7ad
--- /dev/null
+++ b/third_party/gmp/mpn/x86/p6/sse2/mod_1_1.asm

@@ -0,0 +1,34 @@
+dnl  Intel P6/SSE2 mpn_mod_1_1.
+
+dnl  Copyright 2009, 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_mod_1_1p)
+include_mpn(`x86/pentium4/sse2/mod_1_1.asm')

diff --git a/third_party/gmp/mpn/x86/p6/sse2/mod_1_4.asm b/third_party/gmp/mpn/x86/p6/sse2/mod_1_4.asm
new file mode 100644
index 0000000..49c96c6
--- /dev/null
+++ b/third_party/gmp/mpn/x86/p6/sse2/mod_1_4.asm

@@ -0,0 +1,34 @@
+dnl  Intel P6/SSE2 mpn_mod_1_4.
+
+dnl  Copyright 2009, 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_mod_1s_4p)
+include_mpn(`x86/pentium4/sse2/mod_1_4.asm')

diff --git a/third_party/gmp/mpn/x86/p6/sse2/mul_1.asm b/third_party/gmp/mpn/x86/p6/sse2/mul_1.asm
new file mode 100644
index 0000000..50e5b69
--- /dev/null
+++ b/third_party/gmp/mpn/x86/p6/sse2/mul_1.asm

@@ -0,0 +1,38 @@
+dnl  Intel P6/SSE2 mpn_mul_1.
+
+dnl  Copyright 2008 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C TODO
+C  * Write P6 specific SSE2 code.  It should reach 3 c/l.
+C    The Pentium4 code runs at 4.2 c/l.
+
+MULFUNC_PROLOGUE(mpn_mul_1)
+include_mpn(`x86/pentium4/sse2/mul_1.asm')

diff --git a/third_party/gmp/mpn/x86/p6/sse2/mul_basecase.asm b/third_party/gmp/mpn/x86/p6/sse2/mul_basecase.asm
new file mode 100644
index 0000000..4687625
--- /dev/null
+++ b/third_party/gmp/mpn/x86/p6/sse2/mul_basecase.asm

@@ -0,0 +1,35 @@
+dnl  Intel P6/SSE2 mpn_mul_basecase.
+
+dnl  Copyright 2008 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+MULFUNC_PROLOGUE(mpn_mul_basecase)
+include_mpn(`x86/pentium4/sse2/mul_basecase.asm')

diff --git a/third_party/gmp/mpn/x86/p6/sse2/popcount.asm b/third_party/gmp/mpn/x86/p6/sse2/popcount.asm
new file mode 100644
index 0000000..4c02b93
--- /dev/null
+++ b/third_party/gmp/mpn/x86/p6/sse2/popcount.asm

@@ -0,0 +1,35 @@
+dnl  Intel P6/SSE2 mpn_popcount -- population count.
+
+dnl  Copyright 2008 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+MULFUNC_PROLOGUE(mpn_popcount)
+include_mpn(`x86/pentium4/sse2/popcount.asm')

diff --git a/third_party/gmp/mpn/x86/p6/sse2/sqr_basecase.asm b/third_party/gmp/mpn/x86/p6/sse2/sqr_basecase.asm
new file mode 100644
index 0000000..76b574b
--- /dev/null
+++ b/third_party/gmp/mpn/x86/p6/sse2/sqr_basecase.asm

@@ -0,0 +1,35 @@
+dnl  Intel P6/SSE2 mpn_sqr_basecase.
+
+dnl  Copyright 2008 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+MULFUNC_PROLOGUE(mpn_sqr_basecase)
+include_mpn(`x86/pentium4/sse2/sqr_basecase.asm')

diff --git a/third_party/gmp/mpn/x86/p6/sse2/submul_1.asm b/third_party/gmp/mpn/x86/p6/sse2/submul_1.asm
new file mode 100644
index 0000000..69d940d
--- /dev/null
+++ b/third_party/gmp/mpn/x86/p6/sse2/submul_1.asm

@@ -0,0 +1,35 @@
+dnl  Intel P6/SSE2 mpn_submul_1.
+
+dnl  Copyright 2008 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+MULFUNC_PROLOGUE(mpn_submul_1)
+include_mpn(`x86/pentium4/sse2/submul_1.asm')

diff --git a/third_party/gmp/mpn/x86/pentium/README b/third_party/gmp/mpn/x86/pentium/README
new file mode 100644
index 0000000..305936b
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium/README

@@ -0,0 +1,181 @@
+Copyright 1996, 1999-2001, 2003 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.
+
+
+
+
+
+                   INTEL PENTIUM P5 MPN SUBROUTINES
+
+
+This directory contains mpn functions optimized for Intel Pentium (P5,P54)
+processors.  The mmx subdirectory has additional code for Pentium with MMX
+(P55).
+
+
+STATUS
+
+                                cycles/limb
+
+	mpn_add_n/sub_n            2.375
+
+	mpn_mul_1                 12.0
+	mpn_add/submul_1          14.0
+
+	mpn_mul_basecase          14.2 cycles/crossproduct (approx)
+
+	mpn_sqr_basecase           8 cycles/crossproduct (approx)
+                                   or 15.5 cycles/triangleproduct (approx)
+
+	mpn_l/rshift               5.375 normal (6.0 on P54)
+				   1.875 special shift by 1 bit
+
+	mpn_divrem_1              44.0
+	mpn_mod_1                 28.0
+	mpn_divexact_by3          15.0
+
+	mpn_copyi/copyd            1.0
+
+Pentium MMX gets the following improvements
+
+	mpn_l/rshift               1.75
+
+	mpn_mul_1                 12.0 normal, 7.0 for 16-bit multiplier
+
+
+mpn_add_n and mpn_sub_n run at asymptotically 2 cycles/limb.  Due to loop
+overhead and other delays (cache refill?), they run at or near 2.5
+cycles/limb.
+
+mpn_mul_1, mpn_addmul_1, mpn_submul_1 all run 1 cycle faster than they
+should.  Intel documentation says a mul instruction is 10 cycles, but it
+measures 9 and the routines using it run as 9.
+
+
+
+P55 MMX AND X87
+
+The cost of switching between MMX and x87 floating point on P55 is about 100
+cycles (fld1/por/emms for instance).  In order to avoid that the two aren't
+mixed and currently that means using MMX and not x87.
+
+MMX offers a big speedup for lshift and rshift, and a nice speedup for
+16-bit multipliers in mpn_mul_1.  If fast code using x87 is found then
+perhaps the preference for MMX will be reversed.
+
+
+
+
+P54 SHLDL
+
+mpn_lshift and mpn_rshift run at about 6 cycles/limb on P5 and P54, but the
+documentation indicates that they should take only 43/8 = 5.375 cycles/limb,
+or 5 cycles/limb asymptotically.  The P55 runs them at the expected speed.
+
+It seems that on P54 a shldl or shrdl allows pairing in one following cycle,
+but not two.  For example, back to back repetitions of the following
+
+	shldl(	%cl, %eax, %ebx)
+	xorl	%edx, %edx
+	xorl	%esi, %esi
+
+run at 5 cycles, as expected, but repetitions of the following run at 7
+cycles, whereas 6 would be expected (and is achieved on P55),
+
+	shldl(	%cl, %eax, %ebx)
+	xorl	%edx, %edx
+	xorl	%esi, %esi
+	xorl	%edi, %edi
+	xorl	%ebp, %ebp
+
+Three xorls run at 7 cycles too, so it doesn't seem to be just that pairing
+inhibited is only in the second following cycle (or something like that).
+
+Avoiding this problem would bring P54 shifts down from 6.0 c/l to 5.5 with a
+pattern of shift, 2 loads, shift, 2 stores, shift, etc.  A start has been
+made on something like that, but it's not yet complete.
+
+
+
+
+OTHER NOTES
+
+Prefetching Destinations
+
+    Pentium doesn't allocate cache lines on writes, unlike most other modern
+    processors.  Since the functions in the mpn class do array writes, we
+    have to handle allocating the destination cache lines by reading a word
+    from it in the loops, to achieve the best performance.
+
+Prefetching Sources
+
+    Prefetching of sources is pointless since there's no out-of-order loads.
+    Any load instruction blocks until the line is brought to L1, so it may
+    as well be the load that wants the data which blocks.
+
+Data Cache Bank Clashes
+
+    Pairing of memory operations requires that the two issued operations
+    refer to different cache banks (ie. different addresses modulo 32
+    bytes).  The simplest way to ensure this is to read/write two words from
+    the same object.  If we make operations on different objects, they might
+    or might not be to the same cache bank.
+
+PIC %eip Fetching
+
+    A simple call $+5 and popl can be used to get %eip, there's no need to
+    balance calls and returns since P5 doesn't have any return stack branch
+    prediction.
+
+Float Multiplies
+
+    fmul is pairable and can be issued every 2 cycles (with a 4 cycle
+    latency for data ready to use).  This is a lot better than integer mull
+    or imull at 9 cycles non-pairing.  Unfortunately the advantage is
+    quickly eaten away by needing to throw data through memory back to the
+    integer registers to adjust for fild and fist being signed, and to do
+    things like propagating carry bits.
+
+
+
+
+
+REFERENCES
+
+"Intel Architecture Optimization Manual", 1997, order number 242816.  This
+is mostly about P5, the parts about P6 aren't relevant.  Available on-line:
+
+        http://download.intel.com/design/PentiumII/manuals/242816.htm
+
+
+
+----------------
+Local variables:
+mode: text
+fill-column: 76
+End:

diff --git a/third_party/gmp/mpn/x86/pentium/aors_n.asm b/third_party/gmp/mpn/x86/pentium/aors_n.asm
new file mode 100644
index 0000000..01ebfb9
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium/aors_n.asm

@@ -0,0 +1,203 @@
+dnl  Intel Pentium mpn_add_n/mpn_sub_n -- mpn addition and subtraction.
+
+dnl  Copyright 1992, 1994-1996, 1999, 2000, 2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P5: 2.375 cycles/limb
+
+
+ifdef(`OPERATION_add_n',`
+	define(M4_inst,        adcl)
+	define(M4_function_n,  mpn_add_n)
+	define(M4_function_nc, mpn_add_nc)
+
+',`ifdef(`OPERATION_sub_n',`
+	define(M4_inst,        sbbl)
+	define(M4_function_n,  mpn_sub_n)
+	define(M4_function_nc, mpn_sub_nc)
+
+',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n
+')')')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+
+
+C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C                          mp_size_t size);
+C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C                           mp_size_t size, mp_limb_t carry);
+
+defframe(PARAM_CARRY,20)
+defframe(PARAM_SIZE, 16)
+defframe(PARAM_SRC2, 12)
+defframe(PARAM_SRC1, 8)
+defframe(PARAM_DST,  4)
+
+	TEXT
+	ALIGN(8)
+PROLOGUE(M4_function_nc)
+
+	pushl	%edi
+	pushl	%esi
+	pushl	%ebx
+	pushl	%ebp
+deflit(`FRAME',16)
+
+	movl	PARAM_DST,%edi
+	movl	PARAM_SRC1,%esi
+	movl	PARAM_SRC2,%ebp
+	movl	PARAM_SIZE,%ecx
+
+	movl	(%ebp),%ebx
+
+	decl	%ecx
+	movl	%ecx,%edx
+	shrl	$3,%ecx
+	andl	$7,%edx
+	testl	%ecx,%ecx		C zero carry flag
+	jz	L(endgo)
+
+	pushl	%edx
+FRAME_pushl()
+	movl	PARAM_CARRY,%eax
+	shrl	%eax			C shift bit 0 into carry
+	jmp	L(oop)
+
+L(endgo):
+deflit(`FRAME',16)
+	movl	PARAM_CARRY,%eax
+	shrl	%eax			C shift bit 0 into carry
+	jmp	L(end)
+
+EPILOGUE()
+
+
+	ALIGN(8)
+PROLOGUE(M4_function_n)
+
+	pushl	%edi
+	pushl	%esi
+	pushl	%ebx
+	pushl	%ebp
+deflit(`FRAME',16)
+
+	movl	PARAM_DST,%edi
+	movl	PARAM_SRC1,%esi
+	movl	PARAM_SRC2,%ebp
+	movl	PARAM_SIZE,%ecx
+
+	movl	(%ebp),%ebx
+
+	decl	%ecx
+	movl	%ecx,%edx
+	shrl	$3,%ecx
+	andl	$7,%edx
+	testl	%ecx,%ecx		C zero carry flag
+	jz	L(end)
+	pushl	%edx
+FRAME_pushl()
+
+	ALIGN(8)
+L(oop):	movl	28(%edi),%eax		C fetch destination cache line
+	leal	32(%edi),%edi
+
+L(1):	movl	(%esi),%eax
+	movl	4(%esi),%edx
+	M4_inst	%ebx,%eax
+	movl	4(%ebp),%ebx
+	M4_inst	%ebx,%edx
+	movl	8(%ebp),%ebx
+	movl	%eax,-32(%edi)
+	movl	%edx,-28(%edi)
+
+L(2):	movl	8(%esi),%eax
+	movl	12(%esi),%edx
+	M4_inst	%ebx,%eax
+	movl	12(%ebp),%ebx
+	M4_inst	%ebx,%edx
+	movl	16(%ebp),%ebx
+	movl	%eax,-24(%edi)
+	movl	%edx,-20(%edi)
+
+L(3):	movl	16(%esi),%eax
+	movl	20(%esi),%edx
+	M4_inst	%ebx,%eax
+	movl	20(%ebp),%ebx
+	M4_inst	%ebx,%edx
+	movl	24(%ebp),%ebx
+	movl	%eax,-16(%edi)
+	movl	%edx,-12(%edi)
+
+L(4):	movl	24(%esi),%eax
+	movl	28(%esi),%edx
+	M4_inst	%ebx,%eax
+	movl	28(%ebp),%ebx
+	M4_inst	%ebx,%edx
+	movl	32(%ebp),%ebx
+	movl	%eax,-8(%edi)
+	movl	%edx,-4(%edi)
+
+	leal	32(%esi),%esi
+	leal	32(%ebp),%ebp
+	decl	%ecx
+	jnz	L(oop)
+
+	popl	%edx
+FRAME_popl()
+L(end):
+	decl	%edx			C test %edx w/o clobbering carry
+	js	L(end2)
+	incl	%edx
+L(oop2):
+	leal	4(%edi),%edi
+	movl	(%esi),%eax
+	M4_inst	%ebx,%eax
+	movl	4(%ebp),%ebx
+	movl	%eax,-4(%edi)
+	leal	4(%esi),%esi
+	leal	4(%ebp),%ebp
+	decl	%edx
+	jnz	L(oop2)
+L(end2):
+	movl	(%esi),%eax
+	M4_inst	%ebx,%eax
+	movl	%eax,(%edi)
+
+	sbbl	%eax,%eax
+	negl	%eax
+
+	popl	%ebp
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/pentium/aorsmul_1.asm b/third_party/gmp/mpn/x86/pentium/aorsmul_1.asm
new file mode 100644
index 0000000..d83cc45
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium/aorsmul_1.asm

@@ -0,0 +1,144 @@
+dnl  Intel Pentium mpn_addmul_1 -- mpn by limb multiplication.
+
+dnl  Copyright 1992, 1994, 1996, 1999, 2000, 2002 Free Software Foundation,
+dnl  Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P5: 14.0 cycles/limb
+
+
+ifdef(`OPERATION_addmul_1', `
+      define(M4_inst,        addl)
+      define(M4_function_1,  mpn_addmul_1)
+      define(M4_function_1c, mpn_addmul_1c)
+
+',`ifdef(`OPERATION_submul_1', `
+      define(M4_inst,        subl)
+      define(M4_function_1,  mpn_submul_1)
+      define(M4_function_1c, mpn_submul_1c)
+
+',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1
+')')')
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c)
+
+
+C mp_limb_t mpn_addmul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                         mp_limb_t mult);
+C mp_limb_t mpn_addmul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                          mp_limb_t mult, mp_limb_t carry);
+C
+C mp_limb_t mpn_submul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                         mp_limb_t mult);
+C mp_limb_t mpn_submul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                          mp_limb_t mult, mp_limb_t carry);
+C
+
+defframe(PARAM_CARRY,     20)
+defframe(PARAM_MULTIPLIER,16)
+defframe(PARAM_SIZE,      12)
+defframe(PARAM_SRC,       8)
+defframe(PARAM_DST,       4)
+
+	TEXT
+
+	ALIGN(8)
+PROLOGUE(M4_function_1c)
+deflit(`FRAME',0)
+
+	movl	PARAM_CARRY, %ecx
+	pushl	%esi		FRAME_pushl()
+
+	jmp	L(start_1c)
+
+EPILOGUE()
+
+
+	ALIGN(8)
+PROLOGUE(M4_function_1)
+deflit(`FRAME',0)
+
+	xorl	%ecx, %ecx
+	pushl	%esi		FRAME_pushl()
+
+L(start_1c):
+	movl	PARAM_SRC, %esi
+	movl	PARAM_SIZE, %eax
+
+	pushl	%edi		FRAME_pushl()
+	pushl	%ebx		FRAME_pushl()
+
+	movl	PARAM_DST, %edi
+	leal	-1(%eax), %ebx		C size-1
+
+	leal	(%esi,%eax,4), %esi
+	xorl	$-1, %ebx		C -size, and clear carry
+
+	leal	(%edi,%eax,4), %edi
+
+L(top):
+	C eax
+	C ebx	counter, negative
+	C ecx	carry
+	C edx
+	C esi	src end
+	C edi	dst end
+	C ebp
+
+	adcl	$0, %ecx
+	movl	(%esi,%ebx,4), %eax
+
+	mull	PARAM_MULTIPLIER
+
+	addl	%ecx, %eax
+	movl	(%edi,%ebx,4), %ecx
+
+	adcl	$0, %edx
+	M4_inst	%eax, %ecx
+
+	movl	%ecx, (%edi,%ebx,4)
+	incl	%ebx
+
+	movl	%edx, %ecx
+	jnz	L(top)
+
+
+	adcl	$0, %ecx
+	popl	%ebx
+
+	movl	%ecx, %eax
+	popl	%edi
+
+	popl	%esi
+
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/pentium/bdiv_q_1.asm b/third_party/gmp/mpn/x86/pentium/bdiv_q_1.asm
new file mode 100644
index 0000000..c2c4f58
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium/bdiv_q_1.asm

@@ -0,0 +1,266 @@
+dnl  Intel Pentium mpn_divexact_1 -- mpn by limb exact division.
+
+dnl  Rearranged from mpn/x86/pentium/dive_1.asm by Marco Bodrato.
+
+dnl  Copyright 2001, 2002, 2011, 2014 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C         divisor
+C       odd   even
+C P54:  24.5  30.5   cycles/limb
+C P55:  23.0  28.0
+
+MULFUNC_PROLOGUE(mpn_bdiv_q_1 mpn_pi1_bdiv_q_1)
+
+C The P55 speeds noted above, 23 cycles odd or 28 cycles even, are as
+C expected.  On P54 in the even case the shrdl pairing nonsense (see
+C mpn/x86/pentium/README) costs 1 cycle, but it's not clear why there's a
+C further 1.5 slowdown for both odd and even.
+
+defframe(PARAM_SHIFT,  24)
+defframe(PARAM_INVERSE,20)
+defframe(PARAM_DIVISOR,16)
+defframe(PARAM_SIZE,   12)
+defframe(PARAM_SRC,    8)
+defframe(PARAM_DST,    4)
+
+dnl  re-use parameter space
+define(VAR_INVERSE,`PARAM_DST')
+
+	TEXT
+
+	ALIGN(32)
+C mp_limb_t mpn_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                           mp_limb_t divisor);
+C
+PROLOGUE(mpn_bdiv_q_1)
+deflit(`FRAME',0)
+
+	movl	$-1, %ecx
+	movl	PARAM_DIVISOR, %eax
+
+L(strip_twos):
+	ASSERT(nz, `orl %eax, %eax')
+	shrl	%eax
+	incl	%ecx			C shift count
+
+	jnc	L(strip_twos)
+
+	leal	1(%eax,%eax), %edx	C d
+	andl	$127, %eax		C d/2, 7 bits
+
+	pushl	%ebx		FRAME_pushl()
+	pushl	%ebp		FRAME_pushl()
+
+ifdef(`PIC',`
+ifdef(`DARWIN',`
+	LEA(	binvert_limb_table, %ebp)
+	movzbl	(%eax,%ebp), %eax
+',`
+	call	L(here)
+L(here):
+	popl	%ebp			C eip
+
+	addl	$_GLOBAL_OFFSET_TABLE_+[.-L(here)], %ebp
+	C AGI
+	movl	binvert_limb_table@GOT(%ebp), %ebp
+	C AGI
+	movzbl	(%eax,%ebp), %eax
+')
+',`
+
+dnl non-PIC
+	movzbl	binvert_limb_table(%eax), %eax	C inv 8 bits
+')
+
+	movl	%eax, %ebp		C inv
+	addl	%eax, %eax		C 2*inv
+
+	imull	%ebp, %ebp		C inv*inv
+
+	imull	%edx, %ebp		C inv*inv*d
+
+	subl	%ebp, %eax		C inv = 2*inv - inv*inv*d
+	movl	PARAM_SIZE, %ebx
+
+	movl	%eax, %ebp
+	addl	%eax, %eax		C 2*inv
+
+	imull	%ebp, %ebp		C inv*inv
+
+	imull	%edx, %ebp		C inv*inv*d
+
+	subl	%ebp, %eax		C inv = 2*inv - inv*inv*d
+	movl	%edx, PARAM_DIVISOR	C d without twos
+
+	ASSERT(e,`	C expect d*inv == 1 mod 2^GMP_LIMB_BITS
+	pushl	%eax	FRAME_pushl()
+	imull	PARAM_DIVISOR, %eax
+	cmpl	$1, %eax
+	popl	%eax	FRAME_popl()')
+
+	jmp	L(common)
+EPILOGUE()
+
+C mp_limb_t
+C mpn_pi1_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor,
+C		    mp_limb_t inverse, int shift)
+	ALIGN(32)
+PROLOGUE(mpn_pi1_bdiv_q_1)
+deflit(`FRAME',0)
+
+	movl	PARAM_SHIFT, %ecx
+
+	pushl	%ebx		FRAME_pushl()
+	pushl	%ebp		FRAME_pushl()
+
+	movl	PARAM_SIZE, %ebx
+	movl	PARAM_INVERSE, %eax
+
+L(common):
+	pushl	%esi		FRAME_pushl()
+	push	%edi		FRAME_pushl()
+
+	movl	PARAM_SRC, %esi
+	movl	PARAM_DST, %edi
+	movl	%eax, VAR_INVERSE
+
+	leal	(%esi,%ebx,4), %esi	C src end
+	leal	(%edi,%ebx,4), %edi	C dst end
+
+	negl	%ebx			C -size
+
+	xorl	%ebp, %ebp		C initial carry bit
+
+	orl	%ecx, %ecx		C shift
+	movl	(%esi,%ebx,4), %eax	C src low limb
+	jz	L(odd_entry)
+
+	xorl	%edx, %edx		C initial carry limb (for even, if one)
+	incl	%ebx
+	jz	L(one)
+
+	movl	(%esi,%ebx,4), %edx	C src second limb (for even)
+	shrdl(	%cl, %edx, %eax)
+
+	jmp	L(even_entry)
+
+
+	ALIGN(8)
+L(odd_top):
+	C eax	scratch
+	C ebx	counter, limbs, negative
+	C ecx
+	C edx
+	C esi	src end
+	C edi	dst end
+	C ebp	carry bit, 0 or -1
+
+	mull	PARAM_DIVISOR
+
+	movl	(%esi,%ebx,4), %eax
+	subl	%ebp, %edx
+
+	subl	%edx, %eax
+
+	sbbl	%ebp, %ebp
+
+L(odd_entry):
+	imull	VAR_INVERSE, %eax
+
+	movl	%eax, (%edi,%ebx,4)
+
+	incl	%ebx
+	jnz	L(odd_top)
+
+	popl	%edi
+	popl	%esi
+
+	popl	%ebp
+	popl	%ebx
+
+	ret
+
+L(even_top):
+	C eax	scratch
+	C ebx	counter, limbs, negative
+	C ecx	twos
+	C edx
+	C esi	src end
+	C edi	dst end
+	C ebp	carry bit, 0 or -1
+
+	mull	PARAM_DIVISOR
+
+	subl	%ebp, %edx		C carry bit
+	movl	-4(%esi,%ebx,4), %eax	C src limb
+
+	movl	(%esi,%ebx,4), %ebp	C and one above it
+
+	shrdl(	%cl, %ebp, %eax)
+
+	subl	%edx, %eax		C carry limb
+
+	sbbl	%ebp, %ebp
+
+L(even_entry):
+	imull	VAR_INVERSE, %eax
+
+	movl	%eax, -4(%edi,%ebx,4)
+	incl	%ebx
+
+	jnz	L(even_top)
+
+	mull	PARAM_DIVISOR
+
+	movl	-4(%esi), %eax		C src high limb
+	subl	%ebp, %edx
+
+L(one):
+	shrl	%cl, %eax
+
+	subl	%edx, %eax		C no carry if division is exact
+
+	imull	VAR_INVERSE, %eax
+
+	movl	%eax, -4(%edi)		C dst high limb
+	nop				C protect against cache bank clash
+
+	popl	%edi
+	popl	%esi
+
+	popl	%ebp
+	popl	%ebx
+
+	ret
+
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/x86/pentium/com.asm b/third_party/gmp/mpn/x86/pentium/com.asm
new file mode 100644
index 0000000..b080545
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium/com.asm

@@ -0,0 +1,181 @@
+dnl  Intel Pentium mpn_com -- mpn ones complement.
+
+dnl  Copyright 1996, 2001, 2002, 2006 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P5: 1.75 cycles/limb
+
+
+NAILS_SUPPORT(0-31)
+
+
+C void mpn_com (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C This code is similar to mpn_copyi, basically there's just some "xorl
+C $GMP_NUMB_MASK"s inserted.
+C
+C Alternatives:
+C
+C On P55 some MMX code could be 1.25 c/l (8 limb unrolled) if src and dst
+C are the same alignment mod 8, but it doesn't seem worth the trouble for
+C just that case (there'd need to be some plain integer available too for
+C the unaligned case).
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+	TEXT
+	ALIGN(8)
+PROLOGUE(mpn_com)
+deflit(`FRAME',0)
+
+	movl	PARAM_SRC, %eax
+	movl	PARAM_SIZE, %ecx
+
+	pushl	%esi	FRAME_pushl()
+	pushl	%edi	FRAME_pushl()
+
+	leal	(%eax,%ecx,4), %eax
+	xorl	$-1, %ecx		C -size-1
+
+	movl	PARAM_DST, %edx
+	addl	$8, %ecx		C -size+7
+
+	jns	L(end)
+
+	movl	(%edx), %esi		C fetch destination cache line
+	nop
+
+L(top):
+	C eax	&src[size]
+	C ebx
+	C ecx	counter, limbs, negative
+	C edx	dst, incrementing
+	C esi	scratch
+	C edi	scratch
+	C ebp
+
+	movl	28(%edx), %esi		C destination prefetch
+	addl	$32, %edx
+
+	movl	-28(%eax,%ecx,4), %esi
+	movl	-24(%eax,%ecx,4), %edi
+	xorl	$GMP_NUMB_MASK, %esi
+	xorl	$GMP_NUMB_MASK, %edi
+	movl	%esi, -32(%edx)
+	movl	%edi, -28(%edx)
+
+	movl	-20(%eax,%ecx,4), %esi
+	movl	-16(%eax,%ecx,4), %edi
+	xorl	$GMP_NUMB_MASK, %esi
+	xorl	$GMP_NUMB_MASK, %edi
+	movl	%esi, -24(%edx)
+	movl	%edi, -20(%edx)
+
+	movl	-12(%eax,%ecx,4), %esi
+	movl	-8(%eax,%ecx,4), %edi
+	xorl	$GMP_NUMB_MASK, %esi
+	xorl	$GMP_NUMB_MASK, %edi
+	movl	%esi, -16(%edx)
+	movl	%edi, -12(%edx)
+
+	movl	-4(%eax,%ecx,4), %esi
+	movl	(%eax,%ecx,4), %edi
+	xorl	$GMP_NUMB_MASK, %esi
+	xorl	$GMP_NUMB_MASK, %edi
+	movl	%esi, -8(%edx)
+	movl	%edi, -4(%edx)
+
+	addl	$8, %ecx
+	js	L(top)
+
+
+L(end):
+	C eax	&src[size]
+	C ecx	0 to 7, representing respectively 7 to 0 limbs remaining
+	C edx	dst, next location to store
+
+	subl	$4, %ecx
+	nop
+
+	jns	L(no4)
+
+	movl	-12(%eax,%ecx,4), %esi
+	movl	-8(%eax,%ecx,4), %edi
+	xorl	$GMP_NUMB_MASK, %esi
+	xorl	$GMP_NUMB_MASK, %edi
+	movl	%esi, (%edx)
+	movl	%edi, 4(%edx)
+
+	movl	-4(%eax,%ecx,4), %esi
+	movl	(%eax,%ecx,4), %edi
+	xorl	$GMP_NUMB_MASK, %esi
+	xorl	$GMP_NUMB_MASK, %edi
+	movl	%esi, 8(%edx)
+	movl	%edi, 12(%edx)
+
+	addl	$16, %edx
+	addl	$4, %ecx
+L(no4):
+
+	subl	$2, %ecx
+	nop
+
+	jns	L(no2)
+
+	movl	-4(%eax,%ecx,4), %esi
+	movl	(%eax,%ecx,4), %edi
+	xorl	$GMP_NUMB_MASK, %esi
+	xorl	$GMP_NUMB_MASK, %edi
+	movl	%esi, (%edx)
+	movl	%edi, 4(%edx)
+
+	addl	$8, %edx
+	addl	$2, %ecx
+L(no2):
+
+	popl	%edi
+	jnz	L(done)
+
+	movl	-4(%eax), %ecx
+
+	xorl	$GMP_NUMB_MASK, %ecx
+	popl	%esi
+
+	movl	%ecx, (%edx)
+	ret
+
+L(done):
+	popl	%esi
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/pentium/copyd.asm b/third_party/gmp/mpn/x86/pentium/copyd.asm
new file mode 100644
index 0000000..72a543b
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium/copyd.asm

@@ -0,0 +1,146 @@
+dnl  Intel Pentium mpn_copyd -- copy limb vector, decrementing.
+
+dnl  Copyright 1996, 2001, 2002, 2006 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P5: 1.25 cycles/limb
+
+
+C void mpn_copyd (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C See comments in copyi.asm.
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+	TEXT
+	ALIGN(8)
+PROLOGUE(mpn_copyd)
+deflit(`FRAME',0)
+
+	movl	PARAM_SRC, %eax
+	movl	PARAM_SIZE, %ecx
+
+	pushl	%esi	FRAME_pushl()
+	pushl	%edi	FRAME_pushl()
+
+	leal	-4(%eax,%ecx,4), %eax		C &src[size-1]
+	movl	PARAM_DST, %edx
+
+	subl	$7, %ecx			C size-7
+	jle	L(end)
+
+	movl	28-4(%edx,%ecx,4), %esi		C prefetch cache, dst[size-1]
+	nop
+
+L(top):
+	C eax	src, decrementing
+	C ebx
+	C ecx	counter, limbs
+	C edx	dst
+	C esi	scratch
+	C edi	scratch
+	C ebp
+
+	movl	28-32(%edx,%ecx,4), %esi	C prefetch dst cache line
+	subl	$8, %ecx
+
+	movl	(%eax), %esi			C read words pairwise
+	movl	-4(%eax), %edi
+	movl	%esi, 56(%edx,%ecx,4)		C store words pairwise
+	movl	%edi, 52(%edx,%ecx,4)
+
+	movl	-8(%eax), %esi
+	movl	-12(%eax), %edi
+	movl	%esi, 48(%edx,%ecx,4)
+	movl	%edi, 44(%edx,%ecx,4)
+
+	movl	-16(%eax), %esi
+	movl	-20(%eax), %edi
+	movl	%esi, 40(%edx,%ecx,4)
+	movl	%edi, 36(%edx,%ecx,4)
+
+	movl	-24(%eax), %esi
+	movl	-28(%eax), %edi
+	movl	%esi, 32(%edx,%ecx,4)
+	movl	%edi, 28(%edx,%ecx,4)
+
+	leal	-32(%eax), %eax
+	jg	L(top)
+
+
+L(end):
+	C ecx	-7 to 0, representing respectively 0 to 7 limbs remaining
+	C eax	src end
+	C edx	dst, next location to store
+
+	addl	$4, %ecx
+	jle	L(no4)
+
+	movl	(%eax), %esi
+	movl	-4(%eax), %edi
+	movl	%esi, 8(%edx,%ecx,4)
+	movl	%edi, 4(%edx,%ecx,4)
+
+	movl	-8(%eax), %esi
+	movl	-12(%eax), %edi
+	movl	%esi, (%edx,%ecx,4)
+	movl	%edi, -4(%edx,%ecx,4)
+
+	subl	$16, %eax
+	subl	$4, %ecx
+L(no4):
+
+	addl	$2, %ecx
+	jle	L(no2)
+
+	movl	(%eax), %esi
+	movl	-4(%eax), %edi
+	movl	%esi, (%edx,%ecx,4)
+	movl	%edi, -4(%edx,%ecx,4)
+
+	subl	$8, %eax
+	subl	$2, %ecx
+L(no2):
+
+	jnz	L(done)
+
+	movl	(%eax), %ecx
+	movl	%ecx, (%edx)	C risk of cache bank clash here
+
+L(done):
+	popl	%edi
+	popl	%esi
+
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/pentium/copyi.asm b/third_party/gmp/mpn/x86/pentium/copyi.asm
new file mode 100644
index 0000000..d983d6b
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium/copyi.asm

@@ -0,0 +1,164 @@
+dnl  Intel Pentium mpn_copyi -- copy limb vector, incrementing.
+
+dnl  Copyright 1996, 2001, 2002, 2006 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P5: 1.25 cycles/limb
+
+
+C void mpn_copyi (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C Destination prefetching is done to avoid repeated write-throughs on lines
+C not already in L1.
+C
+C At least one of the src or dst pointer needs to be incremented rather than
+C using indexing, so that there's somewhere to put the loop control without
+C an AGI.  Incrementing one and not two lets us keep loop overhead to 2
+C cycles.  Making it the src pointer incremented avoids an AGI on the %ecx
+C subtracts in the finishup code.
+C
+C The block of finishup code is almost as big as the main loop itself, which
+C is unfortunate, but it's faster that way than with say rep movsl, by about
+C 10 cycles for instance on P55.
+C
+C There's nothing to be gained from MMX on P55, since it can do only one
+C movq load (or store) per cycle, so the throughput would be the same as the
+C code here (and even then only if src and dst have the same alignment mod
+C 8).
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+	TEXT
+	ALIGN(8)
+PROLOGUE(mpn_copyi)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	movl	PARAM_DST, %edx
+
+	pushl	%ebx	FRAME_pushl()
+	pushl	%esi	FRAME_pushl()
+
+	leal	(%edx,%ecx,4), %edx	C &dst[size-1]
+	xorl	$-1, %ecx		C -size-1
+
+	movl	PARAM_SRC, %esi
+	addl	$8, %ecx		C -size+7
+
+	jns	L(end)
+
+	movl	-28(%edx,%ecx,4), %eax	C fetch destination cache line, dst[0]
+	nop
+
+L(top):
+	C eax	scratch
+	C ebx	scratch
+	C ecx	counter, limbs, negative
+	C edx	&dst[size-1]
+	C esi	src, incrementing
+	C edi
+	C ebp
+
+	movl	(%edx,%ecx,4), %eax	C fetch destination cache line
+	addl	$8, %ecx
+
+	movl	(%esi), %eax		C read words pairwise
+	movl	4(%esi), %ebx
+	movl	%eax, -60(%edx,%ecx,4)	C store words pairwise
+	movl	%ebx, -56(%edx,%ecx,4)
+
+	movl	8(%esi), %eax
+	movl	12(%esi), %ebx
+	movl	%eax, -52(%edx,%ecx,4)
+	movl	%ebx, -48(%edx,%ecx,4)
+
+	movl	16(%esi), %eax
+	movl	20(%esi), %ebx
+	movl	%eax, -44(%edx,%ecx,4)
+	movl	%ebx, -40(%edx,%ecx,4)
+
+	movl	24(%esi), %eax
+	movl	28(%esi), %ebx
+	movl	%eax, -36(%edx,%ecx,4)
+	movl	%ebx, -32(%edx,%ecx,4)
+
+	leal	32(%esi), %esi
+	js	L(top)
+
+
+L(end):
+	C ecx	0 to 7, representing respectively 7 to 0 limbs remaining
+	C esi	src end
+	C edx	dst, next location to store
+
+	subl	$4, %ecx
+	jns	L(no4)
+
+	movl	(%esi), %eax
+	movl	4(%esi), %ebx
+	movl	%eax, -12(%edx,%ecx,4)
+	movl	%ebx, -8(%edx,%ecx,4)
+
+	movl	8(%esi), %eax
+	movl	12(%esi), %ebx
+	movl	%eax, -4(%edx,%ecx,4)
+	movl	%ebx, (%edx,%ecx,4)
+
+	addl	$16, %esi
+	addl	$4, %ecx
+L(no4):
+
+	subl	$2, %ecx
+	jns	L(no2)
+
+	movl	(%esi), %eax
+	movl	4(%esi), %ebx
+	movl	%eax, -4(%edx,%ecx,4)
+	movl	%ebx, (%edx,%ecx,4)
+
+	addl	$8, %esi
+	addl	$2, %ecx
+L(no2):
+
+	jnz	L(done)
+
+	movl	(%esi), %eax
+	movl	%eax, -4(%edx,%ecx,4)	C risk of cache bank clash here
+
+L(done):
+	popl	%esi
+	popl	%ebx
+
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/pentium/dive_1.asm b/third_party/gmp/mpn/x86/pentium/dive_1.asm
new file mode 100644
index 0000000..21b5287
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium/dive_1.asm

@@ -0,0 +1,264 @@
+dnl  Intel Pentium mpn_divexact_1 -- mpn by limb exact division.
+
+dnl  Copyright 2001, 2002, 2014 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C         divisor
+C       odd   even
+C P54:  24.5  30.5   cycles/limb
+C P55:  23.0  28.0
+
+
+C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                      mp_limb_t divisor);
+C
+C Plain divl is used for small sizes, since the inverse takes a while to
+C setup.  Multiplying works out faster for size>=3 when the divisor is odd,
+C or size>=4 when the divisor is even.  Actually on P55 size==2 for odd or
+C size==3 for even are about the same speed for both divl or mul, but the
+C former is used since it will use up less code cache.
+C
+C The P55 speeds noted above, 23 cycles odd or 28 cycles even, are as
+C expected.  On P54 in the even case the shrdl pairing nonsense (see
+C mpn/x86/pentium/README) costs 1 cycle, but it's not clear why there's a
+C further 1.5 slowdown for both odd and even.
+
+defframe(PARAM_DIVISOR,16)
+defframe(PARAM_SIZE,   12)
+defframe(PARAM_SRC,    8)
+defframe(PARAM_DST,    4)
+
+dnl  re-use parameter space
+define(VAR_INVERSE,`PARAM_DST')
+
+	TEXT
+
+	ALIGN(32)
+PROLOGUE(mpn_divexact_1)
+deflit(`FRAME',0)
+
+	movl	PARAM_DIVISOR, %eax
+	movl	PARAM_SIZE, %ecx
+
+	pushl	%esi		FRAME_pushl()
+	push	%edi		FRAME_pushl()
+
+	movl	PARAM_SRC, %esi
+	andl	$1, %eax
+
+	movl	PARAM_DST, %edi
+	addl	%ecx, %eax	C size if even, size+1 if odd
+
+	cmpl	$4, %eax
+	jae	L(mul_by_inverse)
+
+
+	xorl	%edx, %edx
+L(div_top):
+	movl	-4(%esi,%ecx,4), %eax
+
+	divl	PARAM_DIVISOR
+
+	movl	%eax, -4(%edi,%ecx,4)
+	decl	%ecx
+
+	jnz	L(div_top)
+
+	popl	%edi
+	popl	%esi
+
+	ret
+
+
+
+L(mul_by_inverse):
+	movl	PARAM_DIVISOR, %eax
+	movl	$-1, %ecx
+
+L(strip_twos):
+	ASSERT(nz, `orl %eax, %eax')
+	shrl	%eax
+	incl	%ecx			C shift count
+
+	jnc	L(strip_twos)
+
+	leal	1(%eax,%eax), %edx	C d
+	andl	$127, %eax		C d/2, 7 bits
+
+	pushl	%ebx		FRAME_pushl()
+	pushl	%ebp		FRAME_pushl()
+
+ifdef(`PIC',`dnl
+	LEA(	binvert_limb_table, %ebp)
+	movzbl	(%eax,%ebp), %eax		C inv 8 bits
+',`
+	movzbl	binvert_limb_table(%eax), %eax	C inv 8 bits
+')
+
+	movl	%eax, %ebp		C inv
+	addl	%eax, %eax		C 2*inv
+
+	imull	%ebp, %ebp		C inv*inv
+
+	imull	%edx, %ebp		C inv*inv*d
+
+	subl	%ebp, %eax		C inv = 2*inv - inv*inv*d
+	movl	PARAM_SIZE, %ebx
+
+	movl	%eax, %ebp
+	addl	%eax, %eax		C 2*inv
+
+	imull	%ebp, %ebp		C inv*inv
+
+	imull	%edx, %ebp		C inv*inv*d
+
+	subl	%ebp, %eax		C inv = 2*inv - inv*inv*d
+	movl	%edx, PARAM_DIVISOR	C d without twos
+
+	leal	(%esi,%ebx,4), %esi	C src end
+	leal	(%edi,%ebx,4), %edi	C dst end
+
+	negl	%ebx			C -size
+
+	ASSERT(e,`	C expect d*inv == 1 mod 2^GMP_LIMB_BITS
+	pushl	%eax	FRAME_pushl()
+	imull	PARAM_DIVISOR, %eax
+	cmpl	$1, %eax
+	popl	%eax	FRAME_popl()')
+
+	movl	%eax, VAR_INVERSE
+	xorl	%ebp, %ebp		C initial carry bit
+
+	movl	(%esi,%ebx,4), %eax	C src low limb
+	orl	%ecx, %ecx		C shift
+
+	movl	4(%esi,%ebx,4), %edx	C src second limb (for even)
+	jz	L(odd_entry)
+
+	shrdl(	%cl, %edx, %eax)
+
+	incl	%ebx
+	jmp	L(even_entry)
+
+
+	ALIGN(8)
+L(odd_top):
+	C eax	scratch
+	C ebx	counter, limbs, negative
+	C ecx
+	C edx
+	C esi	src end
+	C edi	dst end
+	C ebp	carry bit, 0 or -1
+
+	mull	PARAM_DIVISOR
+
+	movl	(%esi,%ebx,4), %eax
+	subl	%ebp, %edx
+
+	subl	%edx, %eax
+
+	sbbl	%ebp, %ebp
+
+L(odd_entry):
+	imull	VAR_INVERSE, %eax
+
+	movl	%eax, (%edi,%ebx,4)
+
+	incl	%ebx
+	jnz	L(odd_top)
+
+
+	popl	%ebp
+	popl	%ebx
+
+	popl	%edi
+	popl	%esi
+
+	ret
+
+
+L(even_top):
+	C eax	scratch
+	C ebx	counter, limbs, negative
+	C ecx	twos
+	C edx
+	C esi	src end
+	C edi	dst end
+	C ebp	carry bit, 0 or -1
+
+	mull	PARAM_DIVISOR
+
+	subl	%ebp, %edx		C carry bit
+	movl	-4(%esi,%ebx,4), %eax	C src limb
+
+	movl	(%esi,%ebx,4), %ebp	C and one above it
+
+	shrdl(	%cl, %ebp, %eax)
+
+	subl	%edx, %eax		C carry limb
+
+	sbbl	%ebp, %ebp
+
+L(even_entry):
+	imull	VAR_INVERSE, %eax
+
+	movl	%eax, -4(%edi,%ebx,4)
+	incl	%ebx
+
+	jnz	L(even_top)
+
+
+
+	mull	PARAM_DIVISOR
+
+	movl	-4(%esi), %eax		C src high limb
+	subl	%ebp, %edx
+
+	shrl	%cl, %eax
+
+	subl	%edx, %eax		C no carry if division is exact
+
+	imull	VAR_INVERSE, %eax
+
+	movl	%eax, -4(%edi)		C dst high limb
+	nop				C protect against cache bank clash
+
+	popl	%ebp
+	popl	%ebx
+
+	popl	%edi
+	popl	%esi
+
+	ret
+
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/x86/pentium/gmp-mparam.h b/third_party/gmp/mpn/x86/pentium/gmp-mparam.h
new file mode 100644
index 0000000..befa6e2
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium/gmp-mparam.h

@@ -0,0 +1,76 @@
+/* Intel P54 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 1999-2002, 2004 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+
+/* For mpn/x86/pentium/mod_1.asm */
+#define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
+
+
+/* 166MHz P54 */
+
+/* Generated by tuneup.c, 2004-02-10, gcc 2.95 */
+
+#define MUL_TOOM22_THRESHOLD             16
+#define MUL_TOOM33_THRESHOLD             90
+
+#define SQR_BASECASE_THRESHOLD            0  /* always */
+#define SQR_TOOM2_THRESHOLD              22
+#define SQR_TOOM3_THRESHOLD             122
+
+#define DIV_SB_PREINV_THRESHOLD       MP_SIZE_T_MAX  /* never */
+#define DIV_DC_THRESHOLD                 52
+#define POWM_THRESHOLD                   77
+
+#define HGCD_THRESHOLD                  121
+#define GCD_ACCEL_THRESHOLD               3
+#define GCD_DC_THRESHOLD                615
+#define JACOBI_BASE_METHOD                2
+
+#define USE_PREINV_DIVREM_1               0
+#define USE_PREINV_MOD_1                  1  /* native */
+#define DIVREM_2_THRESHOLD            MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD              0  /* always (native) */
+#define MODEXACT_1_ODD_THRESHOLD          0  /* always (native) */
+
+#define GET_STR_DC_THRESHOLD             23
+#define GET_STR_PRECOMPUTE_THRESHOLD     33
+#define SET_STR_THRESHOLD              2788
+
+#define MUL_FFT_TABLE  { 432, 928, 1664, 3584, 10240, 40960, 0 }
+#define MUL_FFT_MODF_THRESHOLD          448
+#define MUL_FFT_THRESHOLD              3328
+
+#define SQR_FFT_TABLE  { 496, 928, 1920, 4608, 10240, 40960, 0 }
+#define SQR_FFT_MODF_THRESHOLD          512
+#define SQR_FFT_THRESHOLD              3328

diff --git a/third_party/gmp/mpn/x86/pentium/hamdist.asm b/third_party/gmp/mpn/x86/pentium/hamdist.asm
new file mode 100644
index 0000000..6c6c1a1
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium/hamdist.asm

@@ -0,0 +1,154 @@
+dnl  Intel P5 mpn_hamdist -- mpn hamming distance.
+
+dnl  Copyright 2001, 2002, 2014, 2015 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P5: 14.0 cycles/limb
+
+
+C unsigned long mpn_hamdist (mp_srcptr src1, mp_srcptr src2, mp_size_t size);
+C
+C It might be possible to shave 1 cycle from the loop, and hence 2
+C cycles/limb.  The xorb is taking 2 cycles, but a separate load and xor
+C would be 1, if the right schedule could be found (not found so far).
+C Wanting to avoid potential cache bank clashes makes it tricky.
+
+C The slightly strange quoting here helps the renaming done by tune/many.pl.
+deflit(TABLE_NAME,
+m4_assert_defined(`GSYM_PREFIX')
+GSYM_PREFIX`'mpn_popcount``'_table')
+
+C FIXME: referencing popcount.asm's table is incorrect as it hurt incremental
+C linking.
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC2, 8)
+defframe(PARAM_SRC1, 4)
+
+	TEXT
+	ALIGN(8)
+
+PROLOGUE(mpn_hamdist)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	pushl	%esi	FRAME_pushl()
+
+	shll	%ecx		C size in byte pairs
+	pushl	%edi	FRAME_pushl()
+
+ifdef(`PIC',`
+	pushl	%ebx	FRAME_pushl()
+	pushl	%ebp	FRAME_pushl()
+ifdef(`DARWIN',`
+	movl	PARAM_SRC1, %esi
+	movl	PARAM_SRC2, %edi
+	LEA(	TABLE_NAME, %ebp)
+	xorl	%ebx, %ebx	C byte
+	xorl	%edx, %edx	C byte
+	xorl	%eax, %eax	C total
+',`
+	call	L(here)	FRAME_pushl()
+L(here):
+	movl	PARAM_SRC1, %esi
+	popl	%ebp	FRAME_popl()
+
+	movl	PARAM_SRC2, %edi
+	addl	$_GLOBAL_OFFSET_TABLE_+[.-L(here)], %ebp
+
+	xorl	%ebx, %ebx	C byte
+	xorl	%edx, %edx	C byte
+
+	movl	TABLE_NAME@GOT(%ebp), %ebp
+	xorl	%eax, %eax	C total
+')
+define(TABLE,`(%ebp,$1)')
+',`
+dnl non-PIC
+	movl	PARAM_SRC1, %esi
+	movl	PARAM_SRC2, %edi
+
+	xorl	%eax, %eax	C total
+	pushl	%ebx	FRAME_pushl()
+
+	xorl	%edx, %edx	C byte
+	xorl	%ebx, %ebx	C byte
+
+define(TABLE,`TABLE_NAME($1)')
+')
+
+
+	C The nop after the xorb seems necessary.  Although a movb might be
+	C expected to go down the V pipe in the second cycle of the xorb, it
+	C doesn't and costs an extra 2 cycles.
+L(top):
+	C eax	total
+	C ebx	byte
+	C ecx	counter, 2*size to 2
+	C edx	byte
+	C esi	src1
+	C edi	src2
+	C ebp	[PIC] table
+
+	addl	%ebx, %eax
+	movb	-1(%esi,%ecx,2), %bl
+
+	addl	%edx, %eax
+	movb	-1(%edi,%ecx,2), %dl
+
+	xorb	%dl, %bl
+	movb	-2(%esi,%ecx,2), %dl
+
+	xorb	-2(%edi,%ecx,2), %dl
+	nop
+
+	movb	TABLE(%ebx), %bl
+	decl	%ecx
+
+	movb	TABLE(%edx), %dl
+	jnz	L(top)
+
+
+ifdef(`PIC',`
+	popl	%ebp
+')
+	addl	%ebx, %eax
+	popl	%ebx
+
+	addl	%edx, %eax
+	popl	%edi
+
+	popl	%esi
+
+	ret
+
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/x86/pentium/logops_n.asm b/third_party/gmp/mpn/x86/pentium/logops_n.asm
new file mode 100644
index 0000000..1877317
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium/logops_n.asm

@@ -0,0 +1,176 @@
+dnl  Intel Pentium mpn_and_n,...,mpn_xnor_n -- bitwise logical operations.
+
+dnl  Copyright 2001, 2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P5: 3.0 c/l  and, ior, xor
+C     3.5 c/l  andn, iorn, nand, nior, xnor
+
+
+define(M4_choose_op,
+`ifdef(`OPERATION_$1',`
+define(`M4_function', `mpn_$1')
+define(`M4_want_pre', `$4')
+define(`M4op',        `$3')
+define(`M4_want_post',`$2')
+')')
+define(M4pre, `ifelse(M4_want_pre, yes,`$1')')
+define(M4post,`ifelse(M4_want_post,yes,`$1')')
+
+M4_choose_op( and_n,     , andl,    )
+M4_choose_op( andn_n,    , andl, yes)
+M4_choose_op( nand_n, yes, andl,    )
+M4_choose_op( ior_n,     ,  orl,    )
+M4_choose_op( iorn_n,    ,  orl, yes)
+M4_choose_op( nior_n, yes,  orl,    )
+M4_choose_op( xor_n,     , xorl,    )
+M4_choose_op( xnor_n, yes, xorl,    )
+
+ifdef(`M4_function',,
+`m4_error(`Unrecognised or undefined OPERATION symbol
+')')
+
+MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n)
+
+NAILS_SUPPORT(0-31)
+
+
+C void M4_function (mp_ptr wp, mp_srcptr xp, mp_srcptr yp, mp_size_t size);
+C
+C Nothing complicated here, just some care to avoid data cache bank clashes
+C and AGIs.
+C
+C We're one register short of being able to do a simple 4 loads, 2 ops, 2
+C stores.  Instead %ebp is juggled a bit and nops are introduced to keep the
+C pairings as intended.  An in-place operation would free up a register, for
+C an 0.5 c/l speedup, if that's worth bothering with.
+C
+C This code seems best for P55 too.  Data alignment is a big problem for MMX
+C and the pairing restrictions on movq and integer instructions make life
+C difficult.
+
+defframe(PARAM_SIZE,16)
+defframe(PARAM_YP,  12)
+defframe(PARAM_XP,   8)
+defframe(PARAM_WP,   4)
+
+	TEXT
+	ALIGN(8)
+
+PROLOGUE(M4_function)
+deflit(`FRAME',0)
+
+	pushl	%ebx	FRAME_pushl()
+	pushl	%esi	FRAME_pushl()
+
+	pushl	%edi	FRAME_pushl()
+	pushl	%ebp	FRAME_pushl()
+
+	movl	PARAM_SIZE, %ecx
+	movl	PARAM_XP, %ebx
+
+	movl	PARAM_YP, %esi
+	movl	PARAM_WP, %edi
+
+	shrl	%ecx
+	jnc	L(entry)
+
+	movl	(%ebx,%ecx,8), %eax	C risk of data cache bank clash here
+	movl	(%esi,%ecx,8), %edx
+
+M4pre(`	notl_or_xorl_GMP_NUMB_MASK(%edx)')
+
+	M4op	%edx, %eax
+
+M4post(`xorl	$GMP_NUMB_MASK, %eax')
+	orl	%ecx, %ecx
+
+	movl	%eax, (%edi,%ecx,8)
+	jz	L(done)
+
+	jmp	L(entry)
+
+
+L(top):
+	C eax
+	C ebx	xp
+	C ecx	counter, limb pairs, decrementing
+	C edx
+	C esi	yp
+	C edi	wp
+	C ebp
+
+	M4op	%ebp, %edx
+	nop
+
+M4post(`xorl	$GMP_NUMB_MASK, %eax')
+M4post(`xorl	$GMP_NUMB_MASK, %edx')
+
+	movl	%eax, 4(%edi,%ecx,8)
+	movl	%edx, (%edi,%ecx,8)
+
+L(entry):
+	movl	-4(%ebx,%ecx,8), %ebp
+	nop
+
+	movl	-4(%esi,%ecx,8), %eax
+	movl	-8(%esi,%ecx,8), %edx
+
+M4pre(`	xorl	$GMP_NUMB_MASK, %eax')
+M4pre(`	xorl	$GMP_NUMB_MASK, %edx')
+
+	M4op	%ebp, %eax
+	movl	-8(%ebx,%ecx,8), %ebp
+
+	decl	%ecx
+	jnz	L(top)
+
+
+	M4op	%ebp, %edx
+	nop
+
+M4post(`xorl	$GMP_NUMB_MASK, %eax')
+M4post(`xorl	$GMP_NUMB_MASK, %edx')
+
+	movl	%eax, 4(%edi,%ecx,8)
+	movl	%edx, (%edi,%ecx,8)
+
+
+L(done):
+	popl	%ebp
+	popl	%edi
+
+	popl	%esi
+	popl	%ebx
+
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/pentium/lshift.asm b/third_party/gmp/mpn/x86/pentium/lshift.asm
new file mode 100644
index 0000000..2a31f36
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium/lshift.asm

@@ -0,0 +1,243 @@
+dnl  Intel Pentium mpn_lshift -- mpn left shift.
+
+dnl  Copyright 1992, 1994-1996, 1999, 2000, 2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C         cycles/limb
+C P5,P54:    6.0
+C P55:       5.375
+
+
+C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                       unsigned shift);
+C
+C The main shift-by-N loop should run at 5.375 c/l and that's what P55 does,
+C but P5 and P54 run only at 6.0 c/l, which is 4 cycles lost somewhere.
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC,  8)
+defframe(PARAM_DST,  4)
+
+	TEXT
+	ALIGN(8)
+PROLOGUE(mpn_lshift)
+
+	pushl	%edi
+	pushl	%esi
+	pushl	%ebx
+	pushl	%ebp
+deflit(`FRAME',16)
+
+	movl	PARAM_DST,%edi
+	movl	PARAM_SRC,%esi
+	movl	PARAM_SIZE,%ebp
+	movl	PARAM_SHIFT,%ecx
+
+C We can use faster code for shift-by-1 under certain conditions.
+	cmp	$1,%ecx
+	jne	L(normal)
+	leal	4(%esi),%eax
+	cmpl	%edi,%eax
+	jnc	L(special)		C jump if s_ptr + 1 >= res_ptr
+	leal	(%esi,%ebp,4),%eax
+	cmpl	%eax,%edi
+	jnc	L(special)		C jump if res_ptr >= s_ptr + size
+
+L(normal):
+	leal	-4(%edi,%ebp,4),%edi
+	leal	-4(%esi,%ebp,4),%esi
+
+	movl	(%esi),%edx
+	subl	$4,%esi
+	xorl	%eax,%eax
+	shldl(	%cl, %edx, %eax)	C compute carry limb
+	pushl	%eax			C push carry limb onto stack
+
+	decl	%ebp
+	pushl	%ebp
+	shrl	$3,%ebp
+	jz	L(end)
+
+	movl	(%edi),%eax		C fetch destination cache line
+
+	ALIGN(4)
+L(oop):	movl	-28(%edi),%eax		C fetch destination cache line
+	movl	%edx,%ebx
+
+	movl	(%esi),%eax
+	movl	-4(%esi),%edx
+	shldl(	%cl, %eax, %ebx)
+	shldl(	%cl, %edx, %eax)
+	movl	%ebx,(%edi)
+	movl	%eax,-4(%edi)
+
+	movl	-8(%esi),%ebx
+	movl	-12(%esi),%eax
+	shldl(	%cl, %ebx, %edx)
+	shldl(	%cl, %eax, %ebx)
+	movl	%edx,-8(%edi)
+	movl	%ebx,-12(%edi)
+
+	movl	-16(%esi),%edx
+	movl	-20(%esi),%ebx
+	shldl(	%cl, %edx, %eax)
+	shldl(	%cl, %ebx, %edx)
+	movl	%eax,-16(%edi)
+	movl	%edx,-20(%edi)
+
+	movl	-24(%esi),%eax
+	movl	-28(%esi),%edx
+	shldl(	%cl, %eax, %ebx)
+	shldl(	%cl, %edx, %eax)
+	movl	%ebx,-24(%edi)
+	movl	%eax,-28(%edi)
+
+	subl	$32,%esi
+	subl	$32,%edi
+	decl	%ebp
+	jnz	L(oop)
+
+L(end):	popl	%ebp
+	andl	$7,%ebp
+	jz	L(end2)
+L(oop2):
+	movl	(%esi),%eax
+	shldl(	%cl,%eax,%edx)
+	movl	%edx,(%edi)
+	movl	%eax,%edx
+	subl	$4,%esi
+	subl	$4,%edi
+	decl	%ebp
+	jnz	L(oop2)
+
+L(end2):
+	shll	%cl,%edx		C compute least significant limb
+	movl	%edx,(%edi)		C store it
+
+	popl	%eax			C pop carry limb
+
+	popl	%ebp
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
+
+
+C We loop from least significant end of the arrays, which is only
+C permissable if the source and destination don't overlap, since the
+C function is documented to work for overlapping source and destination.
+
+L(special):
+	movl	(%esi),%edx
+	addl	$4,%esi
+
+	decl	%ebp
+	pushl	%ebp
+	shrl	$3,%ebp
+
+	addl	%edx,%edx
+	incl	%ebp
+	decl	%ebp
+	jz	L(Lend)
+
+	movl	(%edi),%eax		C fetch destination cache line
+
+	ALIGN(4)
+L(Loop):
+	movl	28(%edi),%eax		C fetch destination cache line
+	movl	%edx,%ebx
+
+	movl	(%esi),%eax
+	movl	4(%esi),%edx
+	adcl	%eax,%eax
+	movl	%ebx,(%edi)
+	adcl	%edx,%edx
+	movl	%eax,4(%edi)
+
+	movl	8(%esi),%ebx
+	movl	12(%esi),%eax
+	adcl	%ebx,%ebx
+	movl	%edx,8(%edi)
+	adcl	%eax,%eax
+	movl	%ebx,12(%edi)
+
+	movl	16(%esi),%edx
+	movl	20(%esi),%ebx
+	adcl	%edx,%edx
+	movl	%eax,16(%edi)
+	adcl	%ebx,%ebx
+	movl	%edx,20(%edi)
+
+	movl	24(%esi),%eax
+	movl	28(%esi),%edx
+	adcl	%eax,%eax
+	movl	%ebx,24(%edi)
+	adcl	%edx,%edx
+	movl	%eax,28(%edi)
+
+	leal	32(%esi),%esi		C use leal not to clobber carry
+	leal	32(%edi),%edi
+	decl	%ebp
+	jnz	L(Loop)
+
+L(Lend):
+	popl	%ebp
+	sbbl	%eax,%eax		C save carry in %eax
+	andl	$7,%ebp
+	jz	L(Lend2)
+	addl	%eax,%eax		C restore carry from eax
+L(Loop2):
+	movl	%edx,%ebx
+	movl	(%esi),%edx
+	adcl	%edx,%edx
+	movl	%ebx,(%edi)
+
+	leal	4(%esi),%esi		C use leal not to clobber carry
+	leal	4(%edi),%edi
+	decl	%ebp
+	jnz	L(Loop2)
+
+	jmp	L(L1)
+L(Lend2):
+	addl	%eax,%eax		C restore carry from eax
+L(L1):	movl	%edx,(%edi)		C store last limb
+
+	sbbl	%eax,%eax
+	negl	%eax
+
+	popl	%ebp
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/pentium/mmx/gmp-mparam.h b/third_party/gmp/mpn/x86/pentium/mmx/gmp-mparam.h
new file mode 100644
index 0000000..02a0def
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium/mmx/gmp-mparam.h

@@ -0,0 +1,163 @@
+/* Intel P55 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 1999-2002, 2004, 2009, 2010 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+
+/* For mpn/x86/pentium/mod_1.asm */
+#define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
+
+
+/* 233MHz P55 */
+
+#define MOD_1_NORM_THRESHOLD                 5
+#define MOD_1_UNNORM_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD      MP_SIZE_T_MAX  /* never */
+#define MOD_1U_TO_MOD_1_1_THRESHOLD         12
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD         0
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD        11
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     63
+#define USE_PREINV_DIVREM_1                  0
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           51
+
+#define MUL_TOOM22_THRESHOLD                16
+#define MUL_TOOM33_THRESHOLD                53
+#define MUL_TOOM44_THRESHOLD               128
+#define MUL_TOOM6H_THRESHOLD               189
+#define MUL_TOOM8H_THRESHOLD               260
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      89
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD      91
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD      90
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD      88
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 20
+#define SQR_TOOM3_THRESHOLD                 73
+#define SQR_TOOM4_THRESHOLD                178
+#define SQR_TOOM6_THRESHOLD                210
+#define SQR_TOOM8_THRESHOLD                375
+
+#define MULMOD_BNM1_THRESHOLD               11
+#define SQRMOD_BNM1_THRESHOLD               12
+
+#define MUL_FFT_MODF_THRESHOLD             364  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    364, 5}, {     15, 6}, {      8, 5}, {     17, 6}, \
+    {      9, 5}, {     19, 6}, {     17, 7}, {      9, 6}, \
+    {     21, 7}, {     11, 6}, {     23, 7}, {     15, 6}, \
+    {     31, 7}, {     21, 8}, {     11, 7}, {     27, 8}, \
+    {     15, 7}, {     33, 8}, {     19, 7}, {     39, 8}, \
+    {     23, 7}, {     47, 8}, {     27, 9}, {     15, 8}, \
+    {     31, 7}, {     63, 8}, {     39, 9}, {     23, 8}, \
+    {     47,10}, {     15, 9}, {     31, 8}, {     67, 9}, \
+    {     39, 8}, {     79, 9}, {     47, 8}, {     95, 9}, \
+    {     55,10}, {     31, 9}, {     79,10}, {     47, 9}, \
+    {     95,11}, {     31,10}, {     63, 9}, {    135,10}, \
+    {     79, 9}, {    159, 8}, {    319, 9}, {    167,10}, \
+    {     95, 9}, {    191, 8}, {    383,11}, {     63,10}, \
+    {    127, 9}, {    255,10}, {    143, 9}, {    287,10}, \
+    {    159, 9}, {    319,11}, {     95,10}, {    191, 9}, \
+    {    383,12}, {     63,11}, {    127,10}, {    271, 9}, \
+    {    543,10}, {    287,11}, {    159,10}, {    351,11}, \
+    {    191,10}, {    415,11}, {    223,12}, {    127,11}, \
+    {    255,10}, {    511,11}, {    287,10}, {    575,11}, \
+    {    351,12}, {    191,11}, {    415,13}, {    127,12}, \
+    {    255,11}, {    575,12}, {    319,11}, {    703,12}, \
+    {    383,11}, {    831,12}, {    447,13}, {   8192,14}, \
+    {  16384,15}, {  32768,16} }
+#define MUL_FFT_TABLE3_SIZE 90
+#define MUL_FFT_THRESHOLD                 3520
+
+#define SQR_FFT_MODF_THRESHOLD             340  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    340, 5}, {     17, 6}, {      9, 5}, {     19, 6}, \
+    {     17, 7}, {      9, 6}, {     21, 7}, {     11, 6}, \
+    {     23, 7}, {     15, 6}, {     31, 7}, {     21, 8}, \
+    {     11, 7}, {     29, 8}, {     15, 7}, {     33, 8}, \
+    {     19, 7}, {     39, 8}, {     27, 7}, {     55, 9}, \
+    {     15, 8}, {     31, 7}, {     65, 8}, {     43, 9}, \
+    {     23, 8}, {     47,10}, {     15, 9}, {     31, 8}, \
+    {     67, 9}, {     39, 8}, {     83, 9}, {     47, 8}, \
+    {     95,10}, {     31, 9}, {     63, 8}, {    127, 9}, \
+    {     79,10}, {     47, 9}, {     95,11}, {     31,10}, \
+    {     63, 9}, {    127, 8}, {    255, 9}, {    135,10}, \
+    {     79, 9}, {    159, 8}, {    319,10}, {     95, 9}, \
+    {    191,11}, {     63,10}, {    127, 9}, {    255, 8}, \
+    {    511, 9}, {    271,10}, {    143, 9}, {    287, 8}, \
+    {    575, 9}, {    303,10}, {    159, 9}, {    319,11}, \
+    {     95,10}, {    191, 9}, {    383,10}, {    207,12}, \
+    {     63,11}, {    127,10}, {    271, 9}, {    543,10}, \
+    {    287, 9}, {    575,10}, {    303,11}, {    159,10}, \
+    {    351,11}, {    191,10}, {    415,11}, {    223,10}, \
+    {    447,12}, {    127,11}, {    255,10}, {    543,11}, \
+    {    287,10}, {    607,11}, {    351,12}, {    191,11}, \
+    {    479,13}, {    127,12}, {    255,11}, {    575,12}, \
+    {    319,11}, {    703,12}, {    383,11}, {    767,12}, \
+    {    447,13}, {   8192,14}, {  16384,15}, {  32768,16} }
+#define SQR_FFT_TABLE3_SIZE 96
+#define SQR_FFT_THRESHOLD                 5504
+
+#define MULLO_BASECASE_THRESHOLD             0  /* always */
+#define MULLO_DC_THRESHOLD                  48
+#define MULLO_MUL_N_THRESHOLD             6633
+
+#define DC_DIV_QR_THRESHOLD                 43
+#define DC_DIVAPPR_Q_THRESHOLD             170
+#define DC_BDIV_QR_THRESHOLD                43
+#define DC_BDIV_Q_THRESHOLD                110
+
+#define INV_MULMOD_BNM1_THRESHOLD           30
+#define INV_NEWTON_THRESHOLD               177
+#define INV_APPR_THRESHOLD                 171
+
+#define BINV_NEWTON_THRESHOLD              194
+#define REDC_1_TO_REDC_N_THRESHOLD          50
+
+#define MU_DIV_QR_THRESHOLD               1142
+#define MU_DIVAPPR_Q_THRESHOLD            1142
+#define MUPI_DIV_QR_THRESHOLD               90
+#define MU_BDIV_QR_THRESHOLD               942
+#define MU_BDIV_Q_THRESHOLD               1017
+
+#define MATRIX22_STRASSEN_THRESHOLD         13
+#define HGCD_THRESHOLD                      92
+#define GCD_DC_THRESHOLD                   283
+#define GCDEXT_DC_THRESHOLD                221
+#define JACOBI_BASE_METHOD                   2
+
+#define GET_STR_DC_THRESHOLD                18
+#define GET_STR_PRECOMPUTE_THRESHOLD        31
+#define SET_STR_DC_THRESHOLD               490
+#define SET_STR_PRECOMPUTE_THRESHOLD       994

diff --git a/third_party/gmp/mpn/x86/pentium/mmx/hamdist.asm b/third_party/gmp/mpn/x86/pentium/mmx/hamdist.asm
new file mode 100644
index 0000000..72e3196
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium/mmx/hamdist.asm

@@ -0,0 +1,40 @@
+dnl  Intel P55 mpn_hamdist -- mpn hamming distance.
+
+dnl  Copyright 2000, 2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P55: hamdist 12.0 cycles/limb
+
+C For reference, this code runs at 11.5 cycles/limb for popcount, which is
+C slower than the plain integer mpn/x86/pentium/popcount.asm.
+
+MULFUNC_PROLOGUE(mpn_hamdist)
+include_mpn(`x86/k6/mmx/popham.asm')

diff --git a/third_party/gmp/mpn/x86/pentium/mmx/lshift.asm b/third_party/gmp/mpn/x86/pentium/mmx/lshift.asm
new file mode 100644
index 0000000..04b0ddc
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium/mmx/lshift.asm

@@ -0,0 +1,463 @@
+dnl  Intel P5 mpn_lshift -- mpn left shift.
+
+dnl  Copyright 2000-2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P5: 1.75 cycles/limb.
+
+
+C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                       unsigned shift);
+C
+C Shift src,size left by shift many bits and store the result in dst,size.
+C Zeros are shifted in at the right.  Return the bits shifted out at the
+C left.
+C
+C The comments in mpn_rshift apply here too.
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC,  8)
+defframe(PARAM_DST,  4)
+deflit(`FRAME',0)
+
+dnl  minimum 5, because the unrolled loop can't handle less
+deflit(UNROLL_THRESHOLD, 5)
+
+	TEXT
+	ALIGN(8)
+
+PROLOGUE(mpn_lshift)
+
+	pushl	%ebx
+	pushl	%edi
+deflit(`FRAME',8)
+
+	movl	PARAM_SIZE, %eax
+	movl	PARAM_DST, %edx
+
+	movl	PARAM_SRC, %ebx
+	movl	PARAM_SHIFT, %ecx
+
+	cmp	$UNROLL_THRESHOLD, %eax
+	jae	L(unroll)
+
+	movl	-4(%ebx,%eax,4), %edi	C src high limb
+	decl	%eax
+
+	jnz	L(simple)
+
+	shldl(	%cl, %edi, %eax)	C eax was decremented to zero
+
+	shll	%cl, %edi
+
+	movl	%edi, (%edx)		C dst low limb
+	popl	%edi			C risk of data cache bank clash
+
+	popl	%ebx
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+L(simple):
+	C eax	size-1
+	C ebx	src
+	C ecx	shift
+	C edx	dst
+	C esi
+	C edi
+	C ebp
+deflit(`FRAME',8)
+
+	movd	(%ebx,%eax,4), %mm5	C src high limb
+
+	movd	%ecx, %mm6		C lshift
+	negl	%ecx
+
+	psllq	%mm6, %mm5
+	addl	$32, %ecx
+
+	movd	%ecx, %mm7
+	psrlq	$32, %mm5		C retval
+
+
+L(simple_top):
+	C eax	counter, limbs, negative
+	C ebx	src
+	C ecx
+	C edx	dst
+	C esi
+	C edi
+	C
+	C mm0	scratch
+	C mm5	return value
+	C mm6	shift
+	C mm7	32-shift
+
+	movq	-4(%ebx,%eax,4), %mm0
+	decl	%eax
+
+	psrlq	%mm7, %mm0
+
+	C
+
+	movd	%mm0, 4(%edx,%eax,4)
+	jnz	L(simple_top)
+
+
+	movd	(%ebx), %mm0
+
+	movd	%mm5, %eax
+	psllq	%mm6, %mm0
+
+	popl	%edi
+	popl	%ebx
+
+	movd	%mm0, (%edx)
+
+	emms
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(8)
+L(unroll):
+	C eax	size
+	C ebx	src
+	C ecx	shift
+	C edx	dst
+	C esi
+	C edi
+	C ebp
+deflit(`FRAME',8)
+
+	movd	-4(%ebx,%eax,4), %mm5	C src high limb
+	leal	(%ebx,%eax,4), %edi
+
+	movd	%ecx, %mm6		C lshift
+	andl	$4, %edi
+
+	psllq	%mm6, %mm5
+	jz	L(start_src_aligned)
+
+
+	C src isn't aligned, process high limb separately (marked xxx) to
+	C make it so.
+	C
+	C  source     -8(ebx,%eax,4)
+	C                  |
+	C  +-------+-------+-------+--
+	C  |               |
+	C  +-------+-------+-------+--
+	C        0mod8   4mod8   0mod8
+	C
+	C  dest
+	C     -4(edx,%eax,4)
+	C          |
+	C  +-------+-------+--
+	C  |  xxx  |       |
+	C  +-------+-------+--
+
+	movq	-8(%ebx,%eax,4), %mm0	C unaligned load
+
+	psllq	%mm6, %mm0
+	decl	%eax
+
+	psrlq	$32, %mm0
+
+	C
+
+	movd	%mm0, (%edx,%eax,4)
+L(start_src_aligned):
+
+	movq	-8(%ebx,%eax,4), %mm1	C src high qword
+	leal	(%edx,%eax,4), %edi
+
+	andl	$4, %edi
+	psrlq	$32, %mm5		C return value
+
+	movq	-16(%ebx,%eax,4), %mm3	C src second highest qword
+	jz	L(start_dst_aligned)
+
+	C dst isn't aligned, subtract 4 to make it so, and pretend the shift
+	C is 32 bits extra.  High limb of dst (marked xxx) handled here
+	C separately.
+	C
+	C  source     -8(ebx,%eax,4)
+	C                  |
+	C  +-------+-------+--
+	C  |      mm1      |
+	C  +-------+-------+--
+	C                0mod8   4mod8
+	C
+	C  dest
+	C     -4(edx,%eax,4)
+	C          |
+	C  +-------+-------+-------+--
+	C  |  xxx  |               |
+	C  +-------+-------+-------+--
+	C        0mod8   4mod8   0mod8
+
+	movq	%mm1, %mm0
+	addl	$32, %ecx		C new shift
+
+	psllq	%mm6, %mm0
+
+	movd	%ecx, %mm6
+	psrlq	$32, %mm0
+
+	C wasted cycle here waiting for %mm0
+
+	movd	%mm0, -4(%edx,%eax,4)
+	subl	$4, %edx
+L(start_dst_aligned):
+
+
+	psllq	%mm6, %mm1
+	negl	%ecx			C -shift
+
+	addl	$64, %ecx		C 64-shift
+	movq	%mm3, %mm2
+
+	movd	%ecx, %mm7
+	subl	$8, %eax		C size-8
+
+	psrlq	%mm7, %mm3
+
+	por	%mm1, %mm3		C mm3 ready to store
+	jc	L(finish)
+
+
+	C The comments in mpn_rshift apply here too.
+
+	ALIGN(8)
+L(unroll_loop):
+	C eax	counter, limbs
+	C ebx	src
+	C ecx
+	C edx	dst
+	C esi
+	C edi
+	C
+	C mm0
+	C mm1
+	C mm2	src qword from 16(%ebx,%eax,4)
+	C mm3	dst qword ready to store to 24(%edx,%eax,4)
+	C
+	C mm5	return value
+	C mm6	lshift
+	C mm7	rshift
+
+	movq	8(%ebx,%eax,4), %mm0
+	psllq	%mm6, %mm2
+
+	movq	%mm0, %mm1
+	psrlq	%mm7, %mm0
+
+	movq	%mm3, 24(%edx,%eax,4)	C prev
+	por	%mm2, %mm0
+
+	movq	(%ebx,%eax,4), %mm3	C
+	psllq	%mm6, %mm1		C
+
+	movq	%mm0, 16(%edx,%eax,4)
+	movq	%mm3, %mm2		C
+
+	psrlq	%mm7, %mm3		C
+	subl	$4, %eax
+
+	por	%mm1, %mm3		C
+	jnc	L(unroll_loop)
+
+
+
+L(finish):
+	C eax	-4 to -1 representing respectively 0 to 3 limbs remaining
+
+	testb	$2, %al
+
+	jz	L(finish_no_two)
+
+	movq	8(%ebx,%eax,4), %mm0
+	psllq	%mm6, %mm2
+
+	movq	%mm0, %mm1
+	psrlq	%mm7, %mm0
+
+	movq	%mm3, 24(%edx,%eax,4)	C prev
+	por	%mm2, %mm0
+
+	movq	%mm1, %mm2
+	movq	%mm0, %mm3
+
+	subl	$2, %eax
+L(finish_no_two):
+
+
+	C eax	-4 or -3 representing respectively 0 or 1 limbs remaining
+	C
+	C mm2	src prev qword, from 16(%ebx,%eax,4)
+	C mm3	dst qword, for 24(%edx,%eax,4)
+
+	testb	$1, %al
+	movd	%mm5, %eax	C retval
+
+	popl	%edi
+	jz	L(finish_zero)
+
+
+	C One extra src limb, destination was aligned.
+	C
+	C                 source                  ebx
+	C                 --+---------------+-------+
+	C                   |      mm2      |       |
+	C                 --+---------------+-------+
+	C
+	C dest         edx+12           edx+4     edx
+	C --+---------------+---------------+-------+
+	C   |      mm3      |               |       |
+	C --+---------------+---------------+-------+
+	C
+	C mm6 = shift
+	C mm7 = ecx = 64-shift
+
+
+	C One extra src limb, destination was unaligned.
+	C
+	C                 source                  ebx
+	C                 --+---------------+-------+
+	C                   |      mm2      |       |
+	C                 --+---------------+-------+
+	C
+	C         dest         edx+12           edx+4
+	C         --+---------------+---------------+
+	C           |      mm3      |               |
+	C         --+---------------+---------------+
+	C
+	C mm6 = shift+32
+	C mm7 = ecx = 64-(shift+32)
+
+
+	C In both cases there's one extra limb of src to fetch and combine
+	C with mm2 to make a qword at 4(%edx), and in the aligned case
+	C there's an extra limb of dst to be formed from that extra src limb
+	C left shifted.
+
+
+	movd	(%ebx), %mm0
+	psllq	%mm6, %mm2
+
+	movq	%mm3, 12(%edx)
+	psllq	$32, %mm0
+
+	movq	%mm0, %mm1
+	psrlq	%mm7, %mm0
+
+	por	%mm2, %mm0
+	psllq	%mm6, %mm1
+
+	movq	%mm0, 4(%edx)
+	psrlq	$32, %mm1
+
+	andl	$32, %ecx
+	popl	%ebx
+
+	jz	L(finish_one_unaligned)
+
+	movd	%mm1, (%edx)
+L(finish_one_unaligned):
+
+	emms
+
+	ret
+
+
+L(finish_zero):
+
+	C No extra src limbs, destination was aligned.
+	C
+	C                 source          ebx
+	C                 --+---------------+
+	C                   |      mm2      |
+	C                 --+---------------+
+	C
+	C dest          edx+8             edx
+	C --+---------------+---------------+
+	C   |      mm3      |               |
+	C --+---------------+---------------+
+	C
+	C mm6 = shift
+	C mm7 = ecx = 64-shift
+
+
+	C No extra src limbs, destination was unaligned.
+	C
+	C               source            ebx
+	C                 --+---------------+
+	C                   |      mm2      |
+	C                 --+---------------+
+	C
+	C         dest          edx+8   edx+4
+	C         --+---------------+-------+
+	C           |      mm3      |       |
+	C         --+---------------+-------+
+	C
+	C mm6 = shift+32
+	C mm7 = ecx = 64-(shift+32)
+
+
+	C The movd for the unaligned case writes the same data to 4(%edx)
+	C that the movq does for the aligned case.
+
+
+	movq	%mm3, 8(%edx)
+	andl	$32, %ecx
+
+	psllq	%mm6, %mm2
+	jz	L(finish_zero_unaligned)
+
+	movq	%mm2, (%edx)
+L(finish_zero_unaligned):
+
+	psrlq	$32, %mm2
+	popl	%ebx
+
+	movd	%mm5, %eax	C retval
+
+	movd	%mm2, 4(%edx)
+
+	emms
+
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/pentium/mmx/mul_1.asm b/third_party/gmp/mpn/x86/pentium/mmx/mul_1.asm
new file mode 100644
index 0000000..4ced577
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium/mmx/mul_1.asm

@@ -0,0 +1,371 @@
+dnl  Intel Pentium MMX mpn_mul_1 -- mpn by limb multiplication.
+
+dnl  Copyright 2000-2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C    cycles/limb
+C P5:   12.0   for 32-bit multiplier
+C        7.0   for 16-bit multiplier
+
+
+C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                      mp_limb_t multiplier);
+C
+C When the multiplier is 16 bits some special case MMX code is used.  Small
+C multipliers might arise reasonably often from mpz_mul_ui etc.  If the size
+C is odd there's roughly a 5 cycle penalty, so times for say size==7 and
+C size==8 end up being quite close.  If src isn't aligned to an 8 byte
+C boundary then one limb is processed separately with roughly a 5 cycle
+C penalty, so in that case it's say size==8 and size==9 which are close.
+C
+C Alternatives:
+C
+C MMX is not believed to be of any use for 32-bit multipliers, since for
+C instance the current method would just have to be more or less duplicated
+C for the high and low halves of the multiplier, and would probably
+C therefore run at about 14 cycles, which is slower than the plain integer
+C at 12.
+C
+C Adding the high and low MMX products using integer code seems best.  An
+C attempt at using paddd and carry bit propagation with pcmpgtd didn't give
+C any joy.  Perhaps something could be done keeping the values signed and
+C thereby avoiding adjustments to make pcmpgtd into an unsigned compare, or
+C perhaps not.
+C
+C Future:
+C
+C An mpn_mul_1c entrypoint would need a double carry out of the low result
+C limb in the 16-bit code, unless it could be assumed the carry fits in 16
+C bits, possibly as carry<multiplier, this being true of a big calculation
+C done piece by piece.  But let's worry about that if/when mul_1c is
+C actually used.
+
+defframe(PARAM_MULTIPLIER,16)
+defframe(PARAM_SIZE,      12)
+defframe(PARAM_SRC,       8)
+defframe(PARAM_DST,       4)
+
+	TEXT
+
+	ALIGN(8)
+PROLOGUE(mpn_mul_1)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	movl	PARAM_SRC, %edx
+
+	cmpl	$1, %ecx
+	jne	L(two_or_more)
+
+	C one limb only
+
+	movl	PARAM_MULTIPLIER, %eax
+	movl	PARAM_DST, %ecx
+
+	mull	(%edx)
+
+	movl	%eax, (%ecx)
+	movl	%edx, %eax
+
+	ret
+
+
+L(two_or_more):
+	C eax	size
+	C ebx
+	C ecx	carry
+	C edx
+	C esi	src
+	C edi
+	C ebp
+
+	pushl	%esi		FRAME_pushl()
+	pushl	%edi		FRAME_pushl()
+
+	movl	%edx, %esi		C src
+	movl	PARAM_DST, %edi
+
+	movl	PARAM_MULTIPLIER, %eax
+	pushl	%ebx		FRAME_pushl()
+
+	leal	(%esi,%ecx,4), %esi	C src end
+	leal	(%edi,%ecx,4), %edi	C dst end
+
+	negl	%ecx			C -size
+
+	pushl	%ebp		FRAME_pushl()
+	cmpl	$65536, %eax
+
+	jb	L(small)
+
+
+L(big):
+	xorl	%ebx, %ebx		C carry limb
+	sarl	%ecx			C -size/2
+
+	jnc	L(top)			C with carry flag clear
+
+
+	C size was odd, process one limb separately
+
+	mull	4(%esi,%ecx,8)		C m * src[0]
+
+	movl	%eax, 4(%edi,%ecx,8)
+	incl	%ecx
+
+	orl	%edx, %ebx		C carry limb, and clear carry flag
+
+
+L(top):
+	C eax
+	C ebx	carry
+	C ecx	counter, negative
+	C edx
+	C esi	src end
+	C edi	dst end
+	C ebp	(scratch carry)
+
+	adcl	$0, %ebx
+	movl	(%esi,%ecx,8), %eax
+
+	mull	PARAM_MULTIPLIER
+
+	movl	%edx, %ebp
+	addl	%eax, %ebx
+
+	adcl	$0, %ebp
+	movl	4(%esi,%ecx,8), %eax
+
+	mull	PARAM_MULTIPLIER
+
+	movl	%ebx, (%edi,%ecx,8)
+	addl	%ebp, %eax
+
+	movl	%eax, 4(%edi,%ecx,8)
+	incl	%ecx
+
+	movl	%edx, %ebx
+	jnz	L(top)
+
+
+	adcl	$0, %ebx
+	popl	%ebp
+
+	movl	%ebx, %eax
+	popl	%ebx
+
+	popl	%edi
+	popl	%esi
+
+	ret
+
+
+L(small):
+	C Special case for 16-bit multiplier.
+	C
+	C eax	multiplier
+	C ebx
+	C ecx	-size
+	C edx	src
+	C esi	src end
+	C edi	dst end
+	C ebp	multiplier
+
+	C size<3 not supported here.  At size==3 we're already a couple of
+	C cycles faster, so there's no threshold as such, just use the MMX
+	C as soon as possible.
+
+	cmpl	$-3, %ecx
+	ja	L(big)
+
+	movd	%eax, %mm7		C m
+	pxor	%mm6, %mm6		C initial carry word
+
+	punpcklwd %mm7, %mm7		C m replicated 2 times
+	addl	$2, %ecx		C -size+2
+
+	punpckldq %mm7, %mm7		C m replicated 4 times
+	andl	$4, %edx		C test alignment, clear carry flag
+
+	movq	%mm7, %mm0		C m
+	jz	L(small_entry)
+
+
+	C Source is unaligned, process one limb separately.
+	C
+	C Plain integer code is used here, since it's smaller and is about
+	C the same 13 cycles as an mmx block would be.
+	C
+	C An "addl $1,%ecx" doesn't clear the carry flag when size==3, hence
+	C the use of separate incl and orl.
+
+	mull	-8(%esi,%ecx,4)		C m * src[0]
+
+	movl	%eax, -8(%edi,%ecx,4)	C dst[0]
+	incl	%ecx			C one limb processed
+
+	movd	%edx, %mm6		C initial carry
+
+	orl	%eax, %eax		C clear carry flag
+	jmp	L(small_entry)
+
+
+C The scheduling here is quite tricky, since so many instructions have
+C pairing restrictions.  In particular the js won't pair with a movd, and
+C can't be paired with an adc since it wants flags from the inc, so
+C instructions are rotated to the top of the loop to find somewhere useful
+C for it.
+C
+C Trouble has been taken to avoid overlapping successive loop iterations,
+C since that would greatly increase the size of the startup and finishup
+C code.  Actually there's probably not much advantage to be had from
+C overlapping anyway, since the difficulties are mostly with pairing, not
+C with latencies as such.
+C
+C In the comments x represents the src data and m the multiplier (16
+C bits, but replicated 4 times).
+C
+C The m signs calculated in %mm3 are a loop invariant and could be held in
+C say %mm5, but that would save only one instruction and hence be no faster.
+
+L(small_top):
+	C eax	l.low, then l.high
+	C ebx	(h.low)
+	C ecx	counter, -size+2 to 0 or 1
+	C edx	(h.high)
+	C esi	&src[size]
+	C edi	&dst[size]
+	C ebp
+	C
+	C %mm0	(high products)
+	C %mm1	(low products)
+	C %mm2	(adjust for m using x signs)
+	C %mm3	(adjust for x using m signs)
+	C %mm4
+	C %mm5
+	C %mm6	h.low, then carry
+	C %mm7	m replicated 4 times
+
+	movd	%mm6, %ebx		C h.low
+	psrlq	$32, %mm1		C l.high
+
+	movd	%mm0, %edx		C h.high
+	movq	%mm0, %mm6		C new c
+
+	adcl	%eax, %ebx
+	incl	%ecx
+
+	movd	%mm1, %eax		C l.high
+	movq	%mm7, %mm0
+
+	adcl	%eax, %edx
+	movl	%ebx, -16(%edi,%ecx,4)
+
+	movl	%edx, -12(%edi,%ecx,4)
+	psrlq	$32, %mm6		C c
+
+L(small_entry):
+	pmulhw	-8(%esi,%ecx,4), %mm0	C h = (x*m).high
+	movq	%mm7, %mm1
+
+	pmullw	-8(%esi,%ecx,4), %mm1	C l = (x*m).low
+	movq	%mm7, %mm3
+
+	movq	-8(%esi,%ecx,4), %mm2	C x
+	psraw	$15, %mm3		C m signs
+
+	pand	-8(%esi,%ecx,4), %mm3	C x selected by m signs
+	psraw	$15, %mm2		C x signs
+
+	paddw	%mm3, %mm0		C add x to h if m neg
+	pand	%mm7, %mm2		C m selected by x signs
+
+	paddw	%mm2, %mm0		C add m to h if x neg
+	incl	%ecx
+
+	movd	%mm1, %eax		C l.low
+	punpcklwd %mm0, %mm6		C c + h.low << 16
+
+	psrlq	$16, %mm0		C h.high
+	js	L(small_top)
+
+
+
+
+	movd	%mm6, %ebx		C h.low
+	psrlq	$32, %mm1		C l.high
+
+	adcl	%eax, %ebx
+	popl	%ebp		FRAME_popl()
+
+	movd	%mm0, %edx		C h.high
+	psrlq	$32, %mm0		C l.high
+
+	movd	%mm1, %eax		C l.high
+
+	adcl	%eax, %edx
+	movl	%ebx, -12(%edi,%ecx,4)
+
+	movd	%mm0, %eax		C c
+
+	adcl	$0, %eax
+	movl	%edx, -8(%edi,%ecx,4)
+
+	orl	%ecx, %ecx
+	jnz	L(small_done)		C final %ecx==1 means even, ==0 odd
+
+
+	C Size odd, one extra limb to process.
+	C Plain integer code is used here, since it's smaller and is about
+	C the same speed as another mmx block would be.
+
+	movl	%eax, %ecx
+	movl	PARAM_MULTIPLIER, %eax
+
+	mull	-4(%esi)
+
+	addl	%ecx, %eax
+
+	adcl	$0, %edx
+	movl	%eax, -4(%edi)
+
+	movl	%edx, %eax
+L(small_done):
+	popl	%ebx
+
+	popl	%edi
+	popl	%esi
+
+	emms
+
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/pentium/mmx/rshift.asm b/third_party/gmp/mpn/x86/pentium/mmx/rshift.asm
new file mode 100644
index 0000000..e3b274b
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium/mmx/rshift.asm

@@ -0,0 +1,468 @@
+dnl  Intel P5 mpn_rshift -- mpn right shift.
+
+dnl  Copyright 2000, 2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P5: 1.75 cycles/limb.
+
+
+C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                       unsigned shift);
+C
+C Shift src,size right by shift many bits and store the result in dst,size.
+C Zeros are shifted in at the left.  Return the bits shifted out at the
+C right.
+C
+C It takes 6 mmx instructions to process 2 limbs, making 1.5 cycles/limb,
+C and with a 4 limb loop and 1 cycle of loop overhead the total is 1.75 c/l.
+C
+C Full speed depends on source and destination being aligned.  Unaligned mmx
+C loads and stores on P5 don't pair and have a 2 cycle penalty.  Some hairy
+C setups and finish-ups are done to ensure alignment for the loop.
+C
+C MMX shifts work out a bit faster even for the simple loop.
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC,  8)
+defframe(PARAM_DST,  4)
+deflit(`FRAME',0)
+
+dnl  Minimum 5, because the unrolled loop can't handle less.
+deflit(UNROLL_THRESHOLD, 5)
+
+	TEXT
+	ALIGN(8)
+
+PROLOGUE(mpn_rshift)
+
+	pushl	%ebx
+	pushl	%edi
+deflit(`FRAME',8)
+
+	movl	PARAM_SIZE, %eax
+	movl	PARAM_DST, %edx
+
+	movl	PARAM_SRC, %ebx
+	movl	PARAM_SHIFT, %ecx
+
+	cmp	$UNROLL_THRESHOLD, %eax
+	jae	L(unroll)
+
+	decl	%eax
+	movl	(%ebx), %edi		C src low limb
+
+	jnz	L(simple)
+
+	shrdl(	%cl, %edi, %eax)	C eax was decremented to zero
+
+	shrl	%cl, %edi
+
+	movl	%edi, (%edx)		C dst low limb
+	popl	%edi			C risk of data cache bank clash
+
+	popl	%ebx
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(8)
+L(simple):
+	C eax	size-1
+	C ebx	src
+	C ecx	shift
+	C edx	dst
+	C esi
+	C edi
+	C ebp
+deflit(`FRAME',8)
+
+	movd	(%ebx), %mm5		C src[0]
+	leal	(%ebx,%eax,4), %ebx	C &src[size-1]
+
+	movd	%ecx, %mm6		C rshift
+	leal	-4(%edx,%eax,4), %edx	C &dst[size-2]
+
+	psllq	$32, %mm5
+	negl	%eax
+
+
+C This loop is 5 or 8 cycles, with every second load unaligned and a wasted
+C cycle waiting for the mm0 result to be ready.  For comparison a shrdl is 4
+C cycles and would be 8 in a simple loop.  Using mmx helps the return value
+C and last limb calculations too.
+
+L(simple_top):
+	C eax	counter, limbs, negative
+	C ebx	&src[size-1]
+	C ecx	return value
+	C edx	&dst[size-2]
+	C
+	C mm0	scratch
+	C mm5	return value
+	C mm6	shift
+
+	movq	(%ebx,%eax,4), %mm0
+	incl	%eax
+
+	psrlq	%mm6, %mm0
+
+	movd	%mm0, (%edx,%eax,4)
+	jnz	L(simple_top)
+
+
+	movd	(%ebx), %mm0
+	psrlq	%mm6, %mm5		C return value
+
+	psrlq	%mm6, %mm0
+	popl	%edi
+
+	movd	%mm5, %eax
+	popl	%ebx
+
+	movd	%mm0, 4(%edx)
+
+	emms
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(8)
+L(unroll):
+	C eax	size
+	C ebx	src
+	C ecx	shift
+	C edx	dst
+	C esi
+	C edi
+	C ebp
+deflit(`FRAME',8)
+
+	movd	(%ebx), %mm5		C src[0]
+	movl	$4, %edi
+
+	movd	%ecx, %mm6		C rshift
+	testl	%edi, %ebx
+
+	psllq	$32, %mm5
+	jz	L(start_src_aligned)
+
+
+	C src isn't aligned, process low limb separately (marked xxx) and
+	C step src and dst by one limb, making src aligned.
+	C
+	C source                  ebx
+	C --+-------+-------+-------+
+	C           |          xxx  |
+	C --+-------+-------+-------+
+	C         4mod8   0mod8   4mod8
+	C
+	C         dest            edx
+	C         --+-------+-------+
+	C           |       |  xxx  |
+	C         --+-------+-------+
+
+	movq	(%ebx), %mm0		C unaligned load
+
+	psrlq	%mm6, %mm0
+	addl	$4, %ebx
+
+	decl	%eax
+
+	movd	%mm0, (%edx)
+	addl	$4, %edx
+L(start_src_aligned):
+
+
+	movq	(%ebx), %mm1
+	testl	%edi, %edx
+
+	psrlq	%mm6, %mm5		C retval
+	jz	L(start_dst_aligned)
+
+	C dst isn't aligned, add 4 to make it so, and pretend the shift is
+	C 32 bits extra.  Low limb of dst (marked xxx) handled here
+	C separately.
+	C
+	C          source          ebx
+	C          --+-------+-------+
+	C            |      mm1      |
+	C          --+-------+-------+
+	C                  4mod8   0mod8
+	C
+	C  dest                    edx
+	C  --+-------+-------+-------+
+	C                    |  xxx  |
+	C  --+-------+-------+-------+
+	C          4mod8   0mod8   4mod8
+
+	movq	%mm1, %mm0
+	addl	$32, %ecx		C new shift
+
+	psrlq	%mm6, %mm0
+
+	movd	%ecx, %mm6
+
+	movd	%mm0, (%edx)
+	addl	$4, %edx
+L(start_dst_aligned):
+
+
+	movq	8(%ebx), %mm3
+	negl	%ecx
+
+	movq	%mm3, %mm2		C mm2 src qword
+	addl	$64, %ecx
+
+	movd	%ecx, %mm7
+	psrlq	%mm6, %mm1
+
+	leal	-12(%ebx,%eax,4), %ebx
+	leal	-20(%edx,%eax,4), %edx
+
+	psllq	%mm7, %mm3
+	subl	$7, %eax		C size-7
+
+	por	%mm1, %mm3		C mm3 ready to store
+	negl	%eax			C -(size-7)
+
+	jns	L(finish)
+
+
+	C This loop is the important bit, the rest is just support.  Careful
+	C instruction scheduling achieves the claimed 1.75 c/l.  The
+	C relevant parts of the pairing rules are:
+	C
+	C - mmx loads and stores execute only in the U pipe
+	C - only one mmx shift in a pair
+	C - wait one cycle before storing an mmx register result
+	C - the usual address generation interlock
+	C
+	C Two qword calculations are slightly interleaved.  The instructions
+	C marked "C" belong to the second qword, and the "C prev" one is for
+	C the second qword from the previous iteration.
+
+	ALIGN(8)
+L(unroll_loop):
+	C eax	counter, limbs, negative
+	C ebx	&src[size-12]
+	C ecx
+	C edx	&dst[size-12]
+	C esi
+	C edi
+	C
+	C mm0
+	C mm1
+	C mm2	src qword from -8(%ebx,%eax,4)
+	C mm3	dst qword ready to store to -8(%edx,%eax,4)
+	C
+	C mm5	return value
+	C mm6	rshift
+	C mm7	lshift
+
+	movq	(%ebx,%eax,4), %mm0
+	psrlq	%mm6, %mm2
+
+	movq	%mm0, %mm1
+	psllq	%mm7, %mm0
+
+	movq	%mm3, -8(%edx,%eax,4)	C prev
+	por	%mm2, %mm0
+
+	movq	8(%ebx,%eax,4), %mm3	C
+	psrlq	%mm6, %mm1		C
+
+	movq	%mm0, (%edx,%eax,4)
+	movq	%mm3, %mm2		C
+
+	psllq	%mm7, %mm3		C
+	addl	$4, %eax
+
+	por	%mm1, %mm3		C
+	js	L(unroll_loop)
+
+
+L(finish):
+	C eax	0 to 3 representing respectively 3 to 0 limbs remaining
+
+	testb	$2, %al
+
+	jnz	L(finish_no_two)
+
+	movq	(%ebx,%eax,4), %mm0
+	psrlq	%mm6, %mm2
+
+	movq	%mm0, %mm1
+	psllq	%mm7, %mm0
+
+	movq	%mm3, -8(%edx,%eax,4)	C prev
+	por	%mm2, %mm0
+
+	movq	%mm1, %mm2
+	movq	%mm0, %mm3
+
+	addl	$2, %eax
+L(finish_no_two):
+
+
+	C eax	2 or 3 representing respectively 1 or 0 limbs remaining
+	C
+	C mm2	src prev qword, from -8(%ebx,%eax,4)
+	C mm3	dst qword, for -8(%edx,%eax,4)
+
+	testb	$1, %al
+	popl	%edi
+
+	movd	%mm5, %eax	C retval
+	jnz	L(finish_zero)
+
+
+	C One extra limb, destination was aligned.
+	C
+	C source                ebx
+	C +-------+---------------+--
+	C |       |      mm2      |
+	C +-------+---------------+--
+	C
+	C dest                                  edx
+	C +-------+---------------+---------------+--
+	C |       |               |      mm3      |
+	C +-------+---------------+---------------+--
+	C
+	C mm6 = shift
+	C mm7 = ecx = 64-shift
+
+
+	C One extra limb, destination was unaligned.
+	C
+	C source                ebx
+	C +-------+---------------+--
+	C |       |      mm2      |
+	C +-------+---------------+--
+	C
+	C dest                          edx
+	C +---------------+---------------+--
+	C |               |      mm3      |
+	C +---------------+---------------+--
+	C
+	C mm6 = shift+32
+	C mm7 = ecx = 64-(shift+32)
+
+
+	C In both cases there's one extra limb of src to fetch and combine
+	C with mm2 to make a qword at 8(%edx), and in the aligned case
+	C there's a further extra limb of dst to be formed.
+
+
+	movd	8(%ebx), %mm0
+	psrlq	%mm6, %mm2
+
+	movq	%mm0, %mm1
+	psllq	%mm7, %mm0
+
+	movq	%mm3, (%edx)
+	por	%mm2, %mm0
+
+	psrlq	%mm6, %mm1
+	andl	$32, %ecx
+
+	popl	%ebx
+	jz	L(finish_one_unaligned)
+
+	C dst was aligned, must store one extra limb
+	movd	%mm1, 16(%edx)
+L(finish_one_unaligned):
+
+	movq	%mm0, 8(%edx)
+
+	emms
+
+	ret
+
+
+L(finish_zero):
+
+	C No extra limbs, destination was aligned.
+	C
+	C source        ebx
+	C +---------------+--
+	C |      mm2      |
+	C +---------------+--
+	C
+	C dest                        edx+4
+	C +---------------+---------------+--
+	C |               |      mm3      |
+	C +---------------+---------------+--
+	C
+	C mm6 = shift
+	C mm7 = ecx = 64-shift
+
+
+	C No extra limbs, destination was unaligned.
+	C
+	C source        ebx
+	C +---------------+--
+	C |      mm2      |
+	C +---------------+--
+	C
+	C dest                edx+4
+	C +-------+---------------+--
+	C |       |      mm3      |
+	C +-------+---------------+--
+	C
+	C mm6 = shift+32
+	C mm7 = 64-(shift+32)
+
+
+	C The movd for the unaligned case is clearly the same data as the
+	C movq for the aligned case, it's just a choice between whether one
+	C or two limbs should be written.
+
+
+	movq	%mm3, 4(%edx)
+	psrlq	%mm6, %mm2
+
+	movd	%mm2, 12(%edx)
+	andl	$32, %ecx
+
+	popl	%ebx
+	jz	L(finish_zero_unaligned)
+
+	movq	%mm2, 12(%edx)
+L(finish_zero_unaligned):
+
+	emms
+
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/pentium/mod_34lsub1.asm b/third_party/gmp/mpn/x86/pentium/mod_34lsub1.asm
new file mode 100644
index 0000000..2d88223
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium/mod_34lsub1.asm

@@ -0,0 +1,192 @@
+dnl  Intel P5 mpn_mod_34lsub1 -- mpn remainder modulo 2**24-1.
+
+dnl  Copyright 2000-2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P5: 1.66 cycles/limb
+
+
+C mp_limb_t mpn_mod_34lsub1 (mp_srcptr src, mp_size_t size)
+C
+
+defframe(PARAM_SIZE, 8)
+defframe(PARAM_SRC,  4)
+
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_mod_34lsub1)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	movl	PARAM_SRC, %edx
+
+	subl	$2, %ecx
+	ja	L(three_or_more)
+
+	movl	(%edx), %eax
+	jne	L(one)
+
+
+	movl	4(%edx), %ecx
+	movl	%eax, %edx
+
+	shrl	$24, %edx
+	andl	$0xFFFFFF, %eax
+
+	addl	%edx, %eax
+	movl	%ecx, %edx
+
+	shrl	$16, %ecx
+	andl	$0xFFFF, %edx
+
+	shll	$8, %edx
+	addl	%ecx, %eax
+
+	addl	%edx, %eax
+
+L(one):
+	ret
+
+
+L(three_or_more):
+	C eax
+	C ebx
+	C ecx	size-2
+	C edx	src
+	C esi
+	C edi
+	C ebp
+
+	pushl	%ebx	FRAME_pushl()
+	pushl	%esi	FRAME_pushl()
+
+	pushl	%edi	FRAME_pushl()
+	pushl	%ebp	FRAME_pushl()
+
+	xorl	%esi, %esi		C 0mod3
+	xorl	%edi, %edi		C 1mod3
+
+	xorl	%ebp, %ebp		C 2mod3, and clear carry
+
+L(top):
+	C eax	scratch
+	C ebx	scratch
+	C ecx	counter, limbs
+	C edx	src
+	C esi	0mod3
+	C edi	1mod3
+	C ebp	2mod3
+
+	movl	(%edx), %eax
+	movl	4(%edx), %ebx
+
+	adcl	%eax, %esi
+	movl	8(%edx), %eax
+
+	adcl	%ebx, %edi
+	leal	12(%edx), %edx
+
+	adcl	%eax, %ebp
+	leal	-2(%ecx), %ecx
+
+	decl	%ecx
+	jg	L(top)
+
+
+	C ecx is -2, -1 or 0, representing 0, 1 or 2 more limbs, respectively
+
+	movl	$0xFFFFFFFF, %ebx	C mask
+	incl	%ecx
+
+	js	L(combine)		C 0 more
+
+	movl	(%edx), %eax
+	movl	$0xFFFFFF00, %ebx
+
+	adcl	%eax, %esi
+	decl	%ecx
+
+	js	L(combine)		C 1 more
+
+	movl	4(%edx), %eax
+	movl	$0xFFFF0000, %ebx
+
+	adcl	%eax, %edi
+
+
+
+L(combine):
+	C eax
+	C ebx	mask
+	C ecx
+	C edx
+	C esi	0mod3
+	C edi	1mod3
+	C ebp	2mod3
+
+	sbbl	%ecx, %ecx		C carry
+	movl	%esi, %eax		C 0mod3
+
+	andl	%ebx, %ecx		C masked for position
+	andl	$0xFFFFFF, %eax		C 0mod3 low
+
+	shrl	$24, %esi		C 0mod3 high
+	subl	%ecx, %eax		C apply carry
+
+	addl	%esi, %eax		C apply 0mod3
+	movl	%edi, %ebx		C 1mod3
+
+	shrl	$16, %edi		C 1mod3 high
+	andl	$0x0000FFFF, %ebx
+
+	shll	$8, %ebx		C 1mod3 low
+	addl	%edi, %eax		C apply 1mod3 high
+
+	addl	%ebx, %eax		C apply 1mod3 low
+	movl	%ebp, %ebx		C 2mod3
+
+	shrl	$8, %ebp		C 2mod3 high
+	andl	$0xFF, %ebx
+
+	shll	$16, %ebx		C 2mod3 low
+	addl	%ebp, %eax		C apply 2mod3 high
+
+	addl	%ebx, %eax		C apply 2mod3 low
+
+	popl	%ebp
+	popl	%edi
+
+	popl	%esi
+	popl	%ebx
+
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/pentium/mode1o.asm b/third_party/gmp/mpn/x86/pentium/mode1o.asm
new file mode 100644
index 0000000..a90abca
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium/mode1o.asm

@@ -0,0 +1,279 @@
+dnl  Intel Pentium mpn_modexact_1_odd -- exact division style remainder.
+
+dnl  Copyright 2000-2002, 2014 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P5: 23.0 cycles/limb
+
+
+C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size,
+C                               mp_limb_t divisor);
+C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size,
+C                                mp_limb_t divisor, mp_limb_t carry);
+C
+C There seems no way to pair up the two lone instructions in the main loop.
+C
+C The special case for size==1 saves about 20 cycles (non-PIC), making it
+C the same as mpn_mod_1, and in fact making modexact faster than mod_1 at
+C all sizes.
+C
+C Alternatives:
+C
+C Using mmx for the multiplies might be possible, with pmullw and pmulhw
+C having just 3 cycle latencies, but carry bit handling would probably be
+C complicated.
+
+defframe(PARAM_CARRY,  16)
+defframe(PARAM_DIVISOR,12)
+defframe(PARAM_SIZE,   8)
+defframe(PARAM_SRC,    4)
+
+dnl  re-using parameter space
+define(VAR_INVERSE,`PARAM_SIZE')
+
+	TEXT
+
+	ALIGN(16)
+PROLOGUE(mpn_modexact_1c_odd)
+deflit(`FRAME',0)
+
+	movl	PARAM_DIVISOR, %eax
+	movl	PARAM_CARRY, %edx
+
+	jmp	L(start_1c)
+
+EPILOGUE()
+
+	ALIGN(16)
+PROLOGUE(mpn_modexact_1_odd)
+deflit(`FRAME',0)
+
+	movl	PARAM_DIVISOR, %eax
+	xorl	%edx, %edx		C carry
+
+L(start_1c):
+
+ifdef(`PIC',`
+ifdef(`DARWIN',`
+	shrl	%eax			C d/2
+	LEA(	binvert_limb_table, %ecx)
+	pushl	%ebx		FRAME_pushl()
+	movl	PARAM_SIZE, %ebx
+
+	andl	$127, %eax
+	subl	$2, %ebx
+
+	movb	(%eax,%ecx), %cl
+	jc	L(one_limb)
+',`
+	call	L(here)		FRAME_pushl()
+L(here):
+
+	shrl	%eax			C d/2
+	movl	(%esp), %ecx		C eip
+
+	addl	$_GLOBAL_OFFSET_TABLE_+[.-L(here)], %ecx
+	movl	%ebx, (%esp)		C push ebx
+
+	andl	$127, %eax
+	movl	PARAM_SIZE, %ebx
+
+	movl	binvert_limb_table@GOT(%ecx), %ecx
+	subl	$2, %ebx
+
+	movb	(%eax,%ecx), %cl			C inv 8 bits
+	jc	L(one_limb)
+')
+',`
+dnl non-PIC
+	shrl	%eax			C d/2
+	pushl	%ebx		FRAME_pushl()
+
+	movl	PARAM_SIZE, %ebx
+	andl	$127, %eax
+
+	subl	$2, %ebx
+	jc	L(one_limb)
+
+	movb	binvert_limb_table(%eax), %cl		C inv 8 bits
+')
+
+	movl	%ecx, %eax
+	addl	%ecx, %ecx		C 2*inv
+
+	imull	%eax, %eax		C inv*inv
+
+	imull	PARAM_DIVISOR, %eax	C inv*inv*d
+
+	subl	%eax, %ecx		C inv = 2*inv - inv*inv*d
+
+	movl	%ecx, %eax
+	addl	%ecx, %ecx		C 2*inv
+
+	imull	%eax, %eax		C inv*inv
+
+	imull	PARAM_DIVISOR, %eax	C inv*inv*d
+
+	subl	%eax, %ecx		C inv = 2*inv - inv*inv*d
+	pushl	%esi		FRAME_pushl()
+
+	ASSERT(e,`	C d*inv == 1 mod 2^GMP_LIMB_BITS
+	movl	%ecx, %eax
+	imull	PARAM_DIVISOR, %eax
+	cmpl	$1, %eax')
+
+	movl	PARAM_SRC, %esi
+	movl	%ecx, VAR_INVERSE
+
+	movl	(%esi), %eax		C src[0]
+	leal	4(%esi,%ebx,4), %esi	C &src[size-1]
+
+	xorl	$-1, %ebx		C -(size-1)
+	ASSERT(nz)
+	jmp	L(entry)
+
+
+C The use of VAR_INVERSE means only a store is needed for that value, rather
+C than a push and pop of say %edi.
+
+	ALIGN(16)
+L(top):
+	C eax	scratch, low product
+	C ebx	counter, limbs, negative
+	C ecx	carry bit
+	C edx	scratch, high product
+	C esi	&src[size-1]
+	C edi
+	C ebp
+
+	mull	PARAM_DIVISOR		C h:dummy = q*d
+
+	movl	(%esi,%ebx,4), %eax	C src[i]
+	subl	%ecx, %edx		C h -= -c
+
+L(entry):
+	subl	%edx, %eax		C s = src[i] - h
+
+	sbbl	%ecx, %ecx		C new -c (0 or -1)
+
+	imull	VAR_INVERSE, %eax	C q = s*i
+
+	incl	%ebx
+	jnz	L(top)
+
+
+	mull	PARAM_DIVISOR
+
+	movl	(%esi), %eax		C src high
+	subl	%ecx, %edx		C h -= -c
+
+	cmpl	PARAM_DIVISOR, %eax
+
+	jbe	L(skip_last)
+deflit(FRAME_LAST,FRAME)
+
+
+	subl	%edx, %eax		C s = src[i] - h
+	popl	%esi		FRAME_popl()
+
+	sbbl	%ecx, %ecx		C c (0 or -1)
+	popl	%ebx		FRAME_popl()
+
+	imull	VAR_INVERSE, %eax	C q = s*i
+
+	mull	PARAM_DIVISOR		C h:dummy = q*d
+
+	movl	%edx, %eax
+
+	subl	%ecx, %eax
+
+	ret
+
+
+C When high<divisor can skip last step.
+
+L(skip_last):
+deflit(`FRAME',FRAME_LAST)
+	C eax	src high
+	C ebx
+	C ecx
+	C edx	r
+	C esi
+
+	subl	%eax, %edx	C r-s
+	popl	%esi		FRAME_popl()
+
+	sbbl	%eax, %eax	C -1 if underflow
+	movl	PARAM_DIVISOR, %ebx
+
+	andl	%ebx, %eax	C divisor if underflow
+	popl	%ebx		FRAME_popl()
+
+	addl	%edx, %eax	C addback if underflow
+
+	ret
+
+
+C Special case for size==1 using a division for r = c-a mod d.
+C Could look for a-c<d and save a division sometimes, but that doesn't seem
+C worth bothering about.
+
+L(one_limb):
+deflit(`FRAME',4)
+	C eax
+	C ebx	size-2 (==-1)
+	C ecx
+	C edx	carry
+	C esi	src end
+	C edi
+	C ebp
+
+	movl	%edx, %eax
+	movl	PARAM_SRC, %edx
+
+	movl	PARAM_DIVISOR, %ecx
+	popl	%ebx		FRAME_popl()
+
+	subl	(%edx), %eax		C c-a
+
+	sbbl	%edx, %edx
+	decl	%ecx			C d-1
+
+	andl	%ecx, %edx		C b*d+c-a if c<a, or c-a if c>=a
+
+	divl	PARAM_DIVISOR
+
+	movl	%edx, %eax
+
+	ret
+
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/x86/pentium/mul_1.asm b/third_party/gmp/mpn/x86/pentium/mul_1.asm
new file mode 100644
index 0000000..a0858af
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium/mul_1.asm

@@ -0,0 +1,177 @@
+dnl  Intel Pentium mpn_mul_1 -- mpn by limb multiplication.
+
+dnl  Copyright 1992, 1994, 1996, 1999, 2000, 2002 Free Software Foundation,
+dnl  Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P5: 12.0 cycles/limb
+
+
+C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                      mp_limb_t multiplier);
+C mp_limb_t mpn_mul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                       mp_limb_t multiplier, mp_limb_t carry);
+C
+
+defframe(PARAM_CARRY,     20)
+defframe(PARAM_MULTIPLIER,16)
+defframe(PARAM_SIZE,      12)
+defframe(PARAM_SRC,       8)
+defframe(PARAM_DST,       4)
+
+	TEXT
+	ALIGN(8)
+PROLOGUE(mpn_mul_1c)
+deflit(`FRAME',0)
+
+	movl	PARAM_CARRY, %ecx
+	pushl	%esi		FRAME_pushl()
+
+	jmp	L(start_1c)
+
+EPILOGUE()
+
+
+	ALIGN(8)
+PROLOGUE(mpn_mul_1)
+deflit(`FRAME',0)
+
+	xorl	%ecx, %ecx
+	pushl	%esi		FRAME_pushl()
+
+L(start_1c):
+	movl	PARAM_SRC, %esi
+	movl	PARAM_SIZE, %eax
+
+	shrl	%eax
+	jnz	L(two_or_more)
+
+
+	C one limb only
+
+	movl	(%esi), %eax
+
+	mull	PARAM_MULTIPLIER
+
+	addl	%eax, %ecx
+	movl	PARAM_DST, %eax
+
+	adcl	$0, %edx
+	popl	%esi
+
+	movl	%ecx, (%eax)
+	movl	%edx, %eax
+
+	ret
+
+
+L(two_or_more):
+	C eax	size/2
+	C ebx
+	C ecx	carry
+	C edx
+	C esi	src
+	C edi
+	C ebp
+
+	pushl	%edi		FRAME_pushl()
+	pushl	%ebx		FRAME_pushl()
+
+	movl	PARAM_DST, %edi
+	leal	-1(%eax), %ebx		C size/2-1
+
+	notl	%ebx			C -size, preserve carry
+
+	leal	(%esi,%eax,8), %esi	C src end
+	leal	(%edi,%eax,8), %edi	C dst end
+
+	pushl	%ebp		FRAME_pushl()
+	jnc	L(top)
+
+
+	C size was odd, process one limb separately
+
+	movl	(%esi,%ebx,8), %eax
+	addl	$4, %esi
+
+	mull	PARAM_MULTIPLIER
+
+	addl	%ecx, %eax
+	movl	%edx, %ecx
+
+	movl	%eax, (%edi,%ebx,8)
+	leal	4(%edi), %edi
+
+
+L(top):
+	C eax
+	C ebx	counter, negative
+	C ecx	carry
+	C edx
+	C esi	src end
+	C edi	dst end
+	C ebp
+
+	adcl	$0, %ecx
+	movl	(%esi,%ebx,8), %eax
+
+	mull	PARAM_MULTIPLIER
+
+	movl	%edx, %ebp
+	addl	%eax, %ecx
+
+	adcl	$0, %ebp
+	movl	4(%esi,%ebx,8), %eax
+
+	mull	PARAM_MULTIPLIER
+
+	movl	%ecx, (%edi,%ebx,8)
+	addl	%ebp, %eax
+
+	movl	%eax, 4(%edi,%ebx,8)
+	incl	%ebx
+
+	movl	%edx, %ecx
+	jnz	L(top)
+
+
+	adcl	$0, %ecx
+	popl	%ebp
+
+	movl	%ecx, %eax
+	popl	%ebx
+
+	popl	%edi
+	popl	%esi
+
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/pentium/mul_2.asm b/third_party/gmp/mpn/x86/pentium/mul_2.asm
new file mode 100644
index 0000000..4c7beb5
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium/mul_2.asm

@@ -0,0 +1,150 @@
+dnl  Intel Pentium mpn_mul_2 -- mpn by 2-limb multiplication.
+
+dnl  Copyright 2001, 2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P5: 24.0 cycles/limb
+
+
+C mp_limb_t mpn_mul_2 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                      mp_srcptr mult);
+C
+C At 24 c/l this is only 2 cycles faster than a separate mul_1 and addmul_1,
+C but has the advantage of making just one pass over the operands.
+C
+C There's not enough registers to use PARAM_MULT directly, so the multiplier
+C limbs are transferred to local variables on the stack.
+
+defframe(PARAM_MULT, 16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC,   8)
+defframe(PARAM_DST,   4)
+
+dnl  re-use parameter space
+define(VAR_MULT_LOW, `PARAM_SRC')
+define(VAR_MULT_HIGH,`PARAM_DST')
+
+	TEXT
+	ALIGN(8)
+PROLOGUE(mpn_mul_2)
+deflit(`FRAME',0)
+
+	pushl	%esi		FRAME_pushl()
+	pushl	%edi		FRAME_pushl()
+
+	movl	PARAM_SRC, %esi
+	movl	PARAM_DST, %edi
+
+	movl	PARAM_MULT, %eax
+	movl	PARAM_SIZE, %ecx
+
+	movl	4(%eax), %edx		C mult high
+	movl	(%eax), %eax		C mult low
+
+	movl	%eax, VAR_MULT_LOW
+	movl	%edx, VAR_MULT_HIGH
+
+	pushl	%ebx		FRAME_pushl()
+	pushl	%ebp		FRAME_pushl()
+
+	mull	(%esi)			C src[0] * mult[0]
+
+	movl	%eax, %ebp		C in case src==dst
+	movl	(%esi), %eax		C src[0]
+
+	movl	%ebp, (%edi)		C dst[0]
+	movl	%edx, %ebx		C initial low carry
+
+	xorl	%ebp, %ebp		C initial high carry
+	leal	(%edi,%ecx,4), %edi	C dst end
+
+	mull	VAR_MULT_HIGH		C src[0] * mult[1]
+
+	subl	$2, %ecx		C size-2
+	js	L(done)
+
+	leal	8(%esi,%ecx,4), %esi	C &src[size]
+	xorl	$-1, %ecx		C -(size-1)
+
+
+
+L(top):
+	C eax	low prod
+	C ebx	low carry
+	C ecx	counter, negative
+	C edx	high prod
+	C esi	src end
+	C edi	dst end
+	C ebp	high carry (0 or -1)
+
+	andl	$1, %ebp		C 1 or 0
+	addl	%eax, %ebx
+
+	adcl	%edx, %ebp
+	ASSERT(nc)
+	movl	(%esi,%ecx,4), %eax
+
+	mull	VAR_MULT_LOW
+
+	addl	%eax, %ebx		C low carry
+	movl	(%esi,%ecx,4), %eax
+
+	adcl	%ebp, %edx		C high carry
+	movl	%ebx, (%edi,%ecx,4)
+
+	sbbl	%ebp, %ebp		C new high carry, -1 or 0
+	movl	%edx, %ebx		C new low carry
+
+	mull	VAR_MULT_HIGH
+
+	incl	%ecx
+	jnz	L(top)
+
+
+L(done):
+	andl	$1, %ebp		C 1 or 0
+	addl	%ebx, %eax
+
+	adcl	%ebp, %edx
+	ASSERT(nc)
+	movl	%eax, (%edi)		C store carry low
+
+	movl	%edx, %eax		C return carry high
+
+	popl	%ebp
+	popl	%ebx
+
+	popl	%edi
+	popl	%esi
+
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/pentium/mul_basecase.asm b/third_party/gmp/mpn/x86/pentium/mul_basecase.asm
new file mode 100644
index 0000000..e1d0f05
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium/mul_basecase.asm

@@ -0,0 +1,142 @@
+dnl  Intel Pentium mpn_mul_basecase -- mpn by mpn multiplication.
+
+dnl  Copyright 1996, 1998-2000, 2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P5: 14.2 cycles/crossproduct (approx)
+
+
+C void mpn_mul_basecase (mp_ptr wp,
+C                        mp_srcptr xp, mp_size_t xsize,
+C                        mp_srcptr yp, mp_size_t ysize);
+
+defframe(PARAM_YSIZE, 20)
+defframe(PARAM_YP,    16)
+defframe(PARAM_XSIZE, 12)
+defframe(PARAM_XP,    8)
+defframe(PARAM_WP,    4)
+
+defframe(VAR_COUNTER, -4)
+
+	TEXT
+	ALIGN(8)
+PROLOGUE(mpn_mul_basecase)
+
+	pushl	%eax			C dummy push for allocating stack slot
+	pushl	%esi
+	pushl	%ebp
+	pushl	%edi
+deflit(`FRAME',16)
+
+	movl	PARAM_XP,%esi
+	movl	PARAM_WP,%edi
+	movl	PARAM_YP,%ebp
+
+	movl	(%esi),%eax		C load xp[0]
+	mull	(%ebp)			C multiply by yp[0]
+	movl	%eax,(%edi)		C store to wp[0]
+	movl	PARAM_XSIZE,%ecx	C xsize
+	decl	%ecx			C If xsize = 1, ysize = 1 too
+	jz	L(done)
+
+	movl	PARAM_XSIZE,%eax
+	pushl	%ebx
+FRAME_pushl()
+	movl	%edx,%ebx
+	leal	(%esi,%eax,4),%esi	C make xp point at end
+	leal	(%edi,%eax,4),%edi	C offset wp by xsize
+	negl	%ecx			C negate j size/index for inner loop
+	xorl	%eax,%eax		C clear carry
+
+	ALIGN(8)
+L(oop1):	adcl	$0,%ebx
+	movl	(%esi,%ecx,4),%eax	C load next limb at xp[j]
+	mull	(%ebp)
+	addl	%ebx,%eax
+	movl	%eax,(%edi,%ecx,4)
+	incl	%ecx
+	movl	%edx,%ebx
+	jnz	L(oop1)
+
+	adcl	$0,%ebx
+	movl	PARAM_YSIZE,%eax
+	movl	%ebx,(%edi)		C most significant limb of product
+	addl	$4,%edi			C increment wp
+	decl	%eax
+	jz	L(skip)
+	movl	%eax,VAR_COUNTER	C set index i to ysize
+
+L(outer):
+	addl	$4,%ebp			C make ebp point to next y limb
+	movl	PARAM_XSIZE,%ecx
+	negl	%ecx
+	xorl	%ebx,%ebx
+
+	C code at 0x61 here, close enough to aligned
+L(oop2):
+	adcl	$0,%ebx
+	movl	(%esi,%ecx,4),%eax
+	mull	(%ebp)
+	addl	%ebx,%eax
+	movl	(%edi,%ecx,4),%ebx
+	adcl	$0,%edx
+	addl	%eax,%ebx
+	movl	%ebx,(%edi,%ecx,4)
+	incl	%ecx
+	movl	%edx,%ebx
+	jnz	L(oop2)
+
+	adcl	$0,%ebx
+
+	movl	%ebx,(%edi)
+	addl	$4,%edi
+	movl	VAR_COUNTER,%eax
+	decl	%eax
+	movl	%eax,VAR_COUNTER
+	jnz	L(outer)
+
+L(skip):
+	popl	%ebx
+	popl	%edi
+	popl	%ebp
+	popl	%esi
+	addl	$4,%esp
+	ret
+
+L(done):
+	movl	%edx,4(%edi)	C store to wp[1]
+	popl	%edi
+	popl	%ebp
+	popl	%esi
+	popl	%eax		C dummy pop for deallocating stack slot
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/pentium/popcount.asm b/third_party/gmp/mpn/x86/pentium/popcount.asm
new file mode 100644
index 0000000..0e82144
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium/popcount.asm

@@ -0,0 +1,146 @@
+dnl  Intel P5 mpn_popcount -- mpn bit population count.
+
+dnl  Copyright 2001, 2002, 2014, 2015 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P5: 8.0 cycles/limb
+
+
+C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size);
+C
+C An arithmetic approach has been found to be slower than the table lookup,
+C due to needing too many instructions.
+
+C The slightly strange quoting here helps the renaming done by tune/many.pl.
+deflit(TABLE_NAME,
+m4_assert_defined(`GSYM_PREFIX')
+GSYM_PREFIX`'mpn_popcount``'_table')
+
+C FIXME: exporting the table to hamdist is incorrect as it hurt incremental
+C linking.
+
+	RODATA
+	ALIGN(8)
+	GLOBL	TABLE_NAME
+TABLE_NAME:
+forloop(i,0,255,
+`	.byte	m4_popcount(i)
+')
+
+defframe(PARAM_SIZE,8)
+defframe(PARAM_SRC, 4)
+
+	TEXT
+	ALIGN(8)
+
+PROLOGUE(mpn_popcount)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	pushl	%esi	FRAME_pushl()
+
+ifdef(`PIC',`
+	pushl	%ebx	FRAME_pushl()
+	pushl	%ebp	FRAME_pushl()
+ifdef(`DARWIN',`
+	shll	%ecx		C size in byte pairs
+	LEA(	TABLE_NAME, %ebp)
+	movl	PARAM_SRC, %esi
+	xorl	%eax, %eax	C total
+	xorl	%ebx, %ebx	C byte
+	xorl	%edx, %edx	C byte
+',`
+	call	L(here)
+L(here):
+	popl	%ebp
+	shll	%ecx		C size in byte pairs
+
+	addl	$_GLOBAL_OFFSET_TABLE_+[.-L(here)], %ebp
+	movl	PARAM_SRC, %esi
+
+	xorl	%eax, %eax	C total
+	xorl	%ebx, %ebx	C byte
+
+	movl	TABLE_NAME@GOT(%ebp), %ebp
+	xorl	%edx, %edx	C byte
+')
+define(TABLE,`(%ebp,$1)')
+',`
+dnl non-PIC
+	shll	%ecx		C size in byte pairs
+	movl	PARAM_SRC, %esi
+
+	pushl	%ebx	FRAME_pushl()
+	xorl	%eax, %eax	C total
+
+	xorl	%ebx, %ebx	C byte
+	xorl	%edx, %edx	C byte
+
+define(TABLE,`TABLE_NAME`'($1)')
+')
+
+
+	ALIGN(8)	C necessary on P55 for claimed speed
+L(top):
+	C eax	total
+	C ebx	byte
+	C ecx	counter, 2*size to 2
+	C edx	byte
+	C esi	src
+	C edi
+	C ebp	[PIC] table
+
+	addl	%ebx, %eax
+	movb	-1(%esi,%ecx,2), %bl
+
+	addl	%edx, %eax
+	movb	-2(%esi,%ecx,2), %dl
+
+	movb	TABLE(%ebx), %bl
+	decl	%ecx
+
+	movb	TABLE(%edx), %dl
+	jnz	L(top)
+
+
+ifdef(`PIC',`
+	popl	%ebp
+')
+	addl	%ebx, %eax
+	popl	%ebx
+
+	addl	%edx, %eax
+	popl	%esi
+
+	ret
+
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/x86/pentium/rshift.asm b/third_party/gmp/mpn/x86/pentium/rshift.asm
new file mode 100644
index 0000000..2105c4c
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium/rshift.asm

@@ -0,0 +1,243 @@
+dnl  Intel Pentium mpn_rshift -- mpn right shift.
+
+dnl  Copyright 1992, 1994-1996, 1999, 2000, 2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C         cycles/limb
+C P5,P54:    6.0
+C P55:       5.375
+
+
+C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                       unsigned shift);
+C
+C The main shift-by-N loop should run at 5.375 c/l and that's what P55 does,
+C but P5 and P54 run only at 6.0 c/l, which is 4 cycles lost somewhere.
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC,  8)
+defframe(PARAM_DST,  4)
+
+	TEXT
+	ALIGN(8)
+PROLOGUE(mpn_rshift)
+
+	pushl	%edi
+	pushl	%esi
+	pushl	%ebx
+	pushl	%ebp
+deflit(`FRAME',16)
+
+	movl	PARAM_DST,%edi
+	movl	PARAM_SRC,%esi
+	movl	PARAM_SIZE,%ebp
+	movl	PARAM_SHIFT,%ecx
+
+C We can use faster code for shift-by-1 under certain conditions.
+	cmp	$1,%ecx
+	jne	L(normal)
+	leal	4(%edi),%eax
+	cmpl	%esi,%eax
+	jnc	L(special)		C jump if res_ptr + 1 >= s_ptr
+	leal	(%edi,%ebp,4),%eax
+	cmpl	%eax,%esi
+	jnc	L(special)		C jump if s_ptr >= res_ptr + size
+
+L(normal):
+	movl	(%esi),%edx
+	addl	$4,%esi
+	xorl	%eax,%eax
+	shrdl(	%cl, %edx, %eax)	C compute carry limb
+	pushl	%eax			C push carry limb onto stack
+
+	decl	%ebp
+	pushl	%ebp
+	shrl	$3,%ebp
+	jz	L(end)
+
+	movl	(%edi),%eax		C fetch destination cache line
+
+	ALIGN(4)
+L(oop):	movl	28(%edi),%eax		C fetch destination cache line
+	movl	%edx,%ebx
+
+	movl	(%esi),%eax
+	movl	4(%esi),%edx
+	shrdl(	%cl, %eax, %ebx)
+	shrdl(	%cl, %edx, %eax)
+	movl	%ebx,(%edi)
+	movl	%eax,4(%edi)
+
+	movl	8(%esi),%ebx
+	movl	12(%esi),%eax
+	shrdl(	%cl, %ebx, %edx)
+	shrdl(	%cl, %eax, %ebx)
+	movl	%edx,8(%edi)
+	movl	%ebx,12(%edi)
+
+	movl	16(%esi),%edx
+	movl	20(%esi),%ebx
+	shrdl(	%cl, %edx, %eax)
+	shrdl(	%cl, %ebx, %edx)
+	movl	%eax,16(%edi)
+	movl	%edx,20(%edi)
+
+	movl	24(%esi),%eax
+	movl	28(%esi),%edx
+	shrdl(	%cl, %eax, %ebx)
+	shrdl(	%cl, %edx, %eax)
+	movl	%ebx,24(%edi)
+	movl	%eax,28(%edi)
+
+	addl	$32,%esi
+	addl	$32,%edi
+	decl	%ebp
+	jnz	L(oop)
+
+L(end):	popl	%ebp
+	andl	$7,%ebp
+	jz	L(end2)
+L(oop2):
+	movl	(%esi),%eax
+	shrdl(	%cl,%eax,%edx)		C compute result limb
+	movl	%edx,(%edi)
+	movl	%eax,%edx
+	addl	$4,%esi
+	addl	$4,%edi
+	decl	%ebp
+	jnz	L(oop2)
+
+L(end2):
+	shrl	%cl,%edx		C compute most significant limb
+	movl	%edx,(%edi)		C store it
+
+	popl	%eax			C pop carry limb
+
+	popl	%ebp
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
+
+
+C We loop from least significant end of the arrays, which is only
+C permissable if the source and destination don't overlap, since the
+C function is documented to work for overlapping source and destination.
+
+L(special):
+	leal	-4(%edi,%ebp,4),%edi
+	leal	-4(%esi,%ebp,4),%esi
+
+	movl	(%esi),%edx
+	subl	$4,%esi
+
+	decl	%ebp
+	pushl	%ebp
+	shrl	$3,%ebp
+
+	shrl	%edx
+	incl	%ebp
+	decl	%ebp
+	jz	L(Lend)
+
+	movl	(%edi),%eax		C fetch destination cache line
+
+	ALIGN(4)
+L(Loop):
+	movl	-28(%edi),%eax		C fetch destination cache line
+	movl	%edx,%ebx
+
+	movl	(%esi),%eax
+	movl	-4(%esi),%edx
+	rcrl	%eax
+	movl	%ebx,(%edi)
+	rcrl	%edx
+	movl	%eax,-4(%edi)
+
+	movl	-8(%esi),%ebx
+	movl	-12(%esi),%eax
+	rcrl	%ebx
+	movl	%edx,-8(%edi)
+	rcrl	%eax
+	movl	%ebx,-12(%edi)
+
+	movl	-16(%esi),%edx
+	movl	-20(%esi),%ebx
+	rcrl	%edx
+	movl	%eax,-16(%edi)
+	rcrl	%ebx
+	movl	%edx,-20(%edi)
+
+	movl	-24(%esi),%eax
+	movl	-28(%esi),%edx
+	rcrl	%eax
+	movl	%ebx,-24(%edi)
+	rcrl	%edx
+	movl	%eax,-28(%edi)
+
+	leal	-32(%esi),%esi		C use leal not to clobber carry
+	leal	-32(%edi),%edi
+	decl	%ebp
+	jnz	L(Loop)
+
+L(Lend):
+	popl	%ebp
+	sbbl	%eax,%eax		C save carry in %eax
+	andl	$7,%ebp
+	jz	L(Lend2)
+	addl	%eax,%eax		C restore carry from eax
+L(Loop2):
+	movl	%edx,%ebx
+	movl	(%esi),%edx
+	rcrl	%edx
+	movl	%ebx,(%edi)
+
+	leal	-4(%esi),%esi		C use leal not to clobber carry
+	leal	-4(%edi),%edi
+	decl	%ebp
+	jnz	L(Loop2)
+
+	jmp	L(L1)
+L(Lend2):
+	addl	%eax,%eax		C restore carry from eax
+L(L1):	movl	%edx,(%edi)		C store last limb
+
+	movl	$0,%eax
+	rcrl	%eax
+
+	popl	%ebp
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/pentium/sqr_basecase.asm b/third_party/gmp/mpn/x86/pentium/sqr_basecase.asm
new file mode 100644
index 0000000..b11d767
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium/sqr_basecase.asm

@@ -0,0 +1,528 @@
+dnl  Intel P5 mpn_sqr_basecase -- square an mpn number.
+
+dnl  Copyright 1999-2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P5: approx 8 cycles per crossproduct, or 15.5 cycles per triangular
+C product at around 20x20 limbs.
+
+
+C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C Calculate src,size squared, storing the result in dst,2*size.
+C
+C The algorithm is basically the same as mpn/generic/sqr_basecase.c, but a
+C lot of function call overheads are avoided, especially when the size is
+C small.
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+	TEXT
+	ALIGN(8)
+PROLOGUE(mpn_sqr_basecase)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %edx
+	movl	PARAM_SRC, %eax
+
+	cmpl	$2, %edx
+	movl	PARAM_DST, %ecx
+
+	je	L(two_limbs)
+
+	movl	(%eax), %eax
+	ja	L(three_or_more)
+
+C -----------------------------------------------------------------------------
+C one limb only
+	C eax	src
+	C ebx
+	C ecx	dst
+	C edx
+
+	mull	%eax
+
+	movl	%eax, (%ecx)
+	movl	%edx, 4(%ecx)
+
+	ret
+
+C -----------------------------------------------------------------------------
+	ALIGN(8)
+L(two_limbs):
+	C eax	src
+	C ebx
+	C ecx	dst
+	C edx	size
+
+	pushl	%ebp
+	pushl	%edi
+
+	pushl	%esi
+	pushl	%ebx
+
+	movl	%eax, %ebx
+	movl	(%eax), %eax
+
+	mull	%eax		C src[0]^2
+
+	movl	%eax, (%ecx)	C dst[0]
+	movl	%edx, %esi	C dst[1]
+
+	movl	4(%ebx), %eax
+
+	mull	%eax		C src[1]^2
+
+	movl	%eax, %edi	C dst[2]
+	movl	%edx, %ebp	C dst[3]
+
+	movl	(%ebx), %eax
+
+	mull	4(%ebx)		C src[0]*src[1]
+
+	addl	%eax, %esi
+	popl	%ebx
+
+	adcl	%edx, %edi
+
+	adcl	$0, %ebp
+	addl	%esi, %eax
+
+	adcl	%edi, %edx
+	movl	%eax, 4(%ecx)
+
+	adcl	$0, %ebp
+	popl	%esi
+
+	movl	%edx, 8(%ecx)
+	movl	%ebp, 12(%ecx)
+
+	popl	%edi
+	popl	%ebp
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(8)
+L(three_or_more):
+	C eax	src low limb
+	C ebx
+	C ecx	dst
+	C edx	size
+
+	cmpl	$4, %edx
+	pushl	%ebx
+deflit(`FRAME',4)
+
+	movl	PARAM_SRC, %ebx
+	jae	L(four_or_more)
+
+
+C -----------------------------------------------------------------------------
+C three limbs
+	C eax	src low limb
+	C ebx	src
+	C ecx	dst
+	C edx	size
+
+	pushl	%ebp
+	pushl	%edi
+
+	mull	%eax		C src[0] ^ 2
+
+	movl	%eax, (%ecx)
+	movl	%edx, 4(%ecx)
+
+	movl	4(%ebx), %eax
+	xorl	%ebp, %ebp
+
+	mull	%eax		C src[1] ^ 2
+
+	movl	%eax, 8(%ecx)
+	movl	%edx, 12(%ecx)
+
+	movl	8(%ebx), %eax
+	pushl	%esi		C risk of cache bank clash
+
+	mull	%eax		C src[2] ^ 2
+
+	movl	%eax, 16(%ecx)
+	movl	%edx, 20(%ecx)
+
+	movl	(%ebx), %eax
+
+	mull	4(%ebx)		C src[0] * src[1]
+
+	movl	%eax, %esi
+	movl	%edx, %edi
+
+	movl	(%ebx), %eax
+
+	mull	8(%ebx)		C src[0] * src[2]
+
+	addl	%eax, %edi
+	movl	%edx, %ebp
+
+	adcl	$0, %ebp
+	movl	4(%ebx), %eax
+
+	mull	8(%ebx)		C src[1] * src[2]
+
+	xorl	%ebx, %ebx
+	addl	%eax, %ebp
+
+	C eax
+	C ebx	zero, will be dst[5]
+	C ecx	dst
+	C edx	dst[4]
+	C esi	dst[1]
+	C edi	dst[2]
+	C ebp	dst[3]
+
+	adcl	$0, %edx
+	addl	%esi, %esi
+
+	adcl	%edi, %edi
+
+	adcl	%ebp, %ebp
+
+	adcl	%edx, %edx
+	movl	4(%ecx), %eax
+
+	adcl	$0, %ebx
+	addl	%esi, %eax
+
+	movl	%eax, 4(%ecx)
+	movl	8(%ecx), %eax
+
+	adcl	%edi, %eax
+	movl	12(%ecx), %esi
+
+	adcl	%ebp, %esi
+	movl	16(%ecx), %edi
+
+	movl	%eax, 8(%ecx)
+	movl	%esi, 12(%ecx)
+
+	adcl	%edx, %edi
+	popl	%esi
+
+	movl	20(%ecx), %eax
+	movl	%edi, 16(%ecx)
+
+	popl	%edi
+	popl	%ebp
+
+	adcl	%ebx, %eax	C no carry out of this
+	popl	%ebx
+
+	movl	%eax, 20(%ecx)
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(8)
+L(four_or_more):
+	C eax	src low limb
+	C ebx	src
+	C ecx	dst
+	C edx	size
+	C esi
+	C edi
+	C ebp
+	C
+	C First multiply src[0]*src[1..size-1] and store at dst[1..size].
+
+deflit(`FRAME',4)
+
+	pushl	%edi
+FRAME_pushl()
+	pushl	%esi
+FRAME_pushl()
+
+	pushl	%ebp
+FRAME_pushl()
+	leal	(%ecx,%edx,4), %edi	C dst end of this mul1
+
+	leal	(%ebx,%edx,4), %esi	C src end
+	movl	%ebx, %ebp		C src
+
+	negl	%edx			C -size
+	xorl	%ebx, %ebx		C clear carry limb and carry flag
+
+	leal	1(%edx), %ecx		C -(size-1)
+
+L(mul1):
+	C eax	scratch
+	C ebx	carry
+	C ecx	counter, negative
+	C edx	scratch
+	C esi	&src[size]
+	C edi	&dst[size]
+	C ebp	src
+
+	adcl	$0, %ebx
+	movl	(%esi,%ecx,4), %eax
+
+	mull	(%ebp)
+
+	addl	%eax, %ebx
+
+	movl	%ebx, (%edi,%ecx,4)
+	incl	%ecx
+
+	movl	%edx, %ebx
+	jnz	L(mul1)
+
+
+	C Add products src[n]*src[n+1..size-1] at dst[2*n-1...], for
+	C n=1..size-2.
+	C
+	C The last two products, which are the end corner of the product
+	C triangle, are handled separately to save looping overhead.  These
+	C are src[size-3]*src[size-2,size-1] and src[size-2]*src[size-1].
+	C If size is 4 then it's only these that need to be done.
+	C
+	C In the outer loop %esi is a constant, and %edi just advances by 1
+	C limb each time.  The size of the operation decreases by 1 limb
+	C each time.
+
+	C eax
+	C ebx	carry (needing carry flag added)
+	C ecx
+	C edx
+	C esi	&src[size]
+	C edi	&dst[size]
+	C ebp
+
+	adcl	$0, %ebx
+	movl	PARAM_SIZE, %edx
+
+	movl	%ebx, (%edi)
+	subl	$4, %edx
+
+	negl	%edx
+	jz	L(corner)
+
+
+L(outer):
+	C ebx	previous carry limb to store
+	C edx	outer loop counter (negative)
+	C esi	&src[size]
+	C edi	dst, pointing at stored carry limb of previous loop
+
+	pushl	%edx			C new outer loop counter
+	leal	-2(%edx), %ecx
+
+	movl	%ebx, (%edi)
+	addl	$4, %edi
+
+	addl	$4, %ebp
+	xorl	%ebx, %ebx		C initial carry limb, clear carry flag
+
+L(inner):
+	C eax	scratch
+	C ebx	carry (needing carry flag added)
+	C ecx	counter, negative
+	C edx	scratch
+	C esi	&src[size]
+	C edi	dst end of this addmul
+	C ebp	&src[j]
+
+	adcl	$0, %ebx
+	movl	(%esi,%ecx,4), %eax
+
+	mull	(%ebp)
+
+	addl	%ebx, %eax
+	movl	(%edi,%ecx,4), %ebx
+
+	adcl	$0, %edx
+	addl	%eax, %ebx
+
+	movl	%ebx, (%edi,%ecx,4)
+	incl	%ecx
+
+	movl	%edx, %ebx
+	jnz	L(inner)
+
+
+	adcl	$0, %ebx
+	popl	%edx		C outer loop counter
+
+	incl	%edx
+	jnz	L(outer)
+
+
+	movl	%ebx, (%edi)
+
+L(corner):
+	C esi	&src[size]
+	C edi	&dst[2*size-4]
+
+	movl	-8(%esi), %eax
+	movl	-4(%edi), %ebx		C risk of data cache bank clash here
+
+	mull	-12(%esi)		C src[size-2]*src[size-3]
+
+	addl	%eax, %ebx
+	movl	%edx, %ecx
+
+	adcl	$0, %ecx
+	movl	-4(%esi), %eax
+
+	mull	-12(%esi)		C src[size-1]*src[size-3]
+
+	addl	%ecx, %eax
+	movl	(%edi), %ecx
+
+	adcl	$0, %edx
+	movl	%ebx, -4(%edi)
+
+	addl	%eax, %ecx
+	movl	%edx, %ebx
+
+	adcl	$0, %ebx
+	movl	-4(%esi), %eax
+
+	mull	-8(%esi)		C src[size-1]*src[size-2]
+
+	movl	%ecx, (%edi)
+	addl	%eax, %ebx
+
+	adcl	$0, %edx
+	movl	PARAM_SIZE, %eax
+
+	negl	%eax
+	movl	%ebx, 4(%edi)
+
+	addl	$1, %eax		C -(size-1) and clear carry
+	movl	%edx, 8(%edi)
+
+
+C -----------------------------------------------------------------------------
+C Left shift of dst[1..2*size-2], high bit shifted out becomes dst[2*size-1].
+
+L(lshift):
+	C eax	counter, negative
+	C ebx	next limb
+	C ecx
+	C edx
+	C esi
+	C edi	&dst[2*size-4]
+	C ebp
+
+	movl	12(%edi,%eax,8), %ebx
+
+	rcll	%ebx
+	movl	16(%edi,%eax,8), %ecx
+
+	rcll	%ecx
+	movl	%ebx, 12(%edi,%eax,8)
+
+	movl	%ecx, 16(%edi,%eax,8)
+	incl	%eax
+
+	jnz	L(lshift)
+
+
+	adcl	%eax, %eax		C high bit out
+	movl	PARAM_SRC, %esi
+
+	movl	PARAM_SIZE, %ecx	C risk of cache bank clash
+	movl	%eax, 12(%edi)		C dst most significant limb
+
+
+C -----------------------------------------------------------------------------
+C Now add in the squares on the diagonal, namely src[0]^2, src[1]^2, ...,
+C src[size-1]^2.  dst[0] hasn't yet been set at all yet, and just gets the
+C low limb of src[0]^2.
+
+	movl	(%esi), %eax		C src[0]
+	leal	(%esi,%ecx,4), %esi	C src end
+
+	negl	%ecx
+
+	mull	%eax
+
+	movl	%eax, 16(%edi,%ecx,8)	C dst[0]
+	movl	%edx, %ebx
+
+	addl	$1, %ecx		C size-1 and clear carry
+
+L(diag):
+	C eax	scratch (low product)
+	C ebx	carry limb
+	C ecx	counter, negative
+	C edx	scratch (high product)
+	C esi	&src[size]
+	C edi	&dst[2*size-4]
+	C ebp	scratch (fetched dst limbs)
+
+	movl	(%esi,%ecx,4), %eax
+	adcl	$0, %ebx
+
+	mull	%eax
+
+	movl	16-4(%edi,%ecx,8), %ebp
+
+	addl	%ebp, %ebx
+	movl	16(%edi,%ecx,8), %ebp
+
+	adcl	%eax, %ebp
+	movl	%ebx, 16-4(%edi,%ecx,8)
+
+	movl	%ebp, 16(%edi,%ecx,8)
+	incl	%ecx
+
+	movl	%edx, %ebx
+	jnz	L(diag)
+
+
+	adcl	$0, %edx
+	movl	16-4(%edi), %eax	C dst most significant limb
+
+	addl	%eax, %edx
+	popl	%ebp
+
+	movl	%edx, 16-4(%edi)
+	popl	%esi		C risk of cache bank clash
+
+	popl	%edi
+	popl	%ebx
+
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/pentium4/README b/third_party/gmp/mpn/x86/pentium4/README
new file mode 100644
index 0000000..90f752e
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium4/README

@@ -0,0 +1,124 @@
+Copyright 2001 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.
+
+
+
+
+                   INTEL PENTIUM-4 MPN SUBROUTINES
+
+
+This directory contains mpn functions optimized for Intel Pentium-4.
+
+The mmx subdirectory has routines using MMX instructions, the sse2
+subdirectory has routines using SSE2 instructions.  All P4s have these, the
+separate directories are just so configure can omit that code if the
+assembler doesn't support it.
+
+
+STATUS
+
+                                cycles/limb
+
+	mpn_add_n/sub_n            4 normal, 6 in-place
+
+	mpn_mul_1                  4 normal, 6 in-place
+	mpn_addmul_1               6
+	mpn_submul_1               7
+
+	mpn_mul_basecase           6 cycles/crossproduct (approx)
+
+	mpn_sqr_basecase           3.5 cycles/crossproduct (approx)
+                                   or 7.0 cycles/triangleproduct (approx)
+
+	mpn_l/rshift               1.75
+
+
+
+The shifts ought to be able to go at 1.5 c/l, but not much effort has been
+applied to them yet.
+
+In-place operations, and all addmul, submul, mul_basecase and sqr_basecase
+calls, suffer from pipeline anomalies associated with write combining and
+movd reads and writes to the same or nearby locations.  The movq
+instructions do not trigger the same hardware problems.  Unfortunately,
+using movq and splitting/combining seems to require too many extra
+instructions to help.  Perhaps future chip steppings will be better.
+
+
+
+NOTES
+
+The Pentium-4 pipeline "Netburst", provides for quite a number of surprises.
+Many traditional x86 instructions run very slowly, requiring use of
+alterative instructions for acceptable performance.
+
+adcl and sbbl are quite slow at 8 cycles for reg->reg.  paddq of 32-bits
+within a 64-bit mmx register seems better, though the combination
+paddq/psrlq when propagating a carry is still a 4 cycle latency.
+
+incl and decl should be avoided, instead use add $1 and sub $1.  Apparently
+the carry flag is not separately renamed, so incl and decl depend on all
+previous flags-setting instructions.
+
+shll and shrl have a 4 cycle latency, or 8 times the latency of the fastest
+integer instructions (addl, subl, orl, andl, and some more).  shldl and
+shrdl seem to have 13 and 15 cycles latency, respectively.  Bizarre.
+
+movq mmx -> mmx does have 6 cycle latency, as noted in the documentation.
+pxor/por or similar combination at 2 cycles latency can be used instead.
+The movq however executes in the float unit, thereby saving MMX execution
+resources.  With the right juggling, data moves shouldn't be on a dependent
+chain.
+
+L1 is write-through, but the write-combining sounds like it does enough to
+not require explicit destination prefetching.
+
+xmm registers so far haven't found a use, but not much effort has been
+expended.  A configure test for whether the operating system knows
+fxsave/fxrestor will be needed if they're used.
+
+
+
+REFERENCES
+
+Intel Pentium-4 processor manuals,
+
+	http://developer.intel.com/design/pentium4/manuals
+
+"Intel Pentium 4 Processor Optimization Reference Manual", Intel, 2001,
+order number 248966.  Available on-line:
+
+	http://developer.intel.com/design/pentium4/manuals/248966.htm
+
+
+
+----------------
+Local variables:
+mode: text
+fill-column: 76
+End:

diff --git a/third_party/gmp/mpn/x86/pentium4/copyd.asm b/third_party/gmp/mpn/x86/pentium4/copyd.asm
new file mode 100644
index 0000000..82af81c
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium4/copyd.asm

@@ -0,0 +1,71 @@
+dnl  Pentium-4 mpn_copyd -- copy limb vector, decrementing.
+
+dnl  Copyright 1999-2001 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+dnl  The std/rep/movsl/cld is very slow for small blocks on pentium4.  Its
+dnl  startup time seems to be about 165 cycles.  It then needs 2.6 c/l.
+dnl  We therefore use an open-coded 2 c/l copying loop.
+
+dnl  Ultimately, we may want to use 64-bit movq or 128-bit movdqu in some
+dnl  nifty unrolled arrangement.  Clearly, that could reach much higher
+dnl  speeds, at least for large blocks.
+
+include(`../config.m4')
+
+
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST,  4)
+
+	TEXT
+	ALIGN(8)
+
+PROLOGUE(mpn_copyd)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+
+	movl	PARAM_SRC, %eax
+	movl	PARAM_DST, %edx
+	movl	%ebx, PARAM_SIZE
+	addl	$-1, %ecx
+	js	L(end)
+
+L(loop):
+	movl	(%eax,%ecx,4), %ebx
+	movl	%ebx, (%edx,%ecx,4)
+	addl	$-1, %ecx
+
+	jns	L(loop)
+L(end):
+	movl	PARAM_SIZE, %ebx
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/pentium4/copyi.asm b/third_party/gmp/mpn/x86/pentium4/copyi.asm
new file mode 100644
index 0000000..b614887
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium4/copyi.asm

@@ -0,0 +1,93 @@
+dnl  Pentium-4 mpn_copyi -- copy limb vector, incrementing.
+
+dnl  Copyright 1999-2001 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+dnl  The rep/movsl is very slow for small blocks on pentium4.  Its startup
+dnl  time seems to be about 110 cycles.  It then copies at a rate of one
+dnl  limb per cycle.  We therefore fall back to an open-coded 2 c/l copying
+dnl  loop for smaller sizes.
+
+dnl  Ultimately, we may want to use 64-bit movd or 128-bit movdqu in some
+dnl  nifty unrolled arrangement.  Clearly, that could reach much higher
+dnl  speeds, at least for large blocks.
+
+include(`../config.m4')
+
+
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST,  4)
+
+	TEXT
+	ALIGN(8)
+
+PROLOGUE(mpn_copyi)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	cmpl	$150, %ecx
+	jg	L(replmovs)
+
+	movl	PARAM_SRC, %eax
+	movl	PARAM_DST, %edx
+	movl	%ebx, PARAM_SIZE
+	testl	%ecx, %ecx
+	jz	L(end)
+
+L(loop):
+	movl	(%eax), %ebx
+	leal	4(%eax), %eax
+	addl	$-1, %ecx
+	movl	%ebx, (%edx)
+	leal	4(%edx), %edx
+
+	jnz	L(loop)
+
+L(end):
+	movl	PARAM_SIZE, %ebx
+	ret
+
+L(replmovs):
+	cld	C better safe than sorry, see mpn/x86/README
+
+	movl	%esi, %eax
+	movl	PARAM_SRC, %esi
+	movl	%edi, %edx
+	movl	PARAM_DST, %edi
+
+	rep
+	movsl
+
+	movl	%eax, %esi
+	movl	%edx, %edi
+
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/pentium4/mmx/lshift.asm b/third_party/gmp/mpn/x86/pentium4/mmx/lshift.asm
new file mode 100644
index 0000000..b5eca66
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium4/mmx/lshift.asm

@@ -0,0 +1,39 @@
+dnl  Intel Pentium-4 mpn_lshift -- left shift.
+
+dnl  Copyright 2001, 2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P4 Willamette, Northwood: 1.75 cycles/limb
+C P4 Prescott:		    2.0 cycles/limb
+
+
+MULFUNC_PROLOGUE(mpn_lshift)
+include_mpn(`x86/pentium/mmx/lshift.asm')

diff --git a/third_party/gmp/mpn/x86/pentium4/mmx/popham.asm b/third_party/gmp/mpn/x86/pentium4/mmx/popham.asm
new file mode 100644
index 0000000..9563cb5
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium4/mmx/popham.asm

@@ -0,0 +1,203 @@
+dnl  Intel Pentium 4 mpn_popcount, mpn_hamdist -- population count and
+dnl  hamming distance.
+
+dnl  Copyright 2000-2002, 2007 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C			     popcount	     hamdist
+C P3 model 9  (Banias)		?		?
+C P3 model 13 (Dothan)		6		6
+C P4 model 0  (Willamette)
+C P4 model 1  (?)
+C P4 model 2  (Northwood)	8		9
+C P4 model 3  (Prescott)	8		9
+C P4 model 4  (Nocona)
+
+C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size);
+C unsigned long mpn_hamdist (mp_srcptr src, mp_srcptr src2, mp_size_t size);
+C
+C Loading with unaligned movq's costs an extra 1 c/l and hence is avoided.
+C Two movd's and a punpckldq seems to be the same speed as an aligned movq,
+C and using them saves fiddling about with alignment testing on entry.
+C
+C For popcount there's 13 mmx instructions in the loop, so perhaps 6.5 c/l
+C might be possible, but 8 c/l relying on out-of-order execution is already
+C quite reasonable.
+
+ifdef(`OPERATION_popcount',,
+`ifdef(`OPERATION_hamdist',,
+`m4_error(`Need OPERATION_popcount or OPERATION_hamdist defined
+')')')
+
+define(HAM,
+m4_assert_numargs(1)
+`ifdef(`OPERATION_hamdist',`$1')')
+
+define(POP,
+m4_assert_numargs(1)
+`ifdef(`OPERATION_popcount',`$1')')
+
+HAM(`
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC2,  8)
+defframe(PARAM_SRC,   4)
+define(M4_function,mpn_hamdist)
+')
+POP(`
+defframe(PARAM_SIZE,  8)
+defframe(PARAM_SRC,   4)
+define(M4_function,mpn_popcount)
+')
+
+MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
+
+
+ifdef(`PIC',,`
+	dnl  non-PIC
+	RODATA
+	ALIGN(8)
+L(rodata_AAAAAAAAAAAAAAAA):
+	.long	0xAAAAAAAA
+	.long	0xAAAAAAAA
+L(rodata_3333333333333333):
+	.long	0x33333333
+	.long	0x33333333
+L(rodata_0F0F0F0F0F0F0F0F):
+	.long	0x0F0F0F0F
+	.long	0x0F0F0F0F
+')
+
+	TEXT
+	ALIGN(16)
+
+PROLOGUE(M4_function)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	movl	PARAM_SRC, %eax
+
+ifdef(`PIC',`
+	movl	$0xAAAAAAAA, %edx
+	movd	%edx, %mm7
+	punpckldq %mm7, %mm7
+
+	movl	$0x33333333, %edx
+	movd	%edx, %mm6
+	punpckldq %mm6, %mm6
+
+	movl	$0x0F0F0F0F, %edx
+	movd	%edx, %mm5
+	punpckldq %mm5, %mm5
+
+HAM(`	movl	PARAM_SRC2, %edx')
+
+',`
+	dnl non-PIC
+HAM(`	movl	PARAM_SRC2, %edx')
+	movq	L(rodata_AAAAAAAAAAAAAAAA), %mm7
+	movq	L(rodata_3333333333333333), %mm6
+	movq	L(rodata_0F0F0F0F0F0F0F0F), %mm5
+')
+
+	pxor	%mm4, %mm4		C zero
+	pxor	%mm0, %mm0		C total
+
+	subl	$1, %ecx
+	ja	L(top)
+
+L(last):
+	movd	(%eax,%ecx,4), %mm1		C src high limb
+HAM(`	movd	(%edx,%ecx,4), %mm2
+	pxor	%mm2, %mm1
+')
+	jmp	L(loaded)
+
+
+L(top):
+	C eax	src
+	C ebx
+	C ecx	counter, size-1 to 2 or 1, inclusive
+	C edx	[hamdist] src2
+	C
+	C mm0	total (low dword)
+	C mm1	(scratch)
+	C mm2	(scratch)
+	C mm3
+	C mm4	0x0000000000000000
+	C mm5	0x0F0F0F0F0F0F0F0F
+	C mm6	0x3333333333333333
+	C mm7	0xAAAAAAAAAAAAAAAA
+
+	movd	(%eax), %mm1
+	movd	4(%eax), %mm2
+	punpckldq %mm2, %mm1
+	addl	$8, %eax
+
+HAM(`	movd	(%edx), %mm2
+	movd	4(%edx), %mm3
+	punpckldq %mm3, %mm2
+	pxor	%mm2, %mm1
+	addl	$8, %edx
+')
+
+L(loaded):
+	movq	%mm7, %mm2
+	pand	%mm1, %mm2
+	psrlq	$1, %mm2
+	psubd	%mm2, %mm1	C bit pairs
+
+	movq	%mm6, %mm2
+	pand	%mm1, %mm2
+	psrlq	$2, %mm1
+	pand	%mm6, %mm1
+	paddd	%mm2, %mm1	C nibbles
+
+	movq	%mm5, %mm2
+	pand	%mm1, %mm2
+	psrlq	$4, %mm1
+	pand	%mm5, %mm1
+	paddd	%mm2, %mm1	C bytes
+
+	psadbw(	%mm4, %mm1)
+	paddd	%mm1, %mm0	C to total
+
+	subl	$2, %ecx
+	jg	L(top)
+
+	C ecx is 0 or -1 representing respectively 1 or 0 further limbs
+	jz	L(last)
+
+
+	movd	%mm0, %eax
+	emms
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/pentium4/mmx/rshift.asm b/third_party/gmp/mpn/x86/pentium4/mmx/rshift.asm
new file mode 100644
index 0000000..3ac0094
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium4/mmx/rshift.asm

@@ -0,0 +1,39 @@
+dnl  Intel Pentium-4 mpn_rshift -- right shift.
+
+dnl  Copyright 2001, 2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P4 Willamette, Northwood: 1.75 cycles/limb
+C P4 Prescott:		    2.0 cycles/limb
+
+
+MULFUNC_PROLOGUE(mpn_rshift)
+include_mpn(`x86/pentium/mmx/rshift.asm')

diff --git a/third_party/gmp/mpn/x86/pentium4/sse2/add_n.asm b/third_party/gmp/mpn/x86/pentium4/sse2/add_n.asm
new file mode 100644
index 0000000..8e2380e
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium4/sse2/add_n.asm

@@ -0,0 +1,101 @@
+dnl  Intel Pentium-4 mpn_add_n -- mpn addition.
+
+dnl  Copyright 2001, 2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C					cycles/limb
+C			     dst!=src1,2  dst==src1  dst==src2
+C P6 model 0-8,10-12		-
+C P6 model 9   (Banias)		?
+C P6 model 13  (Dothan)		?
+C P4 model 0-1 (Willamette)	?
+C P4 model 2   (Northwood)	4	     6		6
+C P4 model 3-4 (Prescott)	4.25	     7.5	7.5
+
+defframe(PARAM_CARRY,20)
+defframe(PARAM_SIZE, 16)
+defframe(PARAM_SRC2, 12)
+defframe(PARAM_SRC1, 8)
+defframe(PARAM_DST,  4)
+
+dnl  re-use parameter space
+define(SAVE_EBX,`PARAM_SRC1')
+
+	TEXT
+	ALIGN(8)
+
+PROLOGUE(mpn_add_nc)
+deflit(`FRAME',0)
+	movd	PARAM_CARRY, %mm0
+	jmp	L(start_nc)
+EPILOGUE()
+
+	ALIGN(8)
+PROLOGUE(mpn_add_n)
+deflit(`FRAME',0)
+	pxor	%mm0, %mm0
+L(start_nc):
+	mov	PARAM_SRC1, %eax
+	mov	%ebx, SAVE_EBX
+	mov	PARAM_SRC2, %ebx
+	mov	PARAM_DST, %edx
+	mov	PARAM_SIZE, %ecx
+
+	lea	(%eax,%ecx,4), %eax	C src1 end
+	lea	(%ebx,%ecx,4), %ebx	C src2 end
+	lea	(%edx,%ecx,4), %edx	C dst end
+	neg	%ecx			C -size
+
+L(top):
+	C eax	src1 end
+	C ebx	src2 end
+	C ecx	counter, limbs, negative
+	C edx	dst end
+	C mm0	carry bit
+
+	movd	(%eax,%ecx,4), %mm1
+	movd	(%ebx,%ecx,4), %mm2
+	paddq	%mm2, %mm1
+
+	paddq	%mm1, %mm0
+	movd	%mm0, (%edx,%ecx,4)
+
+	psrlq	$32, %mm0
+
+	add	$1, %ecx
+	jnz	L(top)
+
+	movd	%mm0, %eax
+	mov	SAVE_EBX, %ebx
+	emms
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/pentium4/sse2/addlsh1_n.asm b/third_party/gmp/mpn/x86/pentium4/sse2/addlsh1_n.asm
new file mode 100644
index 0000000..93b63b2
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium4/sse2/addlsh1_n.asm

@@ -0,0 +1,108 @@
+dnl  Intel Pentium-4 mpn_addlsh1_n -- mpn x+2*y.
+
+dnl  Copyright 2001-2004, 2006 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C					cycles/limb
+C			     dst!=src1,2  dst==src1  dst==src2
+C P6 model 0-8,10-12		-
+C P6 model 9   (Banias)		?
+C P6 model 13  (Dothan)		?
+C P4 model 0-1 (Willamette)	?
+C P4 model 2   (Northwood)	4.25	     6		6
+C P4 model 3-4 (Prescott)	5	     8.5	8.5
+
+C The slightly strange combination of indexing and pointer incrementing
+C that's used seems to work best.  Not sure why, but %ecx,4 with src1 and/or
+C src2 is a slowdown.
+C
+C The dependent chain is simply the paddq of x+2*y to the previous carry,
+C then psrlq to get the new carry.  That makes 4 c/l the target speed, which
+C is almost achieved for separate src/dst but when src==dst the write
+C combining anomalies slow it down.
+
+defframe(PARAM_SIZE, 16)
+defframe(PARAM_SRC2, 12)
+defframe(PARAM_SRC1, 8)
+defframe(PARAM_DST,  4)
+
+dnl  re-use parameter space
+define(SAVE_EBX,`PARAM_SRC1')
+
+	TEXT
+	ALIGN(8)
+
+PROLOGUE(mpn_addlsh1_n)
+deflit(`FRAME',0)
+
+	mov	PARAM_SRC1, %eax
+	mov	%ebx, SAVE_EBX
+
+	mov	PARAM_SRC2, %ebx
+	pxor	%mm0, %mm0		C initial carry
+
+	mov	PARAM_DST, %edx
+
+	mov	PARAM_SIZE, %ecx
+
+	lea	(%edx,%ecx,4), %edx	C dst end
+	neg	%ecx			C -size
+
+L(top):
+	C eax	src1 end
+	C ebx	src2 end
+	C ecx	counter, limbs, negative
+	C edx	dst end
+	C mm0	carry
+
+	movd	(%ebx), %mm2
+	movd	(%eax), %mm1
+	psrlq	$32, %mm0
+	lea	4(%eax), %eax
+	lea	4(%ebx), %ebx
+
+	psllq	$1, %mm2
+	paddq	%mm2, %mm1
+
+	paddq	%mm1, %mm0
+
+	movd	%mm0, (%edx,%ecx,4)
+	add	$1, %ecx
+	jnz	L(top)
+
+
+	psrlq	$32, %mm0
+	mov	SAVE_EBX, %ebx
+	movd	%mm0, %eax
+	emms
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/pentium4/sse2/addmul_1.asm b/third_party/gmp/mpn/x86/pentium4/sse2/addmul_1.asm
new file mode 100644
index 0000000..7810207
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium4/sse2/addmul_1.asm

@@ -0,0 +1,189 @@
+dnl  mpn_addmul_1 for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F).
+
+dnl  Copyright 2005, 2007, 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+C			    cycles/limb
+C P6 model 0-8,10-12		-
+C P6 model 9   (Banias)		5.24
+C P6 model 13  (Dothan)		5.24
+C P4 model 0-1 (Willamette)	5
+C P4 model 2   (Northwood)	5
+C P4 model 3-4 (Prescott)	5
+
+C TODO:
+C  * Tweak eax/edx offsets in loop as to save some lea's
+C  * Perhaps software pipeline small-case code
+
+C INPUT PARAMETERS
+C rp		sp + 4
+C up		sp + 8
+C n		sp + 12
+C v0		sp + 16
+
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_addmul_1)
+	pxor	%mm6, %mm6
+L(ent):	mov	4(%esp), %edx
+	mov	8(%esp), %eax
+	mov	12(%esp), %ecx
+	movd	16(%esp), %mm7
+	cmp	$4, %ecx
+	jnc	L(big)
+
+L(lp0):	movd	(%eax), %mm0
+	lea	4(%eax), %eax
+	movd	(%edx), %mm4
+	lea	4(%edx), %edx
+	pmuludq	%mm7, %mm0
+	paddq	%mm0, %mm4
+	paddq	%mm4, %mm6
+	movd	%mm6, -4(%edx)
+	psrlq	$32, %mm6
+	dec	%ecx
+	jnz	L(lp0)
+	movd	%mm6, %eax
+	emms
+	ret
+
+L(big):	and	$3, %ecx
+	je	L(0)
+	cmp	$2, %ecx
+	jc	L(1)
+	je	L(2)
+	jmp	L(3)			C FIXME: one case should fall through
+
+L(0):	movd	(%eax), %mm3
+	sub	12(%esp), %ecx		C loop count
+	lea	-16(%eax), %eax
+	lea	-12(%edx), %edx
+	pmuludq	%mm7, %mm3
+	movd	20(%eax), %mm0
+	movd	12(%edx), %mm5
+	pmuludq	%mm7, %mm0
+	movd	24(%eax), %mm1
+	paddq	%mm3, %mm5
+	movd	16(%edx), %mm4
+	jmp	L(00)
+
+L(1):	movd	(%eax), %mm2
+	sub	12(%esp), %ecx
+	lea	-12(%eax), %eax
+	lea	-8(%edx), %edx
+	movd	8(%edx), %mm4
+	pmuludq	%mm7, %mm2
+	movd	16(%eax), %mm3
+	pmuludq	%mm7, %mm3
+	movd	20(%eax), %mm0
+	paddq	%mm2, %mm4
+	movd	12(%edx), %mm5
+	jmp	L(01)
+
+L(2):	movd	(%eax), %mm1
+	sub	12(%esp), %ecx
+	lea	-8(%eax), %eax
+	lea	-4(%edx), %edx
+	pmuludq	%mm7, %mm1
+	movd	12(%eax), %mm2
+	movd	4(%edx), %mm5
+	pmuludq	%mm7, %mm2
+	movd	16(%eax), %mm3
+	paddq	%mm1, %mm5
+	movd	8(%edx), %mm4
+	jmp	L(10)
+
+L(3):	movd	(%eax), %mm0
+	sub	12(%esp), %ecx
+	lea	-4(%eax), %eax
+	pmuludq	%mm7, %mm0
+	movd	8(%eax), %mm1
+	movd	(%edx), %mm4
+	pmuludq	%mm7, %mm1
+	movd	12(%eax), %mm2
+	paddq	%mm0, %mm4
+	movd	4(%edx), %mm5
+
+	ALIGN(16)
+L(top):	pmuludq	%mm7, %mm2
+	paddq	%mm4, %mm6
+	movd	16(%eax), %mm3
+	paddq	%mm1, %mm5
+	movd	8(%edx), %mm4
+	movd	%mm6, 0(%edx)
+	psrlq	$32, %mm6
+L(10):	pmuludq	%mm7, %mm3
+	paddq	%mm5, %mm6
+	movd	20(%eax), %mm0
+	paddq	%mm2, %mm4
+	movd	12(%edx), %mm5
+	movd	%mm6, 4(%edx)
+	psrlq	$32, %mm6
+L(01):	pmuludq	%mm7, %mm0
+	paddq	%mm4, %mm6
+	movd	24(%eax), %mm1
+	paddq	%mm3, %mm5
+	movd	16(%edx), %mm4
+	movd	%mm6, 8(%edx)
+	psrlq	$32, %mm6
+L(00):	pmuludq	%mm7, %mm1
+	paddq	%mm5, %mm6
+	movd	28(%eax), %mm2
+	paddq	%mm0, %mm4
+	movd	20(%edx), %mm5
+	movd	%mm6, 12(%edx)
+	psrlq	$32, %mm6
+	lea	16(%eax), %eax
+	lea	16(%edx), %edx
+	add	$4, %ecx
+	jnz	L(top)
+
+L(end):	pmuludq	%mm7, %mm2
+	paddq	%mm4, %mm6
+	paddq	%mm1, %mm5
+	movd	8(%edx), %mm4
+	movd	%mm6, 0(%edx)
+	psrlq	$32, %mm6
+	paddq	%mm5, %mm6
+	paddq	%mm2, %mm4
+	movd	%mm6, 4(%edx)
+	psrlq	$32, %mm6
+	paddq	%mm4, %mm6
+	movd	%mm6, 8(%edx)
+	psrlq	$32, %mm6
+	movd	%mm6, %eax
+	emms
+	ret
+EPILOGUE()
+PROLOGUE(mpn_addmul_1c)
+	movd	20(%esp), %mm6
+	jmp	L(ent)
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/pentium4/sse2/bdiv_dbm1c.asm b/third_party/gmp/mpn/x86/pentium4/sse2/bdiv_dbm1c.asm
new file mode 100644
index 0000000..354300e
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium4/sse2/bdiv_dbm1c.asm

@@ -0,0 +1,141 @@
+dnl  Intel Atom  mpn_bdiv_dbm1.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C			    cycles/limb
+C			    cycles/limb
+C P5				 -
+C P6 model 0-8,10-12		 -
+C P6 model 9  (Banias)		 9.75
+C P6 model 13 (Dothan)
+C P4 model 0  (Willamette)
+C P4 model 1  (?)
+C P4 model 2  (Northwood)	 8.25
+C P4 model 3  (Prescott)
+C P4 model 4  (Nocona)
+C Intel Atom			 8
+C AMD K6			 -
+C AMD K7			 -
+C AMD K8
+C AMD K10
+
+C TODO: This code was optimised for atom-32, consider moving it back to atom
+C	dir(atom currently grabs this code), and write a 4-way version(7c/l).
+
+defframe(PARAM_CARRY,20)
+defframe(PARAM_MUL,  16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC,  8)
+defframe(PARAM_DST,  4)
+
+dnl  re-use parameter space
+define(SAVE_RP,`PARAM_MUL')
+define(SAVE_UP,`PARAM_SIZE')
+
+define(`rp', `%edi')
+define(`up', `%esi')
+define(`n',  `%ecx')
+define(`reg', `%edx')
+define(`cy', `%eax')	C contains the return value
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+deflit(`FRAME',0)
+
+PROLOGUE(mpn_bdiv_dbm1c)
+	mov	PARAM_SIZE, n		C size
+	mov	up, SAVE_UP
+	mov	PARAM_SRC, up
+	movd	PARAM_MUL, %mm7
+	mov	rp, SAVE_RP
+	mov	PARAM_DST, rp
+
+	movd	(up), %mm0
+	pmuludq	%mm7, %mm0
+	shr	n
+	mov	PARAM_CARRY, cy
+	jz	L(eq1)
+
+	movd	4(up), %mm1
+	jc	L(odd)
+
+	lea	4(up), up
+	pmuludq	%mm7, %mm1
+	movd	%mm0, reg
+	psrlq	$32, %mm0
+	sub	reg, cy
+	movd	%mm0, reg
+	movq	%mm1, %mm0
+	dec	n
+	mov	cy, (rp)
+	lea	4(rp), rp
+	jz	L(end)
+
+C	ALIGN(16)
+L(top):	movd	4(up), %mm1
+	sbb	reg, cy
+L(odd):	movd	%mm0, reg
+	psrlq	$32, %mm0
+	pmuludq	%mm7, %mm1
+	sub	reg, cy
+	lea	8(up), up
+	movd	%mm0, reg
+	movd	(up), %mm0
+	mov	cy, (rp)
+	sbb	reg, cy
+	movd	%mm1, reg
+	psrlq	$32, %mm1
+	sub	reg, cy
+	movd	%mm1, reg
+	pmuludq	%mm7, %mm0
+	dec	n
+	mov	cy, 4(rp)
+	lea	8(rp), rp
+	jnz	L(top)
+
+L(end):	sbb	reg, cy
+
+L(eq1):	movd	%mm0, reg
+	psrlq	$32, %mm0
+	mov	SAVE_UP, up
+	sub	reg, cy
+	movd	%mm0, reg
+	emms
+	mov	cy, (rp)
+	sbb	reg, cy
+
+	mov	SAVE_RP, rp
+	ret
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/x86/pentium4/sse2/bdiv_q_1.asm b/third_party/gmp/mpn/x86/pentium4/sse2/bdiv_q_1.asm
new file mode 100644
index 0000000..d5008f4
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium4/sse2/bdiv_q_1.asm

@@ -0,0 +1,234 @@
+dnl  Intel Pentium-4 mpn_divexact_1 -- mpn by limb exact division.
+
+dnl  Rearranged from mpn/x86/pentium4/sse2/dive_1.asm by Marco Bodrato.
+
+dnl  Copyright 2001, 2002, 2007, 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P4: 19.0 cycles/limb
+
+C Pairs of movd's are used to avoid unaligned loads.  Despite the loads not
+C being on the dependent chain and there being plenty of cycles available,
+C using an unaligned movq on every second iteration measured about 23 c/l.
+C
+
+defframe(PARAM_SHIFT,  24)
+defframe(PARAM_INVERSE,20)
+defframe(PARAM_DIVISOR,16)
+defframe(PARAM_SIZE,   12)
+defframe(PARAM_SRC,    8)
+defframe(PARAM_DST,    4)
+
+	TEXT
+
+C mp_limb_t
+C mpn_pi1_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor,
+C		    mp_limb_t inverse, int shift)
+	ALIGN(32)
+PROLOGUE(mpn_pi1_bdiv_q_1)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %edx
+
+	movl	PARAM_SRC, %eax
+
+	movl	PARAM_DIVISOR, %ecx
+
+	movd	%ecx, %mm6
+	movl	PARAM_SHIFT, %ecx
+
+	movd	%ecx, %mm7		C shift
+
+	C
+
+	movl	PARAM_INVERSE, %ecx
+	movd	%ecx, %mm5		C inv
+
+	movl	PARAM_DST, %ecx
+	pxor	%mm1, %mm1		C initial carry limb
+	pxor	%mm0, %mm0		C initial carry bit
+
+	subl	$1, %edx
+	jz	L(done)
+
+	pcmpeqd	%mm4, %mm4
+	psrlq	$32, %mm4		C 0x00000000FFFFFFFF
+
+C The dependent chain here is as follows.
+C
+C					latency
+C	psubq	 s = (src-cbit) - climb	   2
+C	pmuludq	 q = s*inverse		   8
+C	pmuludq	 prod = q*divisor	   8
+C	psrlq	 climb = high(prod)	   2
+C					  --
+C					  20
+C
+C Yet the loop measures 19.0 c/l, so obviously there's something gained
+C there over a straight reading of the chip documentation.
+
+L(top):
+	C eax	src, incrementing
+	C ebx
+	C ecx	dst, incrementing
+	C edx	counter, size-1 iterations
+	C
+	C mm0	carry bit
+	C mm1	carry limb
+	C mm4	0x00000000FFFFFFFF
+	C mm5	inverse
+	C mm6	divisor
+	C mm7	shift
+
+	movd	(%eax), %mm2
+	movd	4(%eax), %mm3
+	addl	$4, %eax
+	punpckldq %mm3, %mm2
+
+	psrlq	%mm7, %mm2
+	pand	%mm4, %mm2		C src
+	psubq	%mm0, %mm2		C src - cbit
+
+	psubq	%mm1, %mm2		C src - cbit - climb
+	movq	%mm2, %mm0
+	psrlq	$63, %mm0		C new cbit
+
+	pmuludq	%mm5, %mm2		C s*inverse
+	movd	%mm2, (%ecx)		C q
+	addl	$4, %ecx
+
+	movq	%mm6, %mm1
+	pmuludq	%mm2, %mm1		C q*divisor
+	psrlq	$32, %mm1		C new climb
+
+L(entry):
+	subl	$1, %edx
+	jnz	L(top)
+
+L(done):
+	movd	(%eax), %mm2
+	psrlq	%mm7, %mm2		C src
+	psubq	%mm0, %mm2		C src - cbit
+
+	psubq	%mm1, %mm2		C src - cbit - climb
+
+	pmuludq	%mm5, %mm2		C s*inverse
+	movd	%mm2, (%ecx)		C q
+
+	emms
+	ret
+
+EPILOGUE()
+
+	ALIGN(16)
+C mp_limb_t mpn_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                           mp_limb_t divisor);
+C
+PROLOGUE(mpn_bdiv_q_1)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %edx
+
+	movl	PARAM_DIVISOR, %ecx
+
+	C eax	src
+	C ebx
+	C ecx	divisor
+	C edx	size-1
+
+	movl	%ecx, %eax
+	bsfl	%ecx, %ecx		C trailing twos
+
+	shrl	%cl, %eax		C d = divisor without twos
+	movd	%eax, %mm6
+	movd	%ecx, %mm7		C shift
+
+	shrl	%eax			C d/2
+
+	andl	$127, %eax		C d/2, 7 bits
+
+ifdef(`PIC',`
+	LEA(	binvert_limb_table, %ecx)
+	movzbl	(%eax,%ecx), %eax		C inv 8 bits
+',`
+	movzbl	binvert_limb_table(%eax), %eax	C inv 8 bits
+')
+
+	C
+
+	movd	%eax, %mm5		C inv
+
+	movd	%eax, %mm0		C inv
+
+	pmuludq	%mm5, %mm5		C inv*inv
+
+	C
+
+	pmuludq	%mm6, %mm5		C inv*inv*d
+	paddd	%mm0, %mm0		C 2*inv
+
+	C
+
+	psubd	%mm5, %mm0		C inv = 2*inv - inv*inv*d
+	pxor	%mm5, %mm5
+
+	paddd	%mm0, %mm5
+	pmuludq	%mm0, %mm0		C inv*inv
+
+	pcmpeqd	%mm4, %mm4
+	psrlq	$32, %mm4		C 0x00000000FFFFFFFF
+
+	C
+
+	pmuludq	%mm6, %mm0		C inv*inv*d
+	paddd	%mm5, %mm5		C 2*inv
+
+	movl	PARAM_SRC, %eax
+	movl	PARAM_DST, %ecx
+	pxor	%mm1, %mm1		C initial carry limb
+
+	C
+
+	psubd	%mm0, %mm5		C inv = 2*inv - inv*inv*d
+
+	ASSERT(e,`	C expect d*inv == 1 mod 2^GMP_LIMB_BITS
+	pushl	%eax	FRAME_pushl()
+	movq	%mm6, %mm0
+	pmuludq	%mm5, %mm0
+	movd	%mm0, %eax
+	cmpl	$1, %eax
+	popl	%eax	FRAME_popl()')
+
+	pxor	%mm0, %mm0		C initial carry bit
+	jmp	L(entry)
+
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/x86/pentium4/sse2/cnd_add_n.asm b/third_party/gmp/mpn/x86/pentium4/sse2/cnd_add_n.asm
new file mode 100644
index 0000000..b3f3474
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium4/sse2/cnd_add_n.asm

@@ -0,0 +1,95 @@
+dnl  Intel Pentium-4 mpn_cnd_add_n -- mpn addition.
+
+dnl  Copyright 2001, 2002, 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C			    cycles/limb
+C P6 model 0-8,10-12		 -
+C P6 model 9   (Banias)		 ?
+C P6 model 13  (Dothan)		 4.67
+C P4 model 0-1 (Willamette)	 ?
+C P4 model 2   (Northwood)	 5
+C P4 model 3-4 (Prescott)	 5.25
+
+defframe(PARAM_SIZE, 20)
+defframe(PARAM_SRC2, 16)
+defframe(PARAM_SRC1, 12)
+defframe(PARAM_DST,  8)
+defframe(PARAM_CND,  4)
+
+dnl  re-use parameter space
+define(SAVE_EBX,`PARAM_SRC1')
+
+define(`cnd', `%mm3')
+
+	TEXT
+	ALIGN(8)
+
+	ALIGN(8)
+PROLOGUE(mpn_cnd_add_n)
+deflit(`FRAME',0)
+	pxor	%mm0, %mm0
+
+	mov	PARAM_CND, %eax
+	neg	%eax
+	sbb	%eax, %eax
+	movd	%eax, cnd
+
+	mov	PARAM_SRC1, %eax
+	mov	%ebx, SAVE_EBX
+	mov	PARAM_SRC2, %ebx
+	mov	PARAM_DST, %edx
+	mov	PARAM_SIZE, %ecx
+
+	lea	(%eax,%ecx,4), %eax	C src1 end
+	lea	(%ebx,%ecx,4), %ebx	C src2 end
+	lea	(%edx,%ecx,4), %edx	C dst end
+	neg	%ecx			C -size
+
+L(top):	movd	(%ebx,%ecx,4), %mm2
+	movd	(%eax,%ecx,4), %mm1
+	pand	cnd, %mm2
+	paddq	%mm2, %mm1
+
+	paddq	%mm1, %mm0
+	movd	%mm0, (%edx,%ecx,4)
+
+	psrlq	$32, %mm0
+
+	add	$1, %ecx
+	jnz	L(top)
+
+	movd	%mm0, %eax
+	mov	SAVE_EBX, %ebx
+	emms
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/pentium4/sse2/cnd_sub_n.asm b/third_party/gmp/mpn/x86/pentium4/sse2/cnd_sub_n.asm
new file mode 100644
index 0000000..339a23e
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium4/sse2/cnd_sub_n.asm

@@ -0,0 +1,114 @@
+dnl  Intel Pentium-4 mpn_cnd_sub_n -- mpn subtraction.
+
+dnl  Copyright 2001, 2002, 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C			    cycles/limb
+C P6 model 0-8,10-12		 -
+C P6 model 9   (Banias)		 ?
+C P6 model 13  (Dothan)		 4.67
+C P4 model 0-1 (Willamette)	 ?
+C P4 model 2   (Northwood)	 5
+C P4 model 3-4 (Prescott)	 5.25
+
+defframe(PARAM_SIZE, 20)
+defframe(PARAM_SRC2, 16)
+defframe(PARAM_SRC1, 12)
+defframe(PARAM_DST,  8)
+defframe(PARAM_CND,  4)
+
+dnl  re-use parameter space
+define(SAVE_EBX,`PARAM_SRC1')
+
+define(`cnd', `%mm3')
+
+	TEXT
+	ALIGN(8)
+
+	ALIGN(8)
+PROLOGUE(mpn_cnd_sub_n)
+deflit(`FRAME',0)
+	pxor	%mm0, %mm0
+
+	mov	PARAM_CND, %eax
+	neg	%eax
+	sbb	%eax, %eax
+	movd	%eax, cnd
+
+	mov	PARAM_SRC1, %eax
+	mov	%ebx, SAVE_EBX
+	mov	PARAM_SRC2, %ebx
+	mov	PARAM_DST, %edx
+	mov	PARAM_SIZE, %ecx
+
+	lea	(%eax,%ecx,4), %eax	C src1 end
+	lea	(%ebx,%ecx,4), %ebx	C src2 end
+	lea	(%edx,%ecx,4), %edx	C dst end
+	neg	%ecx			C -size
+
+L(top):	movd	(%ebx,%ecx,4), %mm2
+	movd	(%eax,%ecx,4), %mm1
+	pand	cnd, %mm2
+	psubq	%mm2, %mm1
+
+	psubq	%mm0, %mm1
+	movd	%mm1, (%edx,%ecx,4)
+
+	psrlq	$63, %mm1
+
+	add	$1, %ecx
+	jz	L(done_mm1)
+
+	movd	(%ebx,%ecx,4), %mm2
+	movd	(%eax,%ecx,4), %mm0
+	pand	cnd, %mm2
+	psubq	%mm2, %mm0
+
+	psubq	%mm1, %mm0
+	movd	%mm0, (%edx,%ecx,4)
+
+	psrlq	$63, %mm0
+
+	add	$1, %ecx
+	jnz	L(top)
+
+	movd	%mm0, %eax
+	mov	SAVE_EBX, %ebx
+	emms
+	ret
+
+L(done_mm1):
+	movd	%mm1, %eax
+	mov	SAVE_EBX, %ebx
+	emms
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/pentium4/sse2/dive_1.asm b/third_party/gmp/mpn/x86/pentium4/sse2/dive_1.asm
new file mode 100644
index 0000000..0ceef5b
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium4/sse2/dive_1.asm

@@ -0,0 +1,216 @@
+dnl  Intel Pentium-4 mpn_divexact_1 -- mpn by limb exact division.
+
+dnl  Copyright 2001, 2002, 2007 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P4: 19.0 cycles/limb
+
+
+C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                      mp_limb_t divisor);
+C
+C Pairs of movd's are used to avoid unaligned loads.  Despite the loads not
+C being on the dependent chain and there being plenty of cycles available,
+C using an unaligned movq on every second iteration measured about 23 c/l.
+C
+C Using divl for size==1 seems a touch quicker than mul-by-inverse.  The mul
+C will be about 9+2*4+2*2+10*4+19+12 = 92 cycles latency, though some of
+C that might be hidden by out-of-order execution, whereas divl is around 60.
+C At size==2 an extra 19 for the mul versus 60 for the divl will see the mul
+C faster.
+
+defframe(PARAM_DIVISOR,16)
+defframe(PARAM_SIZE,   12)
+defframe(PARAM_SRC,    8)
+defframe(PARAM_DST,    4)
+
+	TEXT
+
+	ALIGN(16)
+PROLOGUE(mpn_divexact_1)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %edx
+
+	movl	PARAM_SRC, %eax
+
+	movl	PARAM_DIVISOR, %ecx
+	subl	$1, %edx
+	jnz	L(two_or_more)
+
+	movl	(%eax), %eax
+	xorl	%edx, %edx
+
+	divl	%ecx
+	movl	PARAM_DST, %ecx
+
+	movl	%eax, (%ecx)
+	ret
+
+
+L(two_or_more):
+	C eax	src
+	C ebx
+	C ecx	divisor
+	C edx	size-1
+
+	movl	%ecx, %eax
+	bsfl	%ecx, %ecx		C trailing twos
+
+	shrl	%cl, %eax		C d = divisor without twos
+	movd	%eax, %mm6
+	movd	%ecx, %mm7		C shift
+
+	shrl	%eax			C d/2
+
+	andl	$127, %eax		C d/2, 7 bits
+
+ifdef(`PIC',`
+	LEA(	binvert_limb_table, %ecx)
+	movzbl	(%eax,%ecx), %eax		C inv 8 bits
+',`
+	movzbl	binvert_limb_table(%eax), %eax	C inv 8 bits
+')
+
+	C
+
+	movd	%eax, %mm5		C inv
+
+	movd	%eax, %mm0		C inv
+
+	pmuludq	%mm5, %mm5		C inv*inv
+
+	C
+
+	pmuludq	%mm6, %mm5		C inv*inv*d
+	paddd	%mm0, %mm0		C 2*inv
+
+	C
+
+	psubd	%mm5, %mm0		C inv = 2*inv - inv*inv*d
+	pxor	%mm5, %mm5
+
+	paddd	%mm0, %mm5
+	pmuludq	%mm0, %mm0		C inv*inv
+
+	pcmpeqd	%mm4, %mm4
+	psrlq	$32, %mm4		C 0x00000000FFFFFFFF
+
+	C
+
+	pmuludq	%mm6, %mm0		C inv*inv*d
+	paddd	%mm5, %mm5		C 2*inv
+
+	movl	PARAM_SRC, %eax
+	movl	PARAM_DST, %ecx
+	pxor	%mm1, %mm1		C initial carry limb
+
+	C
+
+	psubd	%mm0, %mm5		C inv = 2*inv - inv*inv*d
+
+	ASSERT(e,`	C expect d*inv == 1 mod 2^GMP_LIMB_BITS
+	pushl	%eax	FRAME_pushl()
+	movq	%mm6, %mm0
+	pmuludq	%mm5, %mm0
+	movd	%mm0, %eax
+	cmpl	$1, %eax
+	popl	%eax	FRAME_popl()')
+
+	pxor	%mm0, %mm0		C initial carry bit
+
+
+C The dependent chain here is as follows.
+C
+C					latency
+C	psubq	 s = (src-cbit) - climb	   2
+C	pmuludq	 q = s*inverse		   8
+C	pmuludq	 prod = q*divisor	   8
+C	psrlq	 climb = high(prod)	   2
+C					  --
+C					  20
+C
+C Yet the loop measures 19.0 c/l, so obviously there's something gained
+C there over a straight reading of the chip documentation.
+
+L(top):
+	C eax	src, incrementing
+	C ebx
+	C ecx	dst, incrementing
+	C edx	counter, size-1 iterations
+	C
+	C mm0	carry bit
+	C mm1	carry limb
+	C mm4	0x00000000FFFFFFFF
+	C mm5	inverse
+	C mm6	divisor
+	C mm7	shift
+
+	movd	(%eax), %mm2
+	movd	4(%eax), %mm3
+	addl	$4, %eax
+	punpckldq %mm3, %mm2
+
+	psrlq	%mm7, %mm2
+	pand	%mm4, %mm2		C src
+	psubq	%mm0, %mm2		C src - cbit
+
+	psubq	%mm1, %mm2		C src - cbit - climb
+	movq	%mm2, %mm0
+	psrlq	$63, %mm0		C new cbit
+
+	pmuludq	%mm5, %mm2		C s*inverse
+	movd	%mm2, (%ecx)		C q
+	addl	$4, %ecx
+
+	movq	%mm6, %mm1
+	pmuludq	%mm2, %mm1		C q*divisor
+	psrlq	$32, %mm1		C new climb
+
+	subl	$1, %edx
+	jnz	L(top)
+
+
+L(done):
+	movd	(%eax), %mm2
+	psrlq	%mm7, %mm2		C src
+	psubq	%mm0, %mm2		C src - cbit
+
+	psubq	%mm1, %mm2		C src - cbit - climb
+
+	pmuludq	%mm5, %mm2		C s*inverse
+	movd	%mm2, (%ecx)		C q
+
+	emms
+	ret
+
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/x86/pentium4/sse2/divrem_1.asm b/third_party/gmp/mpn/x86/pentium4/sse2/divrem_1.asm
new file mode 100644
index 0000000..0146fab
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium4/sse2/divrem_1.asm

@@ -0,0 +1,645 @@
+dnl  Intel Pentium-4 mpn_divrem_1 -- mpn by limb division.
+
+dnl  Copyright 1999-2004 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P4: 32 cycles/limb integer part, 30 cycles/limb fraction part.
+
+
+C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
+C                         mp_srcptr src, mp_size_t size,
+C                         mp_limb_t divisor);
+C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
+C                          mp_srcptr src, mp_size_t size,
+C                          mp_limb_t divisor, mp_limb_t carry);
+C mp_limb_t mpn_preinv_divrem_1 (mp_ptr dst, mp_size_t xsize,
+C                                mp_srcptr src, mp_size_t size,
+C                                mp_limb_t divisor, mp_limb_t inverse,
+C                                unsigned shift);
+C
+C Algorithm:
+C
+C The method and nomenclature follow part 8 of "Division by Invariant
+C Integers using Multiplication" by Granlund and Montgomery, reference in
+C gmp.texi.
+C
+C "m" is written for what is m' in the paper, and "d" for d_norm, which
+C won't cause any confusion since it's only the normalized divisor that's of
+C any use in the code.  "b" is written for 2^N, the size of a limb, N being
+C 32 here.
+C
+C The step "sdword dr = n - 2^N*d + (2^N-1-q1) * d" is instead done as
+C "n-d - q1*d".  This rearrangement gives the same two-limb answer but lets
+C us have just a psubq on the dependent chain.
+C
+C For reference, the way the k7 code uses "n-(q1+1)*d" would not suit here,
+C detecting an overflow of q1+1 when q1=0xFFFFFFFF would cost too much.
+C
+C Notes:
+C
+C mpn_divrem_1 and mpn_preinv_divrem_1 avoid one division if the src high
+C limb is less than the divisor.  mpn_divrem_1c doesn't check for a zero
+C carry, since in normal circumstances that will be a very rare event.
+C
+C The test for skipping a division is branch free (once size>=1 is tested).
+C The store to the destination high limb is 0 when a divide is skipped, or
+C if it's not skipped then a copy of the src high limb is stored.  The
+C latter is in case src==dst.
+C
+C There's a small bias towards expecting xsize==0, by having code for
+C xsize==0 in a straight line and xsize!=0 under forward jumps.
+C
+C Enhancements:
+C
+C The loop measures 32 cycles, but the dependent chain would suggest it
+C could be done with 30.  Not sure where to start looking for the extras.
+C
+C Alternatives:
+C
+C If the divisor is normalized (high bit set) then a division step can
+C always be skipped, since the high destination limb is always 0 or 1 in
+C that case.  It doesn't seem worth checking for this though, since it
+C probably occurs infrequently.
+
+
+dnl  MUL_THRESHOLD is the value of xsize+size at which the multiply by
+dnl  inverse method is used, rather than plain "divl"s.  Minimum value 1.
+dnl
+dnl  The inverse takes about 80-90 cycles to calculate, but after that the
+dnl  multiply is 32 c/l versus division at about 58 c/l.
+dnl
+dnl  At 4 limbs the div is a touch faster than the mul (and of course
+dnl  simpler), so start the mul from 5 limbs.
+
+deflit(MUL_THRESHOLD, 5)
+
+
+defframe(PARAM_PREINV_SHIFT,   28)  dnl mpn_preinv_divrem_1
+defframe(PARAM_PREINV_INVERSE, 24)  dnl mpn_preinv_divrem_1
+defframe(PARAM_CARRY,  24)          dnl mpn_divrem_1c
+defframe(PARAM_DIVISOR,20)
+defframe(PARAM_SIZE,   16)
+defframe(PARAM_SRC,    12)
+defframe(PARAM_XSIZE,  8)
+defframe(PARAM_DST,    4)
+
+dnl  re-use parameter space
+define(SAVE_ESI,`PARAM_SIZE')
+define(SAVE_EBP,`PARAM_SRC')
+define(SAVE_EDI,`PARAM_DIVISOR')
+define(SAVE_EBX,`PARAM_DST')
+
+	TEXT
+
+	ALIGN(16)
+PROLOGUE(mpn_preinv_divrem_1)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	xorl	%edx, %edx		C carry if can't skip a div
+
+	movl	%esi, SAVE_ESI
+	movl	PARAM_SRC, %esi
+
+	movl	%ebp, SAVE_EBP
+	movl	PARAM_DIVISOR, %ebp
+
+	movl	%edi, SAVE_EDI
+	movl	PARAM_DST, %edi
+
+	movl	-4(%esi,%ecx,4), %eax	C src high limb
+
+	movl	%ebx, SAVE_EBX
+	movl	PARAM_XSIZE, %ebx
+
+	movd	PARAM_PREINV_INVERSE, %mm4
+
+	movd	PARAM_PREINV_SHIFT, %mm7  C l
+	cmpl	%ebp, %eax		C high cmp divisor
+
+	cmovc(	%eax, %edx)		C high is carry if high<divisor
+	movd	%edx, %mm0		C carry
+
+	movd	%edx, %mm1		C carry
+	movl	$0, %edx
+
+	movd	%ebp, %mm5		C d
+	cmovnc(	%eax, %edx)		C 0 if skip div, src high if not
+					C (the latter in case src==dst)
+	leal	-4(%edi,%ebx,4), %edi	C &dst[xsize-1]
+
+	movl	%edx, (%edi,%ecx,4)	C dst high limb
+	sbbl	$0, %ecx		C skip one division if high<divisor
+	movl	$32, %eax
+
+	subl	PARAM_PREINV_SHIFT, %eax
+	psllq	%mm7, %mm5		C d normalized
+	leal	(%edi,%ecx,4), %edi	C &dst[xsize+size-1]
+	leal	-4(%esi,%ecx,4), %esi	C &src[size-1]
+
+	movd	%eax, %mm6		C 32-l
+	jmp	L(start_preinv)
+
+EPILOGUE()
+
+
+	ALIGN(16)
+PROLOGUE(mpn_divrem_1c)
+deflit(`FRAME',0)
+
+	movl	PARAM_CARRY, %edx
+
+	movl	PARAM_SIZE, %ecx
+
+	movl	%esi, SAVE_ESI
+	movl	PARAM_SRC, %esi
+
+	movl	%ebp, SAVE_EBP
+	movl	PARAM_DIVISOR, %ebp
+
+	movl	%edi, SAVE_EDI
+	movl	PARAM_DST, %edi
+
+	movl	%ebx, SAVE_EBX
+	movl	PARAM_XSIZE, %ebx
+
+	leal	-4(%edi,%ebx,4), %edi	C &dst[xsize-1]
+	jmp	L(start_1c)
+
+EPILOGUE()
+
+
+	ALIGN(16)
+PROLOGUE(mpn_divrem_1)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	xorl	%edx, %edx		C initial carry (if can't skip a div)
+
+	movl	%esi, SAVE_ESI
+	movl	PARAM_SRC, %esi
+
+	movl	%ebp, SAVE_EBP
+	movl	PARAM_DIVISOR, %ebp
+
+	movl	%edi, SAVE_EDI
+	movl	PARAM_DST, %edi
+
+	movl	%ebx, SAVE_EBX
+	movl	PARAM_XSIZE, %ebx
+	leal	-4(%edi,%ebx,4), %edi	C &dst[xsize-1]
+
+	orl	%ecx, %ecx		C size
+	jz	L(no_skip_div)		C if size==0
+	movl	-4(%esi,%ecx,4), %eax	C src high limb
+
+	cmpl	%ebp, %eax		C high cmp divisor
+
+	cmovnc(	%eax, %edx)		C 0 if skip div, src high if not
+	movl	%edx, (%edi,%ecx,4)	C dst high limb
+
+	movl	$0, %edx
+	cmovc(	%eax, %edx)		C high is carry if high<divisor
+
+	sbbl	$0, %ecx		C size-1 if high<divisor
+L(no_skip_div):
+
+
+L(start_1c):
+	C eax
+	C ebx	xsize
+	C ecx	size
+	C edx	carry
+	C esi	src
+	C edi	&dst[xsize-1]
+	C ebp	divisor
+
+	leal	(%ebx,%ecx), %eax	C size+xsize
+	leal	-4(%esi,%ecx,4), %esi	C &src[size-1]
+	leal	(%edi,%ecx,4), %edi	C &dst[size+xsize-1]
+
+	cmpl	$MUL_THRESHOLD, %eax
+	jae	L(mul_by_inverse)
+
+
+	orl	%ecx, %ecx
+	jz	L(divide_no_integer)	C if size==0
+
+L(divide_integer):
+	C eax	scratch (quotient)
+	C ebx	xsize
+	C ecx	counter
+	C edx	carry
+	C esi	src, decrementing
+	C edi	dst, decrementing
+	C ebp	divisor
+
+	movl	(%esi), %eax
+	subl	$4, %esi
+
+	divl	%ebp
+
+	movl	%eax, (%edi)
+	subl	$4, %edi
+
+	subl	$1, %ecx
+	jnz	L(divide_integer)
+
+
+L(divide_no_integer):
+	orl	%ebx, %ebx
+	jnz	L(divide_fraction)	C if xsize!=0
+
+L(divide_done):
+	movl	SAVE_ESI, %esi
+	movl	SAVE_EDI, %edi
+	movl	SAVE_EBX, %ebx
+	movl	SAVE_EBP, %ebp
+	movl	%edx, %eax
+	ret
+
+
+L(divide_fraction):
+	C eax	scratch (quotient)
+	C ebx	counter
+	C ecx
+	C edx	carry
+	C esi
+	C edi	dst, decrementing
+	C ebp	divisor
+
+	movl	$0, %eax
+
+	divl	%ebp
+
+	movl	%eax, (%edi)
+	subl	$4, %edi
+
+	subl	$1, %ebx
+	jnz	L(divide_fraction)
+
+	jmp	L(divide_done)
+
+
+
+C -----------------------------------------------------------------------------
+
+L(mul_by_inverse):
+	C eax
+	C ebx	xsize
+	C ecx	size
+	C edx	carry
+	C esi	&src[size-1]
+	C edi	&dst[size+xsize-1]
+	C ebp	divisor
+
+	bsrl	%ebp, %eax		C 31-l
+	movd	%edx, %mm0		C carry
+	movd	%edx, %mm1		C carry
+	movl	%ecx, %edx		C size
+	movl	$31, %ecx
+
+	C
+
+	xorl	%eax, %ecx		C l = leading zeros on d
+	addl	$1, %eax
+
+	shll	%cl, %ebp		C d normalized
+	movd	%ecx, %mm7		C l
+	movl	%edx, %ecx		C size
+
+	movd	%eax, %mm6		C 32-l
+	movl	$-1, %edx
+	movl	$-1, %eax
+
+	C
+
+	subl	%ebp, %edx		C (b-d)-1 so  edx:eax = b*(b-d)-1
+
+	divl	%ebp			C floor (b*(b-d)-1 / d)
+	movd	%ebp, %mm5		C d
+
+	C
+
+	movd	%eax, %mm4		C m
+
+
+L(start_preinv):
+	C eax	inverse
+	C ebx	xsize
+	C ecx	size
+	C edx
+	C esi	&src[size-1]
+	C edi	&dst[size+xsize-1]
+	C ebp
+	C
+	C mm0	carry
+	C mm1	carry
+	C mm2
+	C mm4	m
+	C mm5	d
+	C mm6	31-l
+	C mm7	l
+
+	psllq	%mm7, %mm0		C n2 = carry << l, for size==0
+
+	subl	$1, %ecx
+	jb	L(integer_none)
+
+	movd	(%esi), %mm0		C src high limb
+	punpckldq %mm1, %mm0
+	psrlq	%mm6, %mm0		C n2 = high (carry:srchigh << l)
+	jz	L(integer_last)
+
+
+C The dependent chain here consists of
+C
+C	2   paddd    n1+n2
+C	8   pmuludq  m*(n1+n2)
+C	2   paddq    n2:nadj + m*(n1+n2)
+C	2   psrlq    q1
+C	8   pmuludq  d*q1
+C	2   psubq    (n-d)-q1*d
+C	2   psrlq    high n-(q1+1)*d mask
+C	2   pand     d masked
+C	2   paddd    n2+d addback
+C	--
+C	30
+C
+C But it seems to run at 32 cycles, so presumably there's something else
+C going on.
+
+	ALIGN(16)
+L(integer_top):
+	C eax
+	C ebx
+	C ecx	counter, size-1 to 0
+	C edx
+	C esi	src, decrementing
+	C edi	dst, decrementing
+	C
+	C mm0	n2
+	C mm4	m
+	C mm5	d
+	C mm6	32-l
+	C mm7	l
+
+	ASSERT(b,`C n2<d
+	 movd	%mm0, %eax
+	 movd	%mm5, %edx
+	 cmpl	%edx, %eax')
+
+	movd	-4(%esi), %mm1		C next src limbs
+	movd	(%esi), %mm2
+	leal	-4(%esi), %esi
+
+	punpckldq %mm2, %mm1
+	psrlq	%mm6, %mm1		C n10
+
+	movq	%mm1, %mm2		C n10
+	movq	%mm1, %mm3		C n10
+	psrad	$31, %mm1		C -n1
+	pand	%mm5, %mm1		C -n1 & d
+	paddd	%mm2, %mm1		C nadj = n10+(-n1&d), ignore overflow
+
+	psrld	$31, %mm2		C n1
+	paddd	%mm0, %mm2		C n2+n1
+	punpckldq %mm0, %mm1		C n2:nadj
+
+	pmuludq	%mm4, %mm2		C m*(n2+n1)
+
+	C
+
+	paddq	%mm2, %mm1		C n2:nadj + m*(n2+n1)
+	pxor	%mm2, %mm2		C break dependency, saves 4 cycles
+	pcmpeqd	%mm2, %mm2		C FF...FF
+	psrlq	$63, %mm2		C 1
+
+	psrlq	$32, %mm1		C q1 = high(n2:nadj + m*(n2+n1))
+
+	paddd	%mm1, %mm2		C q1+1
+	pmuludq	%mm5, %mm1		C q1*d
+
+	punpckldq %mm0, %mm3		C n = n2:n10
+	pxor	%mm0, %mm0
+
+	psubq	%mm5, %mm3		C n - d
+
+	C
+
+	psubq	%mm1, %mm3		C n - (q1+1)*d
+
+	por	%mm3, %mm0		C copy remainder -> new n2
+	psrlq	$32, %mm3		C high n - (q1+1)*d, 0 or -1
+
+	ASSERT(be,`C 0 or -1
+	 movd	%mm3, %eax
+	 addl	$1, %eax
+	 cmpl	$1, %eax')
+
+	paddd	%mm3, %mm2		C q
+	pand	%mm5, %mm3		C mask & d
+
+	paddd	%mm3, %mm0		C addback if necessary
+	movd	%mm2, (%edi)
+	leal	-4(%edi), %edi
+
+	subl	$1, %ecx
+	ja	L(integer_top)
+
+
+L(integer_last):
+	C eax
+	C ebx	xsize
+	C ecx
+	C edx
+	C esi	&src[0]
+	C edi	&dst[xsize]
+	C
+	C mm0	n2
+	C mm4	m
+	C mm5	d
+	C mm6
+	C mm7	l
+
+	ASSERT(b,`C n2<d
+	 movd	%mm0, %eax
+	 movd	%mm5, %edx
+	 cmpl	%edx, %eax')
+
+	movd	(%esi), %mm1		C src[0]
+	psllq	%mm7, %mm1		C n10
+
+	movq	%mm1, %mm2		C n10
+	movq	%mm1, %mm3		C n10
+	psrad	$31, %mm1		C -n1
+	pand	%mm5, %mm1		C -n1 & d
+	paddd	%mm2, %mm1		C nadj = n10+(-n1&d), ignore overflow
+
+	psrld	$31, %mm2		C n1
+	paddd	%mm0, %mm2		C n2+n1
+	punpckldq %mm0, %mm1		C n2:nadj
+
+	pmuludq	%mm4, %mm2		C m*(n2+n1)
+
+	C
+
+	paddq	%mm2, %mm1		C n2:nadj + m*(n2+n1)
+	pcmpeqd	%mm2, %mm2		C FF...FF
+	psrlq	$63, %mm2		C 1
+
+	psrlq	$32, %mm1		C q1 = high(n2:nadj + m*(n2+n1))
+	paddd	%mm1, %mm2		C q1
+
+	pmuludq	%mm5, %mm1		C q1*d
+	punpckldq %mm0, %mm3		C n
+	psubq	%mm5, %mm3		C n - d
+	pxor	%mm0, %mm0
+
+	C
+
+	psubq	%mm1, %mm3		C n - (q1+1)*d
+
+	por	%mm3, %mm0		C remainder -> n2
+	psrlq	$32, %mm3		C high n - (q1+1)*d, 0 or -1
+
+	ASSERT(be,`C 0 or -1
+	 movd	%mm3, %eax
+	 addl	$1, %eax
+	 cmpl	$1, %eax')
+
+	paddd	%mm3, %mm2		C q
+	pand	%mm5, %mm3		C mask & d
+
+	paddd	%mm3, %mm0		C addback if necessary
+	movd	%mm2, (%edi)
+	leal	-4(%edi), %edi
+
+
+L(integer_none):
+	C eax
+	C ebx	xsize
+
+	orl	%ebx, %ebx
+	jnz	L(fraction_some)	C if xsize!=0
+
+
+L(fraction_done):
+	movl	SAVE_EBP, %ebp
+	psrld	%mm7, %mm0		C remainder
+
+	movl	SAVE_EDI, %edi
+	movd	%mm0, %eax
+
+	movl	SAVE_ESI, %esi
+	movl	SAVE_EBX, %ebx
+	emms
+	ret
+
+
+
+C -----------------------------------------------------------------------------
+C
+
+L(fraction_some):
+	C eax
+	C ebx	xsize
+	C ecx
+	C edx
+	C esi
+	C edi	&dst[xsize-1]
+	C ebp
+
+
+L(fraction_top):
+	C eax
+	C ebx	counter, xsize iterations
+	C ecx
+	C edx
+	C esi	src, decrementing
+	C edi	dst, decrementing
+	C
+	C mm0	n2
+	C mm4	m
+	C mm5	d
+	C mm6	32-l
+	C mm7	l
+
+	ASSERT(b,`C n2<d
+	 movd	%mm0, %eax
+	 movd	%mm5, %edx
+	 cmpl	%edx, %eax')
+
+	movq	%mm0, %mm1		C n2
+	pmuludq	%mm4, %mm0		C m*n2
+
+	pcmpeqd	%mm2, %mm2
+	psrlq	$63, %mm2
+
+	C
+
+	psrlq	$32, %mm0		C high(m*n2)
+
+	paddd	%mm1, %mm0		C q1 = high(n2:0 + m*n2)
+
+	paddd	%mm0, %mm2		C q1+1
+	pmuludq	%mm5, %mm0		C q1*d
+
+	psllq	$32, %mm1		C n = n2:0
+	psubq	%mm5, %mm1		C n - d
+
+	C
+
+	psubq	%mm0, %mm1		C r = n - (q1+1)*d
+	pxor	%mm0, %mm0
+
+	por	%mm1, %mm0		C r -> n2
+	psrlq	$32, %mm1		C high n - (q1+1)*d, 0 or -1
+
+	ASSERT(be,`C 0 or -1
+	 movd	%mm1, %eax
+	 addl	$1, %eax
+	 cmpl	$1, %eax')
+
+	paddd	%mm1, %mm2		C q
+	pand	%mm5, %mm1		C mask & d
+
+	paddd	%mm1, %mm0		C addback if necessary
+	movd	%mm2, (%edi)
+	leal	-4(%edi), %edi
+
+	subl	$1, %ebx
+	jne	L(fraction_top)
+
+
+	jmp	L(fraction_done)
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/pentium4/sse2/gmp-mparam.h b/third_party/gmp/mpn/x86/pentium4/sse2/gmp-mparam.h
new file mode 100644
index 0000000..a047a51
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium4/sse2/gmp-mparam.h

@@ -0,0 +1,213 @@
+/* Intel Pentium-4 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* 2600 MHz P4 Northwood */
+/* FFT tuning limit = 23,700,309 */
+/* Generated by tuneup.c, 2019-11-09, gcc 8.2 */
+
+#define MOD_1_NORM_THRESHOLD                 5
+#define MOD_1_UNNORM_THRESHOLD              14
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          7
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          5
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD        13
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD         0  /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD      7
+#define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_1N_PI1_METHOD                 2  /* 4.36% faster than 1 */
+#define DIV_QR_1_NORM_THRESHOLD             16
+#define DIV_QR_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           21
+
+#define DIV_1_VS_MUL_1_PERCENT             358
+
+#define MUL_TOOM22_THRESHOLD                26
+#define MUL_TOOM33_THRESHOLD               101
+#define MUL_TOOM44_THRESHOLD               284
+#define MUL_TOOM6H_THRESHOLD               406
+#define MUL_TOOM8H_THRESHOLD               592
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD     101
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     191
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD     189
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD     195
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD     151
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 51
+#define SQR_TOOM3_THRESHOLD                163
+#define SQR_TOOM4_THRESHOLD                254
+#define SQR_TOOM6_THRESHOLD                614
+#define SQR_TOOM8_THRESHOLD                842
+
+#define MULMID_TOOM42_THRESHOLD             58
+
+#define MULMOD_BNM1_THRESHOLD               19
+#define SQRMOD_BNM1_THRESHOLD               23
+
+#define MUL_FFT_MODF_THRESHOLD             824  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    824, 5}, {     29, 6}, {     15, 5}, {     33, 6}, \
+    {     17, 5}, {     36, 6}, {     19, 5}, {     39, 6}, \
+    {     29, 7}, {     15, 6}, {     32, 7}, {     17, 6}, \
+    {     36, 7}, {     19, 6}, {     39, 7}, {     21, 6}, \
+    {     43, 7}, {     23, 6}, {     48, 7}, {     29, 8}, \
+    {     15, 7}, {     37, 8}, {     19, 7}, {     43, 8}, \
+    {     23, 7}, {     49, 8}, {     27, 7}, {     55, 8}, \
+    {     31, 7}, {     63, 8}, {     43, 9}, {     23, 8}, \
+    {     55, 9}, {     31, 8}, {     67, 9}, {     39, 8}, \
+    {     79, 9}, {     47, 8}, {     99, 9}, {     55,10}, \
+    {     31, 9}, {     79,10}, {     47, 9}, {    103,11}, \
+    {     31,10}, {     63, 9}, {    143,10}, {     79, 9}, \
+    {    167,10}, {     95, 9}, {    191,10}, {    111,11}, \
+    {     63,10}, {    127, 9}, {    255,10}, {    159,11}, \
+    {     95,10}, {    191,12}, {     63,11}, {    127,10}, \
+    {    271,11}, {    159,10}, {    319, 9}, {    639,10}, \
+    {    335,11}, {    191,10}, {    383, 9}, {    799,10}, \
+    {    415,11}, {    223,12}, {    127,11}, {    255,10}, \
+    {    527,11}, {    287,10}, {    607, 9}, {   1215,11}, \
+    {    319,10}, {    671,12}, {    191,11}, {    383,10}, \
+    {    799,11}, {    415,10}, {    863,13}, {    127,12}, \
+    {    255,11}, {    543,10}, {   1119, 9}, {   2239,11}, \
+    {    607,10}, {   1215,12}, {    319,11}, {    671,10}, \
+    {   1343,11}, {    735,10}, {   1471, 9}, {   2943,12}, \
+    {    383,11}, {    799,10}, {   1599,11}, {    863,12}, \
+    {    447,11}, {    927,10}, {   1855,11}, {    959,13}, \
+    {    255,12}, {    511,11}, {   1119,12}, {    575,11}, \
+    {   1215,10}, {   2431,11}, {   1247,12}, {    639,11}, \
+    {   1343,12}, {    703,11}, {   1471,10}, {   2943,13}, \
+    {    383,12}, {    767,11}, {   1599,12}, {    831,11}, \
+    {   1727,10}, {   3455,12}, {    895,14}, {    255,13}, \
+    {    511,12}, {   1087,11}, {   2239,10}, {   4479,12}, \
+    {   1215,11}, {   2431,13}, {    639,12}, {   1343,11}, \
+    {   2687,12}, {   1471,11}, {   2943,13}, {    767,12}, \
+    {   1727,11}, {   3455,13}, {    895,12}, {   1983,14}, \
+    {    511,13}, {   1023,12}, {   2239,13}, {   1151,12}, \
+    {   2495,11}, {   4991,13}, {   1407,12}, {   2943,14}, \
+    {    767,13}, {   1535,12}, {   3135,13}, {   1663,12}, \
+    {   3455,13}, {   1919,12}, {   3967,15}, {    511,14}, \
+    {   1023,13}, {   2175,12}, {   4479,13}, {   2431,12}, \
+    {   4991,14}, {   1279,13}, {   2687,12}, {   5503,13}, \
+    {   8192,14}, {  16384,15}, {  32768,16} }
+#define MUL_FFT_TABLE3_SIZE 167
+#define MUL_FFT_THRESHOLD                 7808
+
+#define SQR_FFT_MODF_THRESHOLD             560  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    560, 5}, {     33, 6}, {     17, 5}, {     35, 6}, \
+    {     33, 7}, {     17, 6}, {     36, 7}, {     19, 6}, \
+    {     39, 7}, {     35, 8}, {     19, 7}, {     43, 8}, \
+    {     23, 7}, {     47, 8}, {     27, 7}, {     55, 8}, \
+    {     31, 7}, {     63, 8}, {     43, 9}, {     23, 8}, \
+    {     55, 9}, {     31, 8}, {     67, 9}, {     39, 8}, \
+    {     79, 9}, {     47, 8}, {     95, 9}, {     55,10}, \
+    {     31, 9}, {     79,10}, {     47, 9}, {     95,11}, \
+    {     31,10}, {     63, 9}, {    135,10}, {     79, 9}, \
+    {    159,10}, {    111,11}, {     63,10}, {    143, 9}, \
+    {    287,10}, {    159,11}, {     95,10}, {    191,12}, \
+    {     63,11}, {    127, 9}, {    511, 8}, {   1023, 9}, \
+    {    527,11}, {    159,10}, {    319, 9}, {    639,10}, \
+    {    351,11}, {    191,10}, {    431,11}, {    223,12}, \
+    {    127,11}, {    255,10}, {    543,11}, {    287,10}, \
+    {    607, 9}, {   1215,11}, {    319,10}, {    639,11}, \
+    {    351,12}, {    191,11}, {    383,10}, {    767,11}, \
+    {    415,10}, {    831,13}, {    127,12}, {    255,11}, \
+    {    543,10}, {   1119,11}, {    607,12}, {    319,11}, \
+    {    671,10}, {   1343,11}, {    735,12}, {    383,11}, \
+    {    799,10}, {   1599,11}, {    863,12}, {    447,11}, \
+    {    927,10}, {   1855,11}, {    991,13}, {    255,12}, \
+    {    511,11}, {   1055,10}, {   2111,11}, {   1087,12}, \
+    {    575,11}, {   1215,10}, {   2431,12}, {    639,11}, \
+    {   1343,12}, {    703,11}, {   1407,13}, {    383,12}, \
+    {    767,11}, {   1599,12}, {    831,11}, {   1727,10}, \
+    {   3455,12}, {    895,11}, {   1855,12}, {    959,14}, \
+    {    255,13}, {    511,12}, {   1087,11}, {   2239,12}, \
+    {   1215,11}, {   2431,13}, {    639,12}, {   1471,11}, \
+    {   2943,13}, {    767,12}, {   1727,11}, {   3455,13}, \
+    {    895,12}, {   1983,14}, {    511,13}, {   1023,12}, \
+    {   2239,13}, {   1151,12}, {   2495,11}, {   4991,13}, \
+    {   1279,12}, {   2623,13}, {   1407,12}, {   2943,14}, \
+    {    767,13}, {   1663,12}, {   3455,13}, {   1919,12}, \
+    {   3839,15}, {    511,14}, {   1023,13}, {   2175,12}, \
+    {   4479,13}, {   2431,12}, {   4991,14}, {   1279,13}, \
+    {   2687,12}, {   5503,13}, {   8192,14}, {  16384,15}, \
+    {  32768,16} }
+#define SQR_FFT_TABLE3_SIZE 149
+#define SQR_FFT_THRESHOLD                 4800
+
+#define MULLO_BASECASE_THRESHOLD            12
+#define MULLO_DC_THRESHOLD                  44
+#define MULLO_MUL_N_THRESHOLD            14281
+#define SQRLO_BASECASE_THRESHOLD            13
+#define SQRLO_DC_THRESHOLD                  42
+#define SQRLO_SQR_THRESHOLD               9449
+
+#define DC_DIV_QR_THRESHOLD                 38
+#define DC_DIVAPPR_Q_THRESHOLD             105
+#define DC_BDIV_QR_THRESHOLD                52
+#define DC_BDIV_Q_THRESHOLD                 83
+
+#define INV_MULMOD_BNM1_THRESHOLD           50
+#define INV_NEWTON_THRESHOLD               158
+#define INV_APPR_THRESHOLD                 118
+
+#define BINV_NEWTON_THRESHOLD              342
+#define REDC_1_TO_REDC_N_THRESHOLD          67
+
+#define MU_DIV_QR_THRESHOLD               2130
+#define MU_DIVAPPR_Q_THRESHOLD            1895
+#define MUPI_DIV_QR_THRESHOLD               60
+#define MU_BDIV_QR_THRESHOLD              1652
+#define MU_BDIV_Q_THRESHOLD               2089
+
+#define POWM_SEC_TABLE  1,22,96,446,723,1378
+
+#define GET_STR_DC_THRESHOLD                13
+#define GET_STR_PRECOMPUTE_THRESHOLD        20
+#define SET_STR_DC_THRESHOLD               298
+#define SET_STR_PRECOMPUTE_THRESHOLD       960
+
+#define FAC_DSC_THRESHOLD                  212
+#define FAC_ODD_THRESHOLD                   71
+
+#define MATRIX22_STRASSEN_THRESHOLD         26
+#define HGCD2_DIV1_METHOD                    3  /* 0.68% faster than 1 */
+#define HGCD_THRESHOLD                      80
+#define HGCD_APPR_THRESHOLD                138
+#define HGCD_REDUCE_THRESHOLD             4455
+#define GCD_DC_THRESHOLD                   365
+#define GCDEXT_DC_THRESHOLD                245
+#define JACOBI_BASE_METHOD                   4  /* 23.41% faster than 1 */
+
+/* Tuneup completed successfully, took 63807 seconds */

diff --git a/third_party/gmp/mpn/x86/pentium4/sse2/mod_1_1.asm b/third_party/gmp/mpn/x86/pentium4/sse2/mod_1_1.asm
new file mode 100644
index 0000000..ee88bab
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium4/sse2/mod_1_1.asm

@@ -0,0 +1,166 @@
+dnl  x86-32 mpn_mod_1_1p for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F).
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  Copyright 2009, 2010 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C TODO:
+C  * Optimize.  The present code was written quite straightforwardly.
+C  * Optimize post-loop reduction code; it is from mod_1s_4p, thus overkill.
+C  * Write a cps function that uses sse2 insns.
+
+C                           cycles/limb
+C P6 model 0-8,10-12		-
+C P6 model 9   (Banias)		?
+C P6 model 13  (Dothan)		?
+C P4 model 0-1 (Willamette)	?
+C P4 model 2   (Northwood)     16
+C P4 model 3-4 (Prescott)      18
+
+C INPUT PARAMETERS
+C ap		sp + 4
+C n		sp + 8
+C b		sp + 12
+C cps		sp + 16
+
+define(`B1modb', `%mm1')
+define(`B2modb', `%mm2')
+define(`ap',     `%edx')
+define(`n',      `%eax')
+
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_mod_1_1p)
+	push	%ebx
+	mov	8(%esp), ap
+	mov	12(%esp), n
+	mov	20(%esp), %ecx
+	movd	8(%ecx), B1modb
+	movd	12(%ecx), B2modb
+
+	lea	-4(ap,n,4), ap
+
+C FIXME: See comment in generic/mod_1_1.c.
+	movd	(ap), %mm7
+	movd	-4(ap), %mm4
+	pmuludq B1modb, %mm7
+	paddq	%mm4, %mm7
+	add	$-2, n
+	jz	L(end)
+
+	ALIGN(8)
+L(top):	movq	%mm7, %mm6
+	psrlq	$32, %mm7		C rh
+	movd	-8(ap), %mm0
+	add	$-4, ap
+	pmuludq	B2modb, %mm7
+	pmuludq	B1modb, %mm6
+	add	$-1, n
+	paddq	%mm0, %mm7
+	paddq	%mm6, %mm7
+	jnz	L(top)
+
+L(end):	pcmpeqd	%mm4, %mm4
+	psrlq	$32, %mm4		C 0x00000000FFFFFFFF
+	pand	%mm7, %mm4		C rl
+	psrlq	$32, %mm7		C rh
+	pmuludq	B1modb, %mm7		C rh,cl
+	paddq	%mm4, %mm7		C rh,rl
+	movd	4(%ecx), %mm4		C cnt
+	psllq	%mm4, %mm7		C rh,rl normalized
+	movq	%mm7, %mm2		C rl in low half
+	psrlq	$32, %mm7		C rh
+	movd	(%ecx), %mm1		C bi
+	pmuludq	%mm7, %mm1		C qh,ql
+	paddq	%mm2, %mm1		C qh-1,ql
+	movd	%mm1, %ecx		C ql
+	psrlq	$32, %mm1		C qh-1
+	movd	16(%esp), %mm3		C b
+	pmuludq	%mm1, %mm3		C (qh-1) * b
+	psubq	%mm3, %mm2		C r in low half (could use psubd)
+	movd	%mm2, %eax		C r
+	mov	16(%esp), %ebx
+	sub	%ebx, %eax		C r
+	cmp	%eax, %ecx
+	lea	(%eax,%ebx), %edx
+	cmovc(	%edx, %eax)
+	movd	%mm4, %ecx		C cnt
+	cmp	%ebx, %eax
+	jae	L(fix)
+	emms
+	pop	%ebx
+	shr	%cl, %eax
+	ret
+
+L(fix):	sub	%ebx, %eax
+	emms
+	pop	%ebx
+	shr	%cl, %eax
+	ret
+EPILOGUE()
+
+PROLOGUE(mpn_mod_1_1p_cps)
+C CAUTION: This is the same code as in k7/mod_1_1.asm
+	push	%ebp
+	mov	12(%esp), %ebp
+	push	%esi
+	bsr	%ebp, %ecx
+	push	%ebx
+	xor	$31, %ecx
+	mov	16(%esp), %esi
+	sal	%cl, %ebp
+	mov	%ebp, %edx
+	not	%edx
+	mov	$-1, %eax
+	div	%ebp
+	mov	%eax, (%esi)		C store bi
+	mov	%ecx, 4(%esi)		C store cnt
+	xor	%ebx, %ebx
+	sub	%ebp, %ebx
+	mov	$1, %edx
+	shld	%cl, %eax, %edx
+	imul	%edx, %ebx
+	mul	%ebx
+	add	%ebx, %edx
+	not	%edx
+	imul	%ebp, %edx
+	add	%edx, %ebp
+	cmp	%edx, %eax
+	cmovc(	%ebp, %edx)
+	shr	%cl, %ebx
+	mov	%ebx, 8(%esi)		C store B1modb
+	shr	%cl, %edx
+	mov	%edx, 12(%esi)		C store B2modb
+	pop	%ebx
+	pop	%esi
+	pop	%ebp
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/pentium4/sse2/mod_1_4.asm b/third_party/gmp/mpn/x86/pentium4/sse2/mod_1_4.asm
new file mode 100644
index 0000000..eb2edb6
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium4/sse2/mod_1_4.asm

@@ -0,0 +1,269 @@
+dnl  x86-32 mpn_mod_1s_4p for Pentium 4 and P6 models with SSE2 (i.e. 9,D,E,F).
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  Copyright 2009, 2010 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C TODO:
+C  * Optimize.  The present code was written quite straightforwardly.
+C  * Optimize post-loop reduction code.
+C  * Write a cps function that uses sse2 insns.
+
+C			    cycles/limb
+C P6 model 0-8,10-12		-
+C P6 model 9   (Banias)		?
+C P6 model 13  (Dothan)		3.4
+C P4 model 0-1 (Willamette)	?
+C P4 model 2   (Northwood)	4
+C P4 model 3-4 (Prescott)	4.5
+
+C INPUT PARAMETERS
+C ap		sp + 4
+C n		sp + 8
+C b		sp + 12
+C cps		sp + 16
+
+define(`B1modb', `%mm1')
+define(`B2modb', `%mm2')
+define(`B3modb', `%mm3')
+define(`B4modb', `%mm4')
+define(`B5modb', `%mm5')
+define(`ap',     `%edx')
+define(`n',      `%eax')
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_mod_1s_4p)
+	push	%ebx
+	mov	8(%esp), ap
+	mov	12(%esp), n
+	mov	20(%esp), %ecx
+
+	movd	8(%ecx), B1modb
+	movd	12(%ecx), B2modb
+	movd	16(%ecx), B3modb
+	movd	20(%ecx), B4modb
+	movd	24(%ecx), B5modb
+
+	mov	n, %ebx
+	lea	-4(ap,n,4), ap
+	and	$3, %ebx
+	je	L(b0)
+	cmp	$2, %ebx
+	jc	L(b1)
+	je	L(b2)
+
+L(b3):	movd	-4(ap), %mm7
+	pmuludq	B1modb, %mm7
+	movd	-8(ap), %mm6
+	paddq	%mm6, %mm7
+	movd	(ap), %mm6
+	pmuludq	B2modb, %mm6
+	paddq	%mm6, %mm7
+	lea	-24(ap), ap
+	add	$-3, n
+	jz	L(end)
+	jmp	L(top)
+
+L(b0):	movd	-8(ap), %mm7
+	pmuludq	B1modb, %mm7
+	movd	-12(ap), %mm6
+	paddq	%mm6, %mm7
+	movd	-4(ap), %mm6
+	pmuludq	B2modb, %mm6
+	paddq	%mm6, %mm7
+	movd	(ap), %mm6
+	pmuludq	B3modb, %mm6
+	paddq	%mm6, %mm7
+	lea	-28(ap), ap
+	add	$-4, n
+	jz	L(end)
+	jmp	L(top)
+
+L(b1):	movd	(ap), %mm7
+	lea	-16(ap), ap
+	dec	n
+	jz	L(x)
+	jmp	L(top)
+
+L(b2):	movd	-4(ap), %mm7		C rl
+	punpckldq (ap), %mm7		C rh
+	lea	-20(ap), ap
+	add	$-2, n
+	jz	L(end)
+
+	ALIGN(8)
+L(top):	movd	4(ap), %mm0
+	pmuludq	B1modb, %mm0
+	movd	0(ap), %mm6
+	paddq	%mm6, %mm0
+
+	movd	8(ap), %mm6
+	pmuludq	B2modb, %mm6
+	paddq	%mm6, %mm0
+
+	movd	12(ap), %mm6
+	pmuludq	B3modb, %mm6
+	paddq	%mm6, %mm0
+
+	movq	%mm7, %mm6
+	psrlq	$32, %mm7		C rh
+	pmuludq	B5modb, %mm7
+	pmuludq	B4modb, %mm6
+
+	paddq	%mm0, %mm7
+	paddq	%mm6, %mm7
+
+	add	$-16, ap
+	add	$-4, n
+	jnz	L(top)
+
+L(end):	pcmpeqd	%mm4, %mm4
+	psrlq	$32, %mm4		C 0x00000000FFFFFFFF
+	pand	%mm7, %mm4		C rl
+	psrlq	$32, %mm7		C rh
+	pmuludq	B1modb, %mm7		C rh,cl
+	paddq	%mm4, %mm7		C rh,rl
+L(x):	movd	4(%ecx), %mm4		C cnt
+	psllq	%mm4, %mm7		C rh,rl normalized
+	movq	%mm7, %mm2		C rl in low half
+	psrlq	$32, %mm7		C rh
+	movd	(%ecx), %mm1		C bi
+	pmuludq	%mm7, %mm1		C qh,ql
+	paddq	%mm2, %mm1		C qh-1,ql
+	movd	%mm1, %ecx		C ql
+	psrlq	$32, %mm1		C qh-1
+	movd	16(%esp), %mm3		C b
+	pmuludq	%mm1, %mm3		C (qh-1) * b
+	psubq	%mm3, %mm2		C r in low half (could use psubd)
+	movd	%mm2, %eax		C r
+	mov	16(%esp), %ebx
+	sub	%ebx, %eax		C r
+	cmp	%eax, %ecx
+	lea	(%eax,%ebx), %edx
+	cmovc(	%edx, %eax)
+	movd	%mm4, %ecx		C cnt
+	cmp	%ebx, %eax
+	jae	L(fix)
+	emms
+	pop	%ebx
+	shr	%cl, %eax
+	ret
+
+L(fix):	sub	%ebx, %eax
+	emms
+	pop	%ebx
+	shr	%cl, %eax
+	ret
+EPILOGUE()
+
+	ALIGN(16)
+PROLOGUE(mpn_mod_1s_4p_cps)
+C CAUTION: This is the same code as in k7/mod_1_4.asm
+	push	%ebp
+	push	%edi
+	push	%esi
+	push	%ebx
+	mov	20(%esp), %ebp		C FIXME: avoid bp for 0-idx
+	mov	24(%esp), %ebx
+	bsr	%ebx, %ecx
+	xor	$31, %ecx
+	sal	%cl, %ebx		C b << cnt
+	mov	%ebx, %edx
+	not	%edx
+	mov	$-1, %eax
+	div	%ebx
+	xor	%edi, %edi
+	sub	%ebx, %edi
+	mov	$1, %esi
+	mov	%eax, (%ebp)		C store bi
+	mov	%ecx, 4(%ebp)		C store cnt
+	shld	%cl, %eax, %esi
+	imul	%edi, %esi
+	mov	%eax, %edi
+	mul	%esi
+
+	add	%esi, %edx
+	shr	%cl, %esi
+	mov	%esi, 8(%ebp)		C store B1modb
+
+	not	%edx
+	imul	%ebx, %edx
+	lea	(%edx,%ebx), %esi
+	cmp	%edx, %eax
+	cmovnc(	%edx, %esi)
+	mov	%edi, %eax
+	mul	%esi
+
+	add	%esi, %edx
+	shr	%cl, %esi
+	mov	%esi, 12(%ebp)		C store B2modb
+
+	not	%edx
+	imul	%ebx, %edx
+	lea	(%edx,%ebx), %esi
+	cmp	%edx, %eax
+	cmovnc(	%edx, %esi)
+	mov	%edi, %eax
+	mul	%esi
+
+	add	%esi, %edx
+	shr	%cl, %esi
+	mov	%esi, 16(%ebp)		C store B3modb
+
+	not	%edx
+	imul	%ebx, %edx
+	lea	(%edx,%ebx), %esi
+	cmp	%edx, %eax
+	cmovnc(	%edx, %esi)
+	mov	%edi, %eax
+	mul	%esi
+
+	add	%esi, %edx
+	shr	%cl, %esi
+	mov	%esi, 20(%ebp)		C store B4modb
+
+	not	%edx
+	imul	%ebx, %edx
+	add	%edx, %ebx
+	cmp	%edx, %eax
+	cmovnc(	%edx, %ebx)
+
+	shr	%cl, %ebx
+	mov	%ebx, 24(%ebp)		C store B5modb
+
+	pop	%ebx
+	pop	%esi
+	pop	%edi
+	pop	%ebp
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/pentium4/sse2/mod_34lsub1.asm b/third_party/gmp/mpn/x86/pentium4/sse2/mod_34lsub1.asm
new file mode 100644
index 0000000..31e25b7
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium4/sse2/mod_34lsub1.asm

@@ -0,0 +1,175 @@
+dnl  Intel Pentium 4 mpn_mod_34lsub1 -- remainder modulo 2^24-1.
+
+dnl  Copyright 2000-2003 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C Pentium4: 1.0 cycles/limb
+
+
+C mp_limb_t mpn_mod_34lsub1 (mp_srcptr src, mp_size_t size)
+C
+C Enhancements:
+C
+C There might a couple of cycles to save by using plain integer code for
+C more small sizes.  2 limbs measures about 20 cycles, but 3 limbs jumps to
+C about 46 (inclusive of some function call overheads).
+
+defframe(PARAM_SIZE, 8)
+defframe(PARAM_SRC,  4)
+
+dnl  re-use parameter space
+define(SAVE_EBX, `PARAM_SRC')
+define(SAVE_ESI, `PARAM_SIZE')
+
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_mod_34lsub1)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	movl	PARAM_SRC, %edx
+	movl	(%edx), %eax
+
+	subl	$2, %ecx
+	ja	L(three_or_more)
+	jne	L(one)
+
+	movl	4(%edx), %edx
+	movl	%eax, %ecx
+	shrl	$24, %eax		C src[0] high
+
+	andl	$0x00FFFFFF, %ecx	C src[0] low
+	addl	%ecx, %eax
+
+	movl	%edx, %ecx
+	shll	$8, %edx
+
+	shrl	$16, %ecx		C src[1] low
+	addl	%ecx, %eax
+
+	andl	$0x00FFFF00, %edx	C src[1] high
+	addl	%edx, %eax
+
+L(one):
+	ret
+
+
+L(three_or_more):
+	pxor	%mm0, %mm0
+	pxor	%mm1, %mm1
+	pxor	%mm2, %mm2
+
+	pcmpeqd	%mm7, %mm7
+	psrlq	$32, %mm7	C 0x00000000FFFFFFFF, low 32 bits
+
+	pcmpeqd	%mm6, %mm6
+	psrlq	$40, %mm6	C 0x0000000000FFFFFF, low 24 bits
+
+L(top):
+	C eax
+	C ebx
+	C ecx	counter, size-2 to 0, -1 or -2
+	C edx	src, incrementing
+	C
+	C mm0	sum 0mod3
+	C mm1	sum 1mod3
+	C mm2	sum 2mod3
+	C mm3
+	C mm4
+	C mm5
+	C mm6	0x0000000000FFFFFF
+	C mm7	0x00000000FFFFFFFF
+
+	movd	(%edx), %mm3
+	paddq	%mm3, %mm0
+
+	movd	4(%edx), %mm3
+	paddq	%mm3, %mm1
+
+	movd	8(%edx), %mm3
+	paddq	%mm3, %mm2
+
+	addl	$12, %edx
+	subl	$3, %ecx
+	ja	L(top)
+
+
+	C ecx is -2, -1 or 0 representing 0, 1 or 2 more limbs, respectively
+
+	addl	$1, %ecx
+	js	L(combine)		C 0 more
+
+	movd	(%edx), %mm3
+	paddq	%mm3, %mm0
+
+	jz	L(combine)		C 1 more
+
+	movd	4(%edx), %mm3
+	paddq	%mm3, %mm1
+
+L(combine):
+	movq	%mm7, %mm3		C low halves
+	pand	%mm0, %mm3
+
+	movq	%mm7, %mm4
+	pand	%mm1, %mm4
+
+	movq	%mm7, %mm5
+	pand	%mm2, %mm5
+
+	psrlq	$32, %mm0		C high halves
+	psrlq	$32, %mm1
+	psrlq	$32, %mm2
+
+	paddq	%mm0, %mm4		C fold high halves to give 33 bits each
+	paddq	%mm1, %mm5
+	paddq	%mm2, %mm3
+
+	psllq	$8, %mm4		C combine at respective offsets
+	psllq	$16, %mm5
+	paddq	%mm4, %mm3
+	paddq	%mm5, %mm3		C 0x000cxxxxxxxxxxxx, 50 bits
+
+	pand	%mm3, %mm6		C fold at 24 bits
+	psrlq	$24, %mm3
+
+	paddq	%mm6, %mm3
+	movd	%mm3, %eax
+
+	ASSERT(z,	C nothing left in high dword
+	`psrlq	$32, %mm3
+	movd	%mm3, %ecx
+	orl	%ecx, %ecx')
+
+	emms
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/pentium4/sse2/mode1o.asm b/third_party/gmp/mpn/x86/pentium4/sse2/mode1o.asm
new file mode 100644
index 0000000..aa9ef31
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium4/sse2/mode1o.asm

@@ -0,0 +1,175 @@
+dnl  Intel Pentium-4 mpn_modexact_1_odd -- mpn by limb exact remainder.
+
+dnl  Copyright 2001, 2002, 2007 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P4: 19.0 cycles/limb
+
+
+C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size,
+C                               mp_limb_t divisor);
+C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size,
+C                                mp_limb_t divisor, mp_limb_t carry);
+C
+
+defframe(PARAM_CARRY,  16)
+defframe(PARAM_DIVISOR,12)
+defframe(PARAM_SIZE,   8)
+defframe(PARAM_SRC,    4)
+
+	TEXT
+
+	ALIGN(16)
+PROLOGUE(mpn_modexact_1c_odd)
+deflit(`FRAME',0)
+
+	movd	PARAM_CARRY, %mm1
+	jmp	L(start_1c)
+
+EPILOGUE()
+
+
+	ALIGN(16)
+PROLOGUE(mpn_modexact_1_odd)
+deflit(`FRAME',0)
+
+	pxor	%mm1, %mm1		C carry limb
+L(start_1c):
+	movl	PARAM_DIVISOR, %eax
+
+	movd	PARAM_DIVISOR, %mm7
+
+	shrl	%eax
+
+	andl	$127, %eax		C d/2, 7 bits
+
+ifdef(`PIC',`
+	LEA(	binvert_limb_table, %edx)
+	movzbl	(%eax,%edx), %eax		C inv 8 bits
+',`
+	movzbl	binvert_limb_table(%eax), %eax	C inv 8 bits
+')
+
+	C
+
+	movd	%eax, %mm6		C inv
+
+	movd	%eax, %mm0		C inv
+
+	pmuludq	%mm6, %mm6		C inv*inv
+
+	C
+
+	pmuludq	%mm7, %mm6		C inv*inv*d
+	paddd	%mm0, %mm0		C 2*inv
+
+	C
+
+	psubd	%mm6, %mm0		C inv = 2*inv - inv*inv*d
+	pxor	%mm6, %mm6
+
+	paddd	%mm0, %mm6
+	pmuludq	%mm0, %mm0		C inv*inv
+
+	C
+
+	pmuludq	%mm7, %mm0		C inv*inv*d
+	paddd	%mm6, %mm6		C 2*inv
+
+
+	movl	PARAM_SRC, %eax
+	movl	PARAM_SIZE, %ecx
+
+	C
+
+	psubd	%mm0, %mm6		C inv = 2*inv - inv*inv*d
+
+	ASSERT(e,`	C expect d*inv == 1 mod 2^GMP_LIMB_BITS
+	pushl	%eax	FRAME_pushl()
+	movd	%mm6, %eax
+	imul	PARAM_DIVISOR, %eax
+	cmpl	$1, %eax
+	popl	%eax	FRAME_popl()')
+
+	pxor	%mm0, %mm0		C carry bit
+
+
+C The dependent chain here is as follows.
+C
+C					latency
+C	psubq	 s = (src-cbit) - climb	   2
+C	pmuludq	 q = s*inverse		   8
+C	pmuludq	 prod = q*divisor	   8
+C	psrlq	 climb = high(prod)	   2
+C					  --
+C					  20
+C
+C Yet the loop measures 19.0 c/l, so obviously there's something gained
+C there over a straight reading of the chip documentation.
+
+L(top):
+	C eax	src, incrementing
+	C ebx
+	C ecx	counter, limbs
+	C edx
+	C
+	C mm0	carry bit
+	C mm1	carry limb
+	C mm6	inverse
+	C mm7	divisor
+
+	movd	(%eax), %mm2
+	addl	$4, %eax
+
+	psubq	%mm0, %mm2		C src - cbit
+
+	psubq	%mm1, %mm2		C src - cbit - climb
+	movq	%mm2, %mm0
+	psrlq	$63, %mm0		C new cbit
+
+	pmuludq	%mm6, %mm2		C s*inverse
+
+	movq	%mm7, %mm1
+	pmuludq	%mm2, %mm1		C q*divisor
+	psrlq	$32, %mm1		C new climb
+
+	subl	$1, %ecx
+	jnz	L(top)
+
+
+L(done):
+	paddq	%mm1, %mm0
+	movd	%mm0, %eax
+	emms
+	ret
+
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/x86/pentium4/sse2/mul_1.asm b/third_party/gmp/mpn/x86/pentium4/sse2/mul_1.asm
new file mode 100644
index 0000000..6347b8b
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium4/sse2/mul_1.asm

@@ -0,0 +1,164 @@
+dnl  mpn_mul_1 for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F).
+
+dnl  Copyright 2005, 2007, 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+C                           cycles/limb
+C P6 model 0-8,10-12		-
+C P6 model 9   (Banias)		4.17
+C P6 model 13  (Dothan)		4.17
+C P4 model 0-1 (Willamette)	4
+C P4 model 2   (Northwood)	4
+C P4 model 3-4 (Prescott)	4.55
+
+C TODO:
+C  * Tweak eax/edx offsets in loop as to save some lea's
+C  * Perhaps software pipeline small-case code
+
+C INPUT PARAMETERS
+C rp		sp + 4
+C up		sp + 8
+C n		sp + 12
+C v0		sp + 16
+
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_mul_1)
+	pxor	%mm6, %mm6
+L(ent):	mov	4(%esp), %edx
+	mov	8(%esp), %eax
+	mov	12(%esp), %ecx
+	movd	16(%esp), %mm7
+	cmp	$4, %ecx
+	jnc	L(big)
+
+L(lp0):	movd	(%eax), %mm0
+	lea	4(%eax), %eax
+	lea	4(%edx), %edx
+	pmuludq	%mm7, %mm0
+	paddq	%mm0, %mm6
+	movd	%mm6, -4(%edx)
+	psrlq	$32, %mm6
+	dec	%ecx
+	jnz	L(lp0)
+	movd	%mm6, %eax
+	emms
+	ret
+
+L(big):	and	$3, %ecx
+	je	L(0)
+	cmp	$2, %ecx
+	jc	L(1)
+	je	L(2)
+	jmp	L(3)			C FIXME: one case should fall through
+
+L(0):	movd	(%eax), %mm3
+	sub	12(%esp), %ecx		C loop count
+	lea	-16(%eax), %eax
+	lea	-12(%edx), %edx
+	pmuludq	%mm7, %mm3
+	movd	20(%eax), %mm0
+	pmuludq	%mm7, %mm0
+	movd	24(%eax), %mm1
+	jmp	L(00)
+
+L(1):	movd	(%eax), %mm2
+	sub	12(%esp), %ecx
+	lea	-12(%eax), %eax
+	lea	-8(%edx), %edx
+	pmuludq	%mm7, %mm2
+	movd	16(%eax), %mm3
+	pmuludq	%mm7, %mm3
+	movd	20(%eax), %mm0
+	jmp	L(01)
+
+L(2):	movd	(%eax), %mm1
+	sub	12(%esp), %ecx
+	lea	-8(%eax), %eax
+	lea	-4(%edx), %edx
+	pmuludq	%mm7, %mm1
+	movd	12(%eax), %mm2
+	pmuludq	%mm7, %mm2
+	movd	16(%eax), %mm3
+	jmp	L(10)
+
+L(3):	movd	(%eax), %mm0
+	sub	12(%esp), %ecx
+	lea	-4(%eax), %eax
+	pmuludq	%mm7, %mm0
+	movd	8(%eax), %mm1
+	pmuludq	%mm7, %mm1
+	movd	12(%eax), %mm2
+
+	ALIGN(16)
+L(top):	pmuludq	%mm7, %mm2
+	paddq	%mm0, %mm6
+	movd	16(%eax), %mm3
+	movd	%mm6, 0(%edx)
+	psrlq	$32, %mm6
+L(10):	pmuludq	%mm7, %mm3
+	paddq	%mm1, %mm6
+	movd	20(%eax), %mm0
+	movd	%mm6, 4(%edx)
+	psrlq	$32, %mm6
+L(01):	pmuludq	%mm7, %mm0
+	paddq	%mm2, %mm6
+	movd	24(%eax), %mm1
+	movd	%mm6, 8(%edx)
+	psrlq	$32, %mm6
+L(00):	pmuludq	%mm7, %mm1
+	paddq	%mm3, %mm6
+	movd	28(%eax), %mm2
+	movd	%mm6, 12(%edx)
+	psrlq	$32, %mm6
+	lea	16(%eax), %eax
+	lea	16(%edx), %edx
+	add	$4, %ecx
+	ja	L(top)
+
+L(end):	pmuludq	%mm7, %mm2
+	paddq	%mm0, %mm6
+	movd	%mm6, 0(%edx)
+	psrlq	$32, %mm6
+	paddq	%mm1, %mm6
+	movd	%mm6, 4(%edx)
+	psrlq	$32, %mm6
+	paddq	%mm2, %mm6
+	movd	%mm6, 8(%edx)
+	psrlq	$32, %mm6
+	movd	%mm6, %eax
+	emms
+	ret
+EPILOGUE()
+PROLOGUE(mpn_mul_1c)
+	movd	20(%esp), %mm6
+	jmp	L(ent)
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/pentium4/sse2/mul_basecase.asm b/third_party/gmp/mpn/x86/pentium4/sse2/mul_basecase.asm
new file mode 100644
index 0000000..6e3775a
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium4/sse2/mul_basecase.asm

@@ -0,0 +1,662 @@
+dnl  mpn_mul_basecase for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F).
+
+dnl  Copyright 2001, 2002, 2005, 2007 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C TODO:
+C  * Improve ad-hoc outer loop code and register handling.  Some feed-in
+C    scheduling could improve things by several cycles per outer iteration.
+C  * In code for un <= 3, try keeping accumulation operands in registers,
+C    without storing intermediates to rp.
+C  * We might want to keep 32 in a free mm register, since the register form is
+C    3 bytes and the immediate form is 4 bytes.  About 70 bytes to save.
+C  * Look into different loop alignment, we now expand the code about 50 bytes
+C    with possibly needless alignment.
+C  * Perhaps rewrap loops 00,01,02 (6 loops) to allow fall-through entry.
+C  * Use OSP, should solve feed-in latency problems.
+C  * Save a few tens of bytes by doing cross-jumping for Loel0, etc.
+C  * Save around 120 bytes by remapping "m 0", "m 1", "m 2" and "m 3" registers
+C    so that they can share feed-in code, and changing the branch targets from
+C    L<n> to Lm<nn>.
+
+C                           cycles/limb
+C P6 model 9   (Banias)         ?
+C P6 model 13  (Dothan)         5.24
+C P6 model 14  (Yonah)          ?
+C P4 model 0-1 (Willamette):    5
+C P4 model 2   (Northwood):     4.60 at 32 limbs
+C P4 model 3-4 (Prescott):      4.94 at 32 limbs
+
+C INPUT PARAMETERS
+C rp		sp + 4
+C up		sp + 8
+C un		sp + 12
+C vp		sp + 16
+C vn		sp + 20
+
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_mul_basecase)
+	push	%esi
+	push	%ebx
+	mov	12(%esp), %edx		C rp
+	mov	16(%esp), %eax		C up
+	mov	20(%esp), %ecx		C un
+	mov	24(%esp), %esi		C vp
+	mov	28(%esp), %ebx		C vn
+	movd	(%esi), %mm7		C
+L(ent):	cmp	$3, %ecx
+	ja	L(big)
+	movd	(%eax), %mm6
+	pmuludq	%mm7, %mm6
+	jz	L(un3)
+	cmp	$2, %ecx
+	jz	L(un2)
+
+L(un1):	movd	%mm6, (%edx)		C				un=1
+	psrlq	$32, %mm6		C				un=1
+	movd	%mm6, 4(%edx)		C				un=1
+	jmp	L(rtr)			C				un=1
+
+L(un2):	movd	4(%eax), %mm1		C				un=2
+	pmuludq	%mm7, %mm1		C				un=2
+	movd	%mm6, (%edx)		C				un=2
+	psrlq	$32, %mm6		C				un=2
+	paddq	%mm1, %mm6		C				un=2
+	movd	%mm6, 4(%edx)		C				un=2
+	psrlq	$32, %mm6		C				un=2
+	movd	%mm6, 8(%edx)		C				un=2
+      dec	%ebx			C				un=2
+      jz	L(rtr)			C				un=2
+	movd	4(%esi), %mm7		C				un=2
+	movd	(%eax), %mm6		C				un=2
+	pmuludq	%mm7, %mm6		C				un=2
+	movd	4(%eax), %mm1		C				un=2
+	movd	4(%edx), %mm4		C				un=2
+	pmuludq	%mm7, %mm1		C				un=2
+	movd	8(%edx), %mm5		C				un=2
+	paddq	%mm4, %mm6		C				un=2
+	paddq	%mm1, %mm5		C				un=2
+	movd	%mm6, 4(%edx)		C				un=2
+	psrlq	$32, %mm6		C				un=2
+	paddq	%mm5, %mm6		C				un=2
+	movd	%mm6, 8(%edx)		C				un=2
+	psrlq	$32, %mm6		C				un=2
+	movd	%mm6, 12(%edx)		C				un=2
+L(rtr):	emms
+	pop	%ebx
+	pop	%esi
+	ret
+
+L(un3):	movd	4(%eax), %mm1		C				un=3
+	pmuludq	%mm7, %mm1		C				un=3
+	movd	8(%eax), %mm2		C				un=3
+	pmuludq	%mm7, %mm2		C				un=3
+	movd	%mm6, (%edx)		C				un=3
+	psrlq	$32, %mm6		C				un=3
+	paddq	%mm1, %mm6		C				un=3
+	movd	%mm6, 4(%edx)		C				un=3
+	psrlq	$32, %mm6		C				un=3
+	paddq	%mm2, %mm6		C				un=3
+	movd	%mm6, 8(%edx)		C				un=3
+	psrlq	$32, %mm6		C				un=3
+	movd	%mm6, 12(%edx)		C				un=3
+      dec	%ebx			C				un=3
+      jz	L(rtr)			C				un=3
+	movd	4(%esi), %mm7		C				un=3
+	movd	(%eax), %mm6		C				un=3
+	pmuludq	%mm7, %mm6		C				un=3
+	movd	4(%eax), %mm1		C				un=3
+	movd	4(%edx), %mm4		C				un=3
+	pmuludq	%mm7, %mm1		C				un=3
+	movd	8(%eax), %mm2		C				un=3
+	movd	8(%edx), %mm5		C				un=3
+	pmuludq	%mm7, %mm2		C				un=3
+	paddq	%mm4, %mm6		C				un=3
+	paddq	%mm1, %mm5		C				un=3
+	movd	12(%edx), %mm4		C				un=3
+	movd	%mm6, 4(%edx)		C				un=3
+	psrlq	$32, %mm6		C				un=3
+	paddq	%mm5, %mm6		C				un=3
+	paddq	%mm2, %mm4		C				un=3
+	movd	%mm6, 8(%edx)		C				un=3
+	psrlq	$32, %mm6		C				un=3
+	paddq	%mm4, %mm6		C				un=3
+	movd	%mm6, 12(%edx)		C				un=3
+	psrlq	$32, %mm6		C				un=3
+	movd	%mm6, 16(%edx)		C				un=3
+      dec	%ebx			C				un=3
+      jz	L(rtr)			C				un=3
+	movd	8(%esi), %mm7		C				un=3
+	movd	(%eax), %mm6		C				un=3
+	pmuludq	%mm7, %mm6		C				un=3
+	movd	4(%eax), %mm1		C				un=3
+	movd	8(%edx), %mm4		C				un=3
+	pmuludq	%mm7, %mm1		C				un=3
+	movd	8(%eax), %mm2		C				un=3
+	movd	12(%edx), %mm5		C				un=3
+	pmuludq	%mm7, %mm2		C				un=3
+	paddq	%mm4, %mm6		C				un=3
+	paddq	%mm1, %mm5		C				un=3
+	movd	16(%edx), %mm4		C				un=3
+	movd	%mm6, 8(%edx)		C				un=3
+	psrlq	$32, %mm6		C				un=3
+	paddq	%mm5, %mm6		C				un=3
+	paddq	%mm2, %mm4		C				un=3
+	movd	%mm6, 12(%edx)		C				un=3
+	psrlq	$32, %mm6		C				un=3
+	paddq	%mm4, %mm6		C				un=3
+	movd	%mm6, 16(%edx)		C				un=3
+	psrlq	$32, %mm6		C				un=3
+	movd	%mm6, 20(%edx)		C				un=3
+	jmp	L(rtr)
+
+
+L(big):	push	%edi
+	pxor	%mm6, %mm6
+	lea	4(%esi), %esi
+	and	$3, %ecx
+	jz	L(0)
+	cmp	$2, %ecx
+	jc	L(1)
+	jz	L(2)
+	jmp	L(3)			C FIXME: one case should fall through
+
+
+L(0):	movd	(%eax), %mm3		C				m 0
+	sub	24(%esp), %ecx		C inner loop count		m 0
+	mov	%ecx, 24(%esp)		C update loop count for later	m 0
+	pmuludq	%mm7, %mm3		C				m 0
+	movd	4(%eax), %mm0		C				m 0
+	pmuludq	%mm7, %mm0		C				m 0
+	movd	8(%eax), %mm1		C				m 0
+	jmp	L(m00)			C				m 0
+	ALIGN(16)			C				m 0
+L(lpm0):
+	pmuludq	%mm7, %mm4		C				m 0
+	paddq	%mm0, %mm6		C				m 0
+	movd	(%eax), %mm3		C				m 0
+	movd	%mm6, -12(%edx)		C				m 0
+	psrlq	$32, %mm6		C				m 0
+	pmuludq	%mm7, %mm3		C				m 0
+	paddq	%mm1, %mm6		C				m 0
+	movd	4(%eax), %mm0		C				m 0
+	movd	%mm6, -8(%edx)		C				m 0
+	psrlq	$32, %mm6		C				m 0
+	pmuludq	%mm7, %mm0		C				m 0
+	paddq	%mm4, %mm6		C				m 0
+	movd	8(%eax), %mm1		C				m 0
+	movd	%mm6, -4(%edx)		C				m 0
+	psrlq	$32, %mm6		C				m 0
+L(m00):	pmuludq	%mm7, %mm1		C				m 0
+	paddq	%mm3, %mm6		C				m 0
+	movd	12(%eax), %mm4		C				m 0
+	movd	%mm6, (%edx)		C				m 0
+	psrlq	$32, %mm6		C				m 0
+	lea	16(%eax), %eax		C				m 0
+	lea	16(%edx), %edx		C				m 0
+	add	$4, %ecx		C				m 0
+	ja	L(lpm0)			C				m 0
+	pmuludq	%mm7, %mm4		C				m 0
+	paddq	%mm0, %mm6		C				m 0
+	movd	%mm6, -12(%edx)		C				m 0
+	psrlq	$32, %mm6		C				m 0
+	paddq	%mm1, %mm6		C				m 0
+	mov	16(%esp), %edi		C rp				  0
+	jmp	L(x0)
+
+L(olp0):
+	lea	4(%edi), %edi		C				am 0
+	movd	(%esi), %mm7		C				am 0
+	lea	4(%esi), %esi		C				am 0
+	mov	%edi, %edx		C rp				am 0
+	mov	20(%esp), %eax		C up				am 0
+	movd	(%eax), %mm3		C				am 0
+	mov	24(%esp), %ecx		C inner loop count		am 0
+	pxor	%mm6, %mm6		C				am 0
+	pmuludq	%mm7, %mm3		C				am 0
+	movd	4(%eax), %mm0		C				am 0
+	movd	(%edx), %mm5		C				am 0
+	pmuludq	%mm7, %mm0		C				am 0
+	movd	8(%eax), %mm1		C				am 0
+	paddq	%mm3, %mm5		C				am 0
+	movd	4(%edx), %mm4		C				am 0
+	jmp	L(am00)			C				am 0
+	ALIGN(16)			C				mm 0
+L(lam0):
+	pmuludq	%mm7, %mm2		C				am 0
+	paddq	%mm4, %mm6		C				am 0
+	movd	(%eax), %mm3		C				am 0
+	paddq	%mm1, %mm5		C				am 0
+	movd	-4(%edx), %mm4		C				am 0
+	movd	%mm6, -12(%edx)		C				am 0
+	psrlq	$32, %mm6		C				am 0
+	pmuludq	%mm7, %mm3		C				am 0
+	paddq	%mm5, %mm6		C				am 0
+	movd	4(%eax), %mm0		C				am 0
+	paddq	%mm2, %mm4		C				am 0
+	movd	(%edx), %mm5		C				am 0
+	movd	%mm6, -8(%edx)		C				am 0
+	psrlq	$32, %mm6		C				am 0
+	pmuludq	%mm7, %mm0		C				am 0
+	paddq	%mm4, %mm6		C				am 0
+	movd	8(%eax), %mm1		C				am 0
+	paddq	%mm3, %mm5		C				am 0
+	movd	4(%edx), %mm4		C				am 0
+	movd	%mm6, -4(%edx)		C				am 0
+	psrlq	$32, %mm6		C				am 0
+L(am00):
+	pmuludq	%mm7, %mm1		C				am 0
+	paddq	%mm5, %mm6		C				am 0
+	movd	12(%eax), %mm2		C				am 0
+	paddq	%mm0, %mm4		C				am 0
+	movd	8(%edx), %mm5		C				am 0
+	movd	%mm6, (%edx)		C				am 0
+	psrlq	$32, %mm6		C				am 0
+	lea	16(%eax), %eax		C				am 0
+	lea	16(%edx), %edx		C				am 0
+	add	$4, %ecx		C				am 0
+	jnz	L(lam0)			C				am 0
+	pmuludq	%mm7, %mm2		C				am 0
+	paddq	%mm4, %mm6		C				am 0
+	paddq	%mm1, %mm5		C				am 0
+	movd	-4(%edx), %mm4		C				am 0
+	movd	%mm6, -12(%edx)		C				am 0
+	psrlq	$32, %mm6		C				am 0
+	paddq	%mm5, %mm6		C				am 0
+	paddq	%mm2, %mm4		C				am 0
+L(x0):	movd	%mm6, -8(%edx)		C				am 0
+	psrlq	$32, %mm6		C				am 0
+	paddq	%mm4, %mm6		C				am 0
+	movd	%mm6, -4(%edx)		C				am 0
+	psrlq	$32, %mm6		C				am 0
+	movd	%mm6, (%edx)		C				am 0
+	dec	%ebx			C				am 0
+	jnz	L(olp0)			C				am 0
+L(oel0):
+	emms				C				   0
+	pop	%edi			C				   0
+	pop	%ebx			C				   0
+	pop	%esi			C				   0
+	ret				C				   0
+
+
+L(1):	movd	(%eax), %mm4		C				m 1
+	sub	24(%esp), %ecx		C				m 1
+	mov	%ecx, 24(%esp)		C update loop count for later	m 1
+	pmuludq	%mm7, %mm4		C				m 1
+	movd	4(%eax), %mm3		C				m 1
+	pmuludq	%mm7, %mm3		C				m 1
+	movd	8(%eax), %mm0		C				m 1
+	jmp	L(m01)			C				m 1
+	ALIGN(16)			C				m 1
+L(lpm1):
+	pmuludq	%mm7, %mm4		C				m 1
+	paddq	%mm0, %mm6		C				m 1
+	movd	4(%eax), %mm3		C				m 1
+	movd	%mm6, -8(%edx)		C				m 1
+	psrlq	$32, %mm6		C				m 1
+	pmuludq	%mm7, %mm3		C				m 1
+	paddq	%mm1, %mm6		C				m 1
+	movd	8(%eax), %mm0		C				m 1
+	movd	%mm6, -4(%edx)		C				m 1
+	psrlq	$32, %mm6		C				m 1
+L(m01):	pmuludq	%mm7, %mm0		C				m 1
+	paddq	%mm4, %mm6		C				m 1
+	movd	12(%eax), %mm1		C				m 1
+	movd	%mm6, (%edx)		C				m 1
+	psrlq	$32, %mm6		C				m 1
+	pmuludq	%mm7, %mm1		C				m 1
+	paddq	%mm3, %mm6		C				m 1
+	movd	16(%eax), %mm4		C				m 1
+	movd	%mm6, 4(%edx)		C				m 1
+	psrlq	$32, %mm6		C				m 1
+	lea	16(%eax), %eax		C				m 1
+	lea	16(%edx), %edx		C				m 1
+	add	$4, %ecx		C				m 1
+	ja	L(lpm1)			C				m 1
+	pmuludq	%mm7, %mm4		C				m 1
+	paddq	%mm0, %mm6		C				m 1
+	movd	%mm6, -8(%edx)		C				m 1
+	psrlq	$32, %mm6		C				m 1
+	paddq	%mm1, %mm6		C				m 1
+	mov	16(%esp), %edi		C rp				  1
+	jmp	L(x1)
+
+L(olp1):
+	lea	4(%edi), %edi		C				am 1
+	movd	(%esi), %mm7		C				am 1
+	lea	4(%esi), %esi		C				am 1
+	mov	%edi, %edx		C rp				am 1
+	mov	20(%esp), %eax		C up				am 1
+	movd	(%eax), %mm2		C				am 1
+	mov	24(%esp), %ecx		C inner loop count		am 1
+	pxor	%mm6, %mm6		C				am 1
+	pmuludq	%mm7, %mm2		C				am 1
+	movd	4(%eax), %mm3		C				am 1
+	movd	(%edx), %mm4		C				am 1
+	pmuludq	%mm7, %mm3		C				am 1
+	movd	8(%eax), %mm0		C				am 1
+	paddq	%mm2, %mm4		C				am 1
+	movd	4(%edx), %mm5		C				am 1
+	jmp	L(am01)			C				am 1
+	ALIGN(16)			C				am 1
+L(lam1):
+	pmuludq	%mm7, %mm2		C				am 1
+	paddq	%mm4, %mm6		C				am 1
+	movd	4(%eax), %mm3		C				am 1
+	paddq	%mm1, %mm5		C				am 1
+	movd	(%edx), %mm4		C				am 1
+	movd	%mm6, -8(%edx)		C				am 1
+	psrlq	$32, %mm6		C				am 1
+	pmuludq	%mm7, %mm3		C				am 1
+	paddq	%mm5, %mm6		C				am 1
+	movd	8(%eax), %mm0		C				am 1
+	paddq	%mm2, %mm4		C				am 1
+	movd	4(%edx), %mm5		C				am 1
+	movd	%mm6, -4(%edx)		C				am 1
+	psrlq	$32, %mm6		C				am 1
+L(am01):
+	pmuludq	%mm7, %mm0		C				am 1
+	paddq	%mm4, %mm6		C				am 1
+	movd	12(%eax), %mm1		C				am 1
+	paddq	%mm3, %mm5		C				am 1
+	movd	8(%edx), %mm4		C				am 1
+	movd	%mm6, (%edx)		C				am 1
+	psrlq	$32, %mm6		C				am 1
+	pmuludq	%mm7, %mm1		C				am 1
+	paddq	%mm5, %mm6		C				am 1
+	movd	16(%eax), %mm2		C				am 1
+	paddq	%mm0, %mm4		C				am 1
+	movd	12(%edx), %mm5		C				am 1
+	movd	%mm6, 4(%edx)		C				am 1
+	psrlq	$32, %mm6		C				am 1
+	lea	16(%eax), %eax		C				am 1
+	lea	16(%edx), %edx		C				am 1
+	add	$4, %ecx		C				am 1
+	jnz	L(lam1)			C				am 1
+	pmuludq	%mm7, %mm2		C				am 1
+	paddq	%mm4, %mm6		C				am 1
+	paddq	%mm1, %mm5		C				am 1
+	movd	(%edx), %mm4		C				am 1
+	movd	%mm6, -8(%edx)		C				am 1
+	psrlq	$32, %mm6		C				am 1
+	paddq	%mm5, %mm6		C				am 1
+	paddq	%mm2, %mm4		C				am 1
+L(x1):	movd	%mm6, -4(%edx)		C				am 1
+	psrlq	$32, %mm6		C				am 1
+	paddq	%mm4, %mm6		C				am 1
+	movd	%mm6, (%edx)		C				am 1
+	psrlq	$32, %mm6		C				am 1
+	movd	%mm6, 4(%edx)		C				am 1
+	dec	%ebx			C				am 1
+	jnz	L(olp1)			C				am 1
+L(oel1):
+	emms				C				   1
+	pop	%edi			C				   1
+	pop	%ebx			C				   1
+	pop	%esi			C				   1
+	ret				C				   1
+
+
+L(2):	movd	(%eax), %mm1		C				m 2
+	sub	24(%esp), %ecx		C				m 2
+	mov	%ecx, 24(%esp)		C update loop count for later	m 2
+	pmuludq	%mm7, %mm1		C				m 2
+	movd	4(%eax), %mm4		C				m 2
+	pmuludq	%mm7, %mm4		C				m 2
+	movd	8(%eax), %mm3		C				m 2
+	jmp	L(m10)			C				m 2
+	ALIGN(16)			C				m 2
+L(lpm2):
+	pmuludq	%mm7, %mm4		C				m 2
+	paddq	%mm0, %mm6		C				m 2
+	movd	8(%eax), %mm3		C				m 2
+	movd	%mm6, -4(%edx)		C				m 2
+	psrlq	$32, %mm6		C				m 2
+L(m10):	pmuludq	%mm7, %mm3		C				m 2
+	paddq	%mm1, %mm6		C				m 2
+	movd	12(%eax), %mm0		C				m 2
+	movd	%mm6, (%edx)		C				m 2
+	psrlq	$32, %mm6		C				m 2
+	pmuludq	%mm7, %mm0		C				m 2
+	paddq	%mm4, %mm6		C				m 2
+	movd	16(%eax), %mm1		C				m 2
+	movd	%mm6, 4(%edx)		C				m 2
+	psrlq	$32, %mm6		C				m 2
+	pmuludq	%mm7, %mm1		C				m 2
+	paddq	%mm3, %mm6		C				m 2
+	movd	20(%eax), %mm4		C				m 2
+	movd	%mm6, 8(%edx)		C				m 2
+	psrlq	$32, %mm6		C				m 2
+	lea	16(%eax), %eax		C				m 2
+	lea	16(%edx), %edx		C				m 2
+	add	$4, %ecx		C				m 2
+	ja	L(lpm2)			C				m 2
+	pmuludq	%mm7, %mm4		C				m 2
+	paddq	%mm0, %mm6		C				m 2
+	movd	%mm6, -4(%edx)		C				m 2
+	psrlq	$32, %mm6		C				m 2
+	paddq	%mm1, %mm6		C				m 2
+	mov	16(%esp), %edi		C rp				  2
+	jmp	L(x2)
+
+L(olp2):
+	lea	4(%edi), %edi		C				am 2
+	movd	(%esi), %mm7		C				am 2
+	lea	4(%esi), %esi		C				am 2
+	mov	%edi, %edx		C rp				am 2
+	mov	20(%esp), %eax		C up				am 2
+	movd	(%eax), %mm1		C				am 2
+	mov	24(%esp), %ecx		C inner loop count		am 2
+	pxor	%mm6, %mm6		C				am 2
+	pmuludq	%mm7, %mm1		C				am 2
+	movd	4(%eax), %mm2		C				am 2
+	movd	(%edx), %mm5		C				am 2
+	pmuludq	%mm7, %mm2		C				am 2
+	movd	8(%eax), %mm3		C				am 2
+	paddq	%mm1, %mm5		C				am 2
+	movd	4(%edx), %mm4		C				am 2
+	jmp	L(am10)			C				am 2
+	ALIGN(16)			C				am 2
+L(lam2):
+	pmuludq	%mm7, %mm2		C				am 2
+	paddq	%mm4, %mm6		C				am 2
+	movd	8(%eax), %mm3		C				am 2
+	paddq	%mm1, %mm5		C				am 2
+	movd	4(%edx), %mm4		C				am 2
+	movd	%mm6, -4(%edx)		C				am 2
+	psrlq	$32, %mm6		C				am 2
+L(am10):
+	pmuludq	%mm7, %mm3		C				am 2
+	paddq	%mm5, %mm6		C				am 2
+	movd	12(%eax), %mm0		C				am 2
+	paddq	%mm2, %mm4		C				am 2
+	movd	8(%edx), %mm5		C				am 2
+	movd	%mm6, (%edx)		C				am 2
+	psrlq	$32, %mm6		C				am 2
+	pmuludq	%mm7, %mm0		C				am 2
+	paddq	%mm4, %mm6		C				am 2
+	movd	16(%eax), %mm1		C				am 2
+	paddq	%mm3, %mm5		C				am 2
+	movd	12(%edx), %mm4		C				am 2
+	movd	%mm6, 4(%edx)		C				am 2
+	psrlq	$32, %mm6		C				am 2
+	pmuludq	%mm7, %mm1		C				am 2
+	paddq	%mm5, %mm6		C				am 2
+	movd	20(%eax), %mm2		C				am 2
+	paddq	%mm0, %mm4		C				am 2
+	movd	16(%edx), %mm5		C				am 2
+	movd	%mm6, 8(%edx)		C				am 2
+	psrlq	$32, %mm6		C				am 2
+	lea	16(%eax), %eax		C				am 2
+	lea	16(%edx), %edx		C				am 2
+	add	$4, %ecx		C				am 2
+	jnz	L(lam2)			C				am 2
+	pmuludq	%mm7, %mm2		C				am 2
+	paddq	%mm4, %mm6		C				am 2
+	paddq	%mm1, %mm5		C				am 2
+	movd	4(%edx), %mm4		C				am 2
+	movd	%mm6, -4(%edx)		C				am 2
+	psrlq	$32, %mm6		C				am 2
+	paddq	%mm5, %mm6		C				am 2
+	paddq	%mm2, %mm4		C				am 2
+L(x2):	movd	%mm6, (%edx)		C				am 2
+	psrlq	$32, %mm6		C				am 2
+	paddq	%mm4, %mm6		C				am 2
+	movd	%mm6, 4(%edx)		C				am 2
+	psrlq	$32, %mm6		C				am 2
+	movd	%mm6, 8(%edx)		C				am 2
+	dec	%ebx			C				am 2
+	jnz	L(olp2)			C				am 2
+L(oel2):
+	emms				C				   2
+	pop	%edi			C				   2
+	pop	%ebx			C				   2
+	pop	%esi			C				   2
+	ret				C				   2
+
+
+L(3):	movd	(%eax), %mm0		C				m 3
+	sub	24(%esp), %ecx		C				m 3
+	mov	%ecx, 24(%esp)		C update loop count for later	m 3
+	pmuludq	%mm7, %mm0		C				m 3
+	movd	4(%eax), %mm1		C				m 3
+	pmuludq	%mm7, %mm1		C				m 3
+	movd	8(%eax), %mm4		C				m 3
+	jmp	L(lpm3)			C				m 3
+	ALIGN(16)			C				m 3
+L(lpm3):
+	pmuludq	%mm7, %mm4		C				m 3
+	paddq	%mm0, %mm6		C				m 3
+	movd	12(%eax), %mm3		C				m 3
+	movd	%mm6, (%edx)		C				m 3
+	psrlq	$32, %mm6		C				m 3
+	pmuludq	%mm7, %mm3		C				m 3
+	paddq	%mm1, %mm6		C				m 3
+	movd	16(%eax), %mm0		C				m 3
+	movd	%mm6, 4(%edx)		C				m 3
+	psrlq	$32, %mm6		C				m 3
+	pmuludq	%mm7, %mm0		C				m 3
+	paddq	%mm4, %mm6		C				m 3
+	movd	20(%eax), %mm1		C				m 3
+	movd	%mm6, 8(%edx)		C				m 3
+	psrlq	$32, %mm6		C				m 3
+	pmuludq	%mm7, %mm1		C				m 3
+	paddq	%mm3, %mm6		C				m 3
+	movd	24(%eax), %mm4		C				m 3
+	movd	%mm6, 12(%edx)		C				m 3
+	psrlq	$32, %mm6		C				m 3
+	lea	16(%eax), %eax		C				m 3
+	lea	16(%edx), %edx		C				m 3
+	add	$4, %ecx		C				m 3
+	ja	L(lpm3)			C				m 3
+	pmuludq	%mm7, %mm4		C				m 3
+	paddq	%mm0, %mm6		C				m 3
+	movd	%mm6, (%edx)		C				m 3
+	psrlq	$32, %mm6		C				m 3
+	paddq	%mm1, %mm6		C				m 3
+	mov	16(%esp), %edi		C rp				  3
+	jmp	L(x3)
+
+L(olp3):
+	lea	4(%edi), %edi		C				am 3
+	movd	(%esi), %mm7		C				am 3
+	lea	4(%esi), %esi		C				am 3
+	mov	%edi, %edx		C rp				am 3
+	mov	20(%esp), %eax		C up				am 3
+	movd	(%eax), %mm0		C				am 3
+	mov	24(%esp), %ecx		C inner loop count		am 3
+	pxor	%mm6, %mm6		C				am 3
+	pmuludq	%mm7, %mm0		C				am 3
+	movd	4(%eax), %mm1		C				am 3
+	movd	(%edx), %mm4		C				am 3
+	pmuludq	%mm7, %mm1		C				am 3
+	movd	8(%eax), %mm2		C				am 3
+	paddq	%mm0, %mm4		C				am 3
+	movd	4(%edx), %mm5		C				am 3
+	jmp	L(lam3)			C				am 3
+	ALIGN(16)			C				am 3
+L(lam3):
+	pmuludq	%mm7, %mm2		C				am 3
+	paddq	%mm4, %mm6		C				am 3
+	movd	12(%eax), %mm3		C				am 3
+	paddq	%mm1, %mm5		C				am 3
+	movd	8(%edx), %mm4		C				am 3
+	movd	%mm6, (%edx)		C				am 3
+	psrlq	$32, %mm6		C				am 3
+	pmuludq	%mm7, %mm3		C				am 3
+	paddq	%mm5, %mm6		C				am 3
+	movd	16(%eax), %mm0		C				am 3
+	paddq	%mm2, %mm4		C				am 3
+	movd	12(%edx), %mm5		C				am 3
+	movd	%mm6, 4(%edx)		C				am 3
+	psrlq	$32, %mm6		C				am 3
+	pmuludq	%mm7, %mm0		C				am 3
+	paddq	%mm4, %mm6		C				am 3
+	movd	20(%eax), %mm1		C				am 3
+	paddq	%mm3, %mm5		C				am 3
+	movd	16(%edx), %mm4		C				am 3
+	movd	%mm6, 8(%edx)		C				am 3
+	psrlq	$32, %mm6		C				am 3
+	pmuludq	%mm7, %mm1		C				am 3
+	paddq	%mm5, %mm6		C				am 3
+	movd	24(%eax), %mm2		C				am 3
+	paddq	%mm0, %mm4		C				am 3
+	movd	20(%edx), %mm5		C				am 3
+	movd	%mm6, 12(%edx)		C				am 3
+	psrlq	$32, %mm6		C				am 3
+	lea	16(%eax), %eax		C				am 3
+	lea	16(%edx), %edx		C				am 3
+	add	$4, %ecx		C				am 3
+	jnz	L(lam3)			C				am 3
+	pmuludq	%mm7, %mm2		C				am 3
+	paddq	%mm4, %mm6		C				am 3
+	paddq	%mm1, %mm5		C				am 3
+	movd	8(%edx), %mm4		C				am 3
+	movd	%mm6, (%edx)		C				am 3
+	psrlq	$32, %mm6		C				am 3
+	paddq	%mm5, %mm6		C				am 3
+	paddq	%mm2, %mm4		C				am 3
+L(x3):	movd	%mm6, 4(%edx)		C				am 3
+	psrlq	$32, %mm6		C				am 3
+	paddq	%mm4, %mm6		C				am 3
+	movd	%mm6, 8(%edx)		C				am 3
+	psrlq	$32, %mm6		C				am 3
+	movd	%mm6, 12(%edx)		C				am 3
+	dec	%ebx			C				am 3
+	jnz	L(olp3)			C				am 3
+L(oel3):
+	emms				C				   3
+	pop	%edi			C				   3
+	pop	%ebx			C				   3
+	pop	%esi			C				   3
+	ret				C				   3
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/pentium4/sse2/popcount.asm b/third_party/gmp/mpn/x86/pentium4/sse2/popcount.asm
new file mode 100644
index 0000000..c7f4426
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium4/sse2/popcount.asm

@@ -0,0 +1,281 @@
+dnl  X86-32 and X86-64 mpn_popcount using SSE2.
+
+dnl  Copyright 2006, 2007, 2011, 2015, 2020 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+
+C 32-bit		     popcount	     hamdist
+C			    cycles/limb	    cycles/limb
+C P5				-
+C P6 model 0-8,10-12		-
+C P6 model 9  (Banias)		?
+C P6 model 13 (Dothan)		4
+C P4 model 0  (Willamette)	?
+C P4 model 1  (?)		?
+C P4 model 2  (Northwood)	3.9
+C P4 model 3  (Prescott)	?
+C P4 model 4  (Nocona)		?
+C AMD K6			-
+C AMD K7			-
+C AMD K8			?
+
+C 64-bit		     popcount	     hamdist
+C			    cycles/limb	    cycles/limb
+C P4 model 4 (Nocona):		8
+C AMD K8,K9			7.5
+C AMD K10			3.5
+C Intel core2			3.68
+C Intel corei			3.15
+C Intel atom		       10.8
+C VIA nano			6.5
+
+C TODO
+C  * Make an mpn_hamdist based on this.  Alignment could either be handled by
+C    using movdqu for one operand and movdqa for the other, or by painfully
+C    shifting as we go.  Unfortunately, there seem to be no usable shift
+C    instruction, except for one that takes an immediate count.
+C  * It would probably be possible to cut a few cycles/limb using software
+C    pipelining.
+C  * There are 35 decode slots unused by the SSE2 instructions.  Loop control
+C    needs just 2 or 3 slots, leaving around 32 slots.  This allows a parallel
+C    integer based popcount.  Such a combined loop would handle 6 limbs in
+C    about 30 cycles on K8.
+C  * We could save a byte or two by using 32-bit operations on areg.
+C  * Check if using movdqa to a temp of and then register-based pand is faster.
+
+ifelse(GMP_LIMB_BITS,`32',
+`	define(`up',  `%edx')
+	define(`n',   `%ecx')
+	define(`areg',`%eax')
+	define(`breg',`%ebx')
+	define(`zero',`%xmm4')
+	define(`LIMB32',`	$1')
+	define(`LIMB64',`dnl')
+',`
+	define(`up',  `%rdi')
+	define(`n',   `%rsi')
+	define(`areg',`%rax')
+	define(`breg',`%rdx')
+	define(`zero',`%xmm8')
+	define(`LIMB32',`dnl')
+	define(`LIMB64',`	$1')
+')
+
+define(`mm01010101',`%xmm6')
+define(`mm00110011',`%xmm7')
+define(`mm00001111',`%xmm2')
+
+define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8))
+define(`LIMBS_PER_XMM',  eval(16/GMP_LIMB_BYTES))
+define(`LIMBS_PER_2XMM', eval(32/GMP_LIMB_BYTES))
+
+undefine(`psadbw')			C override inherited m4 version
+
+C This file is shared between 32-bit and 64-bit builds.  Only the former has
+C LEAL.  Default LEAL as an alias of LEA.
+ifdef(`LEAL',,`define(`LEAL', `LEA($1,$2)')')
+
+ASM_START()
+
+C Make cnsts global to work around Apple relocation bug.
+ifdef(`DARWIN',`
+	define(`cnsts', MPN(popccnsts))
+	GLOBL	cnsts')
+
+	TEXT
+	ALIGN(32)
+PROLOGUE(mpn_popcount)
+
+LIMB32(`mov	4(%esp), up	')
+LIMB32(`mov	8(%esp), n	')
+LIMB32(`push	%ebx		')
+
+	pxor	%xmm3, %xmm3		C zero grand total count
+LIMB64(`pxor	zero, zero	')
+
+	LEAL(	cnsts, breg)
+
+	movdqa	-48(breg), mm01010101
+	movdqa	-32(breg), mm00110011
+	movdqa	-16(breg), mm00001111
+
+	mov	up, areg
+	and	$-16, up		C round `up' down to 128-bit boundary
+	and	$12, areg		C 32:areg = 0, 4, 8, 12
+					C 64:areg = 0, 8
+	movdqa	(up), %xmm0
+	pand	64(breg,areg,4), %xmm0
+	shr	$m4_log2(GMP_LIMB_BYTES), %eax
+	add	areg, n			C compensate n for rounded down `up'
+
+	pxor	%xmm4, %xmm4
+	sub	$LIMBS_PER_XMM, n
+	jbe	L(sum)
+
+	sub	$LIMBS_PER_XMM, n
+	ja	L(ent)
+	jmp	L(lsum)
+
+	ALIGN(16)
+L(top):	movdqa	(up), %xmm0
+L(ent):	movdqa	16(up), %xmm4
+
+	movdqa	%xmm0, %xmm1
+	movdqa	%xmm4, %xmm5
+	psrld	$1, %xmm0
+	psrld	$1, %xmm4
+	pand	mm01010101, %xmm0
+	pand	mm01010101, %xmm4
+	psubd	%xmm0, %xmm1
+	psubd	%xmm4, %xmm5
+
+	movdqa	%xmm1, %xmm0
+	movdqa	%xmm5, %xmm4
+	psrlq	$2, %xmm1
+	psrlq	$2, %xmm5
+	pand	mm00110011, %xmm0
+	pand	mm00110011, %xmm4
+	pand	mm00110011, %xmm1
+	pand	mm00110011, %xmm5
+	paddq	%xmm0, %xmm1
+	paddq	%xmm4, %xmm5
+
+LIMB32(`pxor	zero, zero	')
+
+	add	$32, up
+	sub	$LIMBS_PER_2XMM, n
+
+	paddq	%xmm5, %xmm1
+	movdqa	%xmm1, %xmm0
+	psrlq	$4, %xmm1
+	pand	mm00001111, %xmm0
+	pand	mm00001111, %xmm1
+	paddq	%xmm0, %xmm1
+
+	psadbw	zero, %xmm1
+	paddq	%xmm1, %xmm3		C add to grand total
+
+	jnc	L(top)
+L(end):
+	add	$LIMBS_PER_2XMM, n
+	jz	L(rt)
+	movdqa	(up), %xmm0
+	pxor	%xmm4, %xmm4
+	sub	$LIMBS_PER_XMM, n
+	jbe	L(sum)
+L(lsum):
+	movdqa	%xmm0, %xmm4
+	movdqa	16(up), %xmm0
+L(sum):
+	shl	$m4_log2(GMP_LIMB_BYTES), n
+	and	$12, n
+	pand	(breg,n,4), %xmm0
+
+	movdqa	%xmm0, %xmm1
+	movdqa	%xmm4, %xmm5
+	psrld	$1, %xmm0
+	psrld	$1, %xmm4
+	pand	mm01010101, %xmm0
+	pand	mm01010101, %xmm4
+	psubd	%xmm0, %xmm1
+	psubd	%xmm4, %xmm5
+
+	movdqa	%xmm1, %xmm0
+	movdqa	%xmm5, %xmm4
+	psrlq	$2, %xmm1
+	psrlq	$2, %xmm5
+	pand	mm00110011, %xmm0
+	pand	mm00110011, %xmm4
+	pand	mm00110011, %xmm1
+	pand	mm00110011, %xmm5
+	paddq	%xmm0, %xmm1
+	paddq	%xmm4, %xmm5
+
+LIMB32(`pxor	zero, zero	')
+
+	paddq	%xmm5, %xmm1
+	movdqa	%xmm1, %xmm0
+	psrlq	$4, %xmm1
+	pand	mm00001111, %xmm0
+	pand	mm00001111, %xmm1
+	paddq	%xmm0, %xmm1
+
+	psadbw	zero, %xmm1
+	paddq	%xmm1, %xmm3		C add to grand total
+
+
+C Add the two 64-bit halves of the grand total counter
+L(rt):	movdqa	%xmm3, %xmm0
+	psrldq	$8, %xmm3
+	paddq	%xmm3, %xmm0
+	movd	%xmm0, areg		C movq avoided due to gas bug
+
+LIMB32(`pop	%ebx		')
+	ret
+
+EPILOGUE()
+DEF_OBJECT(dummy,16)
+C Three magic constants used for masking out bits
+	.byte	0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55
+	.byte	0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55
+
+	.byte	0x33,0x33,0x33,0x33,0x33,0x33,0x33,0x33
+	.byte	0x33,0x33,0x33,0x33,0x33,0x33,0x33,0x33
+
+	.byte	0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f
+	.byte	0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f
+cnsts:
+C Masks for high end of number
+	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+
+	.byte	0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
+	.byte	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+
+	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+	.byte	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+
+	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+	.byte	0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
+C Masks for low end of number
+	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+
+	.byte	0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff
+	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+
+	.byte	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+
+	.byte	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+	.byte	0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff
+END_OBJECT(dummy)
+ASM_END()

diff --git a/third_party/gmp/mpn/x86/pentium4/sse2/rsh1add_n.asm b/third_party/gmp/mpn/x86/pentium4/sse2/rsh1add_n.asm
new file mode 100644
index 0000000..f421d13
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium4/sse2/rsh1add_n.asm

@@ -0,0 +1,126 @@
+dnl  Intel Pentium-4 mpn_rsh1add_n -- mpn (x+y)/2
+
+dnl  Copyright 2001-2004 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C        cycles/limb (approx)
+C      dst!=src1,2  dst==src1  dst==src2
+C P4:      4.5         6.5        6.5
+
+
+C mp_limb_t mpn_rsh1add_n (mp_ptr wp, mp_srcptr xp, mp_srcptr yp,
+C                          mp_size_t size);
+C
+C The slightly strange combination of indexing and pointer incrementing
+C that's used seems to work best.  Not sure why, but for instance leal
+C incrementing on %esi is a 1 or 2 cycle slowdown.
+C
+C The dependent chain is paddq combining the carry and next (shifted) part,
+C plus psrlq to move the new carry down.  That, and just 4 mmx instructions
+C in total, makes 4 c/l the target speed, which is almost achieved for
+C separate src/dst but when src==dst the write combining anomalies slow it
+C down.
+
+defframe(PARAM_SIZE, 16)
+defframe(PARAM_YP,   12)
+defframe(PARAM_XP,   8)
+defframe(PARAM_WP,   4)
+
+dnl  re-use parameter space
+define(SAVE_EBX,`PARAM_XP')
+define(SAVE_ESI,`PARAM_YP')
+
+	TEXT
+	ALIGN(8)
+
+PROLOGUE(mpn_rsh1add_n)
+deflit(`FRAME',0)
+
+	movl	PARAM_XP, %edx
+	movl	%ebx, SAVE_EBX
+
+	movl	PARAM_YP, %ebx
+	movl	%esi, SAVE_ESI
+
+	movl	PARAM_WP, %esi
+
+	movd	(%edx), %mm0		C xp[0]
+
+	movd	(%ebx), %mm1		C yp[0]
+	movl	PARAM_SIZE, %ecx
+
+	movl	(%edx), %eax		C xp[0]
+
+	addl	(%ebx), %eax		C xp[0]+yp[0]
+
+	paddq	%mm1, %mm0		C xp[0]+yp[0]
+	leal	(%esi,%ecx,4), %esi	C wp end
+	negl	%ecx			C -size
+
+	psrlq	$1, %mm0		C (xp[0]+yp[0])/2
+	and	$1, %eax		C return value, rsh1 bit of xp[0]+yp[0]
+	addl	$1, %ecx		C -(size-1)
+	jz	L(done)
+
+
+L(top):
+	C eax	return value
+	C ebx	yp end
+	C ecx	counter, limbs, -(size-1) to -1 inclusive
+	C edx	xp end
+	C esi	wp end
+	C mm0	carry (32 bits)
+
+	movd	4(%edx), %mm1	C xp[i+1]
+	movd	4(%ebx), %mm2	C yp[i+1]
+	leal	4(%edx), %edx
+	leal	4(%ebx), %ebx
+	paddq	%mm2, %mm1		C xp[i+1]+yp[i+1]
+	psllq	$31, %mm1		C low bit at 31, further 32 above
+
+	paddq	%mm1, %mm0		C 31 and carry from prev add
+	movd	%mm0, -4(%esi,%ecx,4)	C low ready to store dst[i]
+
+	psrlq	$32, %mm0		C high becomes new carry
+
+	addl	$1, %ecx
+	jnz	L(top)
+
+
+L(done):
+	movd	%mm0, -4(%esi)		C dst[size-1]
+	movl	SAVE_EBX, %ebx
+
+	movl	SAVE_ESI, %esi
+	emms
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/pentium4/sse2/sqr_basecase.asm b/third_party/gmp/mpn/x86/pentium4/sse2/sqr_basecase.asm
new file mode 100644
index 0000000..2dd57d2
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium4/sse2/sqr_basecase.asm

@@ -0,0 +1,705 @@
+dnl  mpn_sqr_basecase for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F).
+
+dnl  Copyright 2001, 2002, 2007 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C TODO:
+C  * Improve ad-hoc outer loop code and register handling.  Some feed-in
+C    scheduling could improve things by several cycles per outer iteration.
+C  * In Lam3...Lam1 code for, keep accumulation operands in registers, without
+C    storing intermediates to rp.
+C  * We might want to keep 32 in a free mm register, since the register form is
+C    3 bytes and the immediate form is 4 bytes.  About 80 bytes to save.
+C  * Look into different loop alignment, we now expand the code about 50 bytes
+C    with possibly needless alignment.
+C  * Use OSP, should solve feed-in latency problems.
+C  * Address relative slowness for un<=3 for Pentium M.  The old code is there
+C    considerably faster.  (1:20/14, 2:34:32, 3:66/57)
+
+C INPUT PARAMETERS
+C rp		sp + 4
+C up		sp + 8
+C un		sp + 12
+
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_sqr_basecase)
+	mov	4(%esp), %edx		C rp
+	mov	8(%esp), %eax		C up
+	mov	12(%esp), %ecx		C un
+
+	cmp	$2, %ecx
+	jc	L(un1)
+	jz	L(un2)
+	cmp	$4, %ecx
+	jc	L(un3)
+	jz	L(un4)
+	jmp	L(big)
+
+L(un1):	mov	(%eax), %eax
+	mov	%edx, %ecx
+	mul	%eax
+	mov	%eax, (%ecx)
+	mov	%edx, 4(%ecx)
+	ret
+L(un2):	movd	(%eax), %mm0		C				un=2
+	movd	(%eax), %mm2		C				un=2
+	movd	4(%eax), %mm1		C				un=2
+	pmuludq	%mm0, %mm0		C 64b weight 0			un=2
+	pmuludq	%mm1, %mm2		C 64b weight 32			un=2
+	pmuludq	%mm1, %mm1		C 64b weight 64			un=2
+	movd	%mm0, (%edx)		C				un=2
+	psrlq	$32, %mm0		C 32b weight 32			un=2
+	pcmpeqd	%mm7, %mm7		C				un=2
+	psrlq	$33, %mm7		C 0x000000007FFFFFFF		un=2
+	pand	%mm2, %mm7		C 31b weight 32			un=2
+	psrlq	$31, %mm2		C 33b weight 65			un=2
+	psllq	$1, %mm7		C 31b weight 33			un=2
+	paddq	%mm7, %mm0		C				un=2
+	movd	%mm0, 4(%edx)		C				un=2
+	psrlq	$32, %mm0		C				un=2
+	paddq	%mm2, %mm1		C				un=2
+	paddq	%mm0, %mm1		C				un=2
+	movd	%mm1, 8(%edx)		C				un=2
+	psrlq	$32, %mm1		C				un=2
+	movd	%mm1, 12(%edx)		C				un=2
+	emms
+	ret
+L(un3):	movd	(%eax), %mm7		C				un=3
+	movd	4(%eax), %mm6		C				un=3
+	pmuludq	%mm7, %mm6		C				un=3
+	movd	8(%eax), %mm2		C				un=3
+	pmuludq	%mm7, %mm2		C				un=3
+	movd	%mm6, 4(%edx)		C				un=3
+	psrlq	$32, %mm6		C				un=3
+	paddq	%mm2, %mm6		C				un=3
+	movd	%mm6, 8(%edx)		C				un=3
+	psrlq	$32, %mm6		C				un=3
+	movd	%mm6, 12(%edx)		C				un=3
+	lea	4(%edx), %edx		C				un=3
+	lea	4(%eax), %eax		C				un=3
+	jmp	L(am1)
+L(un4):	movd	(%eax), %mm7		C				un=4
+	movd	4(%eax), %mm6		C				un=4
+	pmuludq	%mm7, %mm6		C				un=4
+	movd	8(%eax), %mm0		C				un=4
+	pmuludq	%mm7, %mm0		C				un=4
+	movd	12(%eax), %mm1		C				un=4
+	pmuludq	%mm7, %mm1		C				un=4
+	movd	%mm6, 4(%edx)		C				un=4
+	psrlq	$32, %mm6		C				un=4
+	paddq	%mm0, %mm6		C				un=4
+	movd	%mm6, 8(%edx)		C				un=4
+	psrlq	$32, %mm6		C				un=4
+	paddq	%mm1, %mm6		C				un=4
+	movd	%mm6, 12(%edx)		C				un=4
+	psrlq	$32, %mm6		C				un=4
+	movd	%mm6, 16(%edx)		C				un=4
+	lea	4(%edx), %edx		C				un=4
+	lea	4(%eax), %eax		C				un=4
+	jmp	L(am2)
+
+L(big):	push	%esi
+	push	%ebx
+	push	%edi
+	pxor	%mm6, %mm6
+	movd	(%eax), %mm7		C
+	lea	4(%eax), %esi		C init up, up++
+	lea	4(%eax), %eax		C up2++  FIXME: should fix offsets
+	lea	4(%edx), %edi		C init rp, rp++
+	lea	4(%edx), %edx		C rp2++
+	lea	-4(%ecx), %ebx		C loop count
+	and	$3, %ecx
+	jz	L(3m)
+	cmp	$2, %ecx
+	ja	L(2m)
+	jb	L(0m)
+
+L(1m):
+	movd	(%eax), %mm4		C				m 1
+	lea	(%ebx), %ecx		C inner loop count		m 1
+	pmuludq	%mm7, %mm4		C				m 1
+	movd	4(%eax), %mm3		C				m 1
+	pmuludq	%mm7, %mm3		C				m 1
+	movd	8(%eax), %mm0		C				m 1
+	jmp	L(m01)			C				m 1
+	ALIGN(16)			C				m 1
+L(lpm1):
+	pmuludq	%mm7, %mm4		C				m 1
+	paddq	%mm0, %mm6		C				m 1
+	movd	4(%eax), %mm3		C				m 1
+	movd	%mm6, -8(%edx)		C				m 1
+	psrlq	$32, %mm6		C				m 1
+	pmuludq	%mm7, %mm3		C				m 1
+	paddq	%mm1, %mm6		C				m 1
+	movd	8(%eax), %mm0		C				m 1
+	movd	%mm6, -4(%edx)		C				m 1
+	psrlq	$32, %mm6		C				m 1
+L(m01):	pmuludq	%mm7, %mm0		C				m 1
+	paddq	%mm4, %mm6		C				m 1
+	movd	12(%eax), %mm1		C				m 1
+	movd	%mm6, (%edx)		C				m 1
+	psrlq	$32, %mm6		C				m 1
+	pmuludq	%mm7, %mm1		C				m 1
+	paddq	%mm3, %mm6		C				m 1
+	movd	16(%eax), %mm4		C				m 1
+	movd	%mm6, 4(%edx)		C				m 1
+	psrlq	$32, %mm6		C				m 1
+	lea	16(%eax), %eax		C				m 1
+	lea	16(%edx), %edx		C				m 1
+	sub	$4, %ecx		C				m 1
+	ja	L(lpm1)			C				m 1
+	pmuludq	%mm7, %mm4		C				m 1
+	paddq	%mm0, %mm6		C				m 1
+	movd	%mm6, -8(%edx)		C				m 1
+	psrlq	$32, %mm6		C				m 1
+	paddq	%mm1, %mm6		C				m 1
+	jmp	L(0)
+
+L(2m):
+	movd	(%eax), %mm1		C				m 2
+	lea	(%ebx), %ecx		C inner loop count		m 2
+	pmuludq	%mm7, %mm1		C				m 2
+	movd	4(%eax), %mm4		C				m 2
+	pmuludq	%mm7, %mm4		C				m 2
+	movd	8(%eax), %mm3		C				m 2
+	jmp	L(m10)			C				m 2
+	ALIGN(16)			C				m 2
+L(lpm2):
+	pmuludq	%mm7, %mm4		C				m 2
+	paddq	%mm0, %mm6		C				m 2
+	movd	8(%eax), %mm3		C				m 2
+	movd	%mm6, -4(%edx)		C				m 2
+	psrlq	$32, %mm6		C				m 2
+L(m10):	pmuludq	%mm7, %mm3		C				m 2
+	paddq	%mm1, %mm6		C				m 2
+	movd	12(%eax), %mm0		C				m 2
+	movd	%mm6, (%edx)		C				m 2
+	psrlq	$32, %mm6		C				m 2
+	pmuludq	%mm7, %mm0		C				m 2
+	paddq	%mm4, %mm6		C				m 2
+	movd	16(%eax), %mm1		C				m 2
+	movd	%mm6, 4(%edx)		C				m 2
+	psrlq	$32, %mm6		C				m 2
+	pmuludq	%mm7, %mm1		C				m 2
+	paddq	%mm3, %mm6		C				m 2
+	movd	20(%eax), %mm4		C				m 2
+	movd	%mm6, 8(%edx)		C				m 2
+	psrlq	$32, %mm6		C				m 2
+	lea	16(%eax), %eax		C				m 2
+	lea	16(%edx), %edx		C				m 2
+	sub	$4, %ecx		C				m 2
+	ja	L(lpm2)			C				m 2
+	pmuludq	%mm7, %mm4		C				m 2
+	paddq	%mm0, %mm6		C				m 2
+	movd	%mm6, -4(%edx)		C				m 2
+	psrlq	$32, %mm6		C				m 2
+	paddq	%mm1, %mm6		C				m 2
+	jmp	L(1)
+
+L(3m):
+	movd	(%eax), %mm0		C				m 3
+	lea	(%ebx), %ecx		C inner loop count		m 3
+	pmuludq	%mm7, %mm0		C				m 3
+	movd	4(%eax), %mm1		C				m 3
+	pmuludq	%mm7, %mm1		C				m 3
+	movd	8(%eax), %mm4		C				m 3
+	jmp	L(lpm3)			C				m 3
+	ALIGN(16)			C				m 3
+L(lpm3):
+	pmuludq	%mm7, %mm4		C				m 3
+	paddq	%mm0, %mm6		C				m 3
+	movd	12(%eax), %mm3		C				m 3
+	movd	%mm6, (%edx)		C				m 3
+	psrlq	$32, %mm6		C				m 3
+	pmuludq	%mm7, %mm3		C				m 3
+	paddq	%mm1, %mm6		C				m 3
+	movd	16(%eax), %mm0		C				m 3
+	movd	%mm6, 4(%edx)		C				m 3
+	psrlq	$32, %mm6		C				m 3
+	pmuludq	%mm7, %mm0		C				m 3
+	paddq	%mm4, %mm6		C				m 3
+	movd	20(%eax), %mm1		C				m 3
+	movd	%mm6, 8(%edx)		C				m 3
+	psrlq	$32, %mm6		C				m 3
+	pmuludq	%mm7, %mm1		C				m 3
+	paddq	%mm3, %mm6		C				m 3
+	movd	24(%eax), %mm4		C				m 3
+	movd	%mm6, 12(%edx)		C				m 3
+	psrlq	$32, %mm6		C				m 3
+	lea	16(%eax), %eax		C				m 3
+	lea	16(%edx), %edx		C				m 3
+	sub	$4, %ecx		C				m 3
+	ja	L(lpm3)			C				m 3
+	pmuludq	%mm7, %mm4		C				m 3
+	paddq	%mm0, %mm6		C				m 3
+	movd	%mm6, (%edx)		C				m 3
+	psrlq	$32, %mm6		C				m 3
+	paddq	%mm1, %mm6		C				m 3
+	jmp	L(2)
+
+L(0m):
+	movd	(%eax), %mm3		C				m 0
+	lea	(%ebx), %ecx		C inner loop count		m 0
+	pmuludq	%mm7, %mm3		C				m 0
+	movd	4(%eax), %mm0		C				m 0
+	pmuludq	%mm7, %mm0		C				m 0
+	movd	8(%eax), %mm1		C				m 0
+	jmp	L(m00)			C				m 0
+	ALIGN(16)			C				m 0
+L(lpm0):
+	pmuludq	%mm7, %mm4		C				m 0
+	paddq	%mm0, %mm6		C				m 0
+	movd	(%eax), %mm3		C				m 0
+	movd	%mm6, -12(%edx)		C				m 0
+	psrlq	$32, %mm6		C				m 0
+	pmuludq	%mm7, %mm3		C				m 0
+	paddq	%mm1, %mm6		C				m 0
+	movd	4(%eax), %mm0		C				m 0
+	movd	%mm6, -8(%edx)		C				m 0
+	psrlq	$32, %mm6		C				m 0
+	pmuludq	%mm7, %mm0		C				m 0
+	paddq	%mm4, %mm6		C				m 0
+	movd	8(%eax), %mm1		C				m 0
+	movd	%mm6, -4(%edx)		C				m 0
+	psrlq	$32, %mm6		C				m 0
+L(m00):	pmuludq	%mm7, %mm1		C				m 0
+	paddq	%mm3, %mm6		C				m 0
+	movd	12(%eax), %mm4		C				m 0
+	movd	%mm6, (%edx)		C				m 0
+	psrlq	$32, %mm6		C				m 0
+	lea	16(%eax), %eax		C				m 0
+	lea	16(%edx), %edx		C				m 0
+	sub	$4, %ecx		C				m 0
+	ja	L(lpm0)			C				m 0
+	pmuludq	%mm7, %mm4		C				m 0
+	paddq	%mm0, %mm6		C				m 0
+	movd	%mm6, -12(%edx)		C				m 0
+	psrlq	$32, %mm6		C				m 0
+	paddq	%mm1, %mm6		C				m 0
+	jmp	L(3)
+
+L(outer):
+	lea	8(%edi), %edi		C rp += 2
+	movd	(%esi), %mm7		C				am 3
+	mov	%edi, %edx		C rp2 = rp			am 3
+	lea	4(%esi), %esi		C up++				am 3
+	lea	(%esi), %eax		C up2 = up			am 3
+	movd	(%eax), %mm0		C				am 3
+	lea	(%ebx), %ecx		C inner loop count		am 3
+	pxor	%mm6, %mm6		C				am 3
+	pmuludq	%mm7, %mm0		C				am 3
+	movd	4(%eax), %mm1		C				am 3
+	movd	(%edx), %mm4		C				am 3
+	pmuludq	%mm7, %mm1		C				am 3
+	movd	8(%eax), %mm2		C				am 3
+	paddq	%mm0, %mm4		C				am 3
+	movd	4(%edx), %mm5		C				am 3
+	jmp	L(lam3)			C				am 3
+	ALIGN(16)			C				am 3
+L(lam3):
+	pmuludq	%mm7, %mm2		C				am 3
+	paddq	%mm4, %mm6		C				am 3
+	movd	12(%eax), %mm3		C				am 3
+	paddq	%mm1, %mm5		C				am 3
+	movd	8(%edx), %mm4		C				am 3
+	movd	%mm6, (%edx)		C				am 3
+	psrlq	$32, %mm6		C				am 3
+	pmuludq	%mm7, %mm3		C				am 3
+	paddq	%mm5, %mm6		C				am 3
+	movd	16(%eax), %mm0		C				am 3
+	paddq	%mm2, %mm4		C				am 3
+	movd	12(%edx), %mm5		C				am 3
+	movd	%mm6, 4(%edx)		C				am 3
+	psrlq	$32, %mm6		C				am 3
+	pmuludq	%mm7, %mm0		C				am 3
+	paddq	%mm4, %mm6		C				am 3
+	movd	20(%eax), %mm1		C				am 3
+	paddq	%mm3, %mm5		C				am 3
+	movd	16(%edx), %mm4		C				am 3
+	movd	%mm6, 8(%edx)		C				am 3
+	psrlq	$32, %mm6		C				am 3
+	pmuludq	%mm7, %mm1		C				am 3
+	paddq	%mm5, %mm6		C				am 3
+	movd	24(%eax), %mm2		C				am 3
+	paddq	%mm0, %mm4		C				am 3
+	movd	20(%edx), %mm5		C				am 3
+	movd	%mm6, 12(%edx)		C				am 3
+	psrlq	$32, %mm6		C				am 3
+	lea	16(%eax), %eax		C				am 3
+	lea	16(%edx), %edx		C				am 3
+	sub	$4, %ecx		C				am 3
+	ja	L(lam3)			C				am 3
+	pmuludq	%mm7, %mm2		C				am 3
+	paddq	%mm4, %mm6		C				am 3
+	paddq	%mm1, %mm5		C				am 3
+	movd	8(%edx), %mm4		C				am 3
+	movd	%mm6, (%edx)		C				am 3
+	psrlq	$32, %mm6		C				am 3
+	paddq	%mm5, %mm6		C				am 3
+	paddq	%mm2, %mm4		C				am 3
+L(2):	movd	%mm6, 4(%edx)		C				am 3
+	psrlq	$32, %mm6		C				am 3
+	paddq	%mm4, %mm6		C				am 3
+	movd	%mm6, 8(%edx)		C				am 3
+	psrlq	$32, %mm6		C				am 3
+	movd	%mm6, 12(%edx)		C				am 3
+
+	lea	8(%edi), %edi		C rp += 2
+	movd	(%esi), %mm7		C				am 2
+	mov	%edi, %edx		C rp2 = rp			am 2
+	lea	4(%esi), %esi		C up++				am 2
+	lea	(%esi), %eax		C up2 = up			am 2
+	movd	(%eax), %mm1		C				am 2
+	lea	(%ebx), %ecx		C inner loop count		am 2
+	pxor	%mm6, %mm6		C				am 2
+	pmuludq	%mm7, %mm1		C				am 2
+	movd	4(%eax), %mm2		C				am 2
+	movd	(%edx), %mm5		C				am 2
+	pmuludq	%mm7, %mm2		C				am 2
+	movd	8(%eax), %mm3		C				am 2
+	paddq	%mm1, %mm5		C				am 2
+	movd	4(%edx), %mm4		C				am 2
+	jmp	L(am10)			C				am 2
+	ALIGN(16)			C				am 2
+L(lam2):
+	pmuludq	%mm7, %mm2		C				am 2
+	paddq	%mm4, %mm6		C				am 2
+	movd	8(%eax), %mm3		C				am 2
+	paddq	%mm1, %mm5		C				am 2
+	movd	4(%edx), %mm4		C				am 2
+	movd	%mm6, -4(%edx)		C				am 2
+	psrlq	$32, %mm6		C				am 2
+L(am10):
+	pmuludq	%mm7, %mm3		C				am 2
+	paddq	%mm5, %mm6		C				am 2
+	movd	12(%eax), %mm0		C				am 2
+	paddq	%mm2, %mm4		C				am 2
+	movd	8(%edx), %mm5		C				am 2
+	movd	%mm6, (%edx)		C				am 2
+	psrlq	$32, %mm6		C				am 2
+	pmuludq	%mm7, %mm0		C				am 2
+	paddq	%mm4, %mm6		C				am 2
+	movd	16(%eax), %mm1		C				am 2
+	paddq	%mm3, %mm5		C				am 2
+	movd	12(%edx), %mm4		C				am 2
+	movd	%mm6, 4(%edx)		C				am 2
+	psrlq	$32, %mm6		C				am 2
+	pmuludq	%mm7, %mm1		C				am 2
+	paddq	%mm5, %mm6		C				am 2
+	movd	20(%eax), %mm2		C				am 2
+	paddq	%mm0, %mm4		C				am 2
+	movd	16(%edx), %mm5		C				am 2
+	movd	%mm6, 8(%edx)		C				am 2
+	psrlq	$32, %mm6		C				am 2
+	lea	16(%eax), %eax		C				am 2
+	lea	16(%edx), %edx		C				am 2
+	sub	$4, %ecx		C				am 2
+	ja	L(lam2)			C				am 2
+	pmuludq	%mm7, %mm2		C				am 2
+	paddq	%mm4, %mm6		C				am 2
+	paddq	%mm1, %mm5		C				am 2
+	movd	4(%edx), %mm4		C				am 2
+	movd	%mm6, -4(%edx)		C				am 2
+	psrlq	$32, %mm6		C				am 2
+	paddq	%mm5, %mm6		C				am 2
+	paddq	%mm2, %mm4		C				am 2
+L(1):	movd	%mm6, (%edx)		C				am 2
+	psrlq	$32, %mm6		C				am 2
+	paddq	%mm4, %mm6		C				am 2
+	movd	%mm6, 4(%edx)		C				am 2
+	psrlq	$32, %mm6		C				am 2
+	movd	%mm6, 8(%edx)		C				am 2
+
+	lea	8(%edi), %edi		C rp += 2
+	movd	(%esi), %mm7		C				am 1
+	mov	%edi, %edx		C rp2 = rp			am 1
+	lea	4(%esi), %esi		C up++				am 1
+	lea	(%esi), %eax		C up2 = up			am 1
+	movd	(%eax), %mm2		C				am 1
+	lea	(%ebx), %ecx		C inner loop count		am 1
+	pxor	%mm6, %mm6		C				am 1
+	pmuludq	%mm7, %mm2		C				am 1
+	movd	4(%eax), %mm3		C				am 1
+	movd	(%edx), %mm4		C				am 1
+	pmuludq	%mm7, %mm3		C				am 1
+	movd	8(%eax), %mm0		C				am 1
+	paddq	%mm2, %mm4		C				am 1
+	movd	4(%edx), %mm5		C				am 1
+	jmp	L(am01)			C				am 1
+	ALIGN(16)			C				am 1
+L(lam1):
+	pmuludq	%mm7, %mm2		C				am 1
+	paddq	%mm4, %mm6		C				am 1
+	movd	4(%eax), %mm3		C				am 1
+	paddq	%mm1, %mm5		C				am 1
+	movd	(%edx), %mm4		C				am 1
+	movd	%mm6, -8(%edx)		C				am 1
+	psrlq	$32, %mm6		C				am 1
+	pmuludq	%mm7, %mm3		C				am 1
+	paddq	%mm5, %mm6		C				am 1
+	movd	8(%eax), %mm0		C				am 1
+	paddq	%mm2, %mm4		C				am 1
+	movd	4(%edx), %mm5		C				am 1
+	movd	%mm6, -4(%edx)		C				am 1
+	psrlq	$32, %mm6		C				am 1
+L(am01):
+	pmuludq	%mm7, %mm0		C				am 1
+	paddq	%mm4, %mm6		C				am 1
+	movd	12(%eax), %mm1		C				am 1
+	paddq	%mm3, %mm5		C				am 1
+	movd	8(%edx), %mm4		C				am 1
+	movd	%mm6, (%edx)		C				am 1
+	psrlq	$32, %mm6		C				am 1
+	pmuludq	%mm7, %mm1		C				am 1
+	paddq	%mm5, %mm6		C				am 1
+	movd	16(%eax), %mm2		C				am 1
+	paddq	%mm0, %mm4		C				am 1
+	movd	12(%edx), %mm5		C				am 1
+	movd	%mm6, 4(%edx)		C				am 1
+	psrlq	$32, %mm6		C				am 1
+	lea	16(%eax), %eax		C				am 1
+	lea	16(%edx), %edx		C				am 1
+	sub	$4, %ecx		C				am 1
+	ja	L(lam1)			C				am 1
+	pmuludq	%mm7, %mm2		C				am 1
+	paddq	%mm4, %mm6		C				am 1
+	paddq	%mm1, %mm5		C				am 1
+	movd	(%edx), %mm4		C				am 1
+	movd	%mm6, -8(%edx)		C				am 1
+	psrlq	$32, %mm6		C				am 1
+	paddq	%mm5, %mm6		C				am 1
+	paddq	%mm2, %mm4		C				am 1
+L(0):	movd	%mm6, -4(%edx)		C				am 1
+	psrlq	$32, %mm6		C				am 1
+	paddq	%mm4, %mm6		C				am 1
+	movd	%mm6, (%edx)		C				am 1
+	psrlq	$32, %mm6		C				am 1
+	movd	%mm6, 4(%edx)		C				am 1
+
+	lea	8(%edi), %edi		C rp += 2
+	movd	(%esi), %mm7		C				am 0
+	mov	%edi, %edx		C rp2 = rp			am 0
+	lea	4(%esi), %esi		C up++				am 0
+	lea	(%esi), %eax		C up2 = up			am 0
+	movd	(%eax), %mm3		C				am 0
+	lea	(%ebx), %ecx		C inner loop count		am 0
+	pxor	%mm6, %mm6		C				am 0
+	pmuludq	%mm7, %mm3		C				am 0
+	movd	4(%eax), %mm0		C				am 0
+	movd	(%edx), %mm5		C				am 0
+	pmuludq	%mm7, %mm0		C				am 0
+	movd	8(%eax), %mm1		C				am 0
+	paddq	%mm3, %mm5		C				am 0
+	movd	4(%edx), %mm4		C				am 0
+	jmp	L(am00)			C				am 0
+	ALIGN(16)			C				am 0
+L(lam0):
+	pmuludq	%mm7, %mm2		C				am 0
+	paddq	%mm4, %mm6		C				am 0
+	movd	(%eax), %mm3		C				am 0
+	paddq	%mm1, %mm5		C				am 0
+	movd	-4(%edx), %mm4		C				am 0
+	movd	%mm6, -12(%edx)		C				am 0
+	psrlq	$32, %mm6		C				am 0
+	pmuludq	%mm7, %mm3		C				am 0
+	paddq	%mm5, %mm6		C				am 0
+	movd	4(%eax), %mm0		C				am 0
+	paddq	%mm2, %mm4		C				am 0
+	movd	(%edx), %mm5		C				am 0
+	movd	%mm6, -8(%edx)		C				am 0
+	psrlq	$32, %mm6		C				am 0
+	pmuludq	%mm7, %mm0		C				am 0
+	paddq	%mm4, %mm6		C				am 0
+	movd	8(%eax), %mm1		C				am 0
+	paddq	%mm3, %mm5		C				am 0
+	movd	4(%edx), %mm4		C				am 0
+	movd	%mm6, -4(%edx)		C				am 0
+	psrlq	$32, %mm6		C				am 0
+L(am00):
+	pmuludq	%mm7, %mm1		C				am 0
+	paddq	%mm5, %mm6		C				am 0
+	movd	12(%eax), %mm2		C				am 0
+	paddq	%mm0, %mm4		C				am 0
+	movd	8(%edx), %mm5		C				am 0
+	movd	%mm6, (%edx)		C				am 0
+	psrlq	$32, %mm6		C				am 0
+	lea	16(%eax), %eax		C				am 0
+	lea	16(%edx), %edx		C				am 0
+	sub	$4, %ecx		C				am 0
+	ja	L(lam0)			C				am 0
+	pmuludq	%mm7, %mm2		C				am 0
+	paddq	%mm4, %mm6		C				am 0
+	paddq	%mm1, %mm5		C				am 0
+	movd	-4(%edx), %mm4		C				am 0
+	movd	%mm6, -12(%edx)		C				am 0
+	psrlq	$32, %mm6		C				am 0
+	paddq	%mm5, %mm6		C				am 0
+	paddq	%mm2, %mm4		C				am 0
+L(3):	movd	%mm6, -8(%edx)		C				am 0
+	psrlq	$32, %mm6		C				am 0
+	paddq	%mm4, %mm6		C				am 0
+	movd	%mm6, -4(%edx)		C				am 0
+	psrlq	$32, %mm6		C				am 0
+	movd	%mm6, (%edx)		C				am 0
+	sub	$4, %ebx		C				am 0
+	ja	L(outer)			C				am 0
+
+	mov	%edi, %edx
+	mov	%esi, %eax
+	pop	%edi
+	pop	%ebx
+	pop	%esi
+
+L(am3):	C up[un-1..un-3] x up[un-4]
+	lea	8(%edx), %edx		C rp2 += 2
+	movd	(%eax), %mm7
+	movd	4(%eax), %mm1
+	movd	8(%eax), %mm2
+	movd	12(%eax), %mm3
+	movd	(%edx), %mm4
+	pmuludq	%mm7, %mm1
+	movd	4(%edx), %mm5
+	pmuludq	%mm7, %mm2
+	movd	8(%edx), %mm6
+	pmuludq	%mm7, %mm3
+	paddq	%mm1, %mm4
+	paddq	%mm2, %mm5
+	paddq	%mm3, %mm6
+	movd	%mm4, (%edx)
+	psrlq	$32, %mm4
+	paddq	%mm5, %mm4
+	movd	%mm4, 4(%edx)
+	psrlq	$32, %mm4
+	paddq	%mm6, %mm4
+	movd	%mm4, 8(%edx)
+	psrlq	$32, %mm4
+	movd	%mm4, 12(%edx)		C FIXME feed through!
+	lea	4(%eax), %eax
+
+L(am2):	C up[un-1..un-2] x up[un-3]
+	lea	8(%edx), %edx		C rp2 += 2
+	movd	(%eax), %mm7
+	movd	4(%eax), %mm1
+	movd	8(%eax), %mm2
+	movd	(%edx), %mm4
+	movd	4(%edx), %mm5
+	pmuludq	%mm7, %mm1
+	pmuludq	%mm7, %mm2
+	paddq	%mm1, %mm4
+	paddq	%mm2, %mm5
+	movd	%mm4, (%edx)
+	psrlq	$32, %mm4
+	paddq	%mm5, %mm4
+	movd	%mm4, 4(%edx)
+	psrlq	$32, %mm4
+	movd	%mm4, 8(%edx)		C FIXME feed through!
+	lea	4(%eax), %eax
+
+L(am1):	C up[un-1] x up[un-2]
+	lea	8(%edx), %edx		C rp2 += 2
+	movd	(%eax), %mm7
+	movd	4(%eax), %mm2
+	movd	(%edx), %mm4
+	pmuludq	%mm7, %mm2
+	paddq	%mm2, %mm4
+	movd	%mm4, (%edx)
+	psrlq	$32, %mm4
+	movd	%mm4, 4(%edx)
+
+C *** diag stuff, use elementary code for now
+
+	mov	4(%esp), %edx		C rp
+	mov	8(%esp), %eax		C up
+	mov	12(%esp), %ecx		C un
+
+	movd	(%eax), %mm2
+	pmuludq	%mm2, %mm2		C src[0]^2
+
+	pcmpeqd	%mm7, %mm7
+	psrlq	$32, %mm7
+
+	movd	4(%edx), %mm3		C dst[1]
+
+	movd	%mm2, (%edx)
+	psrlq	$32, %mm2
+
+	psllq	$1, %mm3		C 2*dst[1]
+	paddq	%mm3, %mm2
+	movd	%mm2, 4(%edx)
+	psrlq	$32, %mm2
+
+	sub	$2, %ecx
+
+L(diag):
+	movd	4(%eax), %mm0		C src limb
+	add	$4, %eax
+	pmuludq	%mm0, %mm0
+	movq	%mm7, %mm1
+	pand	%mm0, %mm1		C diagonal low
+	psrlq	$32, %mm0		C diagonal high
+
+	movd	8(%edx), %mm3
+	psllq	$1, %mm3		C 2*dst[i]
+	paddq	%mm3, %mm1
+	paddq	%mm1, %mm2
+	movd	%mm2, 8(%edx)
+	psrlq	$32, %mm2
+
+	movd	12(%edx), %mm3
+	psllq	$1, %mm3		C 2*dst[i+1]
+	paddq	%mm3, %mm0
+	paddq	%mm0, %mm2
+	movd	%mm2, 12(%edx)
+	add	$8, %edx
+	psrlq	$32, %mm2
+
+	sub	$1, %ecx
+	jnz	L(diag)
+
+	movd	4(%eax), %mm0		C src[size-1]
+	pmuludq	%mm0, %mm0
+	pand	%mm0, %mm7		C diagonal low
+	psrlq	$32, %mm0		C diagonal high
+
+	movd	8(%edx), %mm3		C dst[2*size-2]
+	psllq	$1, %mm3
+	paddq	%mm3, %mm7
+	paddq	%mm7, %mm2
+	movd	%mm2, 8(%edx)
+	psrlq	$32, %mm2
+
+	paddq	%mm0, %mm2
+	movd	%mm2, 12(%edx)		C dst[2*size-1]
+
+	emms
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/pentium4/sse2/sub_n.asm b/third_party/gmp/mpn/x86/pentium4/sse2/sub_n.asm
new file mode 100644
index 0000000..5ba1c01
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium4/sse2/sub_n.asm

@@ -0,0 +1,119 @@
+dnl  Intel Pentium-4 mpn_sub_n -- mpn subtraction.
+
+dnl  Copyright 2001, 2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C					cycles/limb
+C			     dst!=src1,2  dst==src1  dst==src2
+C P6 model 0-8,10-12		-
+C P6 model 9   (Banias)		?
+C P6 model 13  (Dothan)		?
+C P4 model 0-1 (Willamette)	?
+C P4 model 2   (Northwood)	4	     6		6
+C P4 model 3-4 (Prescott)	4.25	     7.5	7.5
+
+defframe(PARAM_CARRY,20)
+defframe(PARAM_SIZE, 16)
+defframe(PARAM_SRC2, 12)
+defframe(PARAM_SRC1, 8)
+defframe(PARAM_DST,  4)
+
+dnl  re-use parameter space
+define(SAVE_EBX,`PARAM_SRC1')
+
+	TEXT
+	ALIGN(8)
+
+PROLOGUE(mpn_sub_nc)
+deflit(`FRAME',0)
+	movd	PARAM_CARRY, %mm0
+	jmp	L(start_nc)
+EPILOGUE()
+
+	ALIGN(8)
+PROLOGUE(mpn_sub_n)
+deflit(`FRAME',0)
+	pxor	%mm0, %mm0
+L(start_nc):
+	mov	PARAM_SRC1, %eax
+	mov	%ebx, SAVE_EBX
+	mov	PARAM_SRC2, %ebx
+	mov	PARAM_DST, %edx
+	mov	PARAM_SIZE, %ecx
+
+	lea	(%eax,%ecx,4), %eax	C src1 end
+	lea	(%ebx,%ecx,4), %ebx	C src2 end
+	lea	(%edx,%ecx,4), %edx	C dst end
+	neg	%ecx			C -size
+
+L(top):
+	C eax	src1 end
+	C ebx	src2 end
+	C ecx	counter, limbs, negative
+	C edx	dst end
+	C mm0	carry bit
+
+	movd	(%eax,%ecx,4), %mm1
+	movd	(%ebx,%ecx,4), %mm2
+	psubq	%mm2, %mm1
+
+	psubq	%mm0, %mm1
+	movd	%mm1, (%edx,%ecx,4)
+
+	psrlq	$63, %mm1
+
+	add	$1, %ecx
+	jz	L(done_mm1)
+
+	movd	(%eax,%ecx,4), %mm0
+	movd	(%ebx,%ecx,4), %mm2
+	psubq	%mm2, %mm0
+
+	psubq	%mm1, %mm0
+	movd	%mm0, (%edx,%ecx,4)
+
+	psrlq	$63, %mm0
+
+	add	$1, %ecx
+	jnz	L(top)
+
+	movd	%mm0, %eax
+	mov	SAVE_EBX, %ebx
+	emms
+	ret
+
+L(done_mm1):
+	movd	%mm1, %eax
+	mov	SAVE_EBX, %ebx
+	emms
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/pentium4/sse2/submul_1.asm b/third_party/gmp/mpn/x86/pentium4/sse2/submul_1.asm
new file mode 100644
index 0000000..020675b
--- /dev/null
+++ b/third_party/gmp/mpn/x86/pentium4/sse2/submul_1.asm

@@ -0,0 +1,182 @@
+dnl  Intel Pentium-4 mpn_submul_1 -- Multiply a limb vector with a limb and
+dnl  subtract the result from a second limb vector.
+
+dnl  Copyright 2001, 2002, 2008, 2010 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C			    cycles/limb
+C P6 model 0-8,10-12		-
+C P6 model 9   (Banias)		6.8
+C P6 model 13  (Dothan)		6.9
+C P4 model 0-1 (Willamette)	?
+C P4 model 2   (Northwood)	5.87
+C P4 model 3-4 (Prescott)	6.5
+
+C This code represents a step forwards compared to the code available before
+C GMP 5.1, but it is not carefully tuned for either P6 or P4.  In fact, it is
+C not good for P6.  For P4 it saved a bit over 1 c/l for both Northwood and
+C Prescott compared to the old code.
+C
+C The arrangements made here to get a two instruction dependent chain are
+C slightly subtle.  In the loop the carry (or borrow rather) is a negative so
+C that a paddq can be used to give a low limb ready to store, and a high limb
+C ready to become the new carry after a psrlq.
+C
+C If the carry was a simple twos complement negative then the psrlq shift would
+C need to bring in 0 bits or 1 bits according to whether the high was zero or
+C non-zero, since a non-zero value would represent a negative needing sign
+C extension.  That wouldn't be particularly easy to arrange and certainly would
+C add an instruction to the dependent chain, so instead an offset is applied so
+C that the high limb will be 0xFFFFFFFF+c.  With c in the range -0xFFFFFFFF to
+C 0, the value 0xFFFFFFFF+c is in the range 0 to 0xFFFFFFFF and is therefore
+C always positive and can always have 0 bits shifted in, which is what psrlq
+C does.
+C
+C The extra 0xFFFFFFFF must be subtracted before c is used, but that can be
+C done off the dependent chain.  The total adjustment then is to add
+C 0xFFFFFFFF00000000 to offset the new carry, and subtract 0x00000000FFFFFFFF
+C to remove the offset from the current carry, for a net add of
+C 0xFFFFFFFE00000001.  In the code this is applied to the destination limb when
+C fetched.
+C
+C It's also possible to view the 0xFFFFFFFF adjustment as a ones-complement
+C negative, which is how it's undone for the return value, but that doesn't
+C seem as clear.
+
+defframe(PARAM_CARRY,     20)
+defframe(PARAM_MULTIPLIER,16)
+defframe(PARAM_SIZE,      12)
+defframe(PARAM_SRC,       8)
+defframe(PARAM_DST,       4)
+
+	TEXT
+	ALIGN(16)
+
+PROLOGUE(mpn_submul_1c)
+deflit(`FRAME',0)
+	movd	PARAM_CARRY, %mm1
+	jmp	L(start_1c)
+EPILOGUE()
+
+PROLOGUE(mpn_submul_1)
+deflit(`FRAME',0)
+	pxor	%mm1, %mm1		C initial borrow
+
+L(start_1c):
+	mov	PARAM_SRC, %eax
+	pcmpeqd	%mm0, %mm0
+
+	movd	PARAM_MULTIPLIER, %mm7
+	pcmpeqd	%mm6, %mm6
+
+	mov	PARAM_DST, %edx
+	psrlq	$32, %mm0		C 0x00000000FFFFFFFF
+
+	mov	PARAM_SIZE, %ecx
+	psllq	$32, %mm6		C 0xFFFFFFFF00000000
+
+	psubq	%mm0, %mm6		C 0xFFFFFFFE00000001
+
+	psubq	%mm1, %mm0		C 0xFFFFFFFF - borrow
+
+
+	movd	(%eax), %mm3		C up
+	movd	(%edx), %mm4		C rp
+
+	add	$-1, %ecx
+	paddq	%mm6, %mm4		C add 0xFFFFFFFE00000001
+	pmuludq	%mm7, %mm3
+	jnz	L(gt1)
+	psubq	%mm3, %mm4		C prod
+	paddq	%mm4, %mm0		C borrow
+	movd	%mm0, (%edx)		C result
+	jmp	L(rt)
+
+L(gt1):	movd	4(%eax), %mm1		C up
+	movd	4(%edx), %mm2		C rp
+
+	add	$-1, %ecx
+	jz	L(eev)
+
+	ALIGN(16)
+L(top):	paddq	%mm6, %mm2		C add 0xFFFFFFFE00000001
+	pmuludq	%mm7, %mm1
+	psubq	%mm3, %mm4		C prod
+	movd	8(%eax), %mm3		C up
+	paddq	%mm4, %mm0		C borrow
+	movd	8(%edx), %mm4		C rp
+	movd	%mm0, (%edx)		C result
+	psrlq	$32, %mm0
+
+	add	$-1, %ecx
+	jz	L(eod)
+
+	paddq	%mm6, %mm4		C add 0xFFFFFFFE00000001
+	pmuludq	%mm7, %mm3
+	psubq	%mm1, %mm2		C prod
+	movd	12(%eax), %mm1		C up
+	paddq	%mm2, %mm0		C borrow
+	movd	12(%edx), %mm2		C rp
+	movd	%mm0, 4(%edx)		C result
+	psrlq	$32, %mm0
+
+	lea	8(%eax), %eax
+	lea	8(%edx), %edx
+	add	$-1, %ecx
+	jnz	L(top)
+
+
+L(eev):	paddq	%mm6, %mm2		C add 0xFFFFFFFE00000001
+	pmuludq	%mm7, %mm1
+	psubq	%mm3, %mm4		C prod
+	paddq	%mm4, %mm0		C borrow
+	movd	%mm0, (%edx)		C result
+	psrlq	$32, %mm0
+	psubq	%mm1, %mm2		C prod
+	paddq	%mm2, %mm0		C borrow
+	movd	%mm0, 4(%edx)		C result
+L(rt):	psrlq	$32, %mm0
+	movd	%mm0, %eax
+	not	%eax
+	emms
+	ret
+
+L(eod):	paddq	%mm6, %mm4		C add 0xFFFFFFFE00000001
+	pmuludq	%mm7, %mm3
+	psubq	%mm1, %mm2		C prod
+	paddq	%mm2, %mm0		C borrow
+	movd	%mm0, 4(%edx)		C result
+	psrlq	$32, %mm0
+	psubq	%mm3, %mm4		C prod
+	paddq	%mm4, %mm0		C borrow
+	movd	%mm0, 8(%edx)		C result
+	jmp	L(rt)
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/rshift.asm b/third_party/gmp/mpn/x86/rshift.asm
new file mode 100644
index 0000000..a60dcaa
--- /dev/null
+++ b/third_party/gmp/mpn/x86/rshift.asm

@@ -0,0 +1,108 @@
+dnl  x86 mpn_rshift -- mpn right shift.
+
+dnl  Copyright 1992, 1994, 1996, 1999-2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C     cycles/limb
+C P54	 7.5
+C P55	 7.0
+C P6	 2.5
+C K6	 4.5
+C K7	 5.0
+C P4	16.5
+
+
+C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                       unsigned shift);
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC,  8)
+defframe(PARAM_DST,  4)
+
+	TEXT
+	ALIGN(8)
+PROLOGUE(mpn_rshift)
+
+	pushl	%edi
+	pushl	%esi
+	pushl	%ebx
+deflit(`FRAME',12)
+
+	movl	PARAM_DST,%edi
+	movl	PARAM_SRC,%esi
+	movl	PARAM_SIZE,%edx
+	movl	PARAM_SHIFT,%ecx
+
+	leal	-4(%edi,%edx,4),%edi
+	leal	(%esi,%edx,4),%esi
+	negl	%edx
+
+	movl	(%esi,%edx,4),%ebx	C read least significant limb
+	xorl	%eax,%eax
+	shrdl(	%cl, %ebx, %eax)	C compute carry limb
+	incl	%edx
+	jz	L(end)
+	pushl	%eax			C push carry limb onto stack
+	testb	$1,%dl
+	jnz	L(1)			C enter loop in the middle
+	movl	%ebx,%eax
+
+	ALIGN(8)
+L(oop):	movl	(%esi,%edx,4),%ebx	C load next higher limb
+	shrdl(	%cl, %ebx, %eax)	C compute result limb
+	movl	%eax,(%edi,%edx,4)	C store it
+	incl	%edx
+L(1):	movl	(%esi,%edx,4),%eax
+	shrdl(	%cl, %eax, %ebx)
+	movl	%ebx,(%edi,%edx,4)
+	incl	%edx
+	jnz	L(oop)
+
+	shrl	%cl,%eax		C compute most significant limb
+	movl	%eax,(%edi)		C store it
+
+	popl	%eax			C pop carry limb
+
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
+
+L(end):	shrl	%cl,%ebx		C compute most significant limb
+	movl	%ebx,(%edi)		C store it
+
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/sec_tabselect.asm b/third_party/gmp/mpn/x86/sec_tabselect.asm
new file mode 100644
index 0000000..c7c2e05
--- /dev/null
+++ b/third_party/gmp/mpn/x86/sec_tabselect.asm

@@ -0,0 +1,115 @@
+dnl  x86 mpn_sec_tabselect.
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C			    cycles/limb
+C P5				 ?
+C P6 model 0-8,10-12		 ?
+C P6 model 9  (Banias)		 ?
+C P6 model 13 (Dothan)		 ?
+C P4 model 0  (Willamette)	 ?
+C P4 model 1  (?)		 ?
+C P4 model 2  (Northwood)	 4.5
+C P4 model 3  (Prescott)	 ?
+C P4 model 4  (Nocona)		 ?
+C Intel Atom			 ?
+C AMD K6			 ?
+C AMD K7			 3.4
+C AMD K8			 ?
+C AMD K10			 ?
+
+C NOTES
+C  * This has not been tuned for any specific processor.  Its speed should not
+C    be too bad, though.
+C  * Using SSE2 could result in many-fold speedup.
+
+C mpn_sec_tabselect (mp_limb_t *rp, mp_limb_t *tp, mp_size_t n, mp_size_t nents, mp_size_t which)
+define(`rp',     `%edi')
+define(`tp',     `%esi')
+define(`n',      `%ebx')
+define(`nents',  `%ecx')
+define(`which',  `36(%esp)')
+
+define(`i',      `%ebp')
+define(`maskp',  `20(%esp)')
+define(`maskn',  `32(%esp)')
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_sec_tabselect)
+	push	%edi
+	push	%esi
+	push	%ebx
+	push	%ebp
+	mov	20(%esp), rp
+	mov	24(%esp), tp
+	mov	28(%esp), n
+	mov	32(%esp), nents
+
+	lea	(rp,n,4), rp
+	lea	(tp,n,4), tp
+	sub	nents, which
+L(outer):
+	mov	which, %eax
+	add	nents, %eax
+	neg	%eax			C set CF iff 'which' != k
+	sbb	%eax, %eax
+	mov	%eax, maskn
+	not	%eax
+	mov	%eax, maskp
+
+	mov	n, i
+	neg	i
+
+	ALIGN(16)
+L(top):	mov	(tp,i,4), %eax
+	and	maskp, %eax
+	mov	(rp,i,4), %edx
+	and	maskn, %edx
+	or	%edx, %eax
+	mov	%eax, (rp,i,4)
+	inc	i
+	js	L(top)
+
+L(end):	mov	n, %eax
+	lea	(tp,%eax,4), tp
+	dec	nents
+	jne	L(outer)
+
+L(outer_end):
+	pop	%ebp
+	pop	%ebx
+	pop	%esi
+	pop	%edi
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/silvermont/gmp-mparam.h b/third_party/gmp/mpn/x86/silvermont/gmp-mparam.h
new file mode 100644
index 0000000..e9f1d8f
--- /dev/null
+++ b/third_party/gmp/mpn/x86/silvermont/gmp-mparam.h

@@ -0,0 +1,222 @@
+/* Intel Silvermont/32 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* 2400 MHz Intel Atom C2758 Silvermont/Rangeley */
+/* FFT tuning limit = 67,000,000 */
+/* Generated by tuneup.c, 2019-10-30, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD                 3
+#define MOD_1_UNNORM_THRESHOLD               5
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          9
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          5
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD        11
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD         0  /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     16
+#define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_1N_PI1_METHOD                 1  /* 64.62% faster than 2 */
+#define DIV_QR_1_NORM_THRESHOLD          MP_SIZE_T_MAX  /* never */
+#define DIV_QR_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           32
+
+#define DIV_1_VS_MUL_1_PERCENT             204
+
+#define MUL_TOOM22_THRESHOLD                26
+#define MUL_TOOM33_THRESHOLD               105
+#define MUL_TOOM44_THRESHOLD               236
+#define MUL_TOOM6H_THRESHOLD               351
+#define MUL_TOOM8H_THRESHOLD               502
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD     105
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     163
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD     137
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD     174
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD     215
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 36
+#define SQR_TOOM3_THRESHOLD                138
+#define SQR_TOOM4_THRESHOLD                360
+#define SQR_TOOM6_THRESHOLD                494
+#define SQR_TOOM8_THRESHOLD                620
+
+#define MULMID_TOOM42_THRESHOLD             58
+
+#define MULMOD_BNM1_THRESHOLD               15
+#define SQRMOD_BNM1_THRESHOLD               19
+
+#define MUL_FFT_MODF_THRESHOLD             460  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    460, 5}, {     21, 6}, {     11, 5}, {     23, 6}, \
+    {     12, 5}, {     25, 6}, {     13, 5}, {     27, 6}, \
+    {     21, 7}, {     11, 6}, {     25, 7}, {     13, 6}, \
+    {     27, 7}, {     15, 6}, {     31, 7}, {     17, 6}, \
+    {     35, 7}, {     19, 6}, {     39, 7}, {     21, 8}, \
+    {     11, 7}, {     27, 8}, {     15, 7}, {     35, 8}, \
+    {     19, 7}, {     41, 8}, {     23, 7}, {     47, 8}, \
+    {     27, 9}, {     15, 8}, {     31, 7}, {     63, 8}, \
+    {     39, 9}, {     23, 8}, {     51,10}, {     15, 9}, \
+    {     31, 8}, {     67, 9}, {     39, 8}, {     79, 9}, \
+    {     47, 8}, {     95,10}, {     31, 9}, {     79,10}, \
+    {     47, 9}, {     95,11}, {     31,10}, {     63, 9}, \
+    {    135,10}, {     79, 9}, {    159,10}, {     95,11}, \
+    {     63,10}, {    127, 9}, {    255,10}, {    143, 9}, \
+    {    287,10}, {    159,11}, {     95,10}, {    191,12}, \
+    {     63,11}, {    127,10}, {    255, 9}, {    511,10}, \
+    {    271, 9}, {    543,10}, {    287, 9}, {    575,11}, \
+    {    159,10}, {    319, 9}, {    639,10}, {    335, 9}, \
+    {    671,10}, {    351, 9}, {    703,11}, {    191,10}, \
+    {    383, 9}, {    767,10}, {    415, 9}, {    831,12}, \
+    {    127,11}, {    255,10}, {    543, 9}, {   1087,11}, \
+    {    287,10}, {    607, 9}, {   1215,11}, {    319,10}, \
+    {    671,11}, {    351,10}, {    735,12}, {    191,11}, \
+    {    383,10}, {    767,11}, {    415,10}, {    831,13}, \
+    {    127,12}, {    255,11}, {    543,10}, {   1087,11}, \
+    {    607,10}, {   1215,12}, {    319,11}, {    671,10}, \
+    {   1343,11}, {    735,10}, {   1471,12}, {    383,11}, \
+    {    863,10}, {   1727,12}, {    447,11}, {    959,13}, \
+    {    255,12}, {    511,11}, {   1087,12}, {    575,11}, \
+    {   1215,10}, {   2431,12}, {    639,11}, {   1343,12}, \
+    {    703,11}, {   1471,13}, {    383,12}, {    767,11}, \
+    {   1535,12}, {    831,11}, {   1727,10}, {   3455,12}, \
+    {    959,11}, {   1919,14}, {    255,13}, {    511,12}, \
+    {   1215,11}, {   2431,13}, {    639,12}, {   1471,11}, \
+    {   2943,13}, {    767,12}, {   1727,11}, {   3455,13}, \
+    {    895,12}, {   1919,14}, {    511,13}, {   1023,12}, \
+    {   2111,13}, {   1151,12}, {   2431,13}, {   1407,12}, \
+    {   2943,14}, {    767,13}, {   1663,12}, {   3455,13}, \
+    {   1919,15}, {    511,14}, {   1023,13}, {   2175,12}, \
+    {   4479,13}, {   2431,14}, {   1279,13}, {   2943,12}, \
+    {   5887,14}, {   1535,13}, {   3455,14}, {   1791,13}, \
+    {   3967,12}, {   7935,15}, {   1023,14}, {   2047,13}, \
+    {   4479,14}, {   2303,13}, {   4991,12}, {   9983,14}, \
+    {   2815,13}, {   5887,15}, {   1535,14}, {   3839,13}, \
+    {   7935,16} }
+#define MUL_FFT_TABLE3_SIZE 177
+#define MUL_FFT_THRESHOLD                 4544
+
+#define SQR_FFT_MODF_THRESHOLD             400  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    400, 5}, {     21, 6}, {     11, 5}, {     23, 6}, \
+    {     12, 5}, {     25, 6}, {     13, 5}, {     28, 6}, \
+    {     21, 7}, {     11, 6}, {     25, 7}, {     13, 6}, \
+    {     28, 7}, {     15, 6}, {     32, 7}, {     17, 6}, \
+    {     35, 7}, {     19, 6}, {     39, 7}, {     21, 8}, \
+    {     11, 7}, {     27, 8}, {     15, 7}, {     35, 8}, \
+    {     19, 7}, {     41, 8}, {     23, 7}, {     47, 8}, \
+    {     27, 9}, {     15, 8}, {     31, 7}, {     63, 8}, \
+    {     39, 9}, {     23, 8}, {     51,10}, {     15, 9}, \
+    {     31, 8}, {     67, 9}, {     39, 8}, {     79, 9}, \
+    {     55,10}, {     31, 9}, {     79,10}, {     47, 9}, \
+    {     95,11}, {     31,10}, {     63, 9}, {    127,10}, \
+    {     79, 9}, {    159,10}, {     95,11}, {     63,10}, \
+    {    127, 9}, {    255,10}, {    143, 9}, {    287, 8}, \
+    {    575,10}, {    159,11}, {     95,12}, {     63,11}, \
+    {    127,10}, {    255, 9}, {    511,10}, {    271, 9}, \
+    {    543,10}, {    287, 9}, {    575,11}, {    159,10}, \
+    {    319, 9}, {    639,10}, {    335, 9}, {    671,10}, \
+    {    351, 9}, {    735,11}, {    191,10}, {    383, 9}, \
+    {    799,10}, {    415, 9}, {    831,11}, {    223,12}, \
+    {    127,11}, {    255,10}, {    543, 9}, {   1087,11}, \
+    {    287,10}, {    607, 9}, {   1215,11}, {    319,10}, \
+    {    671,11}, {    351,10}, {    735, 9}, {   1471,12}, \
+    {    191,11}, {    383,10}, {    799,11}, {    415,10}, \
+    {    863,13}, {    127,12}, {    255,11}, {    543,10}, \
+    {   1087,11}, {    607,10}, {   1215,12}, {    319,11}, \
+    {    671,10}, {   1343,11}, {    735,10}, {   1471,12}, \
+    {    383,11}, {    863,10}, {   1727,12}, {    447,11}, \
+    {    959,13}, {    255,12}, {    511,11}, {   1087,12}, \
+    {    575,11}, {   1215,10}, {   2431,12}, {    639,11}, \
+    {   1343,12}, {    703,11}, {   1471,13}, {    383,12}, \
+    {    767,11}, {   1535,12}, {    831,11}, {   1727,12}, \
+    {    959,14}, {    255,13}, {    511,12}, {   1215,11}, \
+    {   2431,13}, {    639,12}, {   1471,11}, {   2943,13}, \
+    {    767,12}, {   1727,11}, {   3455,13}, {    895,12}, \
+    {   1919,14}, {    511,13}, {   1023,12}, {   2111,13}, \
+    {   1151,12}, {   2431,13}, {   1407,12}, {   2943,14}, \
+    {    767,13}, {   1663,12}, {   3455,13}, {   1919,15}, \
+    {    511,14}, {   1023,13}, {   2175,12}, {   4479,13}, \
+    {   2431,14}, {   1279,13}, {   2943,12}, {   5887,14}, \
+    {   1535,13}, {   3455,14}, {   1791,13}, {   3967,15}, \
+    {   1023,14}, {   2047,13}, {   4479,14}, {   2303,13}, \
+    {   4991,12}, {   9983,14}, {   2815,13}, {   5887,15}, \
+    {   1535,14}, {   3839,13}, {   7679,16} }
+#define SQR_FFT_TABLE3_SIZE 175
+#define SQR_FFT_THRESHOLD                 3712
+
+#define MULLO_BASECASE_THRESHOLD             0  /* always */
+#define MULLO_DC_THRESHOLD                  56
+#define MULLO_MUL_N_THRESHOLD             8907
+#define SQRLO_BASECASE_THRESHOLD             0  /* always */
+#define SQRLO_DC_THRESHOLD                 137
+#define SQRLO_SQR_THRESHOLD               7373
+
+#define DC_DIV_QR_THRESHOLD                 76
+#define DC_DIVAPPR_Q_THRESHOLD             336
+#define DC_BDIV_QR_THRESHOLD                66
+#define DC_BDIV_Q_THRESHOLD                218
+
+#define INV_MULMOD_BNM1_THRESHOLD           50
+#define INV_NEWTON_THRESHOLD               345
+#define INV_APPR_THRESHOLD                 342
+
+#define BINV_NEWTON_THRESHOLD              366
+#define REDC_1_TO_REDC_N_THRESHOLD          91
+
+#define MU_DIV_QR_THRESHOLD               1652
+#define MU_DIVAPPR_Q_THRESHOLD            1858
+#define MUPI_DIV_QR_THRESHOLD              171
+#define MU_BDIV_QR_THRESHOLD              1442
+#define MU_BDIV_Q_THRESHOLD               1830
+
+#define POWM_SEC_TABLE  3,17,102,404,1185
+
+#define GET_STR_DC_THRESHOLD                14
+#define GET_STR_PRECOMPUTE_THRESHOLD        21
+#define SET_STR_DC_THRESHOLD               272
+#define SET_STR_PRECOMPUTE_THRESHOLD       788
+
+#define FAC_DSC_THRESHOLD                  132
+#define FAC_ODD_THRESHOLD                   34
+
+#define MATRIX22_STRASSEN_THRESHOLD         19
+#define HGCD2_DIV1_METHOD                    1  /* 0.59% faster than 3 */
+#define HGCD_THRESHOLD                     142
+#define HGCD_APPR_THRESHOLD                181
+#define HGCD_REDUCE_THRESHOLD             2681
+#define GCD_DC_THRESHOLD                   492
+#define GCDEXT_DC_THRESHOLD                365
+#define JACOBI_BASE_METHOD                   1  /* 0.41% faster than 2 */
+
+/* Tuneup completed successfully, took 147027 seconds */

diff --git a/third_party/gmp/mpn/x86/skylake/gmp-mparam.h b/third_party/gmp/mpn/x86/skylake/gmp-mparam.h
new file mode 100644
index 0000000..fb87957
--- /dev/null
+++ b/third_party/gmp/mpn/x86/skylake/gmp-mparam.h

@@ -0,0 +1,211 @@
+/* x86/skylake gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* 3600-4000 MHz Intel Xeon E3-1270v5 Skylake */
+/* FFT tuning limit = 67,000,000 */
+/* Generated by tuneup.c, 2019-10-21, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD                15
+#define MOD_1_UNNORM_THRESHOLD              16
+#define MOD_1N_TO_MOD_1_1_THRESHOLD         10
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          8
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD         0  /* never mpn_mod_1_1p */
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD        10
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD      9
+#define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_1N_PI1_METHOD                 1  /* 5.63% faster than 2 */
+#define DIV_QR_1_NORM_THRESHOLD             12
+#define DIV_QR_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */
+#define DIV_QR_2_PI2_THRESHOLD              17
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           18
+
+#define DIV_1_VS_MUL_1_PERCENT             348
+
+#define MUL_TOOM22_THRESHOLD                24
+#define MUL_TOOM33_THRESHOLD                81
+#define MUL_TOOM44_THRESHOLD               208
+#define MUL_TOOM6H_THRESHOLD               303
+#define MUL_TOOM8H_THRESHOLD               454
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      81
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     149
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD     137
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD     145
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD     196
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 40
+#define SQR_TOOM3_THRESHOLD                129
+#define SQR_TOOM4_THRESHOLD                220
+#define SQR_TOOM6_THRESHOLD                354
+#define SQR_TOOM8_THRESHOLD                608
+
+#define MULMID_TOOM42_THRESHOLD             72
+
+#define MULMOD_BNM1_THRESHOLD               17
+#define SQRMOD_BNM1_THRESHOLD               21
+
+#define MUL_FFT_MODF_THRESHOLD             530  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    530, 5}, {     29, 6}, {     15, 5}, {     31, 6}, \
+    {     29, 7}, {     15, 6}, {     33, 7}, {     17, 6}, \
+    {     36, 7}, {     19, 6}, {     39, 7}, {     21, 6}, \
+    {     43, 7}, {     23, 6}, {     47, 7}, {     29, 8}, \
+    {     15, 7}, {     35, 8}, {     19, 7}, {     43, 8}, \
+    {     23, 7}, {     49, 8}, {     27, 7}, {     55, 9}, \
+    {     15, 8}, {     31, 7}, {     63, 8}, {     43, 9}, \
+    {     23, 8}, {     51, 9}, {     31, 8}, {     67, 9}, \
+    {     39, 8}, {     83, 9}, {     47, 8}, {     95, 9}, \
+    {     55,10}, {     31, 9}, {     79,10}, {     47, 9}, \
+    {     95,11}, {     31,10}, {     63, 9}, {    135,10}, \
+    {     79, 9}, {    159,10}, {     95, 9}, {    191,10}, \
+    {    111,11}, {     63,10}, {    143, 9}, {    287,10}, \
+    {    159,11}, {     95,10}, {    191,12}, {     63,11}, \
+    {    127,10}, {    255, 9}, {    511,10}, {    271, 9}, \
+    {    543,10}, {    287,11}, {    159,10}, {    351,11}, \
+    {    191,10}, {    415,12}, {    127,11}, {    255,10}, \
+    {    543,11}, {    287,10}, {    607,11}, {    319,10}, \
+    {    671,11}, {    351,12}, {    191,11}, {    383,10}, \
+    {    799,11}, {    415,13}, {    127,12}, {    255,11}, \
+    {    543,10}, {   1087,11}, {    607,12}, {    319,11}, \
+    {    671,10}, {   1343,11}, {    735,12}, {    383,11}, \
+    {    799,10}, {   1599,11}, {    863,12}, {    447,11}, \
+    {    959,13}, {    255,12}, {    511,11}, {   1087,12}, \
+    {    575,11}, {   1215,12}, {    639,11}, {   1343,12}, \
+    {    703,13}, {    383,12}, {    767,11}, {   1599,12}, \
+    {    831,11}, {   1727,12}, {    959,14}, {    255,13}, \
+    {    511,12}, {   1087,11}, {   2239,12}, {   1215,11}, \
+    {   2431,13}, {    639,12}, {   1471,13}, {    767,12}, \
+    {   1727,13}, {    895,12}, {   1919,14}, {    511,13}, \
+    {   1023,12}, {   2239,13}, {   1151,12}, {   2431,13}, \
+    {   1279,12}, {   2623,13}, {   1407,12}, {   2815,14}, \
+    {    767,13}, {   1663,12}, {   3455,13}, {   1919,15}, \
+    {    511,14}, {   1023,13}, {   2175,12}, {   4479,13}, \
+    {   2431,14}, {   1279,13}, {   2943,12}, {   5887,14}, \
+    {   1535,13}, {   3455,14}, {   1791,13}, {   3967,15}, \
+    {   1023,14}, {   2047,13}, {   4479,14}, {   2303,13}, \
+    {   4991,12}, {   9983,14}, {   2815,13}, {   5887,15}, \
+    {   1535,14}, {   3839,16} }
+#define MUL_FFT_TABLE3_SIZE 154
+#define MUL_FFT_THRESHOLD                 6784
+
+#define SQR_FFT_MODF_THRESHOLD             460  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    460, 5}, {     29, 6}, {     15, 5}, {     31, 6}, \
+    {     29, 7}, {     15, 6}, {     33, 7}, {     17, 6}, \
+    {     36, 7}, {     19, 6}, {     39, 7}, {     29, 8}, \
+    {     15, 7}, {     35, 8}, {     19, 7}, {     41, 8}, \
+    {     23, 7}, {     49, 8}, {     27, 7}, {     55, 9}, \
+    {     15, 8}, {     31, 7}, {     63, 8}, {     43, 9}, \
+    {     23, 8}, {     55,10}, {     15, 9}, {     31, 8}, \
+    {     67, 9}, {     39, 8}, {     79, 9}, {     47, 8}, \
+    {     95, 9}, {     55,10}, {     31, 9}, {     79,10}, \
+    {     47, 9}, {     95,11}, {     31,10}, {     63, 9}, \
+    {    135,10}, {     79, 9}, {    159,10}, {     95,11}, \
+    {     63,10}, {    127, 9}, {    255,10}, {    143, 9}, \
+    {    287,10}, {    159,11}, {     95,12}, {     63,11}, \
+    {    127,10}, {    271, 9}, {    543,10}, {    287,11}, \
+    {    159,10}, {    319, 9}, {    639,10}, {    351,11}, \
+    {    191,10}, {    415,12}, {    127,11}, {    255,10}, \
+    {    543,11}, {    287,10}, {    575,11}, {    319,10}, \
+    {    671,11}, {    351,10}, {    703,12}, {    191,11}, \
+    {    383,10}, {    799,11}, {    415,10}, {    831,13}, \
+    {    127,12}, {    255,11}, {    511,10}, {   1023,11}, \
+    {    543,10}, {   1087,11}, {    607,12}, {    319,11}, \
+    {    671,10}, {   1343,11}, {    735,12}, {    383,11}, \
+    {    799,10}, {   1599,11}, {    863,12}, {    447,11}, \
+    {    927,13}, {    255,12}, {    511,11}, {   1087,12}, \
+    {    575,11}, {   1215,12}, {    639,11}, {   1343,12}, \
+    {    703,11}, {   1407,13}, {    383,12}, {    767,11}, \
+    {   1599,12}, {    831,11}, {   1727,12}, {    895,11}, \
+    {   1791,14}, {    255,13}, {    511,12}, {   1087,11}, \
+    {   2239,12}, {   1215,13}, {    639,12}, {   1471,13}, \
+    {    767,12}, {   1727,13}, {    895,12}, {   1919,14}, \
+    {    511,13}, {   1023,12}, {   2239,13}, {   1151,12}, \
+    {   2431,13}, {   1279,12}, {   2623,13}, {   1407,12}, \
+    {   2815,14}, {    767,13}, {   1663,12}, {   3455,13}, \
+    {   1919,15}, {    511,14}, {   1023,13}, {   2175,12}, \
+    {   4479,13}, {   2431,14}, {   1279,13}, {   2943,12}, \
+    {   5887,14}, {   1535,13}, {   3455,14}, {   1791,13}, \
+    {   3967,15}, {   1023,14}, {   2047,13}, {   4479,14}, \
+    {   2303,13}, {   4991,12}, {   9983,14}, {   2815,13}, \
+    {   5887,15}, {   1535,14}, {   3839,16} }
+#define SQR_FFT_TABLE3_SIZE 155
+#define SQR_FFT_THRESHOLD                 5568
+
+#define MULLO_BASECASE_THRESHOLD             0  /* always */
+#define MULLO_DC_THRESHOLD                  68
+#define MULLO_MUL_N_THRESHOLD            13555
+#define SQRLO_BASECASE_THRESHOLD             0  /* always */
+#define SQRLO_DC_THRESHOLD                 117
+#define SQRLO_SQR_THRESHOLD              10988
+
+#define DC_DIV_QR_THRESHOLD                 42
+#define DC_DIVAPPR_Q_THRESHOLD             163
+#define DC_BDIV_QR_THRESHOLD                66
+#define DC_BDIV_Q_THRESHOLD                160
+
+#define INV_MULMOD_BNM1_THRESHOLD           46
+#define INV_NEWTON_THRESHOLD               165
+#define INV_APPR_THRESHOLD                 157
+
+#define BINV_NEWTON_THRESHOLD              300
+#define REDC_1_TO_REDC_N_THRESHOLD          68
+
+#define MU_DIV_QR_THRESHOLD               1718
+#define MU_DIVAPPR_Q_THRESHOLD            1685
+#define MUPI_DIV_QR_THRESHOLD               62
+#define MU_BDIV_QR_THRESHOLD              1589
+#define MU_BDIV_Q_THRESHOLD               1830
+
+#define POWM_SEC_TABLE  1,17,129,547,1317
+
+#define GET_STR_DC_THRESHOLD                10
+#define GET_STR_PRECOMPUTE_THRESHOLD        16
+#define SET_STR_DC_THRESHOLD               354
+#define SET_STR_PRECOMPUTE_THRESHOLD       860
+
+#define FAC_DSC_THRESHOLD                  141
+#define FAC_ODD_THRESHOLD                   34
+
+#define MATRIX22_STRASSEN_THRESHOLD         20
+#define HGCD2_DIV1_METHOD                    5  /* 1.04% faster than 3 */
+#define HGCD_THRESHOLD                     114
+#define HGCD_APPR_THRESHOLD                132
+#define HGCD_REDUCE_THRESHOLD             3524
+#define GCD_DC_THRESHOLD                   474
+#define GCDEXT_DC_THRESHOLD                379
+#define JACOBI_BASE_METHOD                   1  /* 27.39% faster than 4 */
+
+/* Tuneup completed successfully, took 31721 seconds */

diff --git a/third_party/gmp/mpn/x86/sqr_basecase.asm b/third_party/gmp/mpn/x86/sqr_basecase.asm
new file mode 100644
index 0000000..39f8a89
--- /dev/null
+++ b/third_party/gmp/mpn/x86/sqr_basecase.asm

@@ -0,0 +1,359 @@
+dnl  x86 generic mpn_sqr_basecase -- square an mpn number.
+
+dnl  Copyright 1999, 2000, 2002, 2003 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+
+C     cycles/crossproduct  cycles/triangleproduct
+C P5
+C P6
+C K6
+C K7
+C P4
+
+
+C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C The algorithm is basically the same as mpn/generic/sqr_basecase.c, but a
+C lot of function call overheads are avoided, especially when the size is
+C small.
+C
+C The mul1 loop is not unrolled like mul_1.asm, it doesn't seem worth the
+C code size to do so here.
+C
+C Enhancements:
+C
+C The addmul loop here is also not unrolled like aorsmul_1.asm and
+C mul_basecase.asm are.  Perhaps it should be done.  It'd add to the
+C complexity, but if it's worth doing in the other places then it should be
+C worthwhile here.
+C
+C A fully-unrolled style like other sqr_basecase.asm versions (k6, k7, p6)
+C might be worth considering.  That'd add quite a bit to the code size, but
+C only as much as is used would be dragged into L1 cache.
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+	TEXT
+	ALIGN(8)
+PROLOGUE(mpn_sqr_basecase)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %edx
+
+	movl	PARAM_SRC, %eax
+
+	cmpl	$2, %edx
+	movl	PARAM_DST, %ecx
+
+	je	L(two_limbs)
+	ja	L(three_or_more)
+
+
+C -----------------------------------------------------------------------------
+C one limb only
+	C eax	src
+	C ebx
+	C ecx	dst
+	C edx
+
+	movl	(%eax), %eax
+	mull	%eax
+	movl	%eax, (%ecx)
+	movl	%edx, 4(%ecx)
+	ret
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(8)
+L(two_limbs):
+	C eax	src
+	C ebx
+	C ecx	dst
+	C edx
+
+	pushl	%ebx
+	pushl	%ebp
+
+	movl	%eax, %ebx
+	movl	(%eax), %eax
+
+	mull	%eax		C src[0]^2
+
+	pushl	%esi
+	pushl	%edi
+
+	movl	%edx, %esi	C dst[1]
+	movl	%eax, (%ecx)	C dst[0]
+
+	movl	4(%ebx), %eax
+	mull	%eax		C src[1]^2
+
+	movl	%eax, %edi	C dst[2]
+	movl	%edx, %ebp	C dst[3]
+
+	movl	(%ebx), %eax
+	mull	4(%ebx)		C src[0]*src[1]
+
+	addl	%eax, %esi
+
+	adcl	%edx, %edi
+
+	adcl	$0, %ebp
+	addl	%esi, %eax
+
+	adcl	%edi, %edx
+	movl	%eax, 4(%ecx)
+
+	adcl	$0, %ebp
+
+	movl	%edx, 8(%ecx)
+	movl	%ebp, 12(%ecx)
+
+	popl	%edi
+	popl	%esi
+
+	popl	%ebp
+	popl	%ebx
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(8)
+L(three_or_more):
+deflit(`FRAME',0)
+	C eax	src
+	C ebx
+	C ecx	dst
+	C edx	size
+
+	pushl	%ebx	FRAME_pushl()
+	pushl	%edi	FRAME_pushl()
+
+	pushl	%esi	FRAME_pushl()
+	pushl	%ebp	FRAME_pushl()
+
+	leal	(%ecx,%edx,4), %edi	C &dst[size], end of this mul1
+	leal	(%eax,%edx,4), %esi	C &src[size]
+
+C First multiply src[0]*src[1..size-1] and store at dst[1..size].
+
+	movl	(%eax), %ebp		C src[0], multiplier
+	movl	%edx, %ecx
+
+	negl	%ecx			C -size
+	xorl	%ebx, %ebx		C clear carry limb
+
+	incl	%ecx			C -(size-1)
+
+L(mul1):
+	C eax	scratch
+	C ebx	carry
+	C ecx	counter, limbs, negative
+	C edx	scratch
+	C esi	&src[size]
+	C edi	&dst[size]
+	C ebp	multiplier
+
+	movl	(%esi,%ecx,4), %eax
+	mull	%ebp
+	addl	%eax, %ebx
+	adcl	$0, %edx
+	movl	%ebx, (%edi,%ecx,4)
+	movl	%edx, %ebx
+	incl	%ecx
+	jnz	L(mul1)
+
+	movl	%ebx, (%edi)
+
+
+	C Add products src[n]*src[n+1..size-1] at dst[2*n-1...], for
+	C n=1..size-2.
+	C
+	C The last products src[size-2]*src[size-1], which is the end corner
+	C of the product triangle, is handled separately at the end to save
+	C looping overhead.  If size is 3 then it's only this that needs to
+	C be done.
+	C
+	C In the outer loop %esi is a constant, and %edi just advances by 1
+	C limb each time.  The size of the operation decreases by 1 limb
+	C each time.
+
+	C eax
+	C ebx	carry (needing carry flag added)
+	C ecx
+	C edx
+	C esi	&src[size]
+	C edi	&dst[size]
+	C ebp
+
+	movl	PARAM_SIZE, %ecx
+	subl	$3, %ecx
+	jz	L(corner)
+
+	negl	%ecx
+
+dnl  re-use parameter space
+define(VAR_OUTER,`PARAM_DST')
+
+L(outer):
+	C eax
+	C ebx
+	C ecx
+	C edx	outer loop counter, -(size-3) to -1
+	C esi	&src[size]
+	C edi	dst, pointing at stored carry limb of previous loop
+	C ebp
+
+	movl	%ecx, VAR_OUTER
+	addl	$4, %edi		C advance dst end
+
+	movl	-8(%esi,%ecx,4), %ebp	C next multiplier
+	subl	$1, %ecx
+
+	xorl	%ebx, %ebx		C initial carry limb
+
+L(inner):
+	C eax	scratch
+	C ebx	carry (needing carry flag added)
+	C ecx	counter, -n-1 to -1
+	C edx	scratch
+	C esi	&src[size]
+	C edi	dst end of this addmul
+	C ebp	multiplier
+
+	movl	(%esi,%ecx,4), %eax
+	mull	%ebp
+	addl	%ebx, %eax
+	adcl	$0, %edx
+	addl	%eax, (%edi,%ecx,4)
+	adcl	$0, %edx
+	movl	%edx, %ebx
+	addl	$1, %ecx
+	jl	L(inner)
+
+
+	movl	%ebx, (%edi)
+	movl	VAR_OUTER, %ecx
+	incl	%ecx
+	jnz	L(outer)
+
+
+L(corner):
+	C esi	&src[size]
+	C edi	&dst[2*size-3]
+
+	movl	-4(%esi), %eax
+	mull	-8(%esi)		C src[size-1]*src[size-2]
+	addl	%eax, 0(%edi)
+	adcl	$0, %edx
+	movl	%edx, 4(%edi)		C dst high limb
+
+
+C -----------------------------------------------------------------------------
+C Left shift of dst[1..2*size-2], high bit shifted out becomes dst[2*size-1].
+
+	movl	PARAM_SIZE, %eax
+	negl	%eax
+	addl	$1, %eax		C -(size-1) and clear carry
+
+L(lshift):
+	C eax	counter, negative
+	C ebx	next limb
+	C ecx
+	C edx
+	C esi
+	C edi	&dst[2*size-4]
+	C ebp
+
+	rcll	8(%edi,%eax,8)
+	rcll	12(%edi,%eax,8)
+	incl	%eax
+	jnz	L(lshift)
+
+
+	adcl	%eax, %eax		C high bit out
+	movl	%eax, 8(%edi)		C dst most significant limb
+
+
+C Now add in the squares on the diagonal, namely src[0]^2, src[1]^2, ...,
+C src[size-1]^2.  dst[0] hasn't yet been set at all yet, and just gets the
+C low limb of src[0]^2.
+
+	movl	PARAM_SRC, %esi
+	movl	(%esi), %eax		C src[0]
+	mull	%eax			C src[0]^2
+
+	movl	PARAM_SIZE, %ecx
+	leal	(%esi,%ecx,4), %esi	C src end
+
+	negl	%ecx			C -size
+	movl	%edx, %ebx		C initial carry
+
+	movl	%eax, 12(%edi,%ecx,8)	C dst[0]
+	incl	%ecx			C -(size-1)
+
+L(diag):
+	C eax	scratch (low product)
+	C ebx	carry limb
+	C ecx	counter, -(size-1) to -1
+	C edx	scratch (high product)
+	C esi	&src[size]
+	C edi	&dst[2*size-3]
+	C ebp	scratch (fetched dst limbs)
+
+	movl	(%esi,%ecx,4), %eax
+	mull	%eax
+
+	addl	%ebx, 8(%edi,%ecx,8)
+	movl	%edx, %ebx
+
+	adcl	%eax, 12(%edi,%ecx,8)
+	adcl	$0, %ebx
+
+	incl	%ecx
+	jnz	L(diag)
+
+
+	addl	%ebx, 8(%edi)		C dst most significant limb
+
+	popl	%ebp
+	popl	%esi
+
+	popl	%edi
+	popl	%ebx
+
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/t-zdisp.sh b/third_party/gmp/mpn/x86/t-zdisp.sh
new file mode 100755
index 0000000..61efdd6
--- /dev/null
+++ b/third_party/gmp/mpn/x86/t-zdisp.sh

@@ -0,0 +1,71 @@
+#! /bin/sh
+#
+# Copyright 2000 Free Software Foundation, Inc.
+#
+#  This file is part of the GNU MP Library.
+#
+#  The GNU MP Library is free software; you can redistribute it and/or modify
+#  it under the terms of either:
+#
+#    * the GNU Lesser General Public License as published by the Free
+#      Software Foundation; either version 3 of the License, or (at your
+#      option) any later version.
+#
+#  or
+#
+#    * the GNU General Public License as published by the Free Software
+#      Foundation; either version 2 of the License, or (at your option) any
+#      later version.
+#
+#  or both in parallel, as here.
+#
+#  The GNU MP Library is distributed in the hope that it will be useful, but
+#  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+#  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+#  for more details.
+#
+#  You should have received copies of the GNU General Public License and the
+#  GNU Lesser General Public License along with the GNU MP Library.  If not,
+#  see https://www.gnu.org/licenses/.
+
+
+# Usage: cd $(builddir)/mpn
+#        $(srcdir)/x86/t-zdisp.sh
+#
+# Run the Zdisp() macro instructions through the assembler to check
+# the encodings used.  Mismatches are printed, no output means all ok.
+#
+# This program is only meant for use during development.  It can be
+# run in the mpn build directory of any x86 configuration.
+#
+# For this test the assembler needs to generate byte sized 0
+# displacements when given something like 0(%eax).  Recent versions of
+# gas are suitable (eg. 2.9.x or 2.10.x).
+
+set -e
+
+cat >tmp-zdisptest.asm <<\EOF
+
+include(`../config.m4')
+
+dnl  Redefine Zdisp_match to output its pattern and encoding.
+define(`Zdisp_match',
+`define(`Zdisp_found',1)dnl
+ifelse(`$2',0,`	$1	$2$3, $4')`'dnl
+ifelse(`$3',0,`	$1	$2, $3$4')`'dnl
+
+	.byte	$5
+')
+	.text
+	Zdisp()
+EOF
+
+m4 tmp-zdisptest.asm >tmp-zdisptest.s
+as -o tmp-zdisptest.o tmp-zdisptest.s
+
+# Demand duplicates from the instruction patterns and byte encodings.
+objdump -d tmp-zdisptest.o | awk '
+/^ *[a-z0-9]+:/ {
+	sub(/^ *[a-z0-9]+:/,"")
+        print
+}' | sort | uniq -u

diff --git a/third_party/gmp/mpn/x86/t-zdisp2.pl b/third_party/gmp/mpn/x86/t-zdisp2.pl
new file mode 100755
index 0000000..b441b65
--- /dev/null
+++ b/third_party/gmp/mpn/x86/t-zdisp2.pl

@@ -0,0 +1,147 @@
+#!/usr/bin/perl -w
+#
+# Copyright 2001, 2002 Free Software Foundation, Inc.
+#
+#  This file is part of the GNU MP Library.
+#
+#  The GNU MP Library is free software; you can redistribute it and/or modify
+#  it under the terms of either:
+#
+#    * the GNU Lesser General Public License as published by the Free
+#      Software Foundation; either version 3 of the License, or (at your
+#      option) any later version.
+#
+#  or
+#
+#    * the GNU General Public License as published by the Free Software
+#      Foundation; either version 2 of the License, or (at your option) any
+#      later version.
+#
+#  or both in parallel, as here.
+#
+#  The GNU MP Library is distributed in the hope that it will be useful, but
+#  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+#  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+#  for more details.
+#
+#  You should have received copies of the GNU General Public License and the
+#  GNU Lesser General Public License along with the GNU MP Library.  If not,
+#  see https://www.gnu.org/licenses/.
+
+
+# Usage: cd $(builddir)/mpn
+#        $(srcdir)/x86/t-zdisp2.pl
+#
+# Grep for any "0(reg...)" addressing modes coming out of the x86 .asm
+# files.  Additive expressions like "12+4-16" are recognised too.
+#
+# Old gas doesn't preserve the "0" displacement, so if it's wanted then
+# Zdisp ought to be used to give explicit .byte sequences.  See
+# mpn/x86/README.
+#
+# No output means everything is ok.  All the asm files are put through m4 in
+# PIC and non-PIC modes, and in each multi-function form, all of which can
+# take a while to run.
+#
+# This program is only meant for use during development.
+
+use strict;
+use File::Find;
+use File::Basename;
+use Getopt::Std;
+
+my %opt;
+getopts('t', \%opt);
+
+
+my $srcdir;
+open IN, '<Makefile' or die;
+while (<IN>) {
+  if (/^srcdir[ \t]*=[ \t]*(.*)/) {
+    $srcdir = $1;
+    last;
+  }
+}
+close IN or die;
+defined $srcdir or die "Cannot find \$srcdir in Makefile\n";
+
+my $filecount = 0;
+
+my $tempfile = 't-zdisp2.tmp';
+open KARA, ">$tempfile" or die;
+close KARA or die;
+
+find({ wanted => \&process, preprocess => \&process_mparam, no_chdir => 1 },
+     "$srcdir/x86");
+
+sub process {
+  if (/gmp-mparam.h$/) {
+    process_mparam($_);
+  } elsif (/\.asm$/) {
+    process_asm($_);
+  }
+}
+
+# Ensure we're using the right SQR_TOOM2_THRESHOLD for the part of the
+# tree being processed.
+sub process_mparam {
+  my $file = "$File::Find::dir/gmp-mparam.h";
+  if (-f $file) {
+    print "$file\n" if $opt{'t'};
+    open MPARAM, "<$file" or die;
+    while (<MPARAM>) {
+      if (/^#define SQR_TOOM2_THRESHOLD[ \t]*([0-9][0-9]*)/) {
+        open KARA, ">$tempfile" or die;
+        print KARA "define(\`SQR_TOOM2_THRESHOLD',$1)\n\n";
+        print "define(\`SQR_TOOM2_THRESHOLD',$1)\n" if $opt{'t'};
+        close KARA or die;
+        last;
+      }
+    }
+    close MPARAM or die;
+  }
+  return @_;
+}
+
+sub process_asm {
+  my ($file) = @_;
+  my $base = basename ($file, '.asm');
+
+  my @funs;
+  if    ($base eq 'aors_n')    { @funs = qw(add_n sub_n); }
+  elsif ($base eq 'aorsmul_1') { @funs = qw(addmul_1 submul_1); }
+  elsif ($base eq 'popham')    { @funs = qw(popcount hamdist); }
+  elsif ($base eq 'logops_n')  { @funs = qw(and_n andn_n nand_n ior_n iorn_n nior_n xor_n xnor_n); }
+  elsif ($base eq 'lorrshift') { @funs = qw(lshift rshift); }
+  else                         { @funs = ($base); }
+
+  foreach my $fun (@funs) {
+    foreach my $pic ('', ' -DPIC') {
+      my $header = "$file: 0: $pic\n";
+      $filecount++;
+
+      my $m4 = "m4 -DHAVE_HOST_CPU_athlon -DOPERATION_$fun $pic ../config.m4 $tempfile $file";
+      print "$m4\n" if $opt{'t'};
+
+      open IN, "$m4 |" or die;
+      while (<IN>) {
+        next unless /([0-9+-][0-9 \t+-]*)\(%/;
+        my $pat=$1;
+        $pat = eval($pat);
+        next if ($pat != 0);
+        print "$header$_";
+        $header='';
+      }
+      close IN or die;
+    }
+  }
+}
+
+unlink($tempfile);
+print "total $filecount processed\n";
+exit 0;
+
+
+# Local variables:
+# perl-indent-level: 2
+# End:

diff --git a/third_party/gmp/mpn/x86/udiv.asm b/third_party/gmp/mpn/x86/udiv.asm
new file mode 100644
index 0000000..a3ee088
--- /dev/null
+++ b/third_party/gmp/mpn/x86/udiv.asm

@@ -0,0 +1,52 @@
+dnl  x86 mpn_udiv_qrnnd -- 2 by 1 limb division
+
+dnl  Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_udiv_qrnnd (mp_limb_t *remptr, mp_limb_t high, mp_limb_t low,
+C                           mp_limb_t divisor);
+
+defframe(PARAM_DIVISOR, 16)
+defframe(PARAM_LOW,     12)
+defframe(PARAM_HIGH,    8)
+defframe(PARAM_REMPTR,  4)
+
+	TEXT
+	ALIGN(8)
+PROLOGUE(mpn_udiv_qrnnd)
+deflit(`FRAME',0)
+	movl	PARAM_LOW, %eax
+	movl	PARAM_HIGH, %edx
+	divl	PARAM_DIVISOR
+	movl	PARAM_REMPTR, %ecx
+	movl	%edx, (%ecx)
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/umul.asm b/third_party/gmp/mpn/x86/umul.asm
new file mode 100644
index 0000000..34fe434
--- /dev/null
+++ b/third_party/gmp/mpn/x86/umul.asm

@@ -0,0 +1,51 @@
+dnl  mpn_umul_ppmm -- 1x1->2 limb multiplication
+
+dnl  Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C mp_limb_t mpn_umul_ppmm (mp_limb_t *lowptr, mp_limb_t m1, mp_limb_t m2);
+C
+
+defframe(PARAM_M2,    12)
+defframe(PARAM_M1,     8)
+defframe(PARAM_LOWPTR, 4)
+
+	TEXT
+	ALIGN(8)
+PROLOGUE(mpn_umul_ppmm)
+deflit(`FRAME',0)
+	movl	PARAM_LOWPTR, %ecx
+	movl	PARAM_M1, %eax
+	mull	PARAM_M2
+	movl	%eax, (%ecx)
+	movl	%edx, %eax
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86/x86-defs.m4 b/third_party/gmp/mpn/x86/x86-defs.m4
new file mode 100644
index 0000000..81309b2
--- /dev/null
+++ b/third_party/gmp/mpn/x86/x86-defs.m4

@@ -0,0 +1,1024 @@
+divert(-1)
+
+dnl  m4 macros for x86 assembler.
+
+dnl  Copyright 1999-2003, 2007, 2010, 2012, 2014 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+dnl  Notes:
+dnl
+dnl  m4 isn't perfect for processing BSD style x86 assembler code, the main
+dnl  problems are,
+dnl
+dnl  1. Doing define(foo,123) and then using foo in an addressing mode like
+dnl     foo(%ebx) expands as a macro rather than a constant.  This is worked
+dnl     around by using deflit() from asm-defs.m4, instead of define().
+dnl
+dnl  2. Immediates in macro definitions need a space or `' to stop the $
+dnl     looking like a macro parameter.  For example,
+dnl
+dnl	        define(foo, `mov $ 123, %eax')
+dnl
+dnl     This is only a problem in macro definitions, not in ordinary text,
+dnl     and not in macro parameters like text passed to forloop() or ifdef().
+
+
+deflit(GMP_LIMB_BYTES, 4)
+
+
+dnl  Libtool gives -DPIC -DDLL_EXPORT to indicate a cygwin or mingw DLL.  We
+dnl  undefine PIC since we don't need to be position independent in this
+dnl  case and definitely don't want the ELF style _GLOBAL_OFFSET_TABLE_ etc.
+
+ifdef(`DLL_EXPORT',`undefine(`PIC')')
+
+
+dnl  Usage: CPUVEC_FUNCS_LIST
+dnl
+dnl  A list of the functions from gmp-impl.h x86 struct cpuvec_t, in the
+dnl  order they appear in that structure.
+
+define(CPUVEC_FUNCS_LIST,
+``add_n',
+`addlsh1_n',
+`addlsh2_n',
+`addmul_1',
+`addmul_2',
+`bdiv_dbm1c',
+`cnd_add_n',
+`cnd_sub_n',
+`com',
+`copyd',
+`copyi',
+`divexact_1',
+`divrem_1',
+`gcd_11',
+`lshift',
+`lshiftc',
+`mod_1',
+`mod_1_1p',
+`mod_1_1p_cps',
+`mod_1s_2p',
+`mod_1s_2p_cps',
+`mod_1s_4p',
+`mod_1s_4p_cps',
+`mod_34lsub1',
+`modexact_1c_odd',
+`mul_1',
+`mul_basecase',
+`mullo_basecase',
+`preinv_divrem_1',
+`preinv_mod_1',
+`redc_1',
+`redc_2',
+`rshift',
+`sqr_basecase',
+`sub_n',
+`sublsh1_n',
+`submul_1'')
+
+
+dnl  Called: PROLOGUE_cpu(GSYM_PREFIX`'foo)
+dnl
+dnl  In the x86 code we use explicit TEXT and ALIGN() calls in the code,
+dnl  since different alignments are wanted in various circumstances.  So for
+dnl  instance,
+dnl
+dnl                  TEXT
+dnl                  ALIGN(16)
+dnl          PROLOGUE(mpn_add_n)
+dnl          ...
+dnl          EPILOGUE()
+
+define(`PROLOGUE_cpu',
+m4_assert_numargs(1)
+m4_assert_defined(`WANT_PROFILING')
+	`GLOBL	$1
+	TYPE($1,`function')
+	COFF_TYPE($1)
+$1:
+ifelse(WANT_PROFILING,`prof',      `	call_mcount')
+ifelse(WANT_PROFILING,`gprof',     `	call_mcount')
+ifelse(WANT_PROFILING,`instrument',`	call_instrument(enter)')
+')
+
+
+dnl  Usage: COFF_TYPE(GSYM_PREFIX`'foo)
+dnl
+dnl  Emit COFF style ".def ... .endef" type information for a function, when
+dnl  supported.  The argument should include any GSYM_PREFIX.
+dnl
+dnl  See autoconf macro GMP_ASM_COFF_TYPE for HAVE_COFF_TYPE.
+
+define(COFF_TYPE,
+m4_assert_numargs(1)
+m4_assert_defined(`HAVE_COFF_TYPE')
+`ifelse(HAVE_COFF_TYPE,yes,
+	`.def	$1
+	.scl	2
+	.type	32
+	.endef')')
+
+
+dnl  Usage: call_mcount
+dnl
+dnl  For `gprof' style profiling, %ebp is setup as a frame pointer.  None of
+dnl  the assembler routines use %ebp this way, so it's done only for the
+dnl  benefit of mcount.  glibc sysdeps/i386/i386-mcount.S shows how mcount
+dnl  gets the current function from (%esp) and the parent from 4(%ebp).
+dnl
+dnl  For `prof' style profiling gcc generates mcount calls without setting
+dnl  up %ebp, and the same is done here.
+
+define(`call_mcount',
+m4_assert_numargs(-1)
+m4_assert_defined(`WANT_PROFILING')
+m4_assert_defined(`MCOUNT_PIC_REG')
+m4_assert_defined(`MCOUNT_NONPIC_REG')
+m4_assert_defined(`MCOUNT_PIC_CALL')
+m4_assert_defined(`MCOUNT_NONPIC_CALL')
+`ifelse(ifdef(`PIC',`MCOUNT_PIC_REG',`MCOUNT_NONPIC_REG'),,,
+`	DATA
+	ALIGN(4)
+L(mcount_data_`'mcount_counter):
+	W32	0
+	TEXT
+')dnl
+ifelse(WANT_PROFILING,`gprof',
+`	pushl	%ebp
+	movl	%esp, %ebp
+')dnl
+ifdef(`PIC',
+`	pushl	%ebx
+	call_movl_eip_to_ebx
+L(mcount_here_`'mcount_counter):
+	addl	$_GLOBAL_OFFSET_TABLE_+[.-L(mcount_here_`'mcount_counter)], %ebx
+ifelse(MCOUNT_PIC_REG,,,
+`	leal	L(mcount_data_`'mcount_counter)@GOTOFF(%ebx), MCOUNT_PIC_REG')
+MCOUNT_PIC_CALL
+	popl	%ebx
+',`dnl non-PIC
+ifelse(MCOUNT_NONPIC_REG,,,
+`	movl	`$'L(mcount_data_`'mcount_counter), MCOUNT_NONPIC_REG
+')dnl
+MCOUNT_NONPIC_CALL
+')dnl
+ifelse(WANT_PROFILING,`gprof',
+`	popl	%ebp
+')
+define(`mcount_counter',incr(mcount_counter))
+')
+
+define(mcount_counter,1)
+
+
+dnl  Usage: call_instrument(enter|exit)
+dnl
+dnl  Call __cyg_profile_func_enter or __cyg_profile_func_exit.
+dnl
+dnl  For PIC, most routines don't require _GLOBAL_OFFSET_TABLE_ themselves
+dnl  so %ebx is just setup for these calls.  It's a bit wasteful to repeat
+dnl  the setup for the exit call having done it earlier for the enter, but
+dnl  there's nowhere very convenient to hold %ebx through the length of a
+dnl  routine, in general.
+dnl
+dnl  For PIC, because instrument_current_function will be within the current
+dnl  object file we can get it just as an offset from %eip, there's no need
+dnl  to use the GOT.
+dnl
+dnl  No attempt is made to maintain the stack alignment gcc generates with
+dnl  -mpreferred-stack-boundary.  This wouldn't be hard, but it seems highly
+dnl  unlikely the instrumenting functions would be doing anything that'd
+dnl  benefit from alignment, in particular they're unlikely to be using
+dnl  doubles or long doubles on the stack.
+dnl
+dnl  The FRAME scheme is used to conveniently account for the register saves
+dnl  before accessing the return address.  Any previous value is saved and
+dnl  restored, since plenty of code keeps a value across a "ret" in the
+dnl  middle of a routine.
+
+define(call_instrument,
+m4_assert_numargs(1)
+`	pushdef(`FRAME',0)
+ifelse($1,exit,
+`	pushl	%eax	FRAME_pushl()	C return value
+')
+ifdef(`PIC',
+`	pushl	%ebx	FRAME_pushl()
+	call_movl_eip_to_ebx
+L(instrument_here_`'instrument_count):
+	movl	%ebx, %ecx
+	addl	$_GLOBAL_OFFSET_TABLE_+[.-L(instrument_here_`'instrument_count)], %ebx
+	C use addl rather than leal to avoid old gas bugs, see mpn/x86/README
+	addl	$instrument_current_function-L(instrument_here_`'instrument_count), %ecx
+	pushl	m4_empty_if_zero(FRAME)(%esp)	FRAME_pushl()	C return addr
+	pushl	%ecx				FRAME_pushl()	C this function
+	call	GSYM_PREFIX`'__cyg_profile_func_$1@PLT
+	addl	$`'8, %esp
+	popl	%ebx
+',
+`	C non-PIC
+	pushl	m4_empty_if_zero(FRAME)(%esp)	FRAME_pushl()	C return addr
+	pushl	$instrument_current_function	FRAME_pushl()	C this function
+	call	GSYM_PREFIX`'__cyg_profile_func_$1
+	addl	$`'8, %esp
+')
+ifelse($1,exit,
+`	popl	%eax			C return value
+')
+	popdef(`FRAME')
+define(`instrument_count',incr(instrument_count))
+')
+define(instrument_count,1)
+
+
+dnl  Usage: instrument_current_function
+dnl
+dnl  Return the current function name for instrumenting purposes.  This is
+dnl  PROLOGUE_current_function, but it sticks at the first such name seen.
+dnl
+dnl  Sticking to the first name seen ensures that multiple-entrypoint
+dnl  functions like mpn_add_nc and mpn_add_n will make enter and exit calls
+dnl  giving the same function address.
+
+define(instrument_current_function,
+m4_assert_numargs(-1)
+`ifdef(`instrument_current_function_seen',
+`instrument_current_function_seen',
+`define(`instrument_current_function_seen',PROLOGUE_current_function)dnl
+PROLOGUE_current_function')')
+
+
+dnl  Usage: call_movl_eip_to_ebx
+dnl
+dnl  Generate a call to L(movl_eip_to_ebx), and record the need for that
+dnl  routine.
+
+define(call_movl_eip_to_ebx,
+m4_assert_numargs(-1)
+`call	L(movl_eip_to_ebx)
+define(`movl_eip_to_ebx_needed',1)')
+
+dnl  Usage: generate_movl_eip_to_ebx
+dnl
+dnl  Emit a L(movl_eip_to_ebx) routine, if needed and not already generated.
+
+define(generate_movl_eip_to_ebx,
+m4_assert_numargs(-1)
+`ifelse(movl_eip_to_ebx_needed,1,
+`ifelse(movl_eip_to_ebx_done,1,,
+`L(movl_eip_to_ebx):
+	movl	(%esp), %ebx
+	ret_internal
+define(`movl_eip_to_ebx_done',1)
+')')')
+
+
+dnl  Usage: ret
+dnl
+dnl  Generate a "ret", but if doing instrumented profiling then call
+dnl  __cyg_profile_func_exit first.
+
+define(ret,
+m4_assert_numargs(-1)
+m4_assert_defined(`WANT_PROFILING')
+`ifelse(WANT_PROFILING,instrument,
+`ret_instrument',
+`ret_internal')
+generate_movl_eip_to_ebx
+')
+
+
+dnl  Usage: ret_internal
+dnl
+dnl  A plain "ret", without any __cyg_profile_func_exit call.  This can be
+dnl  used for a return which is internal to some function, such as when
+dnl  getting %eip for PIC.
+
+define(ret_internal,
+m4_assert_numargs(-1)
+``ret'')
+
+
+dnl  Usage: ret_instrument
+dnl
+dnl  Generate call to __cyg_profile_func_exit and then a ret.  If a ret has
+dnl  already been seen from this function then jump to that chunk of code,
+dnl  rather than emitting it again.
+
+define(ret_instrument,
+m4_assert_numargs(-1)
+`ifelse(m4_unquote(ret_instrument_seen_`'instrument_current_function),1,
+`jmp	L(instrument_exit_`'instrument_current_function)',
+`define(ret_instrument_seen_`'instrument_current_function,1)
+L(instrument_exit_`'instrument_current_function):
+call_instrument(exit)
+	ret_internal')')
+
+
+dnl  Usage: _GLOBAL_OFFSET_TABLE_
+dnl
+dnl  Expand to _GLOBAL_OFFSET_TABLE_ plus any necessary underscore prefix.
+dnl  This lets us write plain _GLOBAL_OFFSET_TABLE_ in SVR4 style, but still
+dnl  work with systems requiring an extra underscore such as OpenBSD.
+dnl
+dnl  deflit is used so "leal _GLOBAL_OFFSET_TABLE_(%eax), %ebx" will come
+dnl  out right, though that form doesn't work properly in gas (see
+dnl  mpn/x86/README).
+
+deflit(_GLOBAL_OFFSET_TABLE_,
+m4_assert_defined(`GOT_GSYM_PREFIX')
+`GOT_GSYM_PREFIX`_GLOBAL_OFFSET_TABLE_'')
+
+
+dnl  --------------------------------------------------------------------------
+dnl  Various x86 macros.
+dnl
+
+
+dnl  Usage: ALIGN_OFFSET(bytes,offset)
+dnl
+dnl  Align to `offset' away from a multiple of `bytes'.
+dnl
+dnl  This is useful for testing, for example align to something very strict
+dnl  and see what effect offsets from it have, "ALIGN_OFFSET(256,32)".
+dnl
+dnl  Generally you wouldn't execute across the padding, but it's done with
+dnl  nop's so it'll work.
+
+define(ALIGN_OFFSET,
+m4_assert_numargs(2)
+`ALIGN($1)
+forloop(`i',1,$2,`	nop
+')')
+
+
+dnl  Usage: defframe(name,offset)
+dnl
+dnl  Make a definition like the following with which to access a parameter
+dnl  or variable on the stack.
+dnl
+dnl         define(name,`FRAME+offset(%esp)')
+dnl
+dnl  Actually m4_empty_if_zero(FRAME+offset) is used, which will save one
+dnl  byte if FRAME+offset is zero, by putting (%esp) rather than 0(%esp).
+dnl  Use define(`defframe_empty_if_zero_disabled',1) if for some reason the
+dnl  zero offset is wanted.
+dnl
+dnl  The new macro also gets a check that when it's used FRAME is actually
+dnl  defined, and that the final %esp offset isn't negative, which would
+dnl  mean an attempt to access something below the current %esp.
+dnl
+dnl  deflit() is used rather than a plain define(), so the new macro won't
+dnl  delete any following parenthesized expression.  name(%edi) will come
+dnl  out say as 16(%esp)(%edi).  This isn't valid assembler and should
+dnl  provoke an error, which is better than silently giving just 16(%esp).
+dnl
+dnl  See README for more on the suggested way to access the stack frame.
+
+define(defframe,
+m4_assert_numargs(2)
+`deflit(`$1',
+m4_assert_defined(`FRAME')
+`defframe_check_notbelow(`$1',$2,FRAME)dnl
+defframe_empty_if_zero(FRAME+($2))(%esp)')')
+
+dnl  Called: defframe_empty_if_zero(expression)
+define(defframe_empty_if_zero,
+m4_assert_numargs(1)
+`ifelse(defframe_empty_if_zero_disabled,1,
+`eval($1)',
+`m4_empty_if_zero($1)')')
+
+dnl  Called: defframe_check_notbelow(`name',offset,FRAME)
+define(defframe_check_notbelow,
+m4_assert_numargs(3)
+`ifelse(eval(($3)+($2)<0),1,
+`m4_error(`$1 at frame offset $2 used when FRAME is only $3 bytes
+')')')
+
+
+dnl  Usage: FRAME_pushl()
+dnl         FRAME_popl()
+dnl         FRAME_addl_esp(n)
+dnl         FRAME_subl_esp(n)
+dnl
+dnl  Adjust FRAME appropriately for a pushl or popl, or for an addl or subl
+dnl  %esp of n bytes.
+dnl
+dnl  Using these macros is completely optional.  Sometimes it makes more
+dnl  sense to put explicit deflit(`FRAME',N) forms, especially when there's
+dnl  jumps and different sequences of FRAME values need to be used in
+dnl  different places.
+
+define(FRAME_pushl,
+m4_assert_numargs(0)
+m4_assert_defined(`FRAME')
+`deflit(`FRAME',eval(FRAME+4))')
+
+define(FRAME_popl,
+m4_assert_numargs(0)
+m4_assert_defined(`FRAME')
+`deflit(`FRAME',eval(FRAME-4))')
+
+define(FRAME_addl_esp,
+m4_assert_numargs(1)
+m4_assert_defined(`FRAME')
+`deflit(`FRAME',eval(FRAME-($1)))')
+
+define(FRAME_subl_esp,
+m4_assert_numargs(1)
+m4_assert_defined(`FRAME')
+`deflit(`FRAME',eval(FRAME+($1)))')
+
+
+dnl  Usage: defframe_pushl(name)
+dnl
+dnl  Do a combination FRAME_pushl() and a defframe() to name the stack
+dnl  location just pushed.  This should come after a pushl instruction.
+dnl  Putting it on the same line works and avoids lengthening the code.  For
+dnl  example,
+dnl
+dnl         pushl   %eax     defframe_pushl(VAR_COUNTER)
+dnl
+dnl  Notice the defframe() is done with an unquoted -FRAME thus giving its
+dnl  current value without tracking future changes.
+
+define(defframe_pushl,
+m4_assert_numargs(1)
+`FRAME_pushl()defframe(`$1',-FRAME)')
+
+
+dnl  --------------------------------------------------------------------------
+dnl  Assembler instruction macros.
+dnl
+
+
+dnl  Usage: emms_or_femms
+dnl         femms_available_p
+dnl
+dnl  femms_available_p expands to 1 or 0 according to whether the AMD 3DNow
+dnl  femms instruction is available.  emms_or_femms expands to femms if
+dnl  available, or emms if not.
+dnl
+dnl  emms_or_femms is meant for use in the K6 directory where plain K6
+dnl  (without femms) and K6-2 and K6-3 (with a slightly faster femms) are
+dnl  supported together.
+dnl
+dnl  On K7 femms is no longer faster and is just an alias for emms, so plain
+dnl  emms may as well be used.
+
+define(femms_available_p,
+m4_assert_numargs(-1)
+`m4_ifdef_anyof_p(
+	`HAVE_HOST_CPU_k62',
+	`HAVE_HOST_CPU_k63',
+	`HAVE_HOST_CPU_athlon')')
+
+define(emms_or_femms,
+m4_assert_numargs(-1)
+`ifelse(femms_available_p,1,`femms',`emms')')
+
+
+dnl  Usage: femms
+dnl
+dnl  Gas 2.9.1 which comes with FreeBSD 3.4 doesn't support femms, so the
+dnl  following is a replacement using .byte.
+
+define(femms,
+m4_assert_numargs(-1)
+`.byte	15,14	C AMD 3DNow femms')
+
+
+dnl  Usage: jadcl0(op)
+dnl
+dnl  Generate a jnc/incl as a substitute for adcl $0,op.  Note this isn't an
+dnl  exact replacement, since it doesn't set the flags like adcl does.
+dnl
+dnl  This finds a use in K6 mpn_addmul_1, mpn_submul_1, mpn_mul_basecase and
+dnl  mpn_sqr_basecase because on K6 an adcl is slow, the branch
+dnl  misprediction penalty is small, and the multiply algorithm used leads
+dnl  to a carry bit on average only 1/4 of the time.
+dnl
+dnl  jadcl0_disabled can be set to 1 to instead generate an ordinary adcl
+dnl  for comparison.  For example,
+dnl
+dnl		define(`jadcl0_disabled',1)
+dnl
+dnl  When using a register operand, eg. "jadcl0(%edx)", the jnc/incl code is
+dnl  the same size as an adcl.  This makes it possible to use the exact same
+dnl  computed jump code when testing the relative speed of the two.
+
+define(jadcl0,
+m4_assert_numargs(1)
+`ifelse(jadcl0_disabled,1,
+	`adcl	$`'0, $1',
+	`jnc	L(jadcl0_`'jadcl0_counter)
+	incl	$1
+L(jadcl0_`'jadcl0_counter):
+define(`jadcl0_counter',incr(jadcl0_counter))')')
+
+define(jadcl0_counter,1)
+
+
+dnl  Usage: x86_lookup(target, key,value, key,value, ...)
+dnl         x86_lookup_p(target, key,value, key,value, ...)
+dnl
+dnl  Look for `target' among the `key' parameters.
+dnl
+dnl  x86_lookup expands to the corresponding `value', or generates an error
+dnl  if `target' isn't found.
+dnl
+dnl  x86_lookup_p expands to 1 if `target' is found, or 0 if not.
+
+define(x86_lookup,
+m4_assert_numargs_range(1,999)
+`ifelse(eval($#<3),1,
+`m4_error(`unrecognised part of x86 instruction: $1
+')',
+`ifelse(`$1',`$2', `$3',
+`x86_lookup(`$1',shift(shift(shift($@))))')')')
+
+define(x86_lookup_p,
+m4_assert_numargs_range(1,999)
+`ifelse(eval($#<3),1, `0',
+`ifelse(`$1',`$2',    `1',
+`x86_lookup_p(`$1',shift(shift(shift($@))))')')')
+
+
+dnl  Usage: x86_opcode_reg32(reg)
+dnl         x86_opcode_reg32_p(reg)
+dnl
+dnl  x86_opcode_reg32 expands to the standard 3 bit encoding for the given
+dnl  32-bit register, eg. `%ebp' turns into 5.
+dnl
+dnl  x86_opcode_reg32_p expands to 1 if reg is a valid 32-bit register, or 0
+dnl  if not.
+
+define(x86_opcode_reg32,
+m4_assert_numargs(1)
+`x86_lookup(`$1',x86_opcode_reg32_list)')
+
+define(x86_opcode_reg32_p,
+m4_assert_onearg()
+`x86_lookup_p(`$1',x86_opcode_reg32_list)')
+
+define(x86_opcode_reg32_list,
+``%eax',0,
+`%ecx',1,
+`%edx',2,
+`%ebx',3,
+`%esp',4,
+`%ebp',5,
+`%esi',6,
+`%edi',7')
+
+
+dnl  Usage: x86_opcode_tttn(cond)
+dnl
+dnl  Expand to the 4-bit "tttn" field value for the given x86 branch
+dnl  condition (like `c', `ae', etc).
+
+define(x86_opcode_tttn,
+m4_assert_numargs(1)
+`x86_lookup(`$1',x86_opcode_ttn_list)')
+
+define(x86_opcode_tttn_list,
+``o',  0,
+`no',  1,
+`b',   2, `c',  2, `nae',2,
+`nb',  3, `nc', 3, `ae', 3,
+`e',   4, `z',  4,
+`ne',  5, `nz', 5,
+`be',  6, `na', 6,
+`nbe', 7, `a',  7,
+`s',   8,
+`ns',  9,
+`p',  10, `pe', 10, `npo',10,
+`np', 11, `npe',11, `po', 11,
+`l',  12, `nge',12,
+`nl', 13, `ge', 13,
+`le', 14, `ng', 14,
+`nle',15, `g',  15')
+
+
+dnl  Usage: cmovCC(%srcreg,%dstreg)
+dnl
+dnl  Emit a cmov instruction, using a .byte sequence, since various past
+dnl  versions of gas don't know cmov.  For example,
+dnl
+dnl         cmovz(  %eax, %ebx)
+dnl
+dnl  The source operand can only be a plain register.  (m4 code implementing
+dnl  full memory addressing modes exists, believe it or not, but isn't
+dnl  currently needed and isn't included.)
+dnl
+dnl  All the standard conditions are defined.  Attempting to use one without
+dnl  the macro parentheses, such as just "cmovbe %eax, %ebx", will provoke
+dnl  an error.  This protects against writing something old gas wouldn't
+dnl  understand.
+
+dnl  Called: define_cmov_many(cond,tttn,cond,tttn,...)
+define(define_cmov_many,
+`ifelse(m4_length(`$1'),0,,
+`define_cmov(`$1',`$2')define_cmov_many(shift(shift($@)))')')
+
+dnl  Called: define_cmov(cond,tttn)
+dnl  Emit basically define(cmov<cond>,`cmov_internal(<cond>,<ttn>,`$1',`$2')')
+define(define_cmov,
+m4_assert_numargs(2)
+`define(`cmov$1',
+m4_instruction_wrapper()
+m4_assert_numargs(2)
+`cmov_internal'(m4_doublequote($`'0),``$2'',dnl
+m4_doublequote($`'1),m4_doublequote($`'2)))')
+
+define_cmov_many(x86_opcode_tttn_list)
+
+dnl  Called: cmov_internal(name,tttn,src,dst)
+define(cmov_internal,
+m4_assert_numargs(4)
+`.byte	dnl
+15, dnl
+eval(64+$2), dnl
+eval(192+8*x86_opcode_reg32(`$4')+x86_opcode_reg32(`$3')) dnl
+	C `$1 $3, $4'')
+
+
+dnl  Usage: x86_opcode_regmmx(reg)
+dnl
+dnl  Validate the given mmx register, and return its number, 0 to 7.
+
+define(x86_opcode_regmmx,
+m4_assert_numargs(1)
+`x86_lookup(`$1',x86_opcode_regmmx_list)')
+
+define(x86_opcode_regmmx_list,
+``%mm0',0,
+`%mm1',1,
+`%mm2',2,
+`%mm3',3,
+`%mm4',4,
+`%mm5',5,
+`%mm6',6,
+`%mm7',7')
+
+
+dnl  Usage: psadbw(%srcreg,%dstreg)
+dnl
+dnl  Oldish versions of gas don't know psadbw, in particular gas 2.9.1 on
+dnl  FreeBSD 3.3 and 3.4 doesn't, so instead emit .byte sequences.  For
+dnl  example,
+dnl
+dnl         psadbw( %mm1, %mm2)
+dnl
+dnl  Only register->register forms are supported here, which suffices for
+dnl  the current code.
+
+define(psadbw,
+m4_instruction_wrapper()
+m4_assert_numargs(2)
+`.byte 0x0f,0xf6,dnl
+eval(192+x86_opcode_regmmx(`$2')*8+x86_opcode_regmmx(`$1')) dnl
+	C `psadbw $1, $2'')
+
+
+dnl  Usage: Zdisp(inst,op,op,op)
+dnl
+dnl  Generate explicit .byte sequences if necessary to force a byte-sized
+dnl  zero displacement on an instruction.  For example,
+dnl
+dnl         Zdisp(  movl,   0,(%esi), %eax)
+dnl
+dnl  expands to
+dnl
+dnl                 .byte   139,70,0  C movl 0(%esi), %eax
+dnl
+dnl  If the displacement given isn't 0, then normal assembler code is
+dnl  generated.  For example,
+dnl
+dnl         Zdisp(  movl,   4,(%esi), %eax)
+dnl
+dnl  expands to
+dnl
+dnl                 movl    4(%esi), %eax
+dnl
+dnl  This means a single Zdisp() form can be used with an expression for the
+dnl  displacement, and .byte will be used only if necessary.  The
+dnl  displacement argument is eval()ed.
+dnl
+dnl  Because there aren't many places a 0(reg) form is wanted, Zdisp is
+dnl  implemented with a table of instructions and encodings.  A new entry is
+dnl  needed for any different operation or registers.  The table is split
+dnl  into separate macros to avoid overflowing BSD m4 macro expansion space.
+
+define(Zdisp,
+m4_assert_numargs(4)
+`define(`Zdisp_found',0)dnl
+Zdisp_1($@)dnl
+Zdisp_2($@)dnl
+Zdisp_3($@)dnl
+Zdisp_4($@)dnl
+ifelse(Zdisp_found,0,
+`m4_error(`unrecognised instruction in Zdisp: $1 $2 $3 $4
+')')')
+
+define(Zdisp_1,`dnl
+Zdisp_match( adcl, 0,(%edx), %eax,        `0x13,0x42,0x00',           $@)`'dnl
+Zdisp_match( adcl, 0,(%edx), %ebx,        `0x13,0x5a,0x00',           $@)`'dnl
+Zdisp_match( adcl, 0,(%edx), %esi,        `0x13,0x72,0x00',           $@)`'dnl
+Zdisp_match( addl, %ebx, 0,(%edi),        `0x01,0x5f,0x00',           $@)`'dnl
+Zdisp_match( addl, %ecx, 0,(%edi),        `0x01,0x4f,0x00',           $@)`'dnl
+Zdisp_match( addl, %esi, 0,(%edi),        `0x01,0x77,0x00',           $@)`'dnl
+Zdisp_match( sbbl, 0,(%edx), %eax,        `0x1b,0x42,0x00',           $@)`'dnl
+Zdisp_match( sbbl, 0,(%edx), %esi,        `0x1b,0x72,0x00',           $@)`'dnl
+Zdisp_match( subl, %ecx, 0,(%edi),        `0x29,0x4f,0x00',           $@)`'dnl
+Zdisp_match( movzbl, 0,(%eax,%ebp), %eax, `0x0f,0xb6,0x44,0x28,0x00', $@)`'dnl
+Zdisp_match( movzbl, 0,(%ecx,%edi), %edi, `0x0f,0xb6,0x7c,0x39,0x00', $@)`'dnl
+Zdisp_match( adc, 0,(%ebx,%ecx,4), %eax,  `0x13,0x44,0x8b,0x00',      $@)`'dnl
+Zdisp_match( sbb, 0,(%ebx,%ecx,4), %eax,  `0x1b,0x44,0x8b,0x00',      $@)`'dnl
+')
+define(Zdisp_2,`dnl
+Zdisp_match( movl, %eax, 0,(%edi),        `0x89,0x47,0x00',           $@)`'dnl
+Zdisp_match( movl, %ebx, 0,(%edi),        `0x89,0x5f,0x00',           $@)`'dnl
+Zdisp_match( movl, %esi, 0,(%edi),        `0x89,0x77,0x00',           $@)`'dnl
+Zdisp_match( movl, 0,(%ebx), %eax,        `0x8b,0x43,0x00',           $@)`'dnl
+Zdisp_match( movl, 0,(%ebx), %esi,        `0x8b,0x73,0x00',           $@)`'dnl
+Zdisp_match( movl, 0,(%edx), %eax,        `0x8b,0x42,0x00',           $@)`'dnl
+Zdisp_match( movl, 0,(%esi), %eax,        `0x8b,0x46,0x00',           $@)`'dnl
+Zdisp_match( movl, 0,(%esi,%ecx,4), %eax, `0x8b,0x44,0x8e,0x00',      $@)`'dnl
+Zdisp_match( mov, 0,(%esi,%ecx,4), %eax,  `0x8b,0x44,0x8e,0x00',      $@)`'dnl
+Zdisp_match( mov, %eax, 0,(%edi,%ecx,4),  `0x89,0x44,0x8f,0x00',      $@)`'dnl
+')
+define(Zdisp_3,`dnl
+Zdisp_match( movq, 0,(%eax,%ecx,8), %mm0, `0x0f,0x6f,0x44,0xc8,0x00', $@)`'dnl
+Zdisp_match( movq, 0,(%ebx,%eax,4), %mm0, `0x0f,0x6f,0x44,0x83,0x00', $@)`'dnl
+Zdisp_match( movq, 0,(%ebx,%eax,4), %mm2, `0x0f,0x6f,0x54,0x83,0x00', $@)`'dnl
+Zdisp_match( movq, 0,(%ebx,%ecx,4), %mm0, `0x0f,0x6f,0x44,0x8b,0x00', $@)`'dnl
+Zdisp_match( movq, 0,(%edx), %mm0,        `0x0f,0x6f,0x42,0x00',      $@)`'dnl
+Zdisp_match( movq, 0,(%esi), %mm0,        `0x0f,0x6f,0x46,0x00',      $@)`'dnl
+Zdisp_match( movq, %mm0, 0,(%edi),        `0x0f,0x7f,0x47,0x00',      $@)`'dnl
+Zdisp_match( movq, %mm2, 0,(%ecx,%eax,4), `0x0f,0x7f,0x54,0x81,0x00', $@)`'dnl
+Zdisp_match( movq, %mm2, 0,(%edx,%eax,4), `0x0f,0x7f,0x54,0x82,0x00', $@)`'dnl
+Zdisp_match( movq, %mm0, 0,(%edx,%ecx,8), `0x0f,0x7f,0x44,0xca,0x00', $@)`'dnl
+')
+define(Zdisp_4,`dnl
+Zdisp_match( movd, 0,(%eax,%ecx,4), %mm0, `0x0f,0x6e,0x44,0x88,0x00', $@)`'dnl
+Zdisp_match( movd, 0,(%eax,%ecx,8), %mm1, `0x0f,0x6e,0x4c,0xc8,0x00', $@)`'dnl
+Zdisp_match( movd, 0,(%edx,%ecx,8), %mm0, `0x0f,0x6e,0x44,0xca,0x00', $@)`'dnl
+Zdisp_match( movd, %mm0, 0,(%eax,%ecx,4), `0x0f,0x7e,0x44,0x88,0x00', $@)`'dnl
+Zdisp_match( movd, %mm0, 0,(%ecx,%eax,4), `0x0f,0x7e,0x44,0x81,0x00', $@)`'dnl
+Zdisp_match( movd, %mm2, 0,(%ecx,%eax,4), `0x0f,0x7e,0x54,0x81,0x00', $@)`'dnl
+Zdisp_match( movd, %mm0, 0,(%edx,%ecx,4), `0x0f,0x7e,0x44,0x8a,0x00', $@)`'dnl
+')
+
+define(Zdisp_match,
+m4_assert_numargs(9)
+`ifelse(eval(m4_stringequal_p(`$1',`$6')
+	&& m4_stringequal_p(`$2',0)
+	&& m4_stringequal_p(`$3',`$8')
+	&& m4_stringequal_p(`$4',`$9')),1,
+`define(`Zdisp_found',1)dnl
+ifelse(eval(`$7'),0,
+`	.byte	$5  C `$1 0$3, $4'',
+`	$6	$7$8, $9')',
+
+`ifelse(eval(m4_stringequal_p(`$1',`$6')
+	&& m4_stringequal_p(`$2',`$7')
+	&& m4_stringequal_p(`$3',0)
+	&& m4_stringequal_p(`$4',`$9')),1,
+`define(`Zdisp_found',1)dnl
+ifelse(eval(`$8'),0,
+`	.byte	$5  C `$1 $2, 0$4'',
+`	$6	$7, $8$9')')')')
+
+
+dnl  Usage: shldl(count,src,dst)
+dnl         shrdl(count,src,dst)
+dnl         shldw(count,src,dst)
+dnl         shrdw(count,src,dst)
+dnl
+dnl  Generate a double-shift instruction, possibly omitting a %cl count
+dnl  parameter if that's what the assembler requires, as indicated by
+dnl  WANT_SHLDL_CL in config.m4.  For example,
+dnl
+dnl         shldl(  %cl, %eax, %ebx)
+dnl
+dnl  turns into either
+dnl
+dnl         shldl   %cl, %eax, %ebx
+dnl  or
+dnl         shldl   %eax, %ebx
+dnl
+dnl  Immediate counts are always passed through unchanged.  For example,
+dnl
+dnl         shrdl(  $2, %esi, %edi)
+dnl  becomes
+dnl         shrdl   $2, %esi, %edi
+dnl
+dnl
+dnl  If you forget to use the macro form "shldl( ...)" and instead write
+dnl  just a plain "shldl ...", an error results.  This ensures the necessary
+dnl  variant treatment of %cl isn't accidentally bypassed.
+
+define(define_shd_instruction,
+m4_assert_numargs(1)
+`define($1,
+m4_instruction_wrapper()
+m4_assert_numargs(3)
+`shd_instruction'(m4_doublequote($`'0),m4_doublequote($`'1),dnl
+m4_doublequote($`'2),m4_doublequote($`'3)))')
+
+dnl  Effectively: define(shldl,`shd_instruction(`$0',`$1',`$2',`$3')') etc
+define_shd_instruction(shldl)
+define_shd_instruction(shrdl)
+define_shd_instruction(shldw)
+define_shd_instruction(shrdw)
+
+dnl  Called: shd_instruction(op,count,src,dst)
+define(shd_instruction,
+m4_assert_numargs(4)
+m4_assert_defined(`WANT_SHLDL_CL')
+`ifelse(eval(m4_stringequal_p(`$2',`%cl') && !WANT_SHLDL_CL),1,
+``$1'	`$3', `$4'',
+``$1'	`$2', `$3', `$4'')')
+
+
+dnl  Usage: ASSERT([cond][,instructions])
+dnl
+dnl  If WANT_ASSERT is 1, output the given instructions and expect the given
+dnl  flags condition to then be satisfied.  For example,
+dnl
+dnl         ASSERT(ne, `cmpl %eax, %ebx')
+dnl
+dnl  The instructions can be omitted to just assert a flags condition with
+dnl  no extra calculation.  For example,
+dnl
+dnl         ASSERT(nc)
+dnl
+dnl  When `instructions' is not empty, a pushf/popf is added to preserve the
+dnl  flags, but the instructions themselves must preserve any registers that
+dnl  matter.  FRAME is adjusted for the push and pop, so the instructions
+dnl  given can use defframe() stack variables.
+dnl
+dnl  The condition can be omitted to just output the given instructions when
+dnl  assertion checking is wanted.  In this case the pushf/popf is omitted.
+dnl  For example,
+dnl
+dnl         ASSERT(, `movl %eax, VAR_KEEPVAL')
+
+define(ASSERT,
+m4_assert_numargs_range(1,2)
+m4_assert_defined(`WANT_ASSERT')
+`ifelse(WANT_ASSERT,1,
+`ifelse(`$1',,
+	`$2',
+	`C ASSERT
+ifelse(`$2',,,`	pushf	ifdef(`FRAME',`FRAME_pushl()')')
+	$2
+	j`$1'	L(ASSERT_ok`'ASSERT_counter)
+	ud2	C assertion failed
+L(ASSERT_ok`'ASSERT_counter):
+ifelse(`$2',,,`	popf	ifdef(`FRAME',`FRAME_popl()')')
+define(`ASSERT_counter',incr(ASSERT_counter))')')')
+
+define(ASSERT_counter,1)
+
+
+dnl  Usage: movl_text_address(label,register)
+dnl
+dnl  Get the address of a text segment label, using either a plain movl or a
+dnl  position-independent calculation, as necessary.  For example,
+dnl
+dnl         movl_code_address(L(foo),%eax)
+dnl
+dnl  This macro is only meant for use in ASSERT()s or when testing, since
+dnl  the PIC sequence it generates will want to be done with a ret balancing
+dnl  the call on CPUs with return address branch prediction.
+dnl
+dnl  The addl generated here has a backward reference to the label, and so
+dnl  won't suffer from the two forwards references bug in old gas (described
+dnl  in mpn/x86/README).
+
+define(movl_text_address,
+m4_assert_numargs(2)
+`ifdef(`PIC',
+	`call	L(movl_text_address_`'movl_text_address_counter)
+L(movl_text_address_`'movl_text_address_counter):
+	popl	$2	C %eip
+	addl	`$'$1-L(movl_text_address_`'movl_text_address_counter), $2
+define(`movl_text_address_counter',incr(movl_text_address_counter))',
+	`movl	`$'$1, $2')')
+
+define(movl_text_address_counter,1)
+
+
+dnl  Usage: notl_or_xorl_GMP_NUMB_MASK(reg)
+dnl
+dnl  Expand to either "notl `reg'" or "xorl $GMP_NUMB_BITS,`reg'" as
+dnl  appropriate for nails in use or not.
+
+define(notl_or_xorl_GMP_NUMB_MASK,
+m4_assert_numargs(1)
+`ifelse(GMP_NAIL_BITS,0,
+`notl	`$1'',
+`xorl	$GMP_NUMB_MASK, `$1'')')
+
+
+dnl  Usage LEA(symbol,reg)
+dnl  Usage LEAL(symbol_local_to_file,reg)
+
+define(`LEA',
+m4_assert_numargs(2)
+`ifdef(`PIC',`dnl
+ifelse(index(defn(`load_eip'), `$2'),-1,
+`m4append(`load_eip',
+`	TEXT
+	ALIGN(16)
+L(movl_eip_`'substr($2,1)):
+	movl	(%esp), $2
+	ret_internal
+')')dnl
+	call	L(movl_eip_`'substr($2,1))
+	addl	$_GLOBAL_OFFSET_TABLE_, $2
+	movl	$1@GOT($2), $2
+',`
+	movl	`$'$1, $2
+')')
+
+define(`LEAL',
+m4_assert_numargs(2)
+`ifdef(`PIC',`dnl
+ifelse(index(defn(`load_eip'), `$2'),-1,
+`m4append(`load_eip',
+`	TEXT
+	ALIGN(16)
+L(movl_eip_`'substr($2,1)):
+	movl	(%esp), $2
+	ret_internal
+')')dnl
+	call	L(movl_eip_`'substr($2,1))
+	addl	$_GLOBAL_OFFSET_TABLE_, $2
+	leal	$1@GOTOFF($2), $2
+',`
+	movl	`$'$1, $2
+')')
+
+dnl ASM_END
+
+define(`ASM_END',`load_eip')
+
+define(`load_eip', `')		dnl updated in LEA/LEAL
+
+
+define(`DEF_OBJECT',
+m4_assert_numargs_range(1,2)
+	`RODATA
+	ALIGN(ifelse($#,1,2,$2))
+$1:
+')
+
+define(`END_OBJECT',
+m4_assert_numargs(1)
+`	SIZE(`$1',.-`$1')')
+
+dnl  Usage: CALL(funcname)
+dnl
+
+define(`CALL',
+m4_assert_numargs(1)
+`ifdef(`PIC',
+  `call	GSYM_PREFIX`'$1@PLT',
+  `call	GSYM_PREFIX`'$1')')
+
+ifdef(`PIC',
+`define(`PIC_WITH_EBX')',
+`undefine(`PIC_WITH_EBX')')
+
+divert`'dnl

diff --git a/third_party/gmp/mpn/x86/zn1/gmp-mparam.h b/third_party/gmp/mpn/x86/zn1/gmp-mparam.h
new file mode 100644
index 0000000..8e6c052
--- /dev/null
+++ b/third_party/gmp/mpn/x86/zn1/gmp-mparam.h

@@ -0,0 +1,220 @@
+/* AMD zn1/32 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* 3700-4300 MHz Pinnacle Ridge */
+/* FFT tuning limit = 67,000,000 */
+/* Generated by tuneup.c, 2019-10-21, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD                 0  /* always */
+#define MOD_1_UNNORM_THRESHOLD               3
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          4
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          3
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD        10
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD         0  /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD      9
+#define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_1N_PI1_METHOD                 1  /* 14.00% faster than 2 */
+#define DIV_QR_1_NORM_THRESHOLD              4
+#define DIV_QR_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           22
+
+#define DIV_1_VS_MUL_1_PERCENT             248
+
+#define MUL_TOOM22_THRESHOLD                28
+#define MUL_TOOM33_THRESHOLD                91
+#define MUL_TOOM44_THRESHOLD               137
+#define MUL_TOOM6H_THRESHOLD               222
+#define MUL_TOOM8H_THRESHOLD               454
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      85
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     103
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD      88
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD     105
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD     130
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 63
+#define SQR_TOOM3_THRESHOLD                 98
+#define SQR_TOOM4_THRESHOLD                172
+#define SQR_TOOM6_THRESHOLD                286
+#define SQR_TOOM8_THRESHOLD                478
+
+#define MULMID_TOOM42_THRESHOLD             64
+
+#define MULMOD_BNM1_THRESHOLD               21
+#define SQRMOD_BNM1_THRESHOLD               17
+
+#define MUL_FFT_MODF_THRESHOLD             606  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    606, 5}, {     25, 6}, {     13, 5}, {     27, 6}, \
+    {     15, 5}, {     31, 6}, {     27, 7}, {     15, 6}, \
+    {     33, 7}, {     17, 6}, {     35, 7}, {     19, 6}, \
+    {     39, 7}, {     29, 8}, {     15, 7}, {     35, 8}, \
+    {     19, 7}, {     41, 8}, {     23, 7}, {     47, 8}, \
+    {     27, 9}, {     15, 8}, {     31, 7}, {     63, 8}, \
+    {     43, 9}, {     23, 8}, {     51, 9}, {     31, 8}, \
+    {     67, 9}, {     39, 8}, {     79, 9}, {     47, 8}, \
+    {     95,10}, {     31, 9}, {     79,10}, {     47, 9}, \
+    {     95,11}, {     31,10}, {     63, 9}, {    127,10}, \
+    {     79, 9}, {    159,10}, {     95,11}, {     63,10}, \
+    {    159,11}, {     95,10}, {    191,12}, {     63,11}, \
+    {    127,10}, {    255, 9}, {    511,10}, {    271, 9}, \
+    {    543, 8}, {   1087,11}, {    159,10}, {    319, 9}, \
+    {    639,10}, {    335, 9}, {    671,11}, {    191,10}, \
+    {    383, 9}, {    767,10}, {    399,11}, {    223,12}, \
+    {    127,11}, {    255,10}, {    543, 9}, {   1087,11}, \
+    {    287,10}, {    607, 9}, {   1215,11}, {    319,10}, \
+    {    671, 9}, {   1343,11}, {    351,12}, {    191,11}, \
+    {    383,10}, {    799,11}, {    415,13}, {    127,12}, \
+    {    255,11}, {    543,10}, {   1087,11}, {    607,10}, \
+    {   1215,12}, {    319,11}, {    671,10}, {   1343,11}, \
+    {    735,10}, {   1471,12}, {    383,11}, {    799,10}, \
+    {   1599,11}, {    863,10}, {   1727,12}, {    447,11}, \
+    {    959,10}, {   1919,11}, {    991,13}, {    255,12}, \
+    {    511,11}, {   1087,12}, {    575,11}, {   1215,10}, \
+    {   2431,12}, {    639,11}, {   1343,12}, {    703,11}, \
+    {   1471,10}, {   2943,13}, {    383,12}, {    767,11}, \
+    {   1599,12}, {    831,11}, {   1727,10}, {   3455,12}, \
+    {    959,11}, {   1919,14}, {    255,13}, {    511,12}, \
+    {   1087,11}, {   2239,12}, {   1215,11}, {   2431,13}, \
+    {    639,12}, {   1471,11}, {   2943,13}, {    767,12}, \
+    {   1727,11}, {   3455,13}, {    895,12}, {   1983,14}, \
+    {    511,13}, {   1023,12}, {   2239,13}, {   1151,12}, \
+    {   2495,13}, {   1279,12}, {   2623,13}, {   1407,12}, \
+    {   2943,14}, {    767,13}, {   1663,12}, {   3455,13}, \
+    {   1919,12}, {   3839,15}, {    511,14}, {   1023,13}, \
+    {   2175,12}, {   4479,13}, {   2431,14}, {   1279,13}, \
+    {   2943,12}, {   5887,14}, {   1535,13}, {   3455,14}, \
+    {   1791,13}, {   3967,12}, {   7935,11}, {  15871,15}, \
+    {   1023,14}, {   2047,13}, {   4479,14}, {   2303,13}, \
+    {   4991,12}, {   9983,14}, {   2815,13}, {   5887,15}, \
+    {   1535,14}, {   3839,13}, {   7935,12}, {  15871,16} }
+#define MUL_FFT_TABLE3_SIZE 172
+#define MUL_FFT_THRESHOLD                 5760
+
+#define SQR_FFT_MODF_THRESHOLD             464  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    464, 5}, {     25, 6}, {     13, 5}, {     27, 6}, \
+    {     25, 7}, {     13, 6}, {     28, 7}, {     15, 6}, \
+    {     31, 7}, {     17, 6}, {     35, 7}, {     19, 6}, \
+    {     39, 7}, {     27, 8}, {     15, 7}, {     35, 8}, \
+    {     19, 7}, {     41, 8}, {     23, 7}, {     47, 8}, \
+    {     27, 9}, {     15, 8}, {     39, 9}, {     23, 8}, \
+    {     51,10}, {     15, 9}, {     31, 8}, {     67, 9}, \
+    {     39, 8}, {     79, 9}, {     47, 8}, {     95, 9}, \
+    {     55,10}, {     31, 9}, {     79,10}, {     47, 9}, \
+    {     95,11}, {     31,10}, {     63, 9}, {    135,10}, \
+    {     79, 9}, {    159,10}, {     95,11}, {     63,10}, \
+    {    127, 9}, {    255,10}, {    143, 9}, {    287,10}, \
+    {    159,11}, {     95,12}, {     63,11}, {    127,10}, \
+    {    255, 9}, {    511,10}, {    271, 9}, {    543,10}, \
+    {    287, 9}, {    575,11}, {    159, 9}, {    639,10}, \
+    {    335, 9}, {    671,10}, {    351, 9}, {    703,11}, \
+    {    191,10}, {    383, 9}, {    767,10}, {    399, 9}, \
+    {    799,10}, {    415,12}, {    127,11}, {    255,10}, \
+    {    543,11}, {    287,10}, {    607,11}, {    319,10}, \
+    {    671,11}, {    351,10}, {    703,12}, {    191,11}, \
+    {    383,10}, {    799,11}, {    415,10}, {    831,13}, \
+    {    127,12}, {    255,11}, {    543,10}, {   1087,11}, \
+    {    607,10}, {   1215,12}, {    319,11}, {    671,10}, \
+    {   1343,11}, {    735,10}, {   1471,12}, {    383,11}, \
+    {    799,10}, {   1599,11}, {    863,12}, {    447,11}, \
+    {    959,13}, {    255,12}, {    511,11}, {   1087,12}, \
+    {    575,11}, {   1215,10}, {   2431,12}, {    639,11}, \
+    {   1343,12}, {    703,11}, {   1471,13}, {    383,12}, \
+    {    767,11}, {   1599,12}, {    831,11}, {   1727,10}, \
+    {   3455,12}, {    959,11}, {   1919,14}, {    255,13}, \
+    {    511,12}, {   1087,11}, {   2239,12}, {   1215,11}, \
+    {   2431,13}, {    639,12}, {   1471,11}, {   2943,13}, \
+    {    767,12}, {   1727,11}, {   3455,13}, {    895,12}, \
+    {   1919,14}, {    511,13}, {   1023,12}, {   2239,13}, \
+    {   1151,12}, {   2431,13}, {   1279,12}, {   2623,13}, \
+    {   1407,12}, {   2943,14}, {    767,13}, {   1663,12}, \
+    {   3455,13}, {   1919,12}, {   3839,15}, {    511,14}, \
+    {   1023,13}, {   2175,12}, {   4479,13}, {   2431,14}, \
+    {   1279,13}, {   2943,12}, {   5887,14}, {   1535,13}, \
+    {   3455,14}, {   1791,13}, {   3839,12}, {   7679,13}, \
+    {   3967,12}, {   7935,15}, {   1023,14}, {   2047,13}, \
+    {   4479,14}, {   2303,13}, {   4991,12}, {   9983,14}, \
+    {   2815,13}, {   5887,15}, {   1535,14}, {   3839,13}, \
+    {   7935,16} }
+#define SQR_FFT_TABLE3_SIZE 173
+#define SQR_FFT_THRESHOLD                 4736
+
+#define MULLO_BASECASE_THRESHOLD             3
+#define MULLO_DC_THRESHOLD                  60
+#define MULLO_MUL_N_THRESHOLD            11278
+#define SQRLO_BASECASE_THRESHOLD             8
+#define SQRLO_DC_THRESHOLD                 161
+#define SQRLO_SQR_THRESHOLD               9335
+
+#define DC_DIV_QR_THRESHOLD                 71
+#define DC_DIVAPPR_Q_THRESHOLD             206
+#define DC_BDIV_QR_THRESHOLD                63
+#define DC_BDIV_Q_THRESHOLD                126
+
+#define INV_MULMOD_BNM1_THRESHOLD           78
+#define INV_NEWTON_THRESHOLD               274
+#define INV_APPR_THRESHOLD                 228
+
+#define BINV_NEWTON_THRESHOLD              274
+#define REDC_1_TO_REDC_N_THRESHOLD          71
+
+#define MU_DIV_QR_THRESHOLD               1652
+#define MU_DIVAPPR_Q_THRESHOLD            1718
+#define MUPI_DIV_QR_THRESHOLD              122
+#define MU_BDIV_QR_THRESHOLD              1470
+#define MU_BDIV_Q_THRESHOLD               1589
+
+#define POWM_SEC_TABLE  3,28,54,386,1337
+
+#define GET_STR_DC_THRESHOLD                13
+#define GET_STR_PRECOMPUTE_THRESHOLD        19
+#define SET_STR_DC_THRESHOLD               262
+#define SET_STR_PRECOMPUTE_THRESHOLD       558
+
+#define FAC_DSC_THRESHOLD                  109
+#define FAC_ODD_THRESHOLD                   39
+
+#define MATRIX22_STRASSEN_THRESHOLD         21
+#define HGCD2_DIV1_METHOD                    1  /* 7.49% faster than 3 */
+#define HGCD_THRESHOLD                      74
+#define HGCD_APPR_THRESHOLD                 70
+#define HGCD_REDUCE_THRESHOLD             3389
+#define GCD_DC_THRESHOLD                   440
+#define GCDEXT_DC_THRESHOLD                327
+#define JACOBI_BASE_METHOD                   1  /* 11.98% faster than 3 */
+
+/* Tuneup completed successfully, took 36916 seconds */

diff --git a/third_party/gmp/mpn/x86/zn2/gmp-mparam.h b/third_party/gmp/mpn/x86/zn2/gmp-mparam.h
new file mode 100644
index 0000000..152e6b7
--- /dev/null
+++ b/third_party/gmp/mpn/x86/zn2/gmp-mparam.h

@@ -0,0 +1,226 @@
+/* AMD zn2/32 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* 3600-4400 MHz Matisse */
+/* FFT tuning limit = 67,000,000 */
+/* Generated by tuneup.c, 2019-10-23, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD                 0  /* always */
+#define MOD_1_UNNORM_THRESHOLD               0  /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          3
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          3
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD        15
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD         0  /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD      9
+#define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_1N_PI1_METHOD                 1  /* 4.78% faster than 2 */
+#define DIV_QR_1_NORM_THRESHOLD              3
+#define DIV_QR_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */
+#define DIV_QR_2_PI2_THRESHOLD               7
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           23
+
+#define DIV_1_VS_MUL_1_PERCENT             274
+
+#define MUL_TOOM22_THRESHOLD                24
+#define MUL_TOOM33_THRESHOLD                85
+#define MUL_TOOM44_THRESHOLD               166
+#define MUL_TOOM6H_THRESHOLD               290
+#define MUL_TOOM8H_THRESHOLD               430
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      97
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     114
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD      97
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD     113
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD     130
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 26
+#define SQR_TOOM3_THRESHOLD                153
+#define SQR_TOOM4_THRESHOLD                214
+#define SQR_TOOM6_THRESHOLD                318
+#define SQR_TOOM8_THRESHOLD                478
+
+#define MULMID_TOOM42_THRESHOLD             48
+
+#define MULMOD_BNM1_THRESHOLD               18
+#define SQRMOD_BNM1_THRESHOLD               24
+
+#define MUL_FFT_MODF_THRESHOLD             444  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    444, 5}, {     21, 6}, {     11, 5}, {     23, 6}, \
+    {     12, 5}, {     25, 6}, {     13, 5}, {     27, 6}, \
+    {     25, 7}, {     13, 6}, {     27, 7}, {     15, 6}, \
+    {     31, 7}, {     17, 6}, {     35, 7}, {     19, 6}, \
+    {     39, 7}, {     27, 8}, {     15, 7}, {     35, 8}, \
+    {     19, 7}, {     41, 8}, {     23, 7}, {     47, 8}, \
+    {     27, 9}, {     15, 8}, {     31, 7}, {     63, 8}, \
+    {     39, 9}, {     23, 8}, {     51,10}, {     15, 9}, \
+    {     31, 8}, {     67, 9}, {     39, 8}, {     79, 9}, \
+    {     47,10}, {     31, 9}, {     79,10}, {     47, 9}, \
+    {     95,11}, {     31,10}, {     63, 9}, {    127,10}, \
+    {     79, 9}, {    159,10}, {     95,11}, {     63,10}, \
+    {    127, 9}, {    255, 8}, {    511,10}, {    143, 9}, \
+    {    287, 8}, {    575,10}, {    159,11}, {     95,12}, \
+    {     63,11}, {    127,10}, {    255, 9}, {    511,10}, \
+    {    271, 9}, {    543,10}, {    287, 9}, {    575,11}, \
+    {    159,10}, {    319, 9}, {    639,10}, {    335, 9}, \
+    {    671, 8}, {   1343,10}, {    351, 9}, {    703,10}, \
+    {    367, 9}, {    735,11}, {    191,10}, {    383, 9}, \
+    {    767,10}, {    415,11}, {    223,10}, {    447,12}, \
+    {    127,11}, {    255,10}, {    543, 9}, {   1087,11}, \
+    {    287,10}, {    607,11}, {    319,10}, {    671, 9}, \
+    {   1343,11}, {    351,10}, {    735,12}, {    191,11}, \
+    {    383,10}, {    767,11}, {    415,10}, {    831,11}, \
+    {    447,13}, {    127,12}, {    255,11}, {    543,10}, \
+    {   1087,11}, {    607,10}, {   1215,12}, {    319,11}, \
+    {    671,10}, {   1343,11}, {    735,10}, {   1471, 9}, \
+    {   2943,12}, {    383,11}, {    799,10}, {   1599,11}, \
+    {    863,12}, {    447,11}, {    959,10}, {   1919,13}, \
+    {    255,12}, {    511,11}, {   1087,12}, {    575,11}, \
+    {   1215,10}, {   2431,12}, {    639,11}, {   1343,12}, \
+    {    703,11}, {   1471,10}, {   2943,13}, {    383,12}, \
+    {    767,11}, {   1599,12}, {    831,11}, {   1727,10}, \
+    {   3455,12}, {    959,11}, {   1919,10}, {   3839,14}, \
+    {    255,13}, {    511,12}, {   1215,11}, {   2431,13}, \
+    {    639,12}, {   1471,11}, {   2943,10}, {   5887,13}, \
+    {    767,12}, {   1727,11}, {   3455,13}, {    895,12}, \
+    {   1919,11}, {   3839,14}, {    511,13}, {   1023,12}, \
+    {   2111,13}, {   1151,12}, {   2431,13}, {   1407,12}, \
+    {   2943,11}, {   5887,14}, {    767,13}, {   1663,12}, \
+    {   3455,13}, {   1919,12}, {   3839,15}, {    511,14}, \
+    {   1023,13}, {   2431,14}, {   1279,13}, {   2943,12}, \
+    {   5887,14}, {   1535,13}, {   3455,14}, {   1791,13}, \
+    {   3839,12}, {   7679,13}, {   3967,12}, {   7935,11}, \
+    {  15871,15}, {   1023,14}, {   2047,13}, {   4351,14}, \
+    {   2303,13}, {   4991,12}, {   9983,14}, {   2815,13}, \
+    {   5887,15}, {   1535,14}, {   3839,13}, {   7935,12}, \
+    {  15871,16} }
+#define MUL_FFT_TABLE3_SIZE 189
+#define MUL_FFT_THRESHOLD                 4736
+
+#define SQR_FFT_MODF_THRESHOLD             404  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    404, 5}, {     21, 6}, {     11, 5}, {     23, 6}, \
+    {     12, 5}, {     25, 6}, {     13, 5}, {     27, 6}, \
+    {     25, 7}, {     13, 6}, {     27, 7}, {     15, 6}, \
+    {     31, 7}, {     19, 6}, {     39, 7}, {     23, 6}, \
+    {     47, 7}, {     27, 8}, {     15, 7}, {     35, 8}, \
+    {     19, 7}, {     41, 8}, {     23, 7}, {     47, 8}, \
+    {     27, 9}, {     15, 8}, {     39, 9}, {     23, 8}, \
+    {     47,10}, {     15, 9}, {     31, 8}, {     63, 9}, \
+    {     39, 8}, {     79, 9}, {     47,10}, {     31, 9}, \
+    {     79,10}, {     47,11}, {     31,10}, {     63, 9}, \
+    {    127,10}, {     95,11}, {     63,10}, {    127, 9}, \
+    {    255, 8}, {    511, 9}, {    271,10}, {    143, 9}, \
+    {    287, 8}, {    607, 7}, {   1215,11}, {     95,12}, \
+    {     63,11}, {    127,10}, {    255, 9}, {    511,10}, \
+    {    271, 9}, {    543, 8}, {   1087, 9}, {    607, 8}, \
+    {   1215,11}, {    159, 9}, {    671, 8}, {   1343,10}, \
+    {    351, 9}, {    735, 8}, {   1471,11}, {    191,10}, \
+    {    383, 9}, {    767,10}, {    415,11}, {    223,12}, \
+    {    127,11}, {    255,10}, {    543, 9}, {   1087,10}, \
+    {    607, 9}, {   1215, 8}, {   2431,10}, {    671, 9}, \
+    {   1343,10}, {    735, 9}, {   1471,12}, {    191,11}, \
+    {    383,10}, {    767,11}, {    415,10}, {    831,13}, \
+    {    127,12}, {    255,11}, {    543,10}, {   1087,11}, \
+    {    607,10}, {   1215, 9}, {   2431,11}, {    671,10}, \
+    {   1343,11}, {    735,10}, {   1471, 9}, {   2943,12}, \
+    {    383,11}, {    863,12}, {    447,11}, {    959,10}, \
+    {   1919,13}, {    255,12}, {    511,11}, {   1087,12}, \
+    {    575,11}, {   1215,10}, {   2431,12}, {    639,11}, \
+    {   1343,12}, {    703,11}, {   1471,10}, {   2943, 9}, \
+    {   5887,12}, {    767,11}, {   1599,12}, {    831,11}, \
+    {   1727,12}, {    959,11}, {   1919,10}, {   3839,14}, \
+    {    255,13}, {    511,12}, {   1215,11}, {   2431,13}, \
+    {    639,12}, {   1471,11}, {   2943,10}, {   5887,13}, \
+    {    767,12}, {   1727,13}, {    895,12}, {   1919,11}, \
+    {   3839,14}, {    511,13}, {   1023,12}, {   2111,13}, \
+    {   1151,12}, {   2431,13}, {   1279,12}, {   2623,13}, \
+    {   1407,12}, {   2943,11}, {   5887,14}, {    767,13}, \
+    {   1663,12}, {   3455,13}, {   1919,12}, {   3839,15}, \
+    {    511,14}, {   1023,13}, {   2431,14}, {   1279,13}, \
+    {   2943,12}, {   5887,14}, {   1535,13}, {   3455,14}, \
+    {   1791,13}, {   3839,12}, {   7679,13}, {   3967,12}, \
+    {   7935,11}, {  15871,15}, {   1023,14}, {   2047,13}, \
+    {   4223,14}, {   2303,13}, {   4991,12}, {   9983,14}, \
+    {   2815,13}, {   5887,15}, {   1535,14}, {   3839,13}, \
+    {   7935,12}, {  15871,16} }
+#define SQR_FFT_TABLE3_SIZE 178
+#define SQR_FFT_THRESHOLD                 3712
+
+#define MULLO_BASECASE_THRESHOLD             4
+#define MULLO_DC_THRESHOLD                  62
+#define MULLO_MUL_N_THRESHOLD             8907
+#define SQRLO_BASECASE_THRESHOLD             8
+#define SQRLO_DC_THRESHOLD                 107
+#define SQRLO_SQR_THRESHOLD               6633
+
+#define DC_DIV_QR_THRESHOLD                 54
+#define DC_DIVAPPR_Q_THRESHOLD             206
+#define DC_BDIV_QR_THRESHOLD                55
+#define DC_BDIV_Q_THRESHOLD                136
+
+#define INV_MULMOD_BNM1_THRESHOLD           74
+#define INV_NEWTON_THRESHOLD               212
+#define INV_APPR_THRESHOLD                 204
+
+#define BINV_NEWTON_THRESHOLD              292
+#define REDC_1_TO_REDC_N_THRESHOLD          67
+
+#define MU_DIV_QR_THRESHOLD               1442
+#define MU_DIVAPPR_Q_THRESHOLD            1528
+#define MUPI_DIV_QR_THRESHOLD               97
+#define MU_BDIV_QR_THRESHOLD              1142
+#define MU_BDIV_Q_THRESHOLD               1470
+
+#define POWM_SEC_TABLE  1,16,96,386,1555
+
+#define GET_STR_DC_THRESHOLD                10
+#define GET_STR_PRECOMPUTE_THRESHOLD        16
+#define SET_STR_DC_THRESHOLD               303
+#define SET_STR_PRECOMPUTE_THRESHOLD       748
+
+#define FAC_DSC_THRESHOLD                  141
+#define FAC_ODD_THRESHOLD                   55
+
+#define MATRIX22_STRASSEN_THRESHOLD         20
+#define HGCD2_DIV1_METHOD                    1  /* 14.03% faster than 3 */
+#define HGCD_THRESHOLD                     103
+#define HGCD_APPR_THRESHOLD                127
+#define HGCD_REDUCE_THRESHOLD             3014
+#define GCD_DC_THRESHOLD                   396
+#define GCDEXT_DC_THRESHOLD                265
+#define JACOBI_BASE_METHOD                   1  /* 47.88% faster than 4 */
+
+/* Tuneup completed successfully, took 29014 seconds */

diff --git a/third_party/gmp/mpn/x86_64/README b/third_party/gmp/mpn/x86_64/README
new file mode 100644
index 0000000..9c8a586
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/README

@@ -0,0 +1,74 @@
+Copyright 2003, 2004, 2006, 2008 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.
+
+
+
+
+
+			AMD64 MPN SUBROUTINES
+
+
+This directory contains mpn functions for AMD64 chips.  It is also useful
+for 64-bit Pentiums, and "Core 2".
+
+
+		     RELEVANT OPTIMIZATION ISSUES
+
+The Opteron and Athlon64 can sustain up to 3 instructions per cycle, but in
+practice that is only possible for integer instructions.  But almost any
+three integer instructions can issue simultaneously, including any 3 ALU
+operations, including shifts.  Up to two memory operations can issue each
+cycle.
+
+Scheduling typically requires that load-use instructions are split into
+separate load and use instructions.  That requires more decode resources,
+and it is rarely a win.  Opteron/Athlon64 have deep out-of-order core.
+
+
+Optimizing for 64-bit Pentium4 is probably a waste of time, as the most
+critical instructions are very poorly implemented here.  Perhaps we could
+save a cycle or two, but the most common loops now run at between 10 and 22
+cycles, so a saved cycle isn't too exciting.
+
+
+The new spin of the venerable P6 core, the "Core 2" is much better than the
+Pentium4 for the GMP loops.  Its integer pipeline is somewhat similar to to
+the Opteron/Athlon64 pipeline, except that the GMP favourites ADC/SBB and
+MUL are slower.  Furthermore, an INC/DEC followed by ADC/SBB incur a
+pipeline stall of around 10 cycles.  The default mpn_add_n and mpn_sub_n
+code suffers badly from the stall.  The code in the core2 subdirectory uses
+the almost forgotten instruction JRCXZ for loop control, and updates the
+induction variable using LEA.
+
+
+
+REFERENCES
+
+"System V Application Binary Interface AMD64 Architecture Processor
+Supplement", draft version 0.99, December 2007.
+http://www.x86-64.org/documentation/abi.pdf

diff --git a/third_party/gmp/mpn/x86_64/addaddmul_1msb0.asm b/third_party/gmp/mpn/x86_64/addaddmul_1msb0.asm
new file mode 100644
index 0000000..87c21b4
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/addaddmul_1msb0.asm

@@ -0,0 +1,170 @@
+dnl  AMD64 mpn_addaddmul_1msb0, R = Au + Bv, u,v < 2^63.
+
+dnl  Copyright 2008 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C AMD K8,K9	 2.167
+C AMD K10	 2.167
+C Intel P4	12.0
+C Intel core2	 4.0
+C Intel corei	 ?
+C Intel atom	 ?
+C VIA nano	 ?
+
+C TODO
+C  * Perhaps handle various n mod 3 sizes better.  The code now is too large.
+
+C INPUT PARAMETERS
+define(`rp',	`%rdi')
+define(`ap',	`%rsi')
+define(`bp_param', `%rdx')
+define(`n',	`%rcx')
+define(`u0',	`%r8')
+define(`v0',	`%r9')
+
+
+define(`bp', `%rbp')
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_addaddmul_1msb0)
+	push	%r12
+	push	%rbp
+
+	lea	(ap,n,8), ap
+	lea	(bp_param,n,8), bp
+	lea	(rp,n,8), rp
+	neg	n
+
+	mov	(ap,n,8), %rax
+	mul	%r8
+	mov	%rax, %r12
+	mov	(bp,n,8), %rax
+	mov	%rdx, %r10
+	add	$3, n
+	jns	L(end)
+
+	ALIGN(16)
+L(top):	mul	%r9
+	add	%rax, %r12
+	mov	-16(ap,n,8), %rax
+	adc	%rdx, %r10
+	mov	%r12, -24(rp,n,8)
+	mul	%r8
+	add	%rax, %r10
+	mov	-16(bp,n,8), %rax
+	mov	$0, R32(%r11)
+	adc	%rdx, %r11
+	mul	%r9
+	add	%rax, %r10
+	mov	-8(ap,n,8), %rax
+	adc	%rdx, %r11
+	mov	%r10, -16(rp,n,8)
+	mul	%r8
+	add	%rax, %r11
+	mov	-8(bp,n,8), %rax
+	mov	$0, R32(%r12)
+	adc	%rdx, %r12
+	mul	%r9
+	add	%rax, %r11
+	adc	%rdx, %r12
+	mov	(ap,n,8), %rax
+	mul	%r8
+	add	%rax, %r12
+	mov	%r11, -8(rp,n,8)
+	mov	(bp,n,8), %rax
+	mov	$0, R32(%r10)
+	adc	%rdx, %r10
+	add	$3, n
+	js	L(top)
+
+L(end):	cmp	$1, R32(n)
+	ja	2f
+	jz	1f
+
+	mul	%r9
+	add	%rax, %r12
+	mov	-16(ap), %rax
+	adc	%rdx, %r10
+	mov	%r12, -24(rp)
+	mul	%r8
+	add	%rax, %r10
+	mov	-16(bp), %rax
+	mov	$0, R32(%r11)
+	adc	%rdx, %r11
+	mul	%r9
+	add	%rax, %r10
+	mov	-8(ap), %rax
+	adc	%rdx, %r11
+	mov	%r10, -16(rp)
+	mul	%r8
+	add	%rax, %r11
+	mov	-8(bp), %rax
+	mov	$0, R32(%r12)
+	adc	%rdx, %r12
+	mul	%r9
+	add	%rax, %r11
+	adc	%rdx, %r12
+	mov	%r11, -8(rp)
+	mov	%r12, %rax
+	pop	%rbp
+	pop	%r12
+	ret
+
+1:	mul	%r9
+	add	%rax, %r12
+	mov	-8(ap), %rax
+	adc	%rdx, %r10
+	mov	%r12, -16(rp)
+	mul	%r8
+	add	%rax, %r10
+	mov	-8(bp), %rax
+	mov	$0, R32(%r11)
+	adc	%rdx, %r11
+	mul	%r9
+	add	%rax, %r10
+	adc	%rdx, %r11
+	mov	%r10, -8(rp)
+	mov	%r11, %rax
+	pop	%rbp
+	pop	%r12
+	ret
+
+2:	mul	%r9
+	add	%rax, %r12
+	mov	%r12, -8(rp)
+	adc	%rdx, %r10
+	mov	%r10, %rax
+	pop	%rbp
+	pop	%r12
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/aorrlsh1_n.asm b/third_party/gmp/mpn/x86_64/aorrlsh1_n.asm
new file mode 100644
index 0000000..6ee0872
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/aorrlsh1_n.asm

@@ -0,0 +1,170 @@
+dnl  AMD64 mpn_addlsh1_n -- rp[] = up[] + (vp[] << 1)
+dnl  AMD64 mpn_rsblsh1_n -- rp[] = (vp[] << 1) - up[]
+
+dnl  Copyright 2003, 2005-2009, 2011, 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C	     cycles/limb
+C AMD K8,K9	 2
+C AMD K10	 2
+C AMD bd1	 ?
+C AMD bobcat	 ?
+C Intel P4	 13
+C Intel core2	 3.45
+C Intel NHM	 ?
+C Intel SBR	 ?
+C Intel atom	 ?
+C VIA nano	 ?
+
+
+C Sometimes speed degenerates, supposedly related to that some operand
+C alignments cause cache conflicts.
+
+C The speed is limited by decoding/issue bandwidth.  There are 22 instructions
+C in the loop, which corresponds to ceil(22/3)/4 = 1.83 c/l.
+
+C INPUT PARAMETERS
+define(`rp',`%rdi')
+define(`up',`%rsi')
+define(`vp',`%rdx')
+define(`n', `%rcx')
+
+ifdef(`OPERATION_addlsh1_n', `
+  define(ADDSUB,	add)
+  define(ADCSBB,	adc)
+  define(func,		mpn_addlsh1_n)')
+ifdef(`OPERATION_rsblsh1_n', `
+  define(ADDSUB,	sub)
+  define(ADCSBB,	sbb)
+  define(func,		mpn_rsblsh1_n)')
+
+MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_rsblsh1_n)
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(func)
+	FUNC_ENTRY(4)
+	push	%rbp
+
+	mov	(vp), %r8
+	mov	R32(n), R32(%rax)
+	lea	(rp,n,8), rp
+	lea	(up,n,8), up
+	lea	(vp,n,8), vp
+	neg	n
+	xor	R32(%rbp), R32(%rbp)
+	and	$3, R32(%rax)
+	je	L(b00)
+	cmp	$2, R32(%rax)
+	jc	L(b01)
+	je	L(b10)
+
+L(b11):	add	%r8, %r8
+	mov	8(vp,n,8), %r9
+	adc	%r9, %r9
+	mov	16(vp,n,8), %r10
+	adc	%r10, %r10
+	sbb	R32(%rax), R32(%rax)	C save scy
+	ADDSUB	(up,n,8), %r8
+	ADCSBB	8(up,n,8), %r9
+	mov	%r8, (rp,n,8)
+	mov	%r9, 8(rp,n,8)
+	ADCSBB	16(up,n,8), %r10
+	mov	%r10, 16(rp,n,8)
+	sbb	R32(%rbp), R32(%rbp)	C save acy
+	add	$3, n
+	jmp	L(ent)
+
+L(b10):	add	%r8, %r8
+	mov	8(vp,n,8), %r9
+	adc	%r9, %r9
+	sbb	R32(%rax), R32(%rax)	C save scy
+	ADDSUB	(up,n,8), %r8
+	ADCSBB	8(up,n,8), %r9
+	mov	%r8, (rp,n,8)
+	mov	%r9, 8(rp,n,8)
+	sbb	R32(%rbp), R32(%rbp)	C save acy
+	add	$2, n
+	jmp	L(ent)
+
+L(b01):	add	%r8, %r8
+	sbb	R32(%rax), R32(%rax)	C save scy
+	ADDSUB	(up,n,8), %r8
+	mov	%r8, (rp,n,8)
+	sbb	R32(%rbp), R32(%rbp)	C save acy
+	inc	n
+L(ent):	jns	L(end)
+
+	ALIGN(16)
+L(top):	add	R32(%rax), R32(%rax)	C restore scy
+
+	mov	(vp,n,8), %r8
+L(b00):	adc	%r8, %r8
+	mov	8(vp,n,8), %r9
+	adc	%r9, %r9
+	mov	16(vp,n,8), %r10
+	adc	%r10, %r10
+	mov	24(vp,n,8), %r11
+	adc	%r11, %r11
+
+	sbb	R32(%rax), R32(%rax)	C save scy
+	add	R32(%rbp), R32(%rbp)	C restore acy
+
+	ADCSBB	(up,n,8), %r8
+	nop				C Hammer speedup!
+	ADCSBB	8(up,n,8), %r9
+	mov	%r8, (rp,n,8)
+	mov	%r9, 8(rp,n,8)
+	ADCSBB	16(up,n,8), %r10
+	ADCSBB	24(up,n,8), %r11
+	mov	%r10, 16(rp,n,8)
+	mov	%r11, 24(rp,n,8)
+
+	sbb	R32(%rbp), R32(%rbp)	C save acy
+	add	$4, n
+	js	L(top)
+
+L(end):
+ifdef(`OPERATION_addlsh1_n',`
+	add	R32(%rbp), R32(%rax)
+	neg	R32(%rax)')
+ifdef(`OPERATION_rsblsh1_n',`
+	sub	R32(%rax), R32(%rbp)
+	movslq	R32(%rbp), %rax')
+
+	pop	%rbp
+	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/aorrlsh2_n.asm b/third_party/gmp/mpn/x86_64/aorrlsh2_n.asm
new file mode 100644
index 0000000..999e972
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/aorrlsh2_n.asm

@@ -0,0 +1,53 @@
+dnl  AMD64 mpn_addlsh2_n -- rp[] = up[] + (vp[] << 2)
+dnl  AMD64 mpn_rsblsh2_n -- rp[] = (vp[] << 2) - up[]
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  Copyright 2009-2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+define(LSH, 2)
+define(RSH, 62)
+
+ifdef(`OPERATION_addlsh2_n',`
+  define(ADDSUB,	add)
+  define(ADCSBB,	adc)
+  define(func,		mpn_addlsh2_n)')
+ifdef(`OPERATION_rsblsh2_n',`
+  define(ADDSUB,	sub)
+  define(ADCSBB,	sbb)
+  define(func,		mpn_rsblsh2_n)')
+
+MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_rsblsh2_n)
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+include_mpn(`x86_64/aorrlshC_n.asm')

diff --git a/third_party/gmp/mpn/x86_64/aorrlshC_n.asm b/third_party/gmp/mpn/x86_64/aorrlshC_n.asm
new file mode 100644
index 0000000..de00154
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/aorrlshC_n.asm

@@ -0,0 +1,172 @@
+dnl  AMD64 mpn_addlshC_n -- rp[] = up[] + (vp[] << C)
+dnl  AMD64 mpn_rsblshC_n -- rp[] = (vp[] << C) - up[]
+
+dnl  Copyright 2009-2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+C	     cycles/limb
+C AMD K8,K9	 2.1
+C AMD K10	 2.0
+C AMD bd1	~2.7
+C AMD bd2	~2.7
+C AMD bd3	 ?
+C AMD bd4	 ?
+C AMD zen	 2.0
+C AMD bt1	 3.3
+C AMD bt2	 3.0
+C Intel P4	 ?
+C Intel PNR	 3.0
+C Intel NHM	 2.75
+C Intel SBR	 2.55
+C Intel IBR	 2.49
+C Intel HWL	 2.25
+C Intel BWL	 1.89
+C Intel SKL	 1.90
+C Intel atom	 8.4
+C Intel SLM	 4.0
+C VIA nano	 ?
+
+C INPUT PARAMETERS
+define(`rp',	`%rdi')
+define(`up',	`%rsi')
+define(`vp',	`%rdx')
+define(`n',	`%rcx')
+
+define(M, eval(m4_lshift(1,LSH)))
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(func)
+	FUNC_ENTRY(4)
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+
+	mov	(vp), %r8
+	lea	(,%r8,M), %r12
+	shr	$RSH, %r8
+
+	mov	R32(n), R32(%rax)
+	lea	(rp,n,8), rp
+	lea	(up,n,8), up
+	lea	(vp,n,8), vp
+	neg	n
+	and	$3, R8(%rax)
+	je	L(b00)
+	cmp	$2, R8(%rax)
+	jc	L(b01)
+	je	L(b10)
+
+L(b11):	mov	8(vp,n,8), %r10
+	lea	(%r8,%r10,M), %r14
+	shr	$RSH, %r10
+	mov	16(vp,n,8), %r11
+	lea	(%r10,%r11,M), %r15
+	shr	$RSH, %r11
+	ADDSUB	(up,n,8), %r12
+	ADCSBB	8(up,n,8), %r14
+	ADCSBB	16(up,n,8), %r15
+	sbb	R32(%rax), R32(%rax)		  C save carry for next
+	mov	%r12, (rp,n,8)
+	mov	%r14, 8(rp,n,8)
+	mov	%r15, 16(rp,n,8)
+	add	$3, n
+	js	L(top)
+	jmp	L(end)
+
+L(b01):	mov	%r8, %r11
+	ADDSUB	(up,n,8), %r12
+	sbb	R32(%rax), R32(%rax)		  C save carry for next
+	mov	%r12, (rp,n,8)
+	add	$1, n
+	js	L(top)
+	jmp	L(end)
+
+L(b10):	mov	8(vp,n,8), %r11
+	lea	(%r8,%r11,M), %r15
+	shr	$RSH, %r11
+	ADDSUB	(up,n,8), %r12
+	ADCSBB	8(up,n,8), %r15
+	sbb	R32(%rax), R32(%rax)		  C save carry for next
+	mov	%r12, (rp,n,8)
+	mov	%r15, 8(rp,n,8)
+	add	$2, n
+	js	L(top)
+	jmp	L(end)
+
+L(b00):	mov	8(vp,n,8), %r9
+	mov	16(vp,n,8), %r10
+	jmp	L(e00)
+
+	ALIGN(16)
+L(top):	mov	16(vp,n,8), %r10
+	mov	(vp,n,8), %r8
+	mov	8(vp,n,8), %r9
+	lea	(%r11,%r8,M), %r12
+	shr	$RSH, %r8
+L(e00):	lea	(%r8,%r9,M), %r13
+	shr	$RSH, %r9
+	mov	24(vp,n,8), %r11
+	lea	(%r9,%r10,M), %r14
+	shr	$RSH, %r10
+	lea	(%r10,%r11,M), %r15
+	shr	$RSH, %r11
+	add	R32(%rax), R32(%rax)		  C restore carry
+	ADCSBB	(up,n,8), %r12
+	ADCSBB	8(up,n,8), %r13
+	ADCSBB	16(up,n,8), %r14
+	ADCSBB	24(up,n,8), %r15
+	mov	%r12, (rp,n,8)
+	mov	%r13, 8(rp,n,8)
+	mov	%r14, 16(rp,n,8)
+	sbb	R32(%rax), R32(%rax)		  C save carry for next
+	mov	%r15, 24(rp,n,8)
+	add	$4, n
+	js	L(top)
+L(end):
+
+ifelse(ADDSUB,add,`
+	sub	R32(%r11), R32(%rax)
+	neg	R32(%rax)
+',`
+	add	R32(%r11), R32(%rax)
+	movslq	R32(%rax), %rax
+')
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/aorrlsh_n.asm b/third_party/gmp/mpn/x86_64/aorrlsh_n.asm
new file mode 100644
index 0000000..5ca128f
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/aorrlsh_n.asm

@@ -0,0 +1,176 @@
+dnl  AMD64 mpn_addlsh_n and mpn_rsblsh_n.  R = V2^k +- U.
+
+dnl  Copyright 2006, 2010-2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C	     cycles/limb
+C AMD K8,K9	 3.1	< 3.85 for lshift + add_n
+C AMD K10	 3.1	< 3.85 for lshift + add_n
+C Intel P4	14.6	> 7.33 for lshift + add_n
+C Intel core2	 3.87	> 3.27 for lshift + add_n
+C Intel NHM	 4	> 3.75 for lshift + add_n
+C Intel SBR	(5.8)	> 3.46 for lshift + add_n
+C Intel atom	(7.75)	< 8.75 for lshift + add_n
+C VIA nano	 4.7	< 6.25 for lshift + add_n
+
+C This was written quickly and not optimized at all.  Surely one could get
+C closer to 3 c/l or perhaps even under 3 c/l.  Ideas:
+C   1) Use indexing to save the 3 LEA
+C   2) Write reasonable feed-in code
+C   3) Be more clever about register usage
+C   4) Unroll more, handling CL negation, carry save/restore cost much now
+C   5) Reschedule
+
+C INPUT PARAMETERS
+define(`rp',	`%rdi')
+define(`up',	`%rsi')
+define(`vp',	`%rdx')
+define(`n',	`%rcx')
+define(`cnt',	`%r8')
+
+ifdef(`OPERATION_addlsh_n',`
+  define(ADCSBB,       `adc')
+  define(func, mpn_addlsh_n)
+')
+ifdef(`OPERATION_rsblsh_n',`
+  define(ADCSBB,       `sbb')
+  define(func, mpn_rsblsh_n)
+')
+
+MULFUNC_PROLOGUE(mpn_addlsh_n mpn_rsblsh_n)
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(func)
+	FUNC_ENTRY(4)
+IFDOS(`	mov	56(%rsp), %r8d	')
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%rbp
+	push	%rbx
+
+	mov	n, %rax
+	xor	R32(%rbx), R32(%rbx)	C clear carry save register
+	mov	R32(%r8), R32(%rcx)	C shift count
+	xor	R32(%rbp), R32(%rbp)	C limb carry
+
+	mov	R32(%rax), R32(%r11)
+	and	$3, R32(%r11)
+	je	L(4)
+	sub	$1, R32(%r11)
+
+L(012):	mov	(vp), %r8
+	mov	%r8, %r12
+	shl	R8(%rcx), %r8
+	or	%rbp, %r8
+	neg	R8(%rcx)
+	mov	%r12, %rbp
+	shr	R8(%rcx), %rbp
+	neg	R8(%rcx)
+	add	R32(%rbx), R32(%rbx)
+	ADCSBB	(up), %r8
+	mov	%r8, (rp)
+	sbb	R32(%rbx), R32(%rbx)
+	lea	8(up), up
+	lea	8(vp), vp
+	lea	8(rp), rp
+	sub	$1, R32(%r11)
+	jnc	L(012)
+
+L(4):	sub	$4, %rax
+	jc	L(end)
+
+	ALIGN(16)
+L(top):	mov	(vp), %r8
+	mov	%r8, %r12
+	mov	8(vp), %r9
+	mov	%r9, %r13
+	mov	16(vp), %r10
+	mov	%r10, %r14
+	mov	24(vp), %r11
+
+	shl	R8(%rcx), %r8
+	shl	R8(%rcx), %r9
+	shl	R8(%rcx), %r10
+	or	%rbp, %r8
+	mov	%r11, %rbp
+	shl	R8(%rcx), %r11
+
+	neg	R8(%rcx)
+
+	shr	R8(%rcx), %r12
+	shr	R8(%rcx), %r13
+	shr	R8(%rcx), %r14
+	shr	R8(%rcx), %rbp		C used next iteration
+
+	or	%r12, %r9
+	or	%r13, %r10
+	or	%r14, %r11
+
+	neg	R8(%rcx)
+
+	add	R32(%rbx), R32(%rbx)	C restore carry flag
+
+	ADCSBB	(up), %r8
+	ADCSBB	8(up), %r9
+	ADCSBB	16(up), %r10
+	ADCSBB	24(up), %r11
+
+	mov	%r8, (rp)
+	mov	%r9, 8(rp)
+	mov	%r10, 16(rp)
+	mov	%r11, 24(rp)
+
+	sbb	R32(%rbx), R32(%rbx)	C save carry flag
+
+	lea	32(up), up
+	lea	32(vp), vp
+	lea	32(rp), rp
+
+	sub	$4, %rax
+	jnc	L(top)
+
+L(end):	add	R32(%rbx), R32(%rbx)
+	ADCSBB	$0, %rbp
+	mov	%rbp, %rax
+	pop	%rbx
+	pop	%rbp
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/aors_err1_n.asm b/third_party/gmp/mpn/x86_64/aors_err1_n.asm
new file mode 100644
index 0000000..54d0b3f
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/aors_err1_n.asm

@@ -0,0 +1,225 @@
+dnl  AMD64 mpn_add_err1_n, mpn_sub_err1_n
+
+dnl  Contributed by David Harvey.
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C AMD K8,K9	 2.75 (degenerates to 3 c/l for some alignments)
+C AMD K10	 ?
+C Intel P4	 ?
+C Intel core2	 ?
+C Intel corei	 ?
+C Intel atom	 ?
+C VIA nano	 ?
+
+
+C INPUT PARAMETERS
+define(`rp',	`%rdi')
+define(`up',	`%rsi')
+define(`vp',	`%rdx')
+define(`ep',	`%rcx')
+define(`yp',	`%r8')
+define(`n',	`%r9')
+define(`cy_param',	`8(%rsp)')
+
+define(`el',	`%rbx')
+define(`eh',	`%rbp')
+define(`t0',	`%r10')
+define(`t1',	`%r11')
+define(`t2',	`%r12')
+define(`t3',	`%r13')
+define(`w0',	`%r14')
+define(`w1',	`%r15')
+
+ifdef(`OPERATION_add_err1_n', `
+	define(ADCSBB,	      adc)
+	define(func,	      mpn_add_err1_n)')
+ifdef(`OPERATION_sub_err1_n', `
+	define(ADCSBB,	      sbb)
+	define(func,	      mpn_sub_err1_n)')
+
+MULFUNC_PROLOGUE(mpn_add_err1_n mpn_sub_err1_n)
+
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(func)
+	mov	cy_param, %rax
+
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+
+	lea	(up,n,8), up
+	lea	(vp,n,8), vp
+	lea	(rp,n,8), rp
+
+	mov	R32(n), R32(%r10)
+	and	$3, R32(%r10)
+	jz	L(0mod4)
+	cmp	$2, R32(%r10)
+	jc	L(1mod4)
+	jz	L(2mod4)
+L(3mod4):
+	xor	R32(el), R32(el)
+	xor	R32(eh), R32(eh)
+	xor	R32(t0), R32(t0)
+	xor	R32(t1), R32(t1)
+	lea	-24(yp,n,8), yp
+	neg	n
+
+	shr	$1, %al		   C restore carry
+	mov	(up,n,8), w0
+	mov	8(up,n,8), w1
+	ADCSBB	(vp,n,8), w0
+	mov	w0, (rp,n,8)
+	cmovc	16(yp), el
+	ADCSBB	8(vp,n,8), w1
+	mov	w1, 8(rp,n,8)
+	cmovc	8(yp), t0
+	mov	16(up,n,8), w0
+	ADCSBB	16(vp,n,8), w0
+	mov	w0, 16(rp,n,8)
+	cmovc	(yp), t1
+	setc	%al		   C save carry
+	add	t0, el
+	adc	$0, eh
+	add	t1, el
+	adc	$0, eh
+
+	add	$3, n
+	jnz	L(loop)
+	jmp	L(end)
+
+	ALIGN(16)
+L(0mod4):
+	xor	R32(el), R32(el)
+	xor	R32(eh), R32(eh)
+	lea	(yp,n,8), yp
+	neg	n
+	jmp	L(loop)
+
+	ALIGN(16)
+L(1mod4):
+	xor	R32(el), R32(el)
+	xor	R32(eh), R32(eh)
+	lea	-8(yp,n,8), yp
+	neg	n
+
+	shr	$1, %al		   C restore carry
+	mov	(up,n,8), w0
+	ADCSBB	(vp,n,8), w0
+	mov	w0, (rp,n,8)
+	cmovc	(yp), el
+	setc	%al		   C save carry
+
+	add	$1, n
+	jnz	L(loop)
+	jmp	L(end)
+
+	ALIGN(16)
+L(2mod4):
+	xor	R32(el), R32(el)
+	xor	R32(eh), R32(eh)
+	xor	R32(t0), R32(t0)
+	lea	-16(yp,n,8), yp
+	neg	n
+
+	shr	$1, %al		   C restore carry
+	mov	(up,n,8), w0
+	mov	8(up,n,8), w1
+	ADCSBB	(vp,n,8), w0
+	mov	w0, (rp,n,8)
+	cmovc	8(yp), el
+	ADCSBB	8(vp,n,8), w1
+	mov	w1, 8(rp,n,8)
+	cmovc	(yp), t0
+	setc	%al		   C save carry
+	add	t0, el
+	adc	$0, eh
+
+	add	$2, n
+	jnz	L(loop)
+	jmp	L(end)
+
+	ALIGN(32)
+L(loop):
+	shr	$1, %al		   C restore carry
+	mov	-8(yp), t0
+	mov	$0, R32(t3)
+	mov	(up,n,8), w0
+	mov	8(up,n,8), w1
+	ADCSBB	(vp,n,8), w0
+	cmovnc	t3, t0
+	ADCSBB	8(vp,n,8), w1
+	mov	-16(yp), t1
+	mov	w0, (rp,n,8)
+	mov	16(up,n,8), w0
+	mov	w1, 8(rp,n,8)
+	cmovnc	t3, t1
+	mov	-24(yp), t2
+	ADCSBB	16(vp,n,8), w0
+	cmovnc	t3, t2
+	mov	24(up,n,8), w1
+	ADCSBB	24(vp,n,8), w1
+	cmovc	-32(yp), t3
+	setc	%al		   C save carry
+	add	t0, el
+	adc	$0, eh
+	add	t1, el
+	adc	$0, eh
+	add	t2, el
+	adc	$0, eh
+	mov	w0, 16(rp,n,8)
+	add	t3, el
+	lea	-32(yp), yp
+	adc	$0, eh
+	mov	w1, 24(rp,n,8)
+	add	$4, n
+	jnz	L(loop)
+
+L(end):
+	mov	el, (ep)
+	mov	eh, 8(ep)
+
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/aors_err2_n.asm b/third_party/gmp/mpn/x86_64/aors_err2_n.asm
new file mode 100644
index 0000000..ce5c2a4
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/aors_err2_n.asm

@@ -0,0 +1,172 @@
+dnl  AMD64 mpn_add_err2_n, mpn_sub_err2_n
+
+dnl  Contributed by David Harvey.
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C AMD K8,K9	 4.5
+C AMD K10	 ?
+C Intel P4	 ?
+C Intel core2	 6.9
+C Intel corei	 ?
+C Intel atom	 ?
+C VIA nano	 ?
+
+
+C INPUT PARAMETERS
+define(`rp',	`%rdi')
+define(`up',	`%rsi')
+define(`vp',	`%rdx')
+define(`ep',	`%rcx')
+define(`yp1',	`%r8')
+define(`yp2',   `%r9')
+define(`n_param',     `8(%rsp)')
+define(`cy_param',    `16(%rsp)')
+
+define(`cy1',   `%r14')
+define(`cy2',   `%rax')
+
+define(`n',     `%r10')
+
+define(`w',     `%rbx')
+define(`e1l',	`%rbp')
+define(`e1h',	`%r11')
+define(`e2l',	`%r12')
+define(`e2h',	`%r13')
+
+
+ifdef(`OPERATION_add_err2_n', `
+	define(ADCSBB,	      adc)
+	define(func,	      mpn_add_err2_n)')
+ifdef(`OPERATION_sub_err2_n', `
+	define(ADCSBB,	      sbb)
+	define(func,	      mpn_sub_err2_n)')
+
+MULFUNC_PROLOGUE(mpn_add_err2_n mpn_sub_err2_n)
+
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(func)
+	mov	cy_param, cy2
+	mov	n_param, n
+
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+
+	xor	R32(e1l), R32(e1l)
+	xor	R32(e1h), R32(e1h)
+	xor	R32(e2l), R32(e2l)
+	xor	R32(e2h), R32(e2h)
+
+	sub	yp1, yp2
+
+	lea	(rp,n,8), rp
+	lea	(up,n,8), up
+	lea	(vp,n,8), vp
+
+	test	$1, n
+	jnz	L(odd)
+
+	lea	-8(yp1,n,8), yp1
+	neg	n
+	jmp	L(top)
+
+	ALIGN(16)
+L(odd):
+	lea	-16(yp1,n,8), yp1
+	neg	n
+	shr	$1, cy2
+	mov	(up,n,8), w
+	ADCSBB	(vp,n,8), w
+	cmovc	8(yp1), e1l
+	cmovc	8(yp1,yp2), e2l
+	mov	w, (rp,n,8)
+	sbb	cy2, cy2
+	inc	n
+	jz	L(end)
+
+	ALIGN(16)
+L(top):
+	mov	(up,n,8), w
+	shr	$1, cy2		C restore carry
+	ADCSBB	(vp,n,8), w
+	mov	w, (rp,n,8)
+	sbb	cy1, cy1	C generate mask, preserve CF
+
+	mov	8(up,n,8), w
+	ADCSBB	8(vp,n,8), w
+	mov	w, 8(rp,n,8)
+	sbb	cy2, cy2	C generate mask, preserve CF
+
+	mov	(yp1), w	C (e1h:e1l) += cy1 * yp1 limb
+	and	cy1, w
+	add	w, e1l
+	adc	$0, e1h
+
+	and	(yp1,yp2), cy1	C (e2h:e2l) += cy1 * yp2 limb
+	add	cy1, e2l
+	adc	$0, e2h
+
+	mov	-8(yp1), w	C (e1h:e1l) += cy2 * next yp1 limb
+	and	cy2, w
+	add	w, e1l
+	adc	$0, e1h
+
+	mov	-8(yp1,yp2), w	C (e2h:e2l) += cy2 * next yp2 limb
+	and	cy2, w
+	add	w, e2l
+	adc	$0, e2h
+
+	add	$2, n
+	lea	-16(yp1), yp1
+	jnz	L(top)
+L(end):
+
+	mov	e1l, (ep)
+	mov	e1h, 8(ep)
+	mov	e2l, 16(ep)
+	mov	e2h, 24(ep)
+
+	and	$1, %eax	C return carry
+
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/aors_err3_n.asm b/third_party/gmp/mpn/x86_64/aors_err3_n.asm
new file mode 100644
index 0000000..bb6d0c5
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/aors_err3_n.asm

@@ -0,0 +1,156 @@
+dnl  AMD64 mpn_add_err3_n, mpn_sub_err3_n
+
+dnl  Contributed by David Harvey.
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C AMD K8,K9	 7.0
+C AMD K10	 ?
+C Intel P4	 ?
+C Intel core2	 ?
+C Intel corei	 ?
+C Intel atom	 ?
+C VIA nano	 ?
+
+C INPUT PARAMETERS
+define(`rp',	`%rdi')
+define(`up',	`%rsi')
+define(`vp',	`%rdx')
+define(`ep',	`%rcx')
+define(`yp1',	`%r8')
+define(`yp2',   `%r9')
+define(`yp3_param',   `8(%rsp)')
+define(`n_param',     `16(%rsp)')
+define(`cy_param',    `24(%rsp)')
+
+define(`n',     `%r10')
+define(`yp3',   `%rcx')
+define(`t',     `%rbx')
+
+define(`e1l',	`%rbp')
+define(`e1h',	`%r11')
+define(`e2l',	`%r12')
+define(`e2h',	`%r13')
+define(`e3l',   `%r14')
+define(`e3h',   `%r15')
+
+
+
+ifdef(`OPERATION_add_err3_n', `
+	define(ADCSBB,	      adc)
+	define(func,	      mpn_add_err3_n)')
+ifdef(`OPERATION_sub_err3_n', `
+	define(ADCSBB,	      sbb)
+	define(func,	      mpn_sub_err3_n)')
+
+MULFUNC_PROLOGUE(mpn_add_err3_n mpn_sub_err3_n)
+
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(func)
+	mov	cy_param, %rax
+	mov	n_param, n
+
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+
+	push	ep
+	mov	64(%rsp), yp3       C load from yp3_param
+
+	xor	R32(e1l), R32(e1l)
+	xor	R32(e1h), R32(e1h)
+	xor	R32(e2l), R32(e2l)
+	xor	R32(e2h), R32(e2h)
+	xor	R32(e3l), R32(e3l)
+	xor	R32(e3h), R32(e3h)
+
+	sub	yp1, yp2
+	sub	yp1, yp3
+
+	lea	-8(yp1,n,8), yp1
+	lea	(rp,n,8), rp
+	lea	(up,n,8), up
+	lea	(vp,n,8), vp
+	neg	n
+
+	ALIGN(16)
+L(top):
+	shr	$1, %rax		C restore carry
+	mov	(up,n,8), %rax
+	ADCSBB	(vp,n,8), %rax
+	mov	%rax, (rp,n,8)
+	sbb	%rax, %rax		C save carry and generate mask
+
+	mov	(yp1), t
+	and	%rax, t
+	add	t, e1l
+	adc	$0, e1h
+
+	mov	(yp1,yp2), t
+	and	%rax, t
+	add	t, e2l
+	adc	$0, e2h
+
+	mov	(yp1,yp3), t
+	and	%rax, t
+	add	t, e3l
+	adc	$0, e3h
+
+	lea	-8(yp1), yp1
+	inc	n
+	jnz     L(top)
+
+L(end):
+	and	$1, %eax
+	pop	ep
+
+	mov	e1l, (ep)
+	mov	e1h, 8(ep)
+	mov	e2l, 16(ep)
+	mov	e2h, 24(ep)
+	mov	e3l, 32(ep)
+	mov	e3h, 40(ep)
+
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/aors_n.asm b/third_party/gmp/mpn/x86_64/aors_n.asm
new file mode 100644
index 0000000..d5a314a
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/aors_n.asm

@@ -0,0 +1,178 @@
+dnl  AMD64 mpn_add_n, mpn_sub_n
+
+dnl  Copyright 2003-2005, 2007, 2008, 2010-2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C AMD K8,K9	 1.5
+C AMD K10	 1.5
+C AMD bd1	 1.8
+C AMD bd2	 1.74
+C AMD bd3	 ?
+C AMD bd4	 1.78
+C AMD zen	 1.5
+C AMD bt1	 2.54
+C AMD bt2	 2.15
+C Intel P4	11.5
+C Intel core2	 4.9
+C Intel NHM	 5.53
+C Intel SBR	 1.59
+C Intel IBR	 1.55
+C Intel HWL	 1.44
+C Intel BWL	 1.14
+C Intel SKL	 1.21
+C Intel atom	 4
+C Intel SLM	 3
+C VIA nano	 3.25
+
+C The loop of this code is the result of running a code generation and
+C optimization tool suite written by David Harvey and Torbjorn Granlund.
+
+C INPUT PARAMETERS
+define(`rp',	`%rdi')	C rcx
+define(`up',	`%rsi')	C rdx
+define(`vp',	`%rdx')	C r8
+define(`n',	`%rcx')	C r9
+define(`cy',	`%r8')	C rsp+40    (mpn_add_nc and mpn_sub_nc)
+
+ifdef(`OPERATION_add_n', `
+	define(ADCSBB,	      adc)
+	define(func,	      mpn_add_n)
+	define(func_nc,	      mpn_add_nc)')
+ifdef(`OPERATION_sub_n', `
+	define(ADCSBB,	      sbb)
+	define(func,	      mpn_sub_n)
+	define(func_nc,	      mpn_sub_nc)')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(func_nc)
+	FUNC_ENTRY(4)
+IFDOS(`	mov	56(%rsp), %r8	')
+	mov	R32(n), R32(%rax)
+	shr	$2, n
+	and	$3, R32(%rax)
+	bt	$0, %r8			C cy flag <- carry parameter
+	jrcxz	L(lt4)
+
+	mov	(up), %r8
+	mov	8(up), %r9
+	dec	n
+	jmp	L(mid)
+
+EPILOGUE()
+	ALIGN(16)
+PROLOGUE(func)
+	FUNC_ENTRY(4)
+	mov	R32(n), R32(%rax)
+	shr	$2, n
+	and	$3, R32(%rax)
+	jrcxz	L(lt4)
+
+	mov	(up), %r8
+	mov	8(up), %r9
+	dec	n
+	jmp	L(mid)
+
+L(lt4):	dec	R32(%rax)
+	mov	(up), %r8
+	jnz	L(2)
+	ADCSBB	(vp), %r8
+	mov	%r8, (rp)
+	adc	R32(%rax), R32(%rax)
+	FUNC_EXIT()
+	ret
+
+L(2):	dec	R32(%rax)
+	mov	8(up), %r9
+	jnz	L(3)
+	ADCSBB	(vp), %r8
+	ADCSBB	8(vp), %r9
+	mov	%r8, (rp)
+	mov	%r9, 8(rp)
+	adc	R32(%rax), R32(%rax)
+	FUNC_EXIT()
+	ret
+
+L(3):	mov	16(up), %r10
+	ADCSBB	(vp), %r8
+	ADCSBB	8(vp), %r9
+	ADCSBB	16(vp), %r10
+	mov	%r8, (rp)
+	mov	%r9, 8(rp)
+	mov	%r10, 16(rp)
+	setc	R8(%rax)
+	FUNC_EXIT()
+	ret
+
+	ALIGN(16)
+L(top):	ADCSBB	(vp), %r8
+	ADCSBB	8(vp), %r9
+	ADCSBB	16(vp), %r10
+	ADCSBB	24(vp), %r11
+	mov	%r8, (rp)
+	lea	32(up), up
+	mov	%r9, 8(rp)
+	mov	%r10, 16(rp)
+	dec	n
+	mov	%r11, 24(rp)
+	lea	32(vp), vp
+	mov	(up), %r8
+	mov	8(up), %r9
+	lea	32(rp), rp
+L(mid):	mov	16(up), %r10
+	mov	24(up), %r11
+	jnz	L(top)
+
+L(end):	lea	32(up), up
+	ADCSBB	(vp), %r8
+	ADCSBB	8(vp), %r9
+	ADCSBB	16(vp), %r10
+	ADCSBB	24(vp), %r11
+	lea	32(vp), vp
+	mov	%r8, (rp)
+	mov	%r9, 8(rp)
+	mov	%r10, 16(rp)
+	mov	%r11, 24(rp)
+	lea	32(rp), rp
+
+	inc	R32(%rax)
+	dec	R32(%rax)
+	jnz	L(lt4)
+	adc	R32(%rax), R32(%rax)
+	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/aorsmul_1.asm b/third_party/gmp/mpn/x86_64/aorsmul_1.asm
new file mode 100644
index 0000000..dfe4dc4
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/aorsmul_1.asm

@@ -0,0 +1,190 @@
+dnl  AMD64 mpn_addmul_1 and mpn_submul_1.
+
+dnl  Copyright 2003-2005, 2007, 2008, 2011, 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C AMD K8,K9	 2.52
+C AMD K10	 2.51
+C AMD bd1	 4.43
+C AMD bd2	 5.03	 5.63
+C AMD bd3	 ?
+C AMD bd4	 ?
+C AMD zen	 ?
+C AMD bobcat	 6.20
+C AMD jaguar	 5.57	 6.56
+C Intel P4	14.9	17.1
+C Intel core2	 5.15
+C Intel NHM	 4.93
+C Intel SBR	 3.95
+C Intel IBR	 3.75
+C Intel HWL	 3.62
+C Intel BWL	 2.53
+C Intel SKL	 2.53
+C Intel atom	21.3
+C Intel SLM	 9.0
+C VIA nano	 5.0
+
+C The loop of this code is the result of running a code generation and
+C optimization tool suite written by David Harvey and Torbjorn Granlund.
+
+C TODO
+C  * The loop is great, but the prologue and epilogue code was quickly written.
+C    Tune it!
+
+define(`rp',      `%rdi')   C rcx
+define(`up',      `%rsi')   C rdx
+define(`n_param', `%rdx')   C r8
+define(`vl',      `%rcx')   C r9
+
+define(`n',       `%r11')
+
+ifdef(`OPERATION_addmul_1',`
+      define(`ADDSUB',        `add')
+      define(`func',  `mpn_addmul_1')
+')
+ifdef(`OPERATION_submul_1',`
+      define(`ADDSUB',        `sub')
+      define(`func',  `mpn_submul_1')
+')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
+
+IFDOS(`	define(`up', ``%rsi'')	') dnl
+IFDOS(`	define(`rp', ``%rcx'')	') dnl
+IFDOS(`	define(`vl', ``%r9'')	') dnl
+IFDOS(`	define(`r9', ``rdi'')	') dnl
+IFDOS(`	define(`n',  ``%r8'')	') dnl
+IFDOS(`	define(`r8', ``r11'')	') dnl
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(func)
+
+IFDOS(``push	%rsi		'')
+IFDOS(``push	%rdi		'')
+IFDOS(``mov	%rdx, %rsi	'')
+
+	mov	(up), %rax		C read first u limb early
+	push	%rbx
+IFSTD(`	mov	n_param, %rbx   ')	C move away n from rdx, mul uses it
+IFDOS(`	mov	n, %rbx         ')
+	mul	vl
+IFSTD(`	mov	%rbx, n         ')
+
+	and	$3, R32(%rbx)
+	jz	L(b0)
+	cmp	$2, R32(%rbx)
+	jz	L(b2)
+	jg	L(b3)
+
+L(b1):	dec	n
+	jne	L(gt1)
+	ADDSUB	%rax, (rp)
+	jmp	L(ret)
+L(gt1):	lea	8(up,n,8), up
+	lea	-8(rp,n,8), rp
+	neg	n
+	xor	%r10, %r10
+	xor	R32(%rbx), R32(%rbx)
+	mov	%rax, %r9
+	mov	(up,n,8), %rax
+	mov	%rdx, %r8
+	jmp	L(L1)
+
+L(b0):	lea	(up,n,8), up
+	lea	-16(rp,n,8), rp
+	neg	n
+	xor	%r10, %r10
+	mov	%rax, %r8
+	mov	%rdx, %rbx
+	jmp	 L(L0)
+
+L(b3):	lea	-8(up,n,8), up
+	lea	-24(rp,n,8), rp
+	neg	n
+	mov	%rax, %rbx
+	mov	%rdx, %r10
+	jmp	L(L3)
+
+L(b2):	lea	-16(up,n,8), up
+	lea	-32(rp,n,8), rp
+	neg	n
+	xor	%r8, %r8
+	xor	R32(%rbx), R32(%rbx)
+	mov	%rax, %r10
+	mov	24(up,n,8), %rax
+	mov	%rdx, %r9
+	jmp	L(L2)
+
+	ALIGN(16)
+L(top):	ADDSUB	%r10, (rp,n,8)
+	adc	%rax, %r9
+	mov	(up,n,8), %rax
+	adc	%rdx, %r8
+	mov	$0, R32(%r10)
+L(L1):	mul	vl
+	ADDSUB	%r9, 8(rp,n,8)
+	adc	%rax, %r8
+	adc	%rdx, %rbx
+L(L0):	mov	8(up,n,8), %rax
+	mul	vl
+	ADDSUB	%r8, 16(rp,n,8)
+	adc	%rax, %rbx
+	adc	%rdx, %r10
+L(L3):	mov	16(up,n,8), %rax
+	mul	vl
+	ADDSUB	%rbx, 24(rp,n,8)
+	mov	$0, R32(%r8)		C zero
+	mov	%r8, %rbx		C zero
+	adc	%rax, %r10
+	mov	24(up,n,8), %rax
+	mov	%r8, %r9		C zero
+	adc	%rdx, %r9
+L(L2):	mul	vl
+	add	$4, n
+	js	 L(top)
+
+	ADDSUB	%r10, (rp,n,8)
+	adc	%rax, %r9
+	adc	%r8, %rdx
+	ADDSUB	%r9, 8(rp,n,8)
+L(ret):	adc	$0, %rdx
+	mov	%rdx, %rax
+
+	pop	%rbx
+IFDOS(``pop	%rdi		'')
+IFDOS(``pop	%rsi		'')
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/atom/addmul_2.asm b/third_party/gmp/mpn/x86_64/atom/addmul_2.asm
new file mode 100644
index 0000000..c1dcdc4
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/atom/addmul_2.asm

@@ -0,0 +1,186 @@
+dnl  AMD64 mpn_addmul_2 optimised for Intel Atom.
+
+dnl  Copyright 2008, 2011-2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb	best
+C AMD K8,K9
+C AMD K10
+C AMD bd1
+C AMD bd2
+C AMD bobcat
+C AMD jaguar
+C Intel P4
+C Intel PNR
+C Intel NHM
+C Intel SBR
+C Intel IBR
+C Intel HWL
+C Intel BWL
+C Intel atom	18.8		this
+C VIA nano
+
+C The loop of this code is the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjorn Granlund.
+
+define(`rp',      `%rdi')   C rcx
+define(`up',      `%rsi')   C rdx
+define(`n_param', `%rdx')   C r8
+define(`vp',      `%rcx')   C r9
+
+define(`v0', `%r8')
+define(`v1', `%r9')
+define(`w0', `%rbx')
+define(`w1', `%rcx')
+define(`w2', `%rbp')
+define(`w3', `%r10')
+define(`n',  `%r11')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_addmul_2)
+	FUNC_ENTRY(4)
+	push	%rbx
+	push	%rbp
+
+	mov	(up), %rax
+
+	mov	(vp), v0
+	mov	8(vp), v1
+
+	mov	n_param, n
+	mul	v0
+
+	test	$1, R8(n)
+	jnz	L(bx1)
+
+L(bx0):	test	$2, R8(n)
+	jnz	L(b10)
+
+L(b00):	mov	%rax, w0
+	mov	(up), %rax
+	mov	%rdx, w1
+	xor	R32(w2), R32(w2)
+	lea	-8(rp), rp
+	jmp	L(lo0)
+
+L(b10):	mov	%rax, w2
+	mov	(up), %rax
+	mov	%rdx, w3
+	xor	R32(w0), R32(w0)
+	lea	-16(up), up
+	lea	-24(rp), rp
+	jmp	L(lo2)
+
+L(bx1):	test	$2, R8(n)
+	jnz	L(b11)
+
+L(b01):	mov	%rax, w3
+	mov	%rdx, w0
+	mov	(up), %rax
+	xor	R32(w1), R32(w1)
+	lea	8(up), up
+	dec	n
+	jmp	L(lo1)
+
+L(b11):	mov	%rax, w1
+	mov	(up), %rax
+	mov	%rdx, w2
+	xor	R32(w3), R32(w3)
+	lea	-8(up), up
+	lea	-16(rp), rp
+	jmp	L(lo3)
+
+	ALIGN(16)
+L(top):
+L(lo1):	mul	v1
+	add	w3, (rp)
+	mov	$0, R32(w2)
+	adc	%rax, w0
+	mov	(up), %rax
+	adc	%rdx, w1
+	mul	v0
+	add	%rax, w0
+	mov	(up), %rax
+	adc	%rdx, w1
+	adc	$0, R32(w2)
+L(lo0):	mul	v1
+	add	w0, 8(rp)
+	adc	%rax, w1
+	mov	8(up), %rax
+	mov	$0, R32(w3)
+	adc	%rdx, w2
+	mul	v0
+	add	%rax, w1
+	mov	8(up), %rax
+	adc	%rdx, w2
+	adc	$0, R32(w3)
+L(lo3):	mul	v1
+	add	w1, 16(rp)
+	adc	%rax, w2
+	mov	16(up), %rax
+	mov	$0, R32(w0)
+	adc	%rdx, w3
+	mul	v0
+	add	%rax, w2
+	mov	16(up), %rax
+	adc	%rdx, w3
+	adc	$0, R32(w0)
+L(lo2):	mul	v1
+	add	w2, 24(rp)
+	adc	%rax, w3
+	mov	24(up), %rax
+	adc	%rdx, w0
+	mov	$0, R32(w1)
+	lea	32(rp), rp
+	mul	v0
+	lea	32(up), up
+	add	%rax, w3
+	adc	%rdx, w0
+	mov	-8(up), %rax
+	adc	$0, R32(w1)
+	sub	$4, n
+	ja	L(top)
+
+L(end):	mul	v1
+	add	w3, (rp)
+	adc	%rax, w0
+	adc	%rdx, w1
+	mov	w0, 8(rp)
+	mov	w1, %rax
+	pop	%rbp
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/atom/aorrlsh1_n.asm b/third_party/gmp/mpn/x86_64/atom/aorrlsh1_n.asm
new file mode 100644
index 0000000..f44de19
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/atom/aorrlsh1_n.asm

@@ -0,0 +1,238 @@
+dnl  AMD64 mpn_addlsh1_n, mpn_rsblsh1_n optimised for Intel Atom.
+dnl  Used also for AMD bd1.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  Copyright 2011, 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C TODO
+C  * This code is slightly large at 433 bytes.
+C  * sublsh1_n.asm and this file use the same basic pattern.
+
+C	     cycles/limb
+C AMD K8,K9	 ?
+C AMD K10	 ?
+C AMD bd1	 2.3
+C AMD bobcat	 ?
+C Intel P4	 ?
+C Intel core2	 ?
+C Intel NHM	 ?
+C Intel SBR	 ?
+C Intel atom	 4.875	(4.75 is probably possible)
+C VIA nano	 ?
+
+C INPUT PARAMETERS
+define(`rp',       `%rdi')
+define(`up',       `%rsi')
+define(`vp',       `%rdx')
+define(`n',        `%rcx')
+define(`cy',       `%r8')
+
+ifdef(`OPERATION_addlsh1_n', `
+  define(ADDSUB,	add)
+  define(ADCSBB,	adc)
+  define(func_n,	mpn_addlsh1_n)
+  define(func_nc,	mpn_addlsh1_nc)')
+ifdef(`OPERATION_rsblsh1_n', `
+  define(ADDSUB,	sub)
+  define(ADCSBB,	sbb)
+  define(func_n,	mpn_rsblsh1_n)
+  define(func_nc,	mpn_rsblsh1_nc)')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_addlsh1_nc mpn_rsblsh1_n mpn_rsblsh1_nc)
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(func_n)
+	FUNC_ENTRY(4)
+	push	%rbp
+	xor	R32(%rbp), R32(%rbp)
+L(ent):	mov	R32(n), R32(%rax)
+	and	$3, R32(%rax)
+	jz	L(b0)
+	cmp	$2, R32(%rax)
+	jz	L(b2)
+	jg	L(b3)
+
+L(b1):	mov	(vp), %r8
+	add	%r8, %r8
+	lea	8(vp), vp
+	sbb	R32(%rax), R32(%rax)	C save scy
+	add	R32(%rbp), R32(%rbp)	C restore acy
+	ADCSBB	(up), %r8
+	mov	%r8, (rp)
+	sbb	R32(%rbp), R32(%rbp)	C save acy
+	lea	8(up), up
+	lea	8(rp), rp
+	jmp	L(b0)
+
+L(b2):	mov	(vp), %r8
+	add	%r8, %r8
+	mov	8(vp), %r9
+	adc	%r9, %r9
+	lea	16(vp), vp
+	sbb	R32(%rax), R32(%rax)	C save scy
+	add	R32(%rbp), R32(%rbp)	C restore acy
+	ADCSBB	(up), %r8
+	mov	%r8, (rp)
+	ADCSBB	8(up), %r9
+	mov	%r9, 8(rp)
+	sbb	R32(%rbp), R32(%rbp)	C save acy
+	lea	16(up), up
+	lea	16(rp), rp
+	jmp	L(b0)
+
+L(b3):	mov	(vp), %r8
+	add	%r8, %r8
+	mov	8(vp), %r9
+	adc	%r9, %r9
+	mov	16(vp), %r10
+	adc	%r10, %r10
+	lea	24(vp), vp
+	sbb	R32(%rax), R32(%rax)	C save scy
+	add	R32(%rbp), R32(%rbp)	C restore acy
+	ADCSBB	(up), %r8
+	mov	%r8, (rp)
+	ADCSBB	8(up), %r9
+	mov	%r9, 8(rp)
+	ADCSBB	16(up), %r10
+	mov	%r10, 16(rp)
+	sbb	R32(%rbp), R32(%rbp)	C save acy
+	lea	24(up), up
+	lea	24(rp), rp
+
+L(b0):	test	$4, R8(n)
+	jz	L(skp)
+	add	R32(%rax), R32(%rax)	C restore scy
+	mov	(vp), %r8
+	adc	%r8, %r8
+	mov	8(vp), %r9
+	adc	%r9, %r9
+	mov	16(vp), %r10
+	adc	%r10, %r10
+	mov	24(vp), %r11
+	adc	%r11, %r11
+	lea	32(vp), vp
+	sbb	R32(%rax), R32(%rax)	C save scy
+	add	R32(%rbp), R32(%rbp)	C restore acy
+	ADCSBB	(up), %r8
+	mov	%r8, (rp)
+	ADCSBB	8(up), %r9
+	mov	%r9, 8(rp)
+	ADCSBB	16(up), %r10
+	mov	%r10, 16(rp)
+	ADCSBB	24(up), %r11
+	mov	%r11, 24(rp)
+	lea	32(up), up
+	lea	32(rp), rp
+	sbb	R32(%rbp), R32(%rbp)	C save acy
+
+L(skp):	cmp	$8, n
+	jl	L(rtn)
+
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%rbx
+	lea	-64(rp), rp
+	jmp	L(x)
+
+	ALIGN(16)
+L(top):	add	R32(%rax), R32(%rax)	C restore scy
+	lea	64(rp), rp
+	mov	(vp), %r8
+	adc	%r8, %r8
+	mov	8(vp), %r9
+	adc	%r9, %r9
+	mov	16(vp), %r10
+	adc	%r10, %r10
+	mov	24(vp), %r11
+	adc	%r11, %r11
+	mov	32(vp), %r12
+	adc	%r12, %r12
+	mov	40(vp), %r13
+	adc	%r13, %r13
+	mov	48(vp), %r14
+	adc	%r14, %r14
+	mov	56(vp), %rbx
+	adc	%rbx, %rbx
+	lea	64(vp), vp
+	sbb	R32(%rax), R32(%rax)	C save scy
+	add	R32(%rbp), R32(%rbp)	C restore acy
+	ADCSBB	(up), %r8
+	mov	%r8, (rp)
+	ADCSBB	8(up), %r9
+	mov	%r9, 8(rp)
+	ADCSBB	16(up), %r10
+	mov	%r10, 16(rp)
+	ADCSBB	24(up), %r11
+	mov	%r11, 24(rp)
+	ADCSBB	32(up), %r12
+	mov	%r12, 32(rp)
+	ADCSBB	40(up), %r13
+	mov	%r13, 40(rp)
+	ADCSBB	48(up), %r14
+	mov	%r14, 48(rp)
+	ADCSBB	56(up), %rbx
+	mov	%rbx, 56(rp)
+	sbb	R32(%rbp), R32(%rbp)	C save acy
+	lea	64(up), up
+L(x):	sub	$8, n
+	jge	L(top)
+
+L(end):	pop	%rbx
+	pop	%r14
+	pop	%r13
+	pop	%r12
+L(rtn):
+ifdef(`OPERATION_addlsh1_n',`
+	add	R32(%rbp), R32(%rax)
+	neg	R32(%rax)')
+ifdef(`OPERATION_rsblsh1_n',`
+	sub	R32(%rax), R32(%rbp)
+	movslq	R32(%rbp), %rax')
+
+	pop	%rbp
+	FUNC_EXIT()
+	ret
+EPILOGUE()
+PROLOGUE(func_nc)
+	FUNC_ENTRY(4)
+IFDOS(`	mov	56(%rsp), %r8	')
+	push	%rbp
+	neg	%r8			C set CF
+	sbb	R32(%rbp), R32(%rbp)	C save acy
+	jmp	L(ent)
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/atom/aorrlsh2_n.asm b/third_party/gmp/mpn/x86_64/atom/aorrlsh2_n.asm
new file mode 100644
index 0000000..02fb29d
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/atom/aorrlsh2_n.asm

@@ -0,0 +1,191 @@
+dnl  AMD64 mpn_addlsh2_n -- rp[] = up[] + (vp[] << 2)
+dnl  AMD64 mpn_rsblsh2_n -- rp[] = (vp[] << 2) - up[]
+dnl  Optimised for Intel Atom.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  Copyright 2011, 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C AMD K8,K9	 ?
+C AMD K10	 ?
+C Intel P4	 ?
+C Intel core2	 ?
+C Intel NHM	 ?
+C Intel SBR	 ?
+C Intel atom	 5.75
+C VIA nano	 ?
+
+C INPUT PARAMETERS
+define(`rp',       `%rdi')
+define(`up',       `%rsi')
+define(`vp',       `%rdx')
+define(`n',        `%rcx')
+
+define(`LSH', 2)
+define(`RSH', 62)
+define(M, eval(m4_lshift(1,LSH)))
+
+ifdef(`OPERATION_addlsh2_n', `
+  define(ADDSUB,	add)
+  define(ADCSBB,	adc)
+  define(func_n,	mpn_addlsh2_n)
+  define(func_nc,	mpn_addlsh2_nc)')
+ifdef(`OPERATION_rsblsh2_n', `
+  define(ADDSUB,	sub)
+  define(ADCSBB,	sbb)
+  define(func_n,	mpn_rsblsh2_n)
+  define(func_nc,	mpn_rsblsh2_nc)')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_rsblsh2_n)
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(func_n)
+	FUNC_ENTRY(4)
+	push	%rbx
+	push	%rbp
+
+	mov	R32(n), R32(%rax)
+	and	$3, R32(%rax)
+	jz	L(b0)			C we rely on rax = 0 at target
+	cmp	$2, R32(%rax)
+	mov	$0, R32(%rax)
+	jz	L(b2)
+	jg	L(b3)
+
+L(b1):	mov	(vp), %r9
+	lea	(%rax,%r9,M), %rbp
+	shr	$RSH, %r9
+	sub	$1, n
+	lea	-8(up), up
+	lea	-8(rp), rp
+	jz	L(cj1)
+	mov	8(vp), %r10
+	lea	(%r9,%r10,M), %r9
+	shr	$RSH, %r10
+	mov	16(vp), %r11
+	lea	24(vp), vp
+	mov	(vp), %r8
+	lea	(%r10,%r11,M), %r10
+	shr	$RSH, %r11
+	add	R32(%rax), R32(%rax)
+	jmp	L(L1)
+
+L(b2):	lea	-32(rp), rp
+	mov	(vp), %r8
+	lea	-32(up), up
+	lea	(%rax,%r8,M), %rbx
+	shr	$RSH, %r8
+	mov	8(vp), %r9
+	sub	$2, n
+	jle	L(end)
+	jmp	L(top)
+
+L(b3):	lea	-24(up), up
+	mov	(vp), %r11
+	lea	-24(rp), rp
+	mov	8(vp), %r8
+	lea	(%rax,%r11,M), %r10
+	shr	$RSH, %r11
+	lea	8(vp), vp
+	lea	(%r11,%r8,M), %rbx
+	add	$1, n
+	jmp	L(L3)
+
+L(b0):	lea	-16(up), up
+	mov	(vp), %r10
+	lea	(%rax,%r10,M), %r9
+	shr	$RSH, %r10
+	mov	8(vp), %r11
+	lea	-16(rp), rp
+	mov	16(vp), %r8
+	lea	(%r10,%r11,M), %r10
+	shr	$RSH, %r11
+	add	R32(%rax), R32(%rax)
+	lea	16(vp), vp
+	jmp	L(L0)
+
+	ALIGN(16)
+L(top):	lea	(%r8,%r9,M), %rbp
+	shr	$RSH, %r9
+	lea	32(up), up
+	mov	16(vp), %r10
+	lea	(%r9,%r10,M), %r9
+	shr	$RSH, %r10
+	mov	24(vp), %r11
+	lea	32(rp), rp
+	lea	32(vp), vp
+	mov	(vp), %r8
+	lea	(%r10,%r11,M), %r10
+	shr	$RSH, %r11
+	add	R32(%rax), R32(%rax)
+	ADCSBB	(up), %rbx
+	mov	%rbx, (rp)
+L(L1):	ADCSBB	8(up), %rbp
+	mov	%rbp, 8(rp)
+L(L0):	ADCSBB	16(up), %r9
+	lea	(%r11,%r8,M), %rbx
+	mov	%r9, 16(rp)
+L(L3):	ADCSBB	24(up), %r10
+	sbb	R32(%rax), R32(%rax)
+L(L2):	shr	$RSH, %r8
+	mov	8(vp), %r9
+	mov	%r10, 24(rp)
+	sub	$4, n
+	jg	L(top)
+
+L(end):	lea	(%r8,%r9,M), %rbp
+	shr	$RSH, %r9
+	lea	32(up), up
+	lea	32(rp), rp
+	add	R32(%rax), R32(%rax)
+	ADCSBB	(up), %rbx
+	mov	%rbx, (rp)
+L(cj1):	ADCSBB	8(up), %rbp
+	mov	%rbp, 8(rp)
+
+ifdef(`OPERATION_addlsh2_n',`
+	mov	R32(n), R32(%rax)	C zero rax
+	adc	%r9, %rax')
+ifdef(`OPERATION_rsblsh2_n',`
+	sbb	n, %r9			C subtract 0
+	mov	%r9, %rax')
+
+	pop	%rbp
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/atom/aors_n.asm b/third_party/gmp/mpn/x86_64/atom/aors_n.asm
new file mode 100644
index 0000000..83b8df9
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/atom/aors_n.asm

@@ -0,0 +1,128 @@
+dnl  X86-64 mpn_add_n, mpn_sub_n, optimised for Intel Atom.
+
+dnl  Copyright 2011, 2017 Free Software Foundation, Inc.
+
+dnl  Contributed to the GNU project by Marco Bodrato.  Ported to 64-bit by
+dnl  Torbjörn Granlund.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	    cycles/limb
+C AMD K8,K9	 2
+C AMD K10	 2
+C AMD bull	 2.34\2.63
+C AMD pile	 2.27\2.52
+C AMD steam
+C AMD excavator
+C AMD bobcat	 2.79
+C AMD jaguar	 2.78
+C Intel P4	11
+C Intel core2	 7.5
+C Intel NHM	 8.5
+C Intel SBR	 2.11
+C Intel IBR	 2.07
+C Intel HWL	 1.75
+C Intel BWL	 1.51
+C Intel SKL	 1.52
+C Intel atom	 3
+C Intel SLM	 4
+C VIA nano
+
+define(`rp',	`%rdi')	C rcx
+define(`up',	`%rsi')	C rdx
+define(`vp',	`%rdx')	C r8
+define(`n',	`%rcx')	C r9
+define(`cy',	`%r8')	C rsp+40    (mpn_add_nc and mpn_sub_nc)
+
+ifdef(`OPERATION_add_n', `
+  define(ADCSBB,    adc)
+  define(func_n,    mpn_add_n)
+  define(func_nc,   mpn_add_nc)')
+ifdef(`OPERATION_sub_n', `
+  define(ADCSBB,    sbb)
+  define(func_n,    mpn_sub_n)
+  define(func_nc,   mpn_sub_nc)')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(func_n)
+	FUNC_ENTRY(4)
+	xor	cy, cy			C carry
+
+L(com):	shr	n			C n >> 1
+	jz	L(1)			C n == 1
+	jc	L(1m2)			C n % 2 == 1
+
+L(0m2):	shr	cy
+	mov	(up), %r10
+	lea	8(up), up
+	lea	8(vp), vp
+	lea	-8(rp), rp
+	jmp	L(mid)
+
+L(1):	shr	cy
+	mov	(up), %r9
+	jmp	L(end)
+
+L(1m2):	shr	cy
+	mov	(up), %r9
+
+	ALIGN(16)
+L(top):	ADCSBB	(vp), %r9
+	lea	16(up), up
+	mov	-8(up), %r10
+	lea	16(vp), vp
+	mov	%r9, (rp)
+L(mid):	ADCSBB	-8(vp), %r10
+	lea	16(rp), rp
+	dec	n
+	mov	(up), %r9
+	mov	%r10, -8(rp)
+	jnz	L(top)
+
+L(end):	ADCSBB	(vp), %r9
+	mov	$0, R32(%rax)
+	mov	%r9, (rp)
+	adc	R32(%rax), R32(%rax)
+	FUNC_EXIT()
+	ret
+EPILOGUE()
+
+PROLOGUE(func_nc)
+	FUNC_ENTRY(4)
+IFDOS(`	mov	56(%rsp), cy	')
+	jmp	L(com)
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/x86_64/atom/aorsmul_1.asm b/third_party/gmp/mpn/x86_64/atom/aorsmul_1.asm
new file mode 100644
index 0000000..7cbc085
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/atom/aorsmul_1.asm

@@ -0,0 +1,194 @@
+dnl  AMD64 mpn_addmul_1/mpn_submul_1 optimised for Intel Atom.
+
+dnl  Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C AMD K8,K9	 4.5
+C AMD K10	 4.5
+C AMD bull	 4.73
+C AMD pile	 4.60	 4.80
+C AMD steam
+C AMD excavator
+C AMD bobcat	 5.48
+C AMD jaguar	 5.61
+C Intel P4	16.6
+C Intel core2	 5.09
+C Intel NHM	 4.79
+C Intel SBR	 3.88
+C Intel IBR	 3.65
+C Intel HWL	 3.53
+C Intel BWL	 2.75
+C Intel SKL	 2.76
+C Intel atom	19.4
+C Intel SLM	 8
+C VIA nano
+
+C The loop of this code is the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjorn Granlund.
+
+define(`rp',      `%rdi')   C rcx
+define(`up',      `%rsi')   C rdx
+define(`n_param', `%rdx')   C r8
+define(`v0',      `%rcx')   C r9
+
+define(`n',       `%rbx')
+
+ifdef(`OPERATION_addmul_1',`
+  define(`ADDSUB', `add')
+  define(`func',   `mpn_addmul_1')
+')
+ifdef(`OPERATION_submul_1',`
+  define(`ADDSUB', `sub')
+  define(`func',   `mpn_submul_1')
+')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(func)
+	FUNC_ENTRY(4)
+	push	%rbx
+
+	mov	(up), %rax
+	lea	-8(up,n_param,8), up
+	lea	-16(rp,n_param,8), rp
+
+	test	$1, R8(n_param)
+	jnz	L(bx1)
+
+L(bx0):	test	$2, R8(n_param)
+	jnz	L(b10)
+
+L(b00):	mov	$1, R32(n)
+	sub	n_param, n
+	mul	v0
+	mov	%rax, %r11
+	mov	8(up,n,8), %rax
+	mov	%rdx, %r10
+	mul	v0
+	mov	%rax, %r8
+	mov	16(up,n,8), %rax
+	jmp	L(lo0)
+
+L(b10):	mov	$3, R32(n)
+	sub	n_param, n
+	mul	v0
+	mov	%rax, %r11
+	mov	-8(up,n,8), %rax
+	mov	%rdx, %r10
+	mul	v0
+	test	n, n
+	jns	L(cj2)
+	mov	%rax, %r8
+	mov	(up,n,8), %rax
+	mov	%rdx, %r9
+	jmp	L(lo2)
+
+L(bx1):	test	$2, R8(n_param)
+	jnz	L(b11)
+
+L(b01):	mov	$2, R32(n)
+	sub	n_param, n
+	mul	v0
+	test	n, n
+	jns	L(cj1)
+	mov	%rax, %r8
+	mov	(up,n,8), %rax
+	mov	%rdx, %r9
+	mul	v0
+	mov	%rax, %r11
+	mov	8(up,n,8), %rax
+	mov	%rdx, %r10
+	jmp	L(lo1)
+
+L(b11):	xor	R32(n), R32(n)
+	sub	n_param, n
+	mul	v0
+	mov	%rax, %r8
+	mov	16(up,n,8), %rax
+	mov	%rdx, %r9
+	mul	v0
+	mov	%rax, %r11
+	mov	24(up,n,8), %rax
+	jmp	L(lo3)
+
+	ALIGN(16)
+L(top):	mul	v0
+	ADDSUB	%r8, -16(rp,n,8)
+	mov	%rax, %r8
+	mov	(up,n,8), %rax
+	adc	%r9, %r11
+	mov	%rdx, %r9
+	adc	$0, %r10
+L(lo2):	mul	v0
+	ADDSUB	%r11, -8(rp,n,8)
+	mov	%rax, %r11
+	mov	8(up,n,8), %rax
+	adc	%r10, %r8
+	mov	%rdx, %r10
+	adc	$0, %r9
+L(lo1):	mul	v0
+	ADDSUB	%r8, (rp,n,8)
+	mov	%rax, %r8
+	adc	%r9, %r11
+	mov	16(up,n,8), %rax
+	adc	$0, %r10
+L(lo0):	mov	%rdx, %r9
+	mul	v0
+	ADDSUB	%r11, 8(rp,n,8)
+	mov	%rax, %r11
+	adc	%r10, %r8
+	mov	24(up,n,8), %rax
+	adc	$0, %r9
+L(lo3):	add	$4, n
+	mov	%rdx, %r10
+	js	L(top)
+
+L(end):	mul	v0
+	ADDSUB	%r8, -16(rp,n,8)
+	adc	%r9, %r11
+	adc	$0, %r10
+L(cj2):	ADDSUB	%r11, -8(rp,n,8)
+	adc	%r10, %rax
+	adc	$0, %rdx
+L(cj1):	ADDSUB	%rax, (rp,n,8)
+	mov	$0, R32(%rax)
+	adc	%rdx, %rax
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/x86_64/atom/cnd_add_n.asm b/third_party/gmp/mpn/x86_64/atom/cnd_add_n.asm
new file mode 100644
index 0000000..fcb9a0f
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/atom/cnd_add_n.asm

@@ -0,0 +1,38 @@
+dnl  X86-64 mpn_cnd_add_n.
+
+dnl  Copyright 2017 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_cnd_add_n)
+include_mpn(`x86_64/coreisbr/cnd_add_n.asm')

diff --git a/third_party/gmp/mpn/x86_64/atom/cnd_sub_n.asm b/third_party/gmp/mpn/x86_64/atom/cnd_sub_n.asm
new file mode 100644
index 0000000..9eee1c1
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/atom/cnd_sub_n.asm

@@ -0,0 +1,38 @@
+dnl  X86-64 mpn_cnd_sub_n.
+
+dnl  Copyright 2017 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_cnd_sub_n)
+include_mpn(`x86_64/coreisbr/cnd_sub_n.asm')

diff --git a/third_party/gmp/mpn/x86_64/atom/com.asm b/third_party/gmp/mpn/x86_64/atom/com.asm
new file mode 100644
index 0000000..6b6460f
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/atom/com.asm

@@ -0,0 +1,37 @@
+dnl  X86-64 mpn_com optimised for Intel Atom.
+
+dnl  Copyright 2012, 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_com)
+include_mpn(`x86_64/fastsse/com-palignr.asm')

diff --git a/third_party/gmp/mpn/x86_64/atom/copyd.asm b/third_party/gmp/mpn/x86_64/atom/copyd.asm
new file mode 100644
index 0000000..e309279
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/atom/copyd.asm

@@ -0,0 +1,37 @@
+dnl  X86-64 mpn_copyd optimised for Intel Atom.
+
+dnl  Copyright 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_copyd)
+include_mpn(`x86_64/fastsse/copyd-palignr.asm')

diff --git a/third_party/gmp/mpn/x86_64/atom/copyi.asm b/third_party/gmp/mpn/x86_64/atom/copyi.asm
new file mode 100644
index 0000000..00ec3c2
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/atom/copyi.asm

@@ -0,0 +1,37 @@
+dnl  X86-64 mpn_copyi optimised for Intel Atom.
+
+dnl  Copyright 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_copyi)
+include_mpn(`x86_64/fastsse/copyi-palignr.asm')

diff --git a/third_party/gmp/mpn/x86_64/atom/dive_1.asm b/third_party/gmp/mpn/x86_64/atom/dive_1.asm
new file mode 100644
index 0000000..d9ba5fe
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/atom/dive_1.asm

@@ -0,0 +1,37 @@
+dnl  AMD64 mpn_divexact_1 -- mpn by limb exact division.
+
+dnl  Copyright 2011, 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_divexact_1)
+include_mpn(`x86_64/nano/dive_1.asm')

diff --git a/third_party/gmp/mpn/x86_64/atom/gmp-mparam.h b/third_party/gmp/mpn/x86_64/atom/gmp-mparam.h
new file mode 100644
index 0000000..2cd90f6
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/atom/gmp-mparam.h

@@ -0,0 +1,222 @@
+/* Intel Atom/64 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+#define SHLD_SLOW 1
+#define SHRD_SLOW 1
+
+/* 1600 MHz Diamondville (Atom 330) */
+/* FFT tuning limit = 50,646,641 */
+/* Generated by tuneup.c, 2019-10-16, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD                 0  /* always */
+#define MOD_1_UNNORM_THRESHOLD               0  /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          5
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          3
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD     MP_SIZE_T_MAX
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD         0  /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     12
+#define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_1_NORM_THRESHOLD              1
+#define DIV_QR_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           16
+
+#define DIV_1_VS_MUL_1_PERCENT             201
+
+#define MUL_TOOM22_THRESHOLD                12
+#define MUL_TOOM33_THRESHOLD                74
+#define MUL_TOOM44_THRESHOLD               106
+#define MUL_TOOM6H_THRESHOLD               155
+#define MUL_TOOM8H_THRESHOLD               212
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      73
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD      77
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD      73
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD      72
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD      58
+
+#define SQR_BASECASE_THRESHOLD               5
+#define SQR_TOOM2_THRESHOLD                 22
+#define SQR_TOOM3_THRESHOLD                 73
+#define SQR_TOOM4_THRESHOLD                130
+#define SQR_TOOM6_THRESHOLD                159
+#define SQR_TOOM8_THRESHOLD                236
+
+#define MULMID_TOOM42_THRESHOLD             16
+
+#define MULMOD_BNM1_THRESHOLD                9
+#define SQRMOD_BNM1_THRESHOLD                9
+
+#define MUL_FFT_MODF_THRESHOLD             220  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    220, 5}, {     11, 6}, {      6, 5}, {     13, 6}, \
+    {     13, 7}, {      7, 6}, {     15, 7}, {      8, 6}, \
+    {     17, 7}, {     13, 8}, {      7, 7}, {     17, 8}, \
+    {      9, 7}, {     19, 8}, {     11, 7}, {     23, 8}, \
+    {     13, 9}, {      7, 8}, {     19, 9}, {     11, 8}, \
+    {     25,10}, {      7, 9}, {     15, 8}, {     33, 9}, \
+    {     19, 8}, {     39, 9}, {     23, 8}, {     47, 9}, \
+    {     27,10}, {     15, 9}, {     39,10}, {     23, 9}, \
+    {     47,11}, {     15,10}, {     31, 9}, {     67,10}, \
+    {     39, 9}, {     79,10}, {     47, 9}, {     95,11}, \
+    {     31,10}, {     63, 9}, {    127, 8}, {    255,10}, \
+    {     71, 9}, {    143, 8}, {    287,10}, {     79,11}, \
+    {     47,10}, {     95, 9}, {    191,12}, {     31,11}, \
+    {     63,10}, {    127, 9}, {    255, 8}, {    511,10}, \
+    {    143, 9}, {    287,11}, {     79,10}, {    159, 9}, \
+    {    319,10}, {    175, 9}, {    351,11}, {     95,10}, \
+    {    191, 9}, {    383,10}, {    207,11}, {    111,10}, \
+    {    223,12}, {     63,11}, {    127,10}, {    255, 9}, \
+    {    511,11}, {    143,10}, {    287, 9}, {    575,11}, \
+    {    159,10}, {    319,11}, {    175,10}, {    351,12}, \
+    {     95,11}, {    191,10}, {    383,11}, {    207,10}, \
+    {    415,11}, {    223,13}, {     63,12}, {    127,11}, \
+    {    255,10}, {    511,11}, {    287,10}, {    575,12}, \
+    {    159,11}, {    319,10}, {    639,11}, {    351,12}, \
+    {    191,11}, {    383,10}, {    767,12}, {    223,11}, \
+    {    447,13}, {    127,12}, {    255,11}, {    511,12}, \
+    {    287,11}, {    575,12}, {    319,11}, {    639,12}, \
+    {    351,13}, {    191,12}, {    383,11}, {    767,12}, \
+    {    447,14}, {    127,13}, {    255,12}, {    575,13}, \
+    {    319,12}, {    703,13}, {    383,12}, {    767,13}, \
+    {    447,14}, {    255,13}, {    511,12}, {   1023,13}, \
+    {    575,12}, {   1151,13}, {    703,14}, {    383,13}, \
+    {    831,12}, {   1663,15}, {    255,14}, {    511,13}, \
+    {   1087,12}, {   2175,13}, {   1151,14}, {    639,13}, \
+    {   1407,12}, {   2815,14}, {    767,13}, {   1663,14}, \
+    {    895,13}, {   1791,15}, {    511,14}, {   1023,13}, \
+    {   2175,14}, {   1151,13}, {   2431,12}, {   4863,14}, \
+    {   1407,13}, {   2815,15}, {    767,14}, {   1791,16}, \
+    {    511,15}, {   1023,14}, {   2431,13}, {   4863,15}, \
+    {   1279,14}, {   2943,15}, {   1535,14}, {  16384,15}, \
+    {  32768,16}, {  65536,17}, { 131072,18}, { 262144,19}, \
+    { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \
+    {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 169
+#define MUL_FFT_THRESHOLD                 2240
+
+#define SQR_FFT_MODF_THRESHOLD             184  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    184, 5}, {     11, 6}, {     13, 7}, {      7, 6}, \
+    {     15, 7}, {      8, 6}, {     17, 7}, {     13, 8}, \
+    {      7, 7}, {     17, 8}, {      9, 7}, {     19, 8}, \
+    {     11, 7}, {     23, 8}, {     13, 9}, {      7, 8}, \
+    {     19, 9}, {     11, 8}, {     25,10}, {      7, 9}, \
+    {     15, 8}, {     33, 9}, {     19, 8}, {     39, 9}, \
+    {     23,10}, {     15, 9}, {     39,10}, {     23, 9}, \
+    {     47,11}, {     15,10}, {     31, 9}, {     63, 8}, \
+    {    127, 7}, {    255,10}, {     39, 8}, {    159,10}, \
+    {     47, 9}, {     95, 8}, {    191,11}, {     31,10}, \
+    {     63, 9}, {    127, 8}, {    255, 7}, {    511,10}, \
+    {     71, 9}, {    143, 8}, {    287, 7}, {    575, 9}, \
+    {    159, 8}, {    319,11}, {     47,10}, {     95, 9}, \
+    {    191, 8}, {    383,12}, {     31,11}, {     63,10}, \
+    {    127, 9}, {    255, 8}, {    511,10}, {    143, 9}, \
+    {    287, 8}, {    575,10}, {    159, 9}, {    319, 8}, \
+    {    639,10}, {    175, 9}, {    351,11}, {     95,10}, \
+    {    191, 9}, {    383,11}, {    111,10}, {    223, 9}, \
+    {    447,12}, {     63,11}, {    127,10}, {    255, 9}, \
+    {    511,11}, {    143,10}, {    287, 9}, {    575,11}, \
+    {    159,10}, {    319, 9}, {    639,11}, {    175,10}, \
+    {    351,12}, {     95,11}, {    191,10}, {    383, 9}, \
+    {    767,11}, {    223,10}, {    447,13}, {     63,12}, \
+    {    127,11}, {    255,10}, {    511,11}, {    287,10}, \
+    {    575,12}, {    159,11}, {    319,10}, {    639,11}, \
+    {    351,12}, {    191,11}, {    383,10}, {    767,12}, \
+    {    223,11}, {    447,13}, {    127,12}, {    255,11}, \
+    {    511,12}, {    287,11}, {    575,12}, {    319,11}, \
+    {    639,12}, {    351,13}, {    191,12}, {    383,11}, \
+    {    767,12}, {    447,14}, {    127,13}, {    255,12}, \
+    {    575,13}, {    319,12}, {    703,13}, {    383,12}, \
+    {    767,13}, {    447,14}, {    255,13}, {    511,12}, \
+    {   1023,13}, {    575,12}, {   1151,13}, {    703,14}, \
+    {    383,13}, {    831,12}, {   1663,15}, {    255,14}, \
+    {    511,13}, {   1151,14}, {    639,13}, {   1407,12}, \
+    {   2815,14}, {    767,13}, {   1663,14}, {    895,13}, \
+    {   1791,15}, {    511,14}, {   1023,13}, {   2047,14}, \
+    {   1151,13}, {   2431,12}, {   4863,14}, {   1407,13}, \
+    {   2815,15}, {    767,14}, {   1791,16}, {    511,15}, \
+    {   1023,14}, {   2431,13}, {   4863,15}, {   1279,14}, \
+    {   2943,15}, {   1535,14}, {  16384,15}, {  32768,16}, \
+    {  65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \
+    {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 172
+#define SQR_FFT_THRESHOLD                 1728
+
+#define MULLO_BASECASE_THRESHOLD             0  /* always */
+#define MULLO_DC_THRESHOLD                  33
+#define MULLO_MUL_N_THRESHOLD             4392
+#define SQRLO_BASECASE_THRESHOLD             0  /* always */
+#define SQRLO_DC_THRESHOLD                  85
+#define SQRLO_SQR_THRESHOLD               3176
+
+#define DC_DIV_QR_THRESHOLD                 34
+#define DC_DIVAPPR_Q_THRESHOLD             119
+#define DC_BDIV_QR_THRESHOLD                31
+#define DC_BDIV_Q_THRESHOLD                 76
+
+#define INV_MULMOD_BNM1_THRESHOLD           22
+#define INV_NEWTON_THRESHOLD               149
+#define INV_APPR_THRESHOLD                 123
+
+#define BINV_NEWTON_THRESHOLD              179
+#define REDC_1_TO_REDC_2_THRESHOLD          24
+#define REDC_2_TO_REDC_N_THRESHOLD          39
+
+#define MU_DIV_QR_THRESHOLD                807
+#define MU_DIVAPPR_Q_THRESHOLD             807
+#define MUPI_DIV_QR_THRESHOLD               77
+#define MU_BDIV_QR_THRESHOLD               748
+#define MU_BDIV_Q_THRESHOLD                807
+
+#define POWM_SEC_TABLE  1,22,114,326,1486
+
+#define GET_STR_DC_THRESHOLD                16
+#define GET_STR_PRECOMPUTE_THRESHOLD        30
+#define SET_STR_DC_THRESHOLD               381
+#define SET_STR_PRECOMPUTE_THRESHOLD      1565
+
+#define FAC_DSC_THRESHOLD                  960
+#define FAC_ODD_THRESHOLD                    0  /* always */
+
+#define MATRIX22_STRASSEN_THRESHOLD         13
+#define HGCD2_DIV1_METHOD                    3  /* 5.86% faster than 4 */
+#define HGCD_THRESHOLD                      88
+#define HGCD_APPR_THRESHOLD                 88
+#define HGCD_REDUCE_THRESHOLD             1182
+#define GCD_DC_THRESHOLD                   241
+#define GCDEXT_DC_THRESHOLD                192
+#define JACOBI_BASE_METHOD                   3  /* 9.43% faster than 2 */
+
+/* Tuneup completed successfully, took 193098 seconds */

diff --git a/third_party/gmp/mpn/x86_64/atom/lshift.asm b/third_party/gmp/mpn/x86_64/atom/lshift.asm
new file mode 100644
index 0000000..1b37d5d
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/atom/lshift.asm

@@ -0,0 +1,123 @@
+dnl  AMD64 mpn_lshift -- mpn left shift, optimised for Atom.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  Copyright 2011, 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C AMD K8,K9	 ?
+C AMD K10	 ?
+C Intel P4	 ?
+C Intel core2	 ?
+C Intel NHM	 ?
+C Intel SBR	 ?
+C Intel atom	 4.5
+C VIA nano	 ?
+
+C TODO
+C  * Consider using 4-way unrolling.  We reach 4 c/l, but the code is 2.5 times
+C    larger.
+
+C INPUT PARAMETERS
+define(`rp',	`%rdi')
+define(`up',	`%rsi')
+define(`n',	`%rdx')
+define(`cnt',	`%rcx')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_lshift)
+	FUNC_ENTRY(4)
+	lea	-8(up,n,8), up
+	lea	-8(rp,n,8), rp
+	shr	R32(n)
+	mov	(up), %rax
+	jnc	L(evn)
+
+	mov	%rax, %r11
+	shl	R8(%rcx), %r11
+	neg	R8(%rcx)
+	shr	R8(%rcx), %rax
+	test	n, n
+	jnz	L(gt1)
+	mov	%r11, (rp)
+	FUNC_EXIT()
+	ret
+
+L(gt1):	mov	-8(up), %r8
+	mov	%r8, %r10
+	shr	R8(%rcx), %r8
+	jmp	L(lo1)
+
+L(evn):	mov	%rax, %r10
+	neg	R8(%rcx)
+	shr	R8(%rcx), %rax
+	mov	-8(up), %r9
+	mov	%r9, %r11
+	shr	R8(%rcx), %r9
+	neg	R8(%rcx)
+	dec	n
+	lea	8(rp), rp
+	lea	-8(up), up
+	jz	L(end)
+
+	ALIGN(8)
+L(top):	shl	R8(%rcx), %r10
+	or	%r10, %r9
+	shl	R8(%rcx), %r11
+	neg	R8(%rcx)
+	mov	-8(up), %r8
+	mov	%r8, %r10
+	mov	%r9, -8(rp)
+	shr	R8(%rcx), %r8
+	lea	-16(rp), rp
+L(lo1):	mov	-16(up), %r9
+	or	%r11, %r8
+	mov	%r9, %r11
+	shr	R8(%rcx), %r9
+	lea	-16(up), up
+	neg	R8(%rcx)
+	mov	%r8, (rp)
+	dec	n
+	jg	L(top)
+
+L(end):	shl	R8(%rcx), %r10
+	or	%r10, %r9
+	shl	R8(%rcx), %r11
+	mov	%r9, -8(rp)
+	mov	%r11, -16(rp)
+	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/atom/lshiftc.asm b/third_party/gmp/mpn/x86_64/atom/lshiftc.asm
new file mode 100644
index 0000000..7385f8f
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/atom/lshiftc.asm

@@ -0,0 +1,127 @@
+dnl  AMD64 mpn_lshiftc -- mpn left shift with complement, optimised for Atom.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  Copyright 2011, 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C AMD K8,K9	 ?
+C AMD K10	 ?
+C Intel P4	 ?
+C Intel core2	 ?
+C Intel NHM	 ?
+C Intel SBR	 ?
+C Intel atom	 5
+C VIA nano	 ?
+
+C TODO
+C  * Consider using 4-way unrolling.  We reach 4.5 c/l, but the code is 2.5
+C    times larger.
+
+C INPUT PARAMETERS
+define(`rp',	`%rdi')
+define(`up',	`%rsi')
+define(`n',	`%rdx')
+define(`cnt',	`%rcx')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_lshiftc)
+	FUNC_ENTRY(4)
+	lea	-8(up,n,8), up
+	lea	-8(rp,n,8), rp
+	shr	R32(n)
+	mov	(up), %rax
+	jnc	L(evn)
+
+	mov	%rax, %r11
+	shl	R8(%rcx), %r11
+	neg	R8(%rcx)
+	shr	R8(%rcx), %rax
+	test	n, n
+	jnz	L(gt1)
+	not	%r11
+	mov	%r11, (rp)
+	FUNC_EXIT()
+	ret
+
+L(gt1):	mov	-8(up), %r8
+	mov	%r8, %r10
+	shr	R8(%rcx), %r8
+	jmp	L(lo1)
+
+L(evn):	mov	%rax, %r10
+	neg	R8(%rcx)
+	shr	R8(%rcx), %rax
+	mov	-8(up), %r9
+	mov	%r9, %r11
+	shr	R8(%rcx), %r9
+	neg	R8(%rcx)
+	lea	8(rp), rp
+	lea	-8(up), up
+	jmp	L(lo0)
+
+C	ALIGN(16)
+L(top):	shl	R8(%rcx), %r10
+	or	%r10, %r9
+	shl	R8(%rcx), %r11
+	not	%r9
+	neg	R8(%rcx)
+	mov	-8(up), %r8
+	lea	-16(rp), rp
+	mov	%r8, %r10
+	shr	R8(%rcx), %r8
+	mov	%r9, 8(rp)
+L(lo1):	or	%r11, %r8
+	mov	-16(up), %r9
+	mov	%r9, %r11
+	shr	R8(%rcx), %r9
+	lea	-16(up), up
+	neg	R8(%rcx)
+	not	%r8
+	mov	%r8, (rp)
+L(lo0):	dec	n
+	jg	L(top)
+
+L(end):	shl	R8(%rcx), %r10
+	or	%r10, %r9
+	not	%r9
+	shl	R8(%rcx), %r11
+	not	%r11
+	mov	%r9, -8(rp)
+	mov	%r11, -16(rp)
+	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/atom/mul_1.asm b/third_party/gmp/mpn/x86_64/atom/mul_1.asm
new file mode 100644
index 0000000..a0dcf1e
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/atom/mul_1.asm

@@ -0,0 +1,147 @@
+dnl  AMD64 mpn_mul_1 optimised for Intel Atom.
+
+dnl  Copyright 2003-2005, 2007, 2008, 2012, 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C AMD K8,K9      3.03
+C AMD K10        3.03
+C AMD bull       4.74
+C AMD pile       4.56
+C AMD steam
+C AMD excavator
+C AMD bobcat     5.56    6.04
+C AMD jaguar     5.55    5.84
+C Intel P4      13.05
+C Intel core2    4.03
+C Intel NHM      3.80
+C Intel SBR      2.75
+C Intel IBR      2.69
+C Intel HWL      2.50
+C Intel BWL      2.55
+C Intel SKL      2.57
+C Intel atom    17.3
+C Intel SLM     14.7
+C VIA nano
+
+C The loop of this code is the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjorn Granlund.
+
+define(`rp',      `%rdi')   C rcx
+define(`up',      `%rsi')   C rdx
+define(`n_param', `%rdx')   C r8
+define(`v0',      `%rcx')   C r9
+
+define(`n',       `%r11')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_mul_1)
+	FUNC_ENTRY(4)
+	xor	%r8, %r8
+L(com):	mov	(up), %rax
+	lea	-16(up,n_param,8), up
+	lea	-8(rp,n_param,8), rp
+	test	$1, R8(n_param)
+	jnz	L(bx1)
+
+L(bx0):	mov	%r8, %r9
+	test	$2, R8(n_param)
+	jnz	L(b10)
+
+L(b00):	mov	$2, R32(n)
+	sub	n_param, n
+	jmp	L(lo0)
+
+L(bx1):	test	$2, R8(n_param)
+	jnz	L(b11)
+
+L(b01):	mov	$3, R32(n)
+	sub	n_param, n
+	mul	v0
+	cmp	$2, n
+	jnz	L(lo1)
+	jmp	L(cj1)
+
+L(b11):	mov	$1, R32(n)
+	sub	n_param, n
+	jmp	L(lo3)
+
+L(b10):	xor	R32(n), R32(n)
+	sub	n_param, n
+	jmp	L(lo2)
+
+L(top):	mul	v0
+	mov	%r9, -24(rp,n,8)
+L(lo1):	xor	%r9d, %r9d
+	add	%rax, %r8
+	mov	(up,n,8), %rax
+	adc	%rdx, %r9
+	mov	%r8, -16(rp,n,8)
+L(lo0):	xor	%r8d, %r8d
+	mul	v0
+	add	%rax, %r9
+	mov	8(up,n,8), %rax
+	adc	%rdx, %r8
+	mov	%r9, -8(rp,n,8)
+L(lo3):	xor	%r9d, %r9d
+	mul	v0
+	add	%rax, %r8
+	mov	16(up,n,8), %rax
+	adc	%rdx, %r9
+	mov	%r8, (rp,n,8)
+L(lo2):	xor	%r8d, %r8d
+	mul	v0
+	add	%rax, %r9
+	mov	24(up,n,8), %rax
+	adc	%rdx, %r8
+	add	$4, n
+	js	L(top)
+
+L(end):	mul	v0
+	mov	%r9, -8(rp)
+L(cj1):	add	%rax, %r8
+	mov	$0, R32(%rax)
+	adc	%rdx, %rax
+	mov	%r8, (rp)
+	FUNC_EXIT()
+	ret
+EPILOGUE()
+
+PROLOGUE(mpn_mul_1c)
+	FUNC_ENTRY(4)
+IFDOS(`	mov	56(%rsp), %r8	')
+	jmp	L(com)
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/x86_64/atom/mul_2.asm b/third_party/gmp/mpn/x86_64/atom/mul_2.asm
new file mode 100644
index 0000000..4bc22cd
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/atom/mul_2.asm

@@ -0,0 +1,190 @@
+dnl  AMD64 mpn_mul_2 optimised for Intel Atom.
+
+dnl  Copyright 2008, 2011-2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb	best
+C AMD K8,K9      5.78
+C AMD K10        5.78
+C AMD bull       9.10
+C AMD pile       9.17
+C AMD steam
+C AMD excavator
+C AMD bobcat    11.3
+C AMD jaguar    10.9
+C Intel P4      24.6
+C Intel core2    8.06
+C Intel NHM      7.65
+C Intel SBR      6.28
+C Intel IBR      6.10
+C Intel HWL      6.09
+C Intel BWL      4.73
+C Intel SKL      4.77
+C Intel atom    35.3
+C Intel SLM     25.6
+C VIA nano
+
+C The loop of this code is the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjorn Granlund.
+
+define(`rp',      `%rdi')   C rcx
+define(`up',      `%rsi')   C rdx
+define(`n_param', `%rdx')   C r8
+define(`vp',      `%rcx')   C r9
+
+define(`v0', `%r8')
+define(`v1', `%r9')
+define(`w0', `%rbx')
+define(`w1', `%rcx')
+define(`w2', `%rbp')
+define(`w3', `%r10')
+define(`n',  `%r11')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_mul_2)
+	FUNC_ENTRY(4)
+	push	%rbx
+	push	%rbp
+
+	mov	(up), %rax
+
+	mov	(vp), v0
+	mov	8(vp), v1
+
+	mov	n_param, n
+	mul	v0
+
+	test	$1, R8(n)
+	jnz	L(bx1)
+
+L(bx0):	test	$2, R8(n)
+	jnz	L(b10)
+
+L(b00):	mov	%rax, w0
+	mov	(up), %rax
+	mov	%rdx, w1
+	xor	R32(w2), R32(w2)
+	lea	-8(rp), rp
+	jmp	L(lo0)
+
+L(b10):	mov	%rax, w2
+	mov	(up), %rax
+	mov	%rdx, w3
+	xor	R32(w0), R32(w0)
+	lea	-16(up), up
+	lea	-24(rp), rp
+	jmp	L(lo2)
+
+L(bx1):	test	$2, R8(n)
+	jnz	L(b11)
+
+L(b01):	mov	%rax, w3
+	mov	%rdx, w0
+	mov	(up), %rax
+	xor	R32(w1), R32(w1)
+	lea	8(up), up
+	dec	n
+	jmp	L(lo1)
+
+L(b11):	mov	%rax, w1
+	mov	(up), %rax
+	mov	%rdx, w2
+	xor	R32(w3), R32(w3)
+	lea	-8(up), up
+	lea	-16(rp), rp
+	jmp	L(lo3)
+
+	ALIGN(16)
+L(top):
+L(lo1):	mul	v1
+	add	%rax, w0
+	mov	(up), %rax
+	mov	$0, R32(w2)
+	mov	w3, (rp)
+	adc	%rdx, w1
+	mul	v0
+	add	%rax, w0
+	mov	(up), %rax
+	adc	%rdx, w1
+	adc	$0, R32(w2)
+L(lo0):	mul	v1
+	add	%rax, w1
+	mov	8(up), %rax
+	mov	w0, 8(rp)
+	adc	%rdx, w2
+	mul	v0
+	add	%rax, w1
+	mov	8(up), %rax
+	adc	%rdx, w2
+	mov	$0, R32(w3)
+	adc	$0, R32(w3)
+L(lo3):	mul	v1
+	add	%rax, w2
+	mov	16(up), %rax
+	mov	w1, 16(rp)
+	mov	$0, R32(w0)
+	adc	%rdx, w3
+	mul	v0
+	add	%rax, w2
+	mov	16(up), %rax
+	adc	%rdx, w3
+L(lo2):	mov	$0, R32(w1)
+	mov	w2, 24(rp)
+	adc	$0, R32(w0)
+	mul	v1
+	add	%rax, w3
+	mov	24(up), %rax
+	lea	32(up), up
+	adc	%rdx, w0
+	mul	v0
+	lea	32(rp), rp
+	add	%rax, w3
+	adc	%rdx, w0
+	mov	-8(up), %rax
+	adc	$0, R32(w1)
+	sub	$4, n
+	ja	L(top)
+
+L(end):	mul	v1
+	mov	w3, (rp)
+	add	%rax, w0
+	adc	%rdx, w1
+	mov	w0, 8(rp)
+	mov	w1, %rax
+	pop	%rbp
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/atom/popcount.asm b/third_party/gmp/mpn/x86_64/atom/popcount.asm
new file mode 100644
index 0000000..fb14dd3
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/atom/popcount.asm

@@ -0,0 +1,35 @@
+dnl  x86-64 mpn_popcount.
+
+dnl  Copyright 2007, 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_popcount)
+include_mpn(`x86/pentium4/sse2/popcount.asm')

diff --git a/third_party/gmp/mpn/x86_64/atom/redc_1.asm b/third_party/gmp/mpn/x86_64/atom/redc_1.asm
new file mode 100644
index 0000000..62b9a84
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/atom/redc_1.asm

@@ -0,0 +1,579 @@
+dnl  X86-64 mpn_redc_1 optimised for Intel Atom.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C AMD K8,K9	 ?
+C AMD K10	 ?
+C AMD bull	 ?
+C AMD pile	 ?
+C AMD steam	 ?
+C AMD bobcat	 5.0
+C AMD jaguar	 ?
+C Intel P4	 ?
+C Intel core	 ?
+C Intel NHM	 ?
+C Intel SBR	 ?
+C Intel IBR	 ?
+C Intel HWL	 ?
+C Intel BWL	 ?
+C Intel atom	 ?
+C VIA nano	 ?
+
+C TODO
+C  * Micro-optimise, none performed thus far.
+C  * Consider inlining mpn_add_n.
+C  * Single basecases out before the pushes.
+C  * Make lead-in code for the inner loops be more similar.
+
+C When playing with pointers, set this to $2 to fall back to conservative
+C indexing in wind-down code.
+define(`I',`$1')
+
+define(`rp',          `%rdi')   C rcx
+define(`up',          `%rsi')   C rdx
+define(`mp_param',    `%rdx')   C r8
+define(`n',           `%rcx')   C r9
+define(`u0inv',       `%r8')    C stack
+
+define(`i',           `%r14')
+define(`j',           `%r15')
+define(`mp',          `%r12')
+define(`q0',          `%r13')
+define(`w0',          `%rbp')
+define(`w1',          `%r9')
+define(`w2',          `%r10')
+define(`w3',          `%r11')
+
+C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+define(`ALIGNx', `ALIGN(16)')
+
+ASM_START()
+	TEXT
+	ALIGN(32)
+PROLOGUE(mpn_redc_1)
+	FUNC_ENTRY(4)
+IFDOS(`	mov	56(%rsp), %r8	')
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+
+	mov	(up), q0
+	mov	n, j			C outer loop induction var
+	lea	(mp_param,n,8), mp
+	lea	(up,n,8), up
+	neg	n
+	imul	u0inv, q0		C first iteration q0
+
+	test	$1, R8(n)
+	jz	L(bx0)
+
+L(bx1):	test	$2, R8(n)
+	jz	L(b3)
+
+L(b1):	cmp	$-1, R32(n)
+	jz	L(n1)
+
+L(otp1):lea	1(n), i
+	mov	(mp,n,8), %rax
+	mul	q0
+	mov	%rax, %rbp
+	mov	8(mp,n,8), %rax
+	mov	%rdx, %r9
+	mul	q0
+	mov	%rax, %rbx
+	mov	16(mp,n,8), %rax
+	mov	%rdx, %r10
+	mul	q0
+	add	(up,n,8), %rbp
+	mov	%rax, %rbp
+	adc	%r9, %rbx
+	mov	24(mp,n,8), %rax
+	adc	$0, %r10
+	mov	%rdx, %r9
+	mul	q0
+	add	8(up,n,8), %rbx
+	mov	%rbx, 8(up,n,8)
+	mov	%rax, %r11
+	adc	%r10, %rbp
+	mov	32(mp,n,8), %rax
+	adc	$0, %r9
+	imul	u0inv, %rbx		C next q limb
+	jmp	L(e1)
+
+	ALIGNx
+L(tp1):	mul	q0
+	add	%rbp, -24(up,i,8)
+	mov	%rax, %rbp
+	mov	(mp,i,8), %rax
+	adc	%r9, %r11
+	mov	%rdx, %r9
+	adc	$0, %r10
+	mul	q0
+	add	%r11, -16(up,i,8)
+	mov	%rax, %r11
+	mov	8(mp,i,8), %rax
+	adc	%r10, %rbp
+	mov	%rdx, %r10
+	adc	$0, %r9
+	mul	q0
+	add	%rbp, -8(up,i,8)
+	mov	%rax, %rbp
+	adc	%r9, %r11
+	mov	16(mp,i,8), %rax
+	adc	$0, %r10
+	mov	%rdx, %r9
+	mul	q0
+	add	%r11, (up,i,8)
+	mov	%rax, %r11
+	adc	%r10, %rbp
+	mov	24(mp,i,8), %rax
+	adc	$0, %r9
+L(e1):	add	$4, i
+	mov	%rdx, %r10
+	js	L(tp1)
+
+L(ed1):	mul	q0
+	add	%rbp, I(-24(up),-24(up,i,8))
+	adc	%r9, %r11
+	adc	$0, %r10
+	add	%r11, I(-16(up),-16(up,i,8))
+	adc	%r10, %rax
+	adc	$0, %rdx
+	add	%rax, I(-8(up),-8(up,i,8))
+	adc	$0, %rdx
+	mov	%rdx, (up,n,8)		C up[0]
+	mov	%rbx, q0		C previously computed q limb -> q0
+	lea	8(up), up		C up++
+	dec	j
+	jnz	L(otp1)
+	jmp	L(cj)
+
+L(b3):	cmp	$-3, R32(n)
+	jz	L(n3)
+
+L(otp3):lea	3(n), i
+	mov	(mp,n,8), %rax
+	mul	q0
+	mov	%rax, %rbp
+	mov	8(mp,n,8), %rax
+	mov	%rdx, %r9
+	mul	q0
+	mov	%rax, %rbx
+	mov	16(mp,n,8), %rax
+	mov	%rdx, %r10
+	mul	q0
+	add	(up,n,8), %rbp
+	mov	%rax, %rbp
+	mov	24(mp,n,8), %rax
+	adc	%r9, %rbx
+	mov	%rdx, %r9
+	adc	$0, %r10
+	mul	q0
+	add	8(up,n,8), %rbx
+	mov	%rbx, 8(up,n,8)
+	mov	%rax, %r11
+	mov	32(mp,n,8), %rax
+	adc	%r10, %rbp
+	mov	%rdx, %r10
+	adc	$0, %r9
+	imul	u0inv, %rbx		C next q limb
+	jmp	L(e3)
+
+	ALIGNx
+L(tp3):	mul	q0
+	add	%rbp, -24(up,i,8)
+	mov	%rax, %rbp
+	mov	(mp,i,8), %rax
+	adc	%r9, %r11
+	mov	%rdx, %r9
+	adc	$0, %r10
+	mul	q0
+	add	%r11, -16(up,i,8)
+	mov	%rax, %r11
+	mov	8(mp,i,8), %rax
+	adc	%r10, %rbp
+	mov	%rdx, %r10
+	adc	$0, %r9
+L(e3):	mul	q0
+	add	%rbp, -8(up,i,8)
+	mov	%rax, %rbp
+	adc	%r9, %r11
+	mov	16(mp,i,8), %rax
+	adc	$0, %r10
+	mov	%rdx, %r9
+	mul	q0
+	add	%r11, (up,i,8)
+	mov	%rax, %r11
+	adc	%r10, %rbp
+	mov	24(mp,i,8), %rax
+	adc	$0, %r9
+	add	$4, i
+	mov	%rdx, %r10
+	js	L(tp3)
+
+L(ed3):	mul	q0
+	add	%rbp, I(-24(up),-24(up,i,8))
+	adc	%r9, %r11
+	adc	$0, %r10
+	add	%r11, I(-16(up),-16(up,i,8))
+	adc	%r10, %rax
+	adc	$0, %rdx
+	add	%rax, I(-8(up),-8(up,i,8))
+	adc	$0, %rdx
+	mov	%rdx, (up,n,8)		C up[0]
+	mov	%rbx, q0		C previously computed q limb -> q0
+	lea	8(up), up		C up++
+	dec	j
+	jnz	L(otp3)
+C	jmp	L(cj)
+
+L(cj):
+IFSTD(`	lea	(up,n,8), up		C param 2: up
+	lea	(up,n,8), %rdx		C param 3: up - n
+	neg	R32(n)		')	C param 4: n
+
+IFDOS(`	lea	(up,n,8), %rdx		C param 2: up
+	lea	(%rdx,n,8), %r8		C param 3: up - n
+	neg	R32(n)
+	mov	n, %r9			C param 4: n
+	mov	rp, %rcx	')	C param 1: rp
+
+IFSTD(`	sub	$8, %rsp	')
+IFDOS(`	sub	$40, %rsp	')
+	ASSERT(nz, `test $15, %rsp')
+	CALL(	mpn_add_n)
+IFSTD(`	add	$8, %rsp	')
+IFDOS(`	add	$40, %rsp	')
+
+L(ret):	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+
+L(bx0):	test	$2, R8(n)
+	jnz	L(b2)
+
+L(b0):	cmp	$-4, R32(n)
+	jz	L(n4)
+
+L(otp0):lea	4(n), i
+	mov	(mp,n,8), %rax
+	mul	q0
+	mov	%rax, %r11
+	mov	8(mp,n,8), %rax
+	mov	%rdx, %r10
+	mul	q0
+	mov	%rax, %rbx
+	mov	16(mp,n,8), %rax
+	mov	%rdx, %r9
+	mul	q0
+	add	(up,n,8), %r11
+	mov	%rax, %r11
+	adc	%r10, %rbx
+	mov	24(mp,n,8), %rax
+	adc	$0, %r9
+	mov	%rdx, %r10
+	mul	q0
+	add	8(up,n,8), %rbx
+	mov	%rbx, 8(up,n,8)
+	mov	%rax, %rbp
+	mov	32(mp,n,8), %rax
+	adc	%r9, %r11
+	mov	%rdx, %r9
+	adc	$0, %r10
+	imul	u0inv, %rbx		C next q limb
+	jmp	L(e0)
+
+	ALIGNx
+L(tp0):	mul	q0
+	add	%rbp, -24(up,i,8)
+	mov	%rax, %rbp
+	mov	(mp,i,8), %rax
+	adc	%r9, %r11
+	mov	%rdx, %r9
+	adc	$0, %r10
+L(e0):	mul	q0
+	add	%r11, -16(up,i,8)
+	mov	%rax, %r11
+	mov	8(mp,i,8), %rax
+	adc	%r10, %rbp
+	mov	%rdx, %r10
+	adc	$0, %r9
+	mul	q0
+	add	%rbp, -8(up,i,8)
+	mov	%rax, %rbp
+	adc	%r9, %r11
+	mov	16(mp,i,8), %rax
+	adc	$0, %r10
+	mov	%rdx, %r9
+	mul	q0
+	add	%r11, (up,i,8)
+	mov	%rax, %r11
+	adc	%r10, %rbp
+	mov	24(mp,i,8), %rax
+	adc	$0, %r9
+	add	$4, i
+	mov	%rdx, %r10
+	js	L(tp0)
+
+L(ed0):	mul	q0
+	add	%rbp, I(-24(up),-24(up,i,8))
+	adc	%r9, %r11
+	adc	$0, %r10
+	add	%r11, I(-16(up),-16(up,i,8))
+	adc	%r10, %rax
+	adc	$0, %rdx
+	add	%rax, I(-8(up),-8(up,i,8))
+	adc	$0, %rdx
+	mov	%rdx, (up,n,8)		C up[0]
+	mov	%rbx, q0		C previously computed q limb -> q0
+	lea	8(up), up		C up++
+	dec	j
+	jnz	L(otp0)
+	jmp	L(cj)
+
+L(b2):	cmp	$-2, R32(n)
+	jz	L(n2)
+
+L(otp2):lea	2(n), i
+	mov	(mp,n,8), %rax
+	mul	q0
+	mov	%rax, %r11
+	mov	8(mp,n,8), %rax
+	mov	%rdx, %r10
+	mul	q0
+	mov	%rax, %rbx
+	mov	16(mp,n,8), %rax
+	mov	%rdx, %r9
+	mul	q0
+	add	(up,n,8), %r11
+	mov	%rax, %r11
+	adc	%r10, %rbx
+	mov	24(mp,n,8), %rax
+	adc	$0, %r9
+	mov	%rdx, %r10
+	mul	q0
+	add	8(up,n,8), %rbx
+	mov	%rbx, 8(up,n,8)
+	mov	%rax, %rbp
+	mov	32(mp,n,8), %rax
+	adc	%r9, %r11
+	mov	%rdx, %r9
+	adc	$0, %r10
+	imul	u0inv, %rbx		C next q limb
+	jmp	L(e2)
+
+	ALIGNx
+L(tp2):	mul	q0
+	add	%rbp, -24(up,i,8)
+	mov	%rax, %rbp
+	mov	(mp,i,8), %rax
+	adc	%r9, %r11
+	mov	%rdx, %r9
+	adc	$0, %r10
+	mul	q0
+	add	%r11, -16(up,i,8)
+	mov	%rax, %r11
+	mov	8(mp,i,8), %rax
+	adc	%r10, %rbp
+	mov	%rdx, %r10
+	adc	$0, %r9
+	mul	q0
+	add	%rbp, -8(up,i,8)
+	mov	%rax, %rbp
+	adc	%r9, %r11
+	mov	16(mp,i,8), %rax
+	adc	$0, %r10
+	mov	%rdx, %r9
+L(e2):	mul	q0
+	add	%r11, (up,i,8)
+	mov	%rax, %r11
+	adc	%r10, %rbp
+	mov	24(mp,i,8), %rax
+	adc	$0, %r9
+	add	$4, i
+	mov	%rdx, %r10
+	js	L(tp2)
+
+L(ed2):	mul	q0
+	add	%rbp, I(-24(up),-24(up,i,8))
+	adc	%r9, %r11
+	adc	$0, %r10
+	add	%r11, I(-16(up),-16(up,i,8))
+	adc	%r10, %rax
+	adc	$0, %rdx
+	add	%rax, I(-8(up),-8(up,i,8))
+	adc	$0, %rdx
+	mov	%rdx, (up,n,8)		C up[0]
+	mov	%rbx, q0		C previously computed q limb -> q0
+	lea	8(up), up		C up++
+	dec	j
+	jnz	L(otp2)
+	jmp	L(cj)
+
+L(n1):	mov	(mp_param), %rax
+	mul	q0
+	add	-8(up), %rax
+	adc	(up), %rdx
+	mov	%rdx, (rp)
+	mov	$0, R32(%rax)
+	adc	R32(%rax), R32(%rax)
+	jmp	L(ret)
+
+L(n2):	mov	(mp_param), %rax
+	mov	-16(up), %rbp
+	mul	q0
+	add	%rax, %rbp
+	mov	%rdx, %r9
+	adc	$0, %r9
+	mov	-8(mp), %rax
+	mov	-8(up), %r10
+	mul	q0
+	add	%rax, %r10
+	mov	%rdx, %r11
+	adc	$0, %r11
+	add	%r9, %r10
+	adc	$0, %r11
+	mov	%r10, q0
+	imul	u0inv, q0		C next q0
+	mov	-16(mp), %rax
+	mul	q0
+	add	%rax, %r10
+	mov	%rdx, %r9
+	adc	$0, %r9
+	mov	-8(mp), %rax
+	mov	(up), %r14
+	mul	q0
+	add	%rax, %r14
+	adc	$0, %rdx
+	add	%r9, %r14
+	adc	$0, %rdx
+	xor	R32(%rax), R32(%rax)
+	add	%r11, %r14
+	adc	8(up), %rdx
+	mov	%r14, (rp)
+	mov	%rdx, 8(rp)
+	adc	R32(%rax), R32(%rax)
+	jmp	L(ret)
+
+	ALIGNx
+L(n3):	mov	-24(mp), %rax
+	mov	-24(up), %r10
+	mul	q0
+	add	%rax, %r10
+	mov	-16(mp), %rax
+	mov	%rdx, %r11
+	adc	$0, %r11
+	mov	-16(up), %rbp
+	mul	q0
+	add	%rax, %rbp
+	mov	%rdx, %r9
+	adc	$0, %r9
+	mov	-8(mp), %rax
+	add	%r11, %rbp
+	mov	-8(up), %r10
+	adc	$0, %r9
+	mul	q0
+	mov	%rbp, q0
+	imul	u0inv, q0		C next q0
+	add	%rax, %r10
+	mov	%rdx, %r11
+	adc	$0, %r11
+	mov	%rbp, -16(up)
+	add	%r9, %r10
+	adc	$0, %r11
+	mov	%r10, -8(up)
+	mov	%r11, -24(up)		C up[0]
+	lea	8(up), up		C up++
+	dec	j
+	jnz	L(n3)
+
+	mov	-48(up), %rdx
+	mov	-40(up), %rbx
+	xor	R32(%rax), R32(%rax)
+	add	%rbp, %rdx
+	adc	%r10, %rbx
+	adc	-8(up), %r11
+	mov	%rdx, (rp)
+	mov	%rbx, 8(rp)
+	mov	%r11, 16(rp)
+	adc	R32(%rax), R32(%rax)
+	jmp	L(ret)
+
+L(n4):	mov	-32(mp), %rax
+	mul	q0
+	mov	%rax, %r11
+	mov	-24(mp), %rax
+	mov	%rdx, %r10
+	mul	q0
+	mov	%rax, %rbx
+	mov	-16(mp), %rax
+	mov	%rdx, %r9
+	mul	q0
+	add	-32(up), %r11
+	mov	%rax, %r11
+	adc	%r10, %rbx
+	mov	-8(mp), %rax
+	adc	$0, %r9
+	mov	%rdx, %r10
+	mul	q0
+	add	-24(up), %rbx
+	mov	%rbx, -24(up)
+	adc	%r9, %r11
+	adc	$0, %r10
+	imul	u0inv, %rbx		C next q limb
+	add	%r11, -16(up)
+	adc	%r10, %rax
+	adc	$0, %rdx
+	add	%rax, -8(up)
+	adc	$0, %rdx
+	mov	%rdx, -32(up)		C up[0]
+	mov	%rbx, q0		C previously computed q limb -> q0
+	dec	j
+	lea	8(up), up		C up++
+	jnz	L(n4)
+	jmp	L(cj)
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/x86_64/atom/rsh1aors_n.asm b/third_party/gmp/mpn/x86_64/atom/rsh1aors_n.asm
new file mode 100644
index 0000000..6f5f638
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/atom/rsh1aors_n.asm

@@ -0,0 +1,287 @@
+dnl  x86-64 mpn_rsh1add_n/mpn_rsh1sub_n.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  Copyright 2011, 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C TODO
+C  * Schedule loop less.  It is now almost surely overscheduled, resulting in
+C    large feed-in and wind-down code.
+
+C	     cycles/limb
+C AMD K8,K9	 ?
+C AMD K10	 ?
+C Intel P4	 ?
+C Intel core2	 ?
+C Intel NMH	 ?
+C Intel SBR	 ?
+C Intel atom	 5.25
+C VIA nano	 ?
+
+C INPUT PARAMETERS
+define(`rp',`%rdi')
+define(`up',`%rsi')
+define(`vp',`%rdx')
+define(`n',`%rcx')
+
+ifdef(`OPERATION_rsh1add_n', `
+	define(ADDSUB,	      add)
+	define(ADCSBB,	      adc)
+	define(func_n,	      mpn_rsh1add_n)
+	define(func_nc,	      mpn_rsh1add_nc)')
+ifdef(`OPERATION_rsh1sub_n', `
+	define(ADDSUB,	      sub)
+	define(ADCSBB,	      sbb)
+	define(func_n,	      mpn_rsh1sub_n)
+	define(func_nc,	      mpn_rsh1sub_nc)')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1sub_n)
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(func_n)
+	FUNC_ENTRY(4)
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+
+	mov	(up), %r15
+	ADDSUB	(vp), %r15
+	sbb	R32(%rbx), R32(%rbx)
+	xor	R32(%rax), R32(%rax)
+	shr	%r15
+	adc	R32(%rax), R32(%rax)	C return value
+
+	mov	R32(n), R32(%rbp)
+	and	$3, R32(%rbp)
+	jz	L(b0)
+	cmp	$2, R32(%rbp)
+	jae	L(b23)
+
+L(b1):	dec	n
+	jnz	L(gt1)
+	shl	$63, %rbx
+	add	%rbx, %r15
+	mov	%r15, (rp)
+	jmp	L(cj1)
+L(gt1):	lea	24(up), up
+	lea	24(vp), vp
+	mov	-16(up), %r9
+	add	R32(%rbx), R32(%rbx)
+	mov	-8(up), %r10
+	lea	24(rp), rp
+	mov	(up), %r11
+	ADCSBB	-16(vp), %r9
+	ADCSBB	-8(vp), %r10
+	mov	%r15, %r12
+	ADCSBB	(vp), %r11
+	mov	%r9, %r13
+	sbb	R32(%rbx), R32(%rbx)
+	mov	%r11, %r15
+	mov	%r10, %r14
+	shl	$63, %r11
+	shl	$63, %r10
+	shl	$63, %r9
+	or	%r9, %r12
+	shr	%r13
+	mov	8(up), %r8
+	shr	%r14
+	or	%r10, %r13
+	shr	%r15
+	or	%r11, %r14
+	sub	$4, n
+	jz	L(cj5)
+L(gt5):	mov	16(up), %r9
+	add	R32(%rbx), R32(%rbx)
+	mov	24(up), %r10
+	ADCSBB	8(vp), %r8
+	mov	%r15, %rbp
+	mov	32(up), %r11
+	jmp	L(lo1)
+
+L(b23):	jnz	L(b3)
+	mov	8(up), %r8
+	sub	$2, n
+	jnz	L(gt2)
+	add	R32(%rbx), R32(%rbx)
+	ADCSBB	8(vp), %r8
+	mov	%r8, %r12
+	jmp	L(cj2)
+L(gt2):	mov	16(up), %r9
+	add	R32(%rbx), R32(%rbx)
+	mov	24(up), %r10
+	ADCSBB	8(vp), %r8
+	mov	%r15, %rbp
+	mov	32(up), %r11
+	ADCSBB	16(vp), %r9
+	lea	32(up), up
+	ADCSBB	24(vp), %r10
+	mov	%r9, %r13
+	ADCSBB	32(vp), %r11
+	mov	%r8, %r12
+	jmp	L(lo2)
+
+L(b3):	lea	40(up), up
+	lea	8(vp), vp
+	mov	%r15, %r14
+	add	R32(%rbx), R32(%rbx)
+	mov	-32(up), %r11
+	ADCSBB	0(vp), %r11
+	lea	8(rp), rp
+	sbb	R32(%rbx), R32(%rbx)
+	mov	%r11, %r15
+	shl	$63, %r11
+	mov	-24(up), %r8
+	shr	%r15
+	or	%r11, %r14
+	sub	$3, n
+	jnz	L(gt3)
+	add	R32(%rbx), R32(%rbx)
+	ADCSBB	8(vp), %r8
+	jmp	L(cj3)
+L(gt3):	mov	-16(up), %r9
+	add	R32(%rbx), R32(%rbx)
+	mov	-8(up), %r10
+	ADCSBB	8(vp), %r8
+	mov	%r15, %rbp
+	mov	(up), %r11
+	ADCSBB	16(vp), %r9
+	ADCSBB	24(vp), %r10
+	mov	%r8, %r12
+	jmp	L(lo3)
+
+L(b0):	lea	48(up), up
+	lea	16(vp), vp
+	add	R32(%rbx), R32(%rbx)
+	mov	-40(up), %r10
+	lea	16(rp), rp
+	mov	-32(up), %r11
+	ADCSBB	-8(vp), %r10
+	mov	%r15, %r13
+	ADCSBB	(vp), %r11
+	sbb	R32(%rbx), R32(%rbx)
+	mov	%r11, %r15
+	mov	%r10, %r14
+	shl	$63, %r11
+	shl	$63, %r10
+	mov	-24(up), %r8
+	shr	%r14
+	or	%r10, %r13
+	shr	%r15
+	or	%r11, %r14
+	sub	$4, n
+	jnz	L(gt4)
+	add	R32(%rbx), R32(%rbx)
+	ADCSBB	8(vp), %r8
+	jmp	L(cj4)
+L(gt4):	mov	-16(up), %r9
+	add	R32(%rbx), R32(%rbx)
+	mov	-8(up), %r10
+	ADCSBB	8(vp), %r8
+	mov	%r15, %rbp
+	mov	(up), %r11
+	ADCSBB	16(vp), %r9
+	jmp	L(lo0)
+
+	ALIGN(8)
+L(top):	mov	16(up), %r9
+	shr	%r14
+	or	%r10, %r13
+	shr	%r15
+	or	%r11, %r14
+	add	R32(%rbx), R32(%rbx)
+	mov	24(up), %r10
+	mov	%rbp, (rp)
+	ADCSBB	8(vp), %r8
+	mov	%r15, %rbp
+	lea	32(rp), rp
+	mov	32(up), %r11
+L(lo1):	ADCSBB	16(vp), %r9
+	lea	32(up), up
+	mov	%r12, -24(rp)
+L(lo0):	ADCSBB	24(vp), %r10
+	mov	%r8, %r12
+	mov	%r13, -16(rp)
+L(lo3):	ADCSBB	32(vp), %r11
+	mov	%r9, %r13
+	mov	%r14, -8(rp)
+L(lo2):	sbb	R32(%rbx), R32(%rbx)
+	shl	$63, %r8
+	mov	%r11, %r15
+	shr	%r12
+	mov	%r10, %r14
+	shl	$63, %r9
+	lea	32(vp), vp
+	shl	$63, %r10
+	or	%r8, %rbp
+	shl	$63, %r11
+	or	%r9, %r12
+	shr	%r13
+	mov	8(up), %r8
+	sub	$4, n
+	jg	L(top)
+
+L(end):	shr	%r14
+	or	%r10, %r13
+	shr	%r15
+	or	%r11, %r14
+	mov	%rbp, (rp)
+	lea	32(rp), rp
+L(cj5):	add	R32(%rbx), R32(%rbx)
+	ADCSBB	8(vp), %r8
+	mov	%r12, -24(rp)
+L(cj4):	mov	%r13, -16(rp)
+L(cj3):	mov	%r8, %r12
+	mov	%r14, -8(rp)
+L(cj2):	sbb	R32(%rbx), R32(%rbx)
+	shl	$63, %r8
+	shr	%r12
+	or	%r8, %r15
+	shl	$63, %rbx
+	add	%rbx, %r12
+	mov	%r15, (rp)
+	mov	%r12, 8(rp)
+L(cj1):	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/atom/rshift.asm b/third_party/gmp/mpn/x86_64/atom/rshift.asm
new file mode 100644
index 0000000..29c027d
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/atom/rshift.asm

@@ -0,0 +1,121 @@
+dnl  AMD64 mpn_rshift -- mpn right shift, optimised for Atom.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  Copyright 2011, 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C AMD K8,K9	 ?
+C AMD K10	 ?
+C Intel P4	 ?
+C Intel core2	 ?
+C Intel NHM	 ?
+C Intel SBR	 ?
+C Intel atom	 4.5
+C VIA nano	 ?
+
+C TODO
+C  * Consider using 4-way unrolling.  We reach 4 c/l, but the code is 2.5 times
+C    larger.
+
+C INPUT PARAMETERS
+define(`rp',	`%rdi')
+define(`up',	`%rsi')
+define(`n',	`%rdx')
+define(`cnt',	`%rcx')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_rshift)
+	FUNC_ENTRY(4)
+	shr	R32(n)
+	mov	(up), %rax
+	jnc	L(evn)
+
+	mov	%rax, %r11
+	shr	R8(cnt), %r11
+	neg	R8(cnt)
+	shl	R8(cnt), %rax
+	test	n, n
+	jnz	L(gt1)
+	mov	%r11, (rp)
+	FUNC_EXIT()
+	ret
+
+L(gt1):	mov	8(up), %r8
+	mov	%r8, %r10
+	shl	R8(cnt), %r8
+	jmp	L(lo1)
+
+L(evn):	mov	%rax, %r10
+	neg	R8(cnt)
+	shl	R8(cnt), %rax
+	mov	8(up), %r9
+	mov	%r9, %r11
+	shl	R8(cnt), %r9
+	neg	R8(cnt)
+	dec	n
+	lea	-8(rp), rp
+	lea	8(up), up
+	jz	L(end)
+
+	ALIGN(8)
+L(top):	shr	R8(cnt), %r10
+	or	%r10, %r9
+	shr	R8(cnt), %r11
+	neg	R8(cnt)
+	mov	8(up), %r8
+	mov	%r8, %r10
+	mov	%r9, 8(rp)
+	shl	R8(cnt), %r8
+	lea	16(rp), rp
+L(lo1):	mov	16(up), %r9
+	or	%r11, %r8
+	mov	%r9, %r11
+	shl	R8(cnt), %r9
+	lea	16(up), up
+	neg	R8(cnt)
+	mov	%r8, (rp)
+	dec	n
+	jg	L(top)
+
+L(end):	shr	R8(cnt), %r10
+	or	%r10, %r9
+	shr	R8(cnt), %r11
+	mov	%r9, 8(rp)
+	mov	%r11, 16(rp)
+	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/atom/sublsh1_n.asm b/third_party/gmp/mpn/x86_64/atom/sublsh1_n.asm
new file mode 100644
index 0000000..1306acd
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/atom/sublsh1_n.asm

@@ -0,0 +1,242 @@
+dnl  AMD64 mpn_sublsh1_n optimised for Intel Atom.
+dnl  Used also for AMD bd1.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  Copyright 2011, 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C TODO
+C  * This code is slightly large at 501 bytes.
+C  * aorrlsh1_n.asm and this file use the same basic pattern.
+
+C	     cycles/limb
+C AMD K8,K9	 ?
+C AMD K10	 ?
+C AMD bd1	 2.3
+C AMD bobcat	 ?
+C Intel P4	 ?
+C Intel core2	 ?
+C Intel NHM	 ?
+C Intel SBR	 ?
+C Intel atom	 5	(4.875 is probably possible)
+C VIA nano	 ?
+
+C INPUT PARAMETERS
+define(`rp',       `%rdi')
+define(`up',       `%rsi')
+define(`vp',       `%rdx')
+define(`n',        `%rcx')
+define(`cy',       `%r8')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_sublsh1_n)
+	FUNC_ENTRY(4)
+	push	%rbp
+	push	%r15
+	xor	R32(%rbp), R32(%rbp)
+L(ent):	mov	R32(n), R32(%rax)
+	and	$3, R32(%rax)
+	jz	L(b0)
+	cmp	$2, R32(%rax)
+	jz	L(b2)
+	jg	L(b3)
+
+L(b1):	mov	(vp), %r8
+	add	%r8, %r8
+	lea	8(vp), vp
+	sbb	R32(%rax), R32(%rax)	C save scy
+	add	R32(%rbp), R32(%rbp)	C restore acy
+	mov	(up), %r15
+	sbb	%r8, %r15
+	mov	%r15, (rp)
+	sbb	R32(%rbp), R32(%rbp)	C save acy
+	lea	8(up), up
+	lea	8(rp), rp
+	jmp	L(b0)
+
+L(b2):	mov	(vp), %r8
+	add	%r8, %r8
+	mov	8(vp), %r9
+	adc	%r9, %r9
+	lea	16(vp), vp
+	sbb	R32(%rax), R32(%rax)	C save scy
+	add	R32(%rbp), R32(%rbp)	C restore acy
+	mov	(up), %r15
+	sbb	%r8, %r15
+	mov	%r15, (rp)
+	mov	8(up), %r15
+	sbb	%r9, %r15
+	mov	%r15, 8(rp)
+	sbb	R32(%rbp), R32(%rbp)	C save acy
+	lea	16(up), up
+	lea	16(rp), rp
+	jmp	L(b0)
+
+L(b3):	mov	(vp), %r8
+	add	%r8, %r8
+	mov	8(vp), %r9
+	adc	%r9, %r9
+	mov	16(vp), %r10
+	adc	%r10, %r10
+	lea	24(vp), vp
+	sbb	R32(%rax), R32(%rax)	C save scy
+	add	R32(%rbp), R32(%rbp)	C restore acy
+	mov	(up), %r15
+	sbb	%r8, %r15
+	mov	%r15, (rp)
+	mov	8(up), %r15
+	sbb	%r9, %r15
+	mov	%r15, 8(rp)
+	mov	16(up), %r15
+	sbb	%r10, %r15
+	mov	%r15, 16(rp)
+	sbb	R32(%rbp), R32(%rbp)	C save acy
+	lea	24(up), up
+	lea	24(rp), rp
+
+L(b0):	test	$4, R8(n)
+	jz	L(skp)
+	add	R32(%rax), R32(%rax)	C restore scy
+	mov	(vp), %r8
+	adc	%r8, %r8
+	mov	8(vp), %r9
+	adc	%r9, %r9
+	mov	16(vp), %r10
+	adc	%r10, %r10
+	mov	24(vp), %r11
+	adc	%r11, %r11
+	lea	32(vp), vp
+	sbb	R32(%rax), R32(%rax)	C save scy
+	add	R32(%rbp), R32(%rbp)	C restore acy
+	mov	(up), %r15
+	sbb	%r8, %r15
+	mov	%r15, (rp)
+	mov	8(up), %r15
+	sbb	%r9, %r15
+	mov	%r15, 8(rp)
+	mov	16(up), %r15
+	sbb	%r10, %r15
+	mov	%r15, 16(rp)
+	mov	24(up), %r15
+	sbb	%r11, %r15
+	mov	%r15, 24(rp)
+	lea	32(up), up
+	lea	32(rp), rp
+	sbb	R32(%rbp), R32(%rbp)	C save acy
+
+L(skp):	cmp	$8, n
+	jl	L(rtn)
+
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%rbx
+	lea	-64(rp), rp
+	jmp	L(x)
+
+	ALIGN(16)
+L(top):	mov	(vp), %r8
+	add	R32(%rax), R32(%rax)
+	lea	64(vp), vp
+	adc	%r8, %r8
+	mov	-56(vp), %r9
+	adc	%r9, %r9
+	mov	-48(vp), %r10
+	adc	%r10, %r10
+	mov	-40(vp), %r11
+	adc	%r11, %r11
+	mov	-32(vp), %r12
+	adc	%r12, %r12
+	mov	-24(vp), %r13
+	adc	%r13, %r13
+	mov	-16(vp), %r14
+	adc	%r14, %r14
+	mov	-8(vp), %r15
+	adc	%r15, %r15
+	sbb	R32(%rax), R32(%rax)
+	add	R32(%rbp), R32(%rbp)
+	mov	(up), %rbp
+	lea	64(rp), rp
+	mov	8(up), %rbx
+	sbb	%r8, %rbp
+	mov	32(up), %r8
+	mov	%rbp, (rp)
+	sbb	%r9, %rbx
+	mov	16(up), %rbp
+	mov	%rbx, 8(rp)
+	sbb	%r10, %rbp
+	mov	24(up), %rbx
+	mov	%rbp, 16(rp)
+	sbb	%r11, %rbx
+	mov	%rbx, 24(rp)
+	sbb	%r12, %r8
+	mov	40(up), %r9
+	mov	%r8, 32(rp)
+	sbb	%r13, %r9
+	mov	48(up), %rbp
+	mov	%r9, 40(rp)
+	sbb	%r14, %rbp
+	mov	56(up), %rbx
+	mov	%rbp, 48(rp)
+	sbb	%r15, %rbx
+	lea	64(up), up
+	mov	%rbx, 56(rp)
+	sbb	R32(%rbp), R32(%rbp)
+L(x):	sub	$8, n
+	jge	L(top)
+
+L(end):	pop	%rbx
+	pop	%r14
+	pop	%r13
+	pop	%r12
+L(rtn):
+	add	R32(%rbp), R32(%rax)
+	neg	R32(%rax)
+
+	pop	%r15
+	pop	%rbp
+	FUNC_EXIT()
+	ret
+EPILOGUE()
+PROLOGUE(mpn_sublsh1_nc)
+	FUNC_ENTRY(4)
+IFDOS(`	mov	56(%rsp), %r8	')
+	push	%rbp
+	push	%r15
+	neg	%r8			C set CF
+	sbb	R32(%rbp), R32(%rbp)	C save acy
+	jmp	L(ent)
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/bd1/README b/third_party/gmp/mpn/x86_64/bd1/README
new file mode 100644
index 0000000..ccd210e
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/bd1/README

@@ -0,0 +1,11 @@
+This directory contains code for AMD bulldozer including its piledriver update.
+
+We currently make limited use of SIMD instructions, both via the MPN_PATH and
+via inclusion of x86_64/fastsse files.
+
+The bd1 cores share one SIMD/FPU pipeline for two integer units.  This probably
+means that an all-core GMP load (such as a HPC load) might run slower if there
+is significant SIMD dependency.
+
+We should perhaps allow a special 'bd1nosimd' pseudo cpu-name excluding any
+SIMD code.

diff --git a/third_party/gmp/mpn/x86_64/bd1/addmul_2.asm b/third_party/gmp/mpn/x86_64/bd1/addmul_2.asm
new file mode 100644
index 0000000..b54e91a
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/bd1/addmul_2.asm

@@ -0,0 +1,235 @@
+dnl  AMD64 mpn_addmul_2 optimised for AMD Bulldozer.
+
+dnl  Copyright 2017 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C AMD K8,K9
+C AMD K10
+C AMD bd1	 4.2
+C AMD bd2	 4.4
+C AMD bd3
+C AMD bd4
+C AMD zen
+C AMD bt1
+C AMD bt2
+C Intel P4
+C Intel PNR
+C Intel NHM
+C Intel SBR
+C Intel IBR
+C Intel HWL
+C Intel BWL
+C Intel SKL
+C Intel atom
+C Intel SLM
+C VIA nano
+
+C The loop of this code is the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjorn Granlund.
+
+define(`rp',      `%rdi')   C rcx
+define(`up',      `%rsi')   C rdx
+define(`n_param', `%rdx')   C r8
+define(`vp',      `%rcx')   C r9
+
+define(`n',       `%rcx')
+define(`v0',      `%rbx')
+define(`v1',      `%rbp')
+define(`X0',      `%r12')
+define(`X1',      `%r13')
+
+define(`w0',    `%r8')
+define(`w1',    `%r9')
+define(`w2',    `%r10')
+define(`w3',    `%r11')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(32)
+PROLOGUE(mpn_addmul_2)
+	FUNC_ENTRY(4)
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+
+	mov	(vp), v0
+	mov	8(vp), v1
+
+	mov	(up), %rax
+	mov	$0, R32(w2)		C abuse w2
+
+	lea	(up,n_param,8), up
+	lea	(rp,n_param,8), rp
+	sub	n_param, w2
+	mul	v0
+
+	test	$1, R8(w2)
+	jnz	L(bx1)
+
+L(bx0):	mov	%rdx, X0
+	mov	%rax, X1
+	test	$2, R8(w2)
+	jnz	L(b10)
+
+L(b00):	lea	(w2), n			C un = 4, 8, 12, ...
+	mov	(up,w2,8), %rax
+	mov	(rp,w2,8), w3
+	mul	v1
+	mov	%rax, w0
+	mov	8(up,w2,8), %rax
+	mov	%rdx, w1
+	jmp	L(lo0)
+
+L(b10):	lea	2(w2), n		C un = 2, 6, 10, ...
+	mov	(up,w2,8), %rax
+	mov	(rp,w2,8), w1
+	mul	v1
+	mov	%rdx, w3
+	mov	%rax, w2
+	mov	-8(up,n,8), %rax
+	test	n, n
+	jz	L(end)
+	jmp	L(top)
+
+L(bx1):	mov	%rax, X0
+	mov	%rdx, X1
+	test	$2, R8(w2)
+	jz	L(b11)
+
+L(b01):	lea	1(w2), n		C un = 1, 5, 9, ...
+	mov	(up,w2,8), %rax
+	mul	v1
+	mov	(rp,w2,8), w2
+	mov	%rdx, w0
+	mov	%rax, w3
+	jmp	L(lo1)
+
+L(b11):	lea	-1(w2), n		C un = 3, 7, 11, ...
+	mov	(up,w2,8), %rax
+	mul	v1
+	mov	(rp,w2,8), w0
+	mov	%rax, w1
+	mov	8(up,w2,8), %rax
+	mov	%rdx, w2
+	jmp	L(lo3)
+
+	ALIGN(32)
+L(top):
+L(lo2):	mul	v0
+	add	w1, X1
+	mov	X1, -16(rp,n,8)
+	mov	%rdx, X1
+	adc	%rax, X0
+	adc	$0, X1
+	mov	-8(up,n,8), %rax
+	mul	v1
+	mov	-8(rp,n,8), w1
+	mov	%rdx, w0
+	add	w1, w2
+	adc	%rax, w3
+	adc	$0, w0
+L(lo1):	mov	(up,n,8), %rax
+	mul	v0
+	add	w2, X0
+	mov	X0, -8(rp,n,8)
+	mov	%rdx, X0
+	adc	%rax, X1
+	mov	(up,n,8), %rax
+	adc	$0, X0
+	mov	(rp,n,8), w2
+	mul	v1
+	add	w2, w3
+	adc	%rax, w0
+	mov	8(up,n,8), %rax
+	mov	%rdx, w1
+	adc	$0, w1
+L(lo0):	mul	v0
+	add	w3, X1
+	mov	X1, (rp,n,8)
+	adc	%rax, X0
+	mov	8(up,n,8), %rax
+	mov	%rdx, X1
+	adc	$0, X1
+	mov	8(rp,n,8), w3
+	mul	v1
+	add	w3, w0
+	adc	%rax, w1
+	mov	16(up,n,8), %rax
+	mov	%rdx, w2
+	adc	$0, w2
+L(lo3):	mul	v0
+	add	w0, X0
+	mov	X0, 8(rp,n,8)
+	mov	%rdx, X0
+	adc	%rax, X1
+	adc	$0, X0
+	mov	16(up,n,8), %rax
+	mov	16(rp,n,8), w0
+	mul	v1
+	mov	%rdx, w3
+	add	w0, w1
+	adc	%rax, w2
+	adc	$0, w3
+	mov	24(up,n,8), %rax
+	add	$4, n
+	jnc	L(top)
+
+L(end):	mul	v0
+	add	w1, X1
+	mov	X1, -16(rp)
+	mov	%rdx, X1
+	adc	%rax, X0
+	adc	$0, X1
+	mov	-8(up), %rax
+	mul	v1
+	mov	-8(rp), w1
+	add	w1, w2
+	adc	%rax, w3
+	adc	$0, %rdx
+	add	w2, X0
+	adc	$0, X1
+	mov	X0, -8(rp)
+	add	w3, X1
+	mov	X1, (rp)
+	adc	$0, %rdx
+	mov	%rdx, %rax
+
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/bd1/aorrlsh1_n.asm b/third_party/gmp/mpn/x86_64/bd1/aorrlsh1_n.asm
new file mode 100644
index 0000000..c34a5fa
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/bd1/aorrlsh1_n.asm

@@ -0,0 +1,37 @@
+dnl  AMD64 mpn_addlsh1_n and mpn_rsblsh1_n
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_addlsh1_nc mpn_rsblsh1_n mpn_rsblsh1_nc)
+include_mpn(`x86_64/atom/aorrlsh1_n.asm')

diff --git a/third_party/gmp/mpn/x86_64/bd1/aorrlsh_n.asm b/third_party/gmp/mpn/x86_64/bd1/aorrlsh_n.asm
new file mode 100644
index 0000000..5516c9d
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/bd1/aorrlsh_n.asm

@@ -0,0 +1,38 @@
+dnl  X86-64 mpn_addlsh_n and mpn_rsblsh_n.
+
+dnl  Copyright 2017 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_addlsh_n mpn_rsblsh_n)
+include_mpn(`x86_64/aorrlsh_n.asm')

diff --git a/third_party/gmp/mpn/x86_64/bd1/aors_n.asm b/third_party/gmp/mpn/x86_64/bd1/aors_n.asm
new file mode 100644
index 0000000..143c42e
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/bd1/aors_n.asm

@@ -0,0 +1,37 @@
+dnl  X86-64 mpn_add_n, mpn_sub_n, optimised for Intel Silvermont.
+
+dnl  Copyright 2017 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+include_mpn(`x86_64/coreihwl/aors_n.asm')

diff --git a/third_party/gmp/mpn/x86_64/bd1/aorsmul_1.asm b/third_party/gmp/mpn/x86_64/bd1/aorsmul_1.asm
new file mode 100644
index 0000000..fc0d2fe
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/bd1/aorsmul_1.asm

@@ -0,0 +1,190 @@
+dnl  AMD64 mpn_addmul_1 and mpn_submul_1 optimised for AMD Bulldozer.
+
+dnl  Copyright 2003-2005, 2007, 2008, 2011, 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C AMD K8,K9      3.30    3.58
+C AMD K10        3.09
+C AMD bull       4.47    4.72
+C AMD pile       4.66
+C AMD steam
+C AMD excavator
+C AMD bobcat     6.30
+C AMD jaguar     6.29
+C Intel P4      17.3    17.8
+C Intel core2    5.13
+C Intel NHM      4.85
+C Intel SBR      3.83
+C Intel IBR      3.75
+C Intel HWL      3.45
+C Intel BWL      2.56
+C Intel SKL      2.53
+C Intel atom    20.3
+C Intel SLM      9
+C VIA nano
+
+C The loop of this code is the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjorn Granlund.
+
+C TODO
+C  * Try to make loop run closer to 4 c/l in Bulldozer and Piledriver.
+
+define(`rp',      `%rdi')   C rcx
+define(`up',      `%rsi')   C rdx
+define(`n_param', `%rdx')   C r8
+define(`v0',      `%rcx')   C r9
+
+define(`n',       `%r11')
+
+ifdef(`OPERATION_addmul_1',`
+      define(`ADDSUB',        `add')
+      define(`func',  `mpn_addmul_1')
+')
+ifdef(`OPERATION_submul_1',`
+      define(`ADDSUB',        `sub')
+      define(`func',  `mpn_submul_1')
+')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
+
+IFDOS(`	define(`up', ``%rsi'')	') dnl
+IFDOS(`	define(`rp', ``%rcx'')	') dnl
+IFDOS(`	define(`v0', ``%r9'')	') dnl
+IFDOS(`	define(`r9', ``rdi'')	') dnl
+IFDOS(`	define(`n',  ``%r8'')	') dnl
+IFDOS(`	define(`r8', ``r11'')	') dnl
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(func)
+IFDOS(``push	%rsi		'')
+IFDOS(``push	%rdi		'')
+IFDOS(``mov	%rdx, %rsi	'')
+
+	mov	(up), %rax		C read first u limb early
+	push	%rbx
+IFSTD(`	mov	n_param, %rbx	')	C move away n from rdx, mul uses it
+IFDOS(`	mov	n, %rbx		')
+	mul	v0
+
+IFSTD(`	mov	%rbx, n		')
+
+	and	$3, R32(%rbx)
+	lea	-16(rp,n,8), rp
+	jz	L(b0)
+	cmp	$2, R32(%rbx)
+	jb	L(b1)
+	jz	L(b2)
+
+L(b3):	mov	$0, R32(%r8)
+	mov	%rax, %rbx
+	mov	$0, R32(%r9)
+	mov	8(up), %rax
+	mov	%rdx, %r10
+	lea	(up,n,8), up
+	not	n
+	jmp	L(L3)
+
+L(b0):	mov	$0, R32(%r10)
+	mov	%rax, %r8
+	mov	%rdx, %rbx
+	mov	8(up), %rax
+	lea	(up,n,8), up
+	neg	n
+	jmp	L(L0)
+
+L(b1):	cmp	$1, n
+	jz	L(n1)
+	mov	%rax, %r9
+	mov	8(up), %rax
+	mov	%rdx, %r8
+	mov	$0, R32(%rbx)
+	lea	(up,n,8), up
+	neg	n
+	inc	n
+	jmp	L(L1)
+
+L(b2):	mov	$0, R32(%rbx)
+	mov	%rax, %r10
+	mov	%rdx, %r9
+	mov	8(up), %rax
+	mov	$0, R32(%r8)
+	lea	(up,n,8), up
+	neg	n
+	add	$2, n
+	jns	L(end)
+
+	ALIGN(32)
+L(top):	mul	v0
+	ADDSUB	%r10, (rp,n,8)
+	adc	%rax, %r9
+	mov	(up,n,8), %rax
+	adc	%rdx, %r8
+L(L1):	mul	v0
+	mov	$0, R32(%r10)
+	ADDSUB	%r9, 8(rp,n,8)
+	adc	%rax, %r8
+	adc	%rdx, %rbx
+	mov	8(up,n,8), %rax
+L(L0):	mul	v0
+	ADDSUB	%r8, 16(rp,n,8)
+	mov	$0, R32(%r8)
+	adc	%rax, %rbx
+	mov	$0, R32(%r9)
+	mov	16(up,n,8), %rax
+	adc	%rdx, %r10
+L(L3):	mul	v0
+	ADDSUB	%rbx, 24(rp,n,8)
+	mov	$0, R32(%rbx)
+	adc	%rax, %r10
+	adc	%rdx, %r9
+	mov	24(up,n,8), %rax
+	add	$4, n
+	js	L(top)
+
+L(end):	mul	v0
+	ADDSUB	%r10, (rp)
+	adc	%r9, %rax
+	adc	%r8, %rdx
+L(n1):	ADDSUB	%rax, 8(rp)
+	adc	$0, %rdx
+	mov	%rdx, %rax
+
+	pop	%rbx
+IFDOS(``pop	%rdi		'')
+IFDOS(``pop	%rsi		'')
+	ret
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/x86_64/bd1/com.asm b/third_party/gmp/mpn/x86_64/bd1/com.asm
new file mode 100644
index 0000000..43f3561
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/bd1/com.asm

@@ -0,0 +1,37 @@
+dnl  X86-64 mpn_com optimised for AMD bd1.
+
+dnl  Copyright 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_com)
+include_mpn(`x86_64/fastsse/com-palignr.asm')

diff --git a/third_party/gmp/mpn/x86_64/bd1/copyd.asm b/third_party/gmp/mpn/x86_64/bd1/copyd.asm
new file mode 100644
index 0000000..675cdc3
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/bd1/copyd.asm

@@ -0,0 +1,37 @@
+dnl  X86-64 mpn_copyd optimised for AMD bd1.
+
+dnl  Copyright 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_copyd)
+include_mpn(`x86_64/fastsse/copyd-palignr.asm')

diff --git a/third_party/gmp/mpn/x86_64/bd1/copyi.asm b/third_party/gmp/mpn/x86_64/bd1/copyi.asm
new file mode 100644
index 0000000..ceef036
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/bd1/copyi.asm

@@ -0,0 +1,37 @@
+dnl  X86-64 mpn_copyi optimised for AMD bd1.
+
+dnl  Copyright 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_copyi)
+include_mpn(`x86_64/fastsse/copyi-palignr.asm')

diff --git a/third_party/gmp/mpn/x86_64/bd1/gcd_11.asm b/third_party/gmp/mpn/x86_64/bd1/gcd_11.asm
new file mode 100644
index 0000000..4723093
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/bd1/gcd_11.asm

@@ -0,0 +1,37 @@
+dnl  AMD64 mpn_gcd_11.
+
+dnl  Copyright 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_gcd_11)
+include_mpn(`x86_64/core2/gcd_11.asm')

diff --git a/third_party/gmp/mpn/x86_64/bd1/gmp-mparam.h b/third_party/gmp/mpn/x86_64/bd1/gmp-mparam.h
new file mode 100644
index 0000000..210f382
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/bd1/gmp-mparam.h

@@ -0,0 +1,265 @@
+/* AMD bd1 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+/* 3600-3800 MHz Bulldozer Zambezi */
+/* FFT tuning limit = 464,627,200 */
+/* Generated by tuneup.c, 2019-10-20, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD                 0  /* always */
+#define MOD_1_UNNORM_THRESHOLD               0  /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          5
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          3
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD        31
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD         0  /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     12
+#define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_1_NORM_THRESHOLD              2
+#define DIV_QR_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           27
+
+#define DIV_1_VS_MUL_1_PERCENT             275
+
+#define MUL_TOOM22_THRESHOLD                20
+#define MUL_TOOM33_THRESHOLD                57
+#define MUL_TOOM44_THRESHOLD               161
+#define MUL_TOOM6H_THRESHOLD               226
+#define MUL_TOOM8H_THRESHOLD               339
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      61
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     108
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD     105
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD     113
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD      91
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 24
+#define SQR_TOOM3_THRESHOLD                 85
+#define SQR_TOOM4_THRESHOLD                234
+#define SQR_TOOM6_THRESHOLD                286
+#define SQR_TOOM8_THRESHOLD                466
+
+#define MULMID_TOOM42_THRESHOLD             20
+
+#define MULMOD_BNM1_THRESHOLD               12
+#define SQRMOD_BNM1_THRESHOLD               15
+
+#define MUL_FFT_MODF_THRESHOLD             412  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    412, 5}, {     19, 6}, {     10, 5}, {     21, 6}, \
+    {     11, 5}, {     23, 6}, {     25, 7}, {     13, 6}, \
+    {     28, 7}, {     25, 8}, {     13, 7}, {     28, 8}, \
+    {     15, 7}, {     32, 8}, {     17, 7}, {     35, 8}, \
+    {     19, 7}, {     39, 8}, {     27, 9}, {     15, 8}, \
+    {     35, 9}, {     19, 8}, {     41, 9}, {     23, 8}, \
+    {     47, 9}, {     27,10}, {     15, 9}, {     39,10}, \
+    {     23, 9}, {     55,11}, {     15,10}, {     31, 9}, \
+    {     71,10}, {     39, 9}, {     83,10}, {     47, 9}, \
+    {     99,10}, {     55,11}, {     31,10}, {     79,11}, \
+    {     47,10}, {    103,12}, {     31,11}, {     63, 7}, \
+    {   1023, 8}, {    543, 9}, {    303,10}, {    167,11}, \
+    {     95,10}, {    191,12}, {     63,11}, {    127,10}, \
+    {    255,11}, {    143,10}, {    287,11}, {    159,12}, \
+    {     95,11}, {    191,13}, {     63,12}, {    127,11}, \
+    {    255,10}, {    511,11}, {    271,10}, {    543,11}, \
+    {    287,12}, {    159,11}, {    319,10}, {    639,11}, \
+    {    351,12}, {    191,11}, {    383,10}, {    767,12}, \
+    {    223,11}, {    447,13}, {    127,12}, {    255,11}, \
+    {    511,10}, {   1023,11}, {    543,12}, {    287,11}, \
+    {    575,10}, {   1151,11}, {    607,12}, {    319,11}, \
+    {    639,10}, {   1279,11}, {    671,12}, {    351,13}, \
+    {    191,12}, {    383,11}, {    767,12}, {    415,11}, \
+    {    831,12}, {    447,14}, {    127,13}, {    255,12}, \
+    {    511,11}, {   1023,12}, {    543,11}, {   1087,10}, \
+    {   2175,12}, {    575,11}, {   1151,12}, {    607,13}, \
+    {    319,12}, {    639,11}, {   1279,12}, {    671,11}, \
+    {   1343,10}, {   2687,12}, {    703,11}, {   1407,13}, \
+    {    383,12}, {    767,11}, {   1535,12}, {    799,11}, \
+    {   1599,12}, {    831,13}, {    447,12}, {    895,14}, \
+    {    255,13}, {    511,12}, {   1023,11}, {   2047,12}, \
+    {   1087,11}, {   2175,13}, {    575,12}, {   1215,11}, \
+    {   2431,13}, {    639,12}, {   1343,11}, {   2687,13}, \
+    {    703,12}, {   1407,14}, {    383,13}, {    767,12}, \
+    {   1599,13}, {    831,12}, {   1727,11}, {   3455,13}, \
+    {    895,15}, {    255,14}, {    511,13}, {   1023,12}, \
+    {   2047,13}, {   1087,12}, {   2175,13}, {   1215,12}, \
+    {   2431,11}, {   4863,14}, {    639,13}, {   1343,12}, \
+    {   2687,13}, {   1471,12}, {   2943,11}, {   5887,14}, \
+    {    767,13}, {   1599,12}, {   3199,13}, {   1727,12}, \
+    {   3455,14}, {    895,13}, {   1919,15}, {    511,14}, \
+    {   1023,13}, {   2175,14}, {   1151,13}, {   2431,12}, \
+    {   4863,14}, {   1279,13}, {   2687,14}, {   1407,13}, \
+    {   2815,12}, {   5631,13}, {   2943,12}, {   5887,15}, \
+    {    767,14}, {   1535,13}, {   3199,14}, {   1663,13}, \
+    {   3455,12}, {   6911,14}, {   1791,13}, {   3583,14}, \
+    {   1919,13}, {   3839,16}, {    511,15}, {   1023,14}, \
+    {   2175,13}, {   4479,14}, {   2431,13}, {   4863,15}, \
+    {   1279,14}, {   2943,13}, {   5887,12}, {  11775,15}, \
+    {   1535,14}, {   3455,13}, {   6911,15}, {   1791,14}, \
+    {   3839,13}, {   7679,16}, {   1023,15}, {   2047,14}, \
+    {   4479,15}, {   2303,14}, {   4863,15}, {   2559,14}, \
+    {   5247,15}, {   2815,14}, {   5887,13}, {  11775,16}, \
+    {   1535,15}, {   3327,14}, {   6911,15}, {   3839,14}, \
+    {   7679,13}, {  15359,17}, {   1023,16}, {   2047,15}, \
+    {   4351,14}, {   8959,15}, {   4863,16}, {   2559,15}, \
+    {   5887,14}, {  11775,16}, {   3071,15}, {   6911,16}, \
+    {   3583,15}, {   7679,14}, {  15359,15}, {   7935,17}, \
+    {   2047,16}, {   4095,15}, {   8959,16}, {   4607,15}, \
+    {   9983,14}, {  19967,16}, {   5119,15}, {  10239,16}, \
+    {   5631,15}, {  11775,17}, {   3071,16}, {  65536,17}, \
+    { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
+    {2097152,22}, {4194304,23}, {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 251
+#define MUL_FFT_THRESHOLD                 4544
+
+#define SQR_FFT_MODF_THRESHOLD             364  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    364, 5}, {     19, 6}, {     10, 5}, {     21, 6}, \
+    {     11, 5}, {     23, 6}, {     12, 5}, {     25, 6}, \
+    {     25, 7}, {     13, 6}, {     27, 7}, {     25, 8}, \
+    {     13, 7}, {     28, 8}, {     15, 7}, {     31, 8}, \
+    {     17, 7}, {     35, 8}, {     19, 7}, {     39, 8}, \
+    {     27, 9}, {     15, 8}, {     35, 9}, {     19, 8}, \
+    {     41, 9}, {     23, 8}, {     47, 9}, {     27,10}, \
+    {     15, 9}, {     39,10}, {     23, 9}, {     51,11}, \
+    {     15,10}, {     31, 9}, {     67,10}, {     39, 9}, \
+    {     79,10}, {     47,11}, {     31,10}, {     79,11}, \
+    {     47,10}, {     95,12}, {     31,11}, {     63,10}, \
+    {    135,11}, {     79,10}, {    159,11}, {     95, 7}, \
+    {   1535, 8}, {    799, 7}, {   1599, 8}, {    831, 9}, \
+    {    447,10}, {    239,11}, {    127,10}, {    255,11}, \
+    {    143,10}, {    303,11}, {    159,12}, {     95,11}, \
+    {    191,10}, {    383,13}, {     63,12}, {    127,11}, \
+    {    255,10}, {    511,11}, {    303,12}, {    159,11}, \
+    {    351,12}, {    191,11}, {    383,10}, {    767,11}, \
+    {    415,12}, {    223,11}, {    447,13}, {    127,12}, \
+    {    255,11}, {    511,10}, {   1023,12}, {    287,11}, \
+    {    575,10}, {   1151,11}, {    607,12}, {    319,11}, \
+    {    639,10}, {   1279,11}, {    671,12}, {    351,13}, \
+    {    191,12}, {    383,11}, {    767,10}, {   1535,12}, \
+    {    415,11}, {    831,12}, {    447,14}, {    127,13}, \
+    {    255,12}, {    511,11}, {   1023,12}, {    543,11}, \
+    {   1087,10}, {   2175,12}, {    575,11}, {   1151,12}, \
+    {    607,13}, {    319,12}, {    639,11}, {   1279,12}, \
+    {    671,11}, {   1343,12}, {    703,11}, {   1407,12}, \
+    {    735,13}, {    383,12}, {    767,11}, {   1535,12}, \
+    {    799,11}, {   1599,12}, {    831,13}, {    447,12}, \
+    {    895,14}, {    255,13}, {    511,12}, {   1023,11}, \
+    {   2047,12}, {   1087,11}, {   2175,13}, {    575,12}, \
+    {   1151,11}, {   2303,12}, {   1215,11}, {   2431,13}, \
+    {    639,12}, {   1343,13}, {    703,12}, {   1407,14}, \
+    {    383,13}, {    767,12}, {   1599,11}, {   3199,13}, \
+    {    831,12}, {   1727,11}, {   3455,13}, {    895,15}, \
+    {    255,14}, {    511,13}, {   1023,12}, {   2047,13}, \
+    {   1087,12}, {   2175,13}, {   1151,12}, {   2303,13}, \
+    {   1215,12}, {   2431,14}, {    639,13}, {   1343,12}, \
+    {   2687,13}, {   1471,12}, {   2943,11}, {   5887,14}, \
+    {    767,13}, {   1599,12}, {   3199,13}, {   1727,12}, \
+    {   3455,11}, {   6911,14}, {    895,13}, {   1791,12}, \
+    {   3583,13}, {   1919,12}, {   3839,15}, {    511,14}, \
+    {   1023,13}, {   2175,14}, {   1151,13}, {   2431,12}, \
+    {   4863,14}, {   1279,13}, {   2687,14}, {   1407,13}, \
+    {   2943,12}, {   5887,11}, {  11775,15}, {    767,14}, \
+    {   1535,13}, {   3199,14}, {   1663,13}, {   3455,12}, \
+    {   6911,14}, {   1791,13}, {   3583,14}, {   1919,13}, \
+    {   3839,16}, {    511,15}, {   1023,14}, {   2175,13}, \
+    {   4351,12}, {   8703,13}, {   4479,12}, {   8959,14}, \
+    {   2303,13}, {   4607,14}, {   2431,13}, {   4863,15}, \
+    {   1279,14}, {   2815,13}, {   5631,14}, {   2943,13}, \
+    {   5887,12}, {  11775,15}, {   1535,14}, {   3455,13}, \
+    {   6911,15}, {   1791,14}, {   3839,13}, {   7679,16}, \
+    {   1023,15}, {   2047,14}, {   4351,13}, {   8703,14}, \
+    {   4479,13}, {   8959,15}, {   2303,14}, {   4991,13}, \
+    {   9983,15}, {   2559,14}, {   5119,15}, {   2815,14}, \
+    {   5887,13}, {  11775,16}, {   1535,15}, {   3071,14}, \
+    {   6143,15}, {   3327,14}, {   6911,15}, {   3839,14}, \
+    {   7679,13}, {  15359,17}, {   1023,16}, {   2047,15}, \
+    {   4095,14}, {   8191,15}, {   4351,14}, {   8959,15}, \
+    {   4863,14}, {   9983,16}, {   2559,15}, {   5887,14}, \
+    {  11775,16}, {   3071,15}, {   6911,16}, {   3583,15}, \
+    {   7679,14}, {  15359,15}, {   7935,14}, {  15871,17}, \
+    {   2047,16}, {   4095,15}, {   8959,16}, {   4607,15}, \
+    {   9983,14}, {  19967,16}, {   5119,15}, {  10239,16}, \
+    {   5631,15}, {  11775,17}, {   3071,16}, {  65536,17}, \
+    { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
+    {2097152,22}, {4194304,23}, {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 275
+#define SQR_FFT_THRESHOLD                 3264
+
+#define MULLO_BASECASE_THRESHOLD             0  /* always */
+#define MULLO_DC_THRESHOLD                  23
+#define MULLO_MUL_N_THRESHOLD             8907
+#define SQRLO_BASECASE_THRESHOLD             9
+#define SQRLO_DC_THRESHOLD                   0  /* never mpn_sqrlo_basecase */
+#define SQRLO_SQR_THRESHOLD               6440
+
+#define DC_DIV_QR_THRESHOLD                 52
+#define DC_DIVAPPR_Q_THRESHOLD             167
+#define DC_BDIV_QR_THRESHOLD                48
+#define DC_BDIV_Q_THRESHOLD                 93
+
+#define INV_MULMOD_BNM1_THRESHOLD           38
+#define INV_NEWTON_THRESHOLD               197
+#define INV_APPR_THRESHOLD                 179
+
+#define BINV_NEWTON_THRESHOLD              230
+#define REDC_1_TO_REDC_2_THRESHOLD          32
+#define REDC_2_TO_REDC_N_THRESHOLD          55
+
+#define MU_DIV_QR_THRESHOLD               1387
+#define MU_DIVAPPR_Q_THRESHOLD            1387
+#define MUPI_DIV_QR_THRESHOLD               92
+#define MU_BDIV_QR_THRESHOLD              1142
+#define MU_BDIV_Q_THRESHOLD               1334
+
+#define POWM_SEC_TABLE  1,22,194,434,452
+
+#define GET_STR_DC_THRESHOLD                13
+#define GET_STR_PRECOMPUTE_THRESHOLD        20
+#define SET_STR_DC_THRESHOLD               438
+#define SET_STR_PRECOMPUTE_THRESHOLD      1254
+
+#define FAC_DSC_THRESHOLD                  189
+#define FAC_ODD_THRESHOLD                   26
+
+#define MATRIX22_STRASSEN_THRESHOLD         14
+#define HGCD2_DIV1_METHOD                    3  /* 2.31% faster than 4 */
+#define HGCD_THRESHOLD                     104
+#define HGCD_APPR_THRESHOLD                 52
+#define HGCD_REDUCE_THRESHOLD             2681
+#define GCD_DC_THRESHOLD                   465
+#define GCDEXT_DC_THRESHOLD                283
+#define JACOBI_BASE_METHOD                   4  /* 5.81% faster than 1 */
+
+/* Tuneup completed successfully, took 554602 seconds */

diff --git a/third_party/gmp/mpn/x86_64/bd1/hamdist.asm b/third_party/gmp/mpn/x86_64/bd1/hamdist.asm
new file mode 100644
index 0000000..29e78a3
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/bd1/hamdist.asm

@@ -0,0 +1,206 @@
+dnl  AMD64 SSSE3/XOP mpn_hamdist -- hamming distance.
+
+dnl  Copyright 2010-2017 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+C		    cycles/limb	  good for cpu?
+C AMD K8,K9		n/a
+C AMD K10		n/a
+C AMD bd1	     1.51-2.0		y
+C AMD bd2	     1.50-1.9		y
+C AMD bd3		 ?
+C AMD bd4		 ?
+C AMD zen		n/a
+C AMD bobcat		n/a
+C AMD jaguar		n/a
+C Intel P4		n/a
+C Intel PNR		n/a
+C Intel NHM		n/a
+C Intel SBR		n/a
+C Intel IBR		n/a
+C Intel HWL		n/a
+C Intel BWL		n/a
+C Intel SKL		n/a
+C Intel atom		n/a
+C Intel SLM		n/a
+C VIA nano		n/a
+
+C TODO
+C  * We need to use .byte for vpshlb, vpperm, vphaddubq, and all popcnt if we
+C    intend to support old systems.
+
+C We use vpshlb and vpperm below, which are XOP extensions to AVX.  Some
+C systems, e.g., NetBSD, set OSXSAVE but nevertheless trigger SIGILL for AVX.
+C We fall back to the core2 code.
+ifdef(`GMP_AVX_NOT_REALLY_AVAILABLE',`
+MULFUNC_PROLOGUE(mpn_hamdist)
+include_mpn(`x86_64/core2/hamdist.asm')
+',`
+
+define(`up',		`%rdi')
+define(`vp',		`%rsi')
+define(`n',		`%rdx')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(32)
+PROLOGUE(mpn_hamdist)
+	FUNC_ENTRY(3)
+	cmp	$5, n
+	jl	L(sma)
+
+	lea	L(cnsts)(%rip), %r9
+
+	xor	R32(%r10), R32(%r10)
+	test	$8, R8(vp)
+	jz	L(ali)
+	mov	(up), %r8
+	xor	(vp), %r8
+	add	$8, up
+	add	$8, vp
+	dec	n
+	popcnt	%r8, %r10
+L(ali):
+
+ifdef(`PIC', `define(`OFF1',16) define(`OFF2',32) define(`OFF3',48)',
+	     `define(`OFF1',32) define(`OFF2',48) define(`OFF3',64)')
+	movdqa	OFF1`'(%r9), %xmm7	C nibble counts table
+	movdqa	OFF2`'(%r9), %xmm6	C splat shift counts
+	movdqa	OFF3`'(%r9), %xmm5	C masks
+	pxor	%xmm4, %xmm4
+	pxor	%xmm8, %xmm8		C grand total count
+
+	mov	R32(n), R32(%rax)
+	and	$6, R32(%rax)
+	lea	-64(up,%rax,8), up
+	lea	-64(vp,%rax,8), vp
+ifdef(`PIC',`
+	movslq	(%r9,%rax,2), %r11
+	add	%r9, %r11
+	jmp	*%r11
+',`
+	jmp	*(%r9,%rax,4)
+')
+
+L(0):	add	$64, up
+	add	$64, vp
+	sub	$2, n
+
+	ALIGN(32)
+L(top):	lddqu	(up), %xmm0
+	pxor	(vp), %xmm0
+	.byte	0x8f,0xe9,0x48,0x94,0xc8	C vpshlb %xmm6, %xmm0, %xmm1
+	pand	%xmm5, %xmm0
+	pand	%xmm5, %xmm1
+	.byte	0x8f,0xe8,0x40,0xa3,0xd7,0x00	C vpperm %xmm0,%xmm7,%xmm7,%xmm2
+	.byte	0x8f,0xe8,0x40,0xa3,0xdf,0x10	C vpperm %xmm1,%xmm7,%xmm7,%xmm3
+	paddb	%xmm2, %xmm3
+	paddb	%xmm3, %xmm4
+L(6):	lddqu	16(up), %xmm0
+	pxor	16(vp), %xmm0
+	.byte	0x8f,0xe9,0x48,0x94,0xc8	C vpshlb %xmm6, %xmm0, %xmm1
+	pand	%xmm5, %xmm0
+	pand	%xmm5, %xmm1
+	.byte	0x8f,0xe8,0x40,0xa3,0xd7,0x00	C vpperm %xmm0,%xmm7,%xmm7,%xmm2
+	.byte	0x8f,0xe8,0x40,0xa3,0xdf,0x10	C vpperm %xmm1,%xmm7,%xmm7,%xmm3
+	paddb	%xmm2, %xmm3
+	paddb	%xmm3, %xmm4
+L(4):	lddqu	32(up), %xmm0
+	pxor	32(vp), %xmm0
+	.byte	0x8f,0xe9,0x48,0x94,0xc8	C vpshlb %xmm6, %xmm0, %xmm1
+	pand	%xmm5, %xmm0
+	pand	%xmm5, %xmm1
+	.byte	0x8f,0xe8,0x40,0xa3,0xd7,0x00	C vpperm %xmm0,%xmm7,%xmm7,%xmm2
+	.byte	0x8f,0xe9,0x78,0xd3,0xc4	C vphaddubq %xmm4, %xmm0
+	.byte	0x8f,0xe8,0x40,0xa3,0xe7,0x10	C vpperm %xmm1,%xmm7,%xmm7,%xmm4
+	paddb	%xmm2, %xmm3
+	paddb	%xmm2, %xmm4
+	paddq	%xmm0, %xmm8		C sum to 2 x 64-bit counts
+L(2):	mov	48(up), %r8
+	mov	56(up), %r9
+	add	$64, up
+	xor	48(vp), %r8
+	xor	56(vp), %r9
+	add	$64, vp
+	popcnt	%r8, %r8
+	popcnt	%r9, %r9
+	add	%r8, %r10
+	add	%r9, %r10
+	sub	$8, n
+	jg	L(top)
+
+	test	$1, R8(n)
+	jz	L(x)
+	mov	(up), %r8
+	xor	(vp), %r8
+	popcnt	%r8, %r8
+	add	%r8, %r10
+L(x):	.byte	0x8f,0xe9,0x78,0xd3,0xc4	C vphaddubq %xmm4, %xmm0
+	paddq	%xmm0, %xmm8
+	pshufd	$14, %xmm8, %xmm0
+	paddq	%xmm8, %xmm0
+	movq	%xmm0, %rax
+	add	%r10, %rax
+	FUNC_EXIT()
+	ret
+
+L(sma):	mov	(up), %r8
+	xor	(vp), %r8
+	popcnt	%r8, %rax
+	dec	n
+	jz	L(ed)
+L(tp):	mov	8(up), %r8
+	add	$8, up
+	xor	8(vp), %r8
+	add	$8, vp
+	popcnt	%r8, %r8
+	add	%r8, %rax
+	dec	n
+	jnz	L(tp)
+L(ed):	FUNC_EXIT()
+	ret
+EPILOGUE()
+DEF_OBJECT(L(cnsts),16,`JUMPTABSECT')
+	JMPENT(	L(0), L(cnsts))
+	JMPENT(	L(2), L(cnsts))
+	JMPENT(	L(4), L(cnsts))
+	JMPENT(	L(6), L(cnsts))
+	.byte	0x00,0x01,0x01,0x02,0x01,0x02,0x02,0x03
+	.byte	0x01,0x02,0x02,0x03,0x02,0x03,0x03,0x04
+	.byte	-4,-4,-4,-4,-4,-4,-4,-4
+	.byte	-4,-4,-4,-4,-4,-4,-4,-4
+	.byte	0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f
+	.byte	0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f
+END_OBJECT(L(cnsts))
+')

diff --git a/third_party/gmp/mpn/x86_64/bd1/mul_1.asm b/third_party/gmp/mpn/x86_64/bd1/mul_1.asm
new file mode 100644
index 0000000..2fb097f
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/bd1/mul_1.asm

@@ -0,0 +1,193 @@
+dnl  AMD64 mpn_mul_1 optimised for AMD Bulldozer.
+
+dnl  Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C AMD K8,K9      3.65
+C AMD K10        3.30    3.68
+C AMD bull       4.04    4.29
+C AMD pile       4.33
+C AMD steam
+C AMD excavator
+C AMD bobcat     5.73
+C AMD jaguar     5.87
+C Intel P4      12.5
+C Intel core2    4.38
+C Intel NHM      4.28
+C Intel SBR      2.69
+C Intel IBR      2.55
+C Intel HWL      2.41
+C Intel BWL      2.49
+C Intel SKL      2.50
+C Intel atom    20.3
+C Intel SLM      7.8
+C VIA nano       4.25
+
+C The loop of this code is the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjorn Granlund.
+
+C TODO
+C  * Move loop code into feed-in blocks, to save insn for zeroing regs.
+
+define(`rp',      `%rdi')   C rcx
+define(`up',      `%rsi')   C rdx
+define(`n_param', `%rdx')   C r8
+define(`v0',      `%rcx')   C r9
+
+define(`n',       `%rbx')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+IFDOS(`	define(`up', ``%rsi'')	') dnl
+IFDOS(`	define(`rp', ``%rcx'')	') dnl
+IFDOS(`	define(`v0', ``%r9'')	') dnl
+IFDOS(`	define(`r9', ``rdi'')	') dnl
+IFDOS(`	define(`n',  ``%r8'')	') dnl
+IFDOS(`	define(`r8', ``rbx'')	') dnl
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_mul_1c)
+IFDOS(``push	%rsi		'')
+IFDOS(``push	%rdi		'')
+IFDOS(``mov	%rdx, %rsi	'')
+
+	mov	(up), %rax		C read first u limb early
+	push	%rbx
+IFSTD(`	mov	n_param, %r11	')	C move away n from rdx, mul uses it
+IFDOS(`	mov	n, %r11		')
+	mul	v0
+
+IFSTD(` add	%r8, %rax	')
+IFDOS(` add	64(%rsp), %rax	')	C 40 + 3*8  (3 push insns)
+	adc	$0, %rdx
+	jmp	L(common)
+
+EPILOGUE()
+
+	ALIGN(16)
+PROLOGUE(mpn_mul_1)
+IFDOS(``push	%rsi		'')
+IFDOS(``push	%rdi		'')
+IFDOS(``mov	%rdx, %rsi	'')
+
+	mov	(up), %rax		C read first u limb early
+	push	%rbx
+IFSTD(`	mov	n_param, %r11	')	C move away n from rdx, mul uses it
+IFDOS(`	mov	n, %r11		')
+	mul	v0
+
+L(common):
+IFSTD(`	mov	%r11, n		')
+
+	and	$3, R32(%r11)
+	lea	-16(rp,n,8), rp
+	jz	L(b0)
+	cmp	$2, R32(%r11)
+	jb	L(b1)
+	jz	L(b2)
+
+L(b3):	mov	%rax, %r10
+	mov	%rdx, %r11
+	mov	8(up), %rax
+	mul	v0
+	lea	(up,n,8), up
+	not	n
+	jmp	L(L3)
+
+L(b0):	mov	%rax, %r9
+	mov	%rdx, %r10
+	mov	8(up), %rax
+	lea	(up,n,8), up
+	neg	n
+	jmp	L(L0)
+
+L(b1):	mov	%rax, %r8
+	cmp	$1, n
+	jz	L(n1)
+	mov	%rdx, %r9
+	lea	(up,n,8), up
+	neg	n
+	mov	%r8, 16(rp,n,8)
+	inc	n
+	jmp	L(L1)
+
+L(b2):	mov	%rax, %r11
+	mov	%rdx, %r8
+	mov	8(up), %rax
+	lea	(up,n,8), up
+	neg	n
+	add	$2, n
+	jns	L(end)
+
+	ALIGN(16)
+L(top):	mul	v0
+	mov	%rdx, %r9
+	add	%rax, %r8
+	adc	$0, %r9
+	mov	%r8, 8(rp,n,8)
+	mov	%r11, (rp,n,8)
+L(L1):	mov	(up,n,8), %rax
+	mul	v0
+	add	%rax, %r9
+	mov	%rdx, %r10
+	mov	8(up,n,8), %rax
+	adc	$0, %r10
+L(L0):	mul	v0
+	add	%rax, %r10
+	mov	%rdx, %r11
+	mov	16(up,n,8), %rax
+	adc	$0, %r11
+	mul	v0
+	mov	%r9, 16(rp,n,8)
+L(L3):	add	%rax, %r11
+	mov	%r10, 24(rp,n,8)
+	mov	%rdx, %r8
+	adc	$0, %r8
+	add	$4, n
+	mov	-8(up,n,8), %rax
+	js	L(top)
+
+L(end):	mul	v0
+	add	%rax, %r8
+	adc	$0, %rdx
+	mov	%r11, (rp)
+L(n1):	mov	%r8, 8(rp)
+	mov	%rdx, %rax
+
+	pop	%rbx
+IFDOS(``pop	%rdi		'')
+IFDOS(``pop	%rsi		'')
+	ret
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/x86_64/bd1/mul_2.asm b/third_party/gmp/mpn/x86_64/bd1/mul_2.asm
new file mode 100644
index 0000000..85fa7aa
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/bd1/mul_2.asm

@@ -0,0 +1,195 @@
+dnl  AMD64 mpn_mul_2 optimised for AMD Bulldozer.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2008, 2011-2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C AMD K8,K9	 6.78
+C AMD K10	 6.78
+C AMD bd1	 8.39	 8.65
+C AMD bd2	 8.47
+C AMD bd3
+C AMD bd4
+C AMD zen
+C AMD bt1	12.1
+C AMD bt2	11.5
+C Intel P4	24.0
+C Intel PNR	 8.14
+C Intel NHM	 7.78
+C Intel SBR	 6.34
+C Intel IBR	 6.15
+C Intel HWL	 6.04
+C Intel BWL	 4.33
+C Intel SKL	 4.41
+C Intel atom	39.5
+C Intel SLM	27.8
+C VIA nano
+
+C The loop of this code is the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjorn Granlund.
+
+define(`rp',      `%rdi')   C rcx
+define(`up',      `%rsi')   C rdx
+define(`n_param', `%rdx')   C r8
+define(`vp',      `%rcx')   C r9
+
+define(`v0', `%r8')
+define(`v1', `%r9')
+define(`w0', `%rbx')
+define(`w1', `%rcx')
+define(`w2', `%rbp')
+define(`w3', `%r10')
+define(`n',  `%r11')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(32)
+PROLOGUE(mpn_mul_2)
+	FUNC_ENTRY(4)
+	push	%rbx
+	push	%rbp
+
+	mov	(up), %rax
+
+	mov	(vp), v0
+	mov	8(vp), v1
+
+	lea	(up,n_param,8), up
+	lea	(rp,n_param,8), rp
+
+	mov	n_param, n
+	mul	v0
+	neg	n
+
+	test	$1, R8(n)
+	jnz	L(bx1)
+
+L(bx0):	test	$2, R8(n)
+	jnz	L(b10)
+
+L(b00):	mov	%rax, w0
+	mov	%rdx, w1
+	xor	R32(w2), R32(w2)
+	mov	(up,n,8), %rax
+	jmp	L(lo0)
+
+L(b10):	mov	%rax, w2
+	mov	%rdx, w3
+	mov	(up,n,8), %rax
+	xor	R32(w0), R32(w0)
+	mul	v1
+	add	$-2, n
+	jmp	L(lo2)
+
+L(bx1):	test	$2, R8(n)
+	jz	L(b11)
+
+L(b01):	mov	%rax, w3
+	mov	%rdx, w0
+	mov	(up,n,8), %rax
+	mul	v1
+	xor	R32(w1), R32(w1)
+	inc	n
+	jmp	L(lo1)
+
+L(b11):	mov	%rax, w1
+	mov	%rdx, w2
+	mov	(up,n,8), %rax
+	xor	R32(w3), R32(w3)
+	dec	n
+	jmp	L(lo3)
+
+	ALIGN(32)
+L(top):	mov	-8(up,n,8), %rax
+	mul	v1
+	mov	w2, -16(rp,n,8)
+L(lo1):	add	%rax, w0
+	mov	w3, -8(rp,n,8)
+	adc	%rdx, w1
+	mov	(up,n,8), %rax
+	mul	v0
+	mov	$0, R32(w2)
+	add	%rax, w0
+	adc	%rdx, w1
+	adc	$0, R32(w2)
+	mov	(up,n,8), %rax
+L(lo0):	mul	v1
+	add	%rax, w1
+	adc	%rdx, w2
+	mov	8(up,n,8), %rax
+	mul	v0
+	add	%rax, w1
+	mov	w0, (rp,n,8)
+	mov	$0, R32(w3)
+	mov	8(up,n,8), %rax
+	adc	%rdx, w2
+	adc	$0, R32(w3)
+L(lo3):	mul	v1
+	add	%rax, w2
+	mov	16(up,n,8), %rax
+	adc	%rdx, w3
+	mul	v0
+	add	%rax, w2
+	mov	16(up,n,8), %rax
+	mov	$0, R32(w0)
+	adc	%rdx, w3
+	adc	$0, R32(w0)
+	mul	v1
+	mov	w1, 8(rp,n,8)
+L(lo2):	add	%rax, w3
+	adc	%rdx, w0
+	mov	24(up,n,8), %rax
+	mul	v0
+	add	%rax, w3
+	adc	%rdx, w0
+	mov	$0, R32(w1)
+	adc	$0, R32(w1)
+	add	$4, n
+	jnc	L(top)
+
+L(end):	mov	-8(up), %rax
+	mul	v1
+	mov	w2, -16(rp)
+	add	%rax, w0
+	mov	w3, -8(rp)
+	adc	%rdx, w1
+	mov	w0, (rp)
+	mov	w1, %rax
+
+	pop	%rbp
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/bd1/mul_basecase.asm b/third_party/gmp/mpn/x86_64/bd1/mul_basecase.asm
new file mode 100644
index 0000000..e47ba58
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/bd1/mul_basecase.asm

@@ -0,0 +1,416 @@
+dnl  AMD64 mpn_mul_basecase optimised for AMD Bulldozer and Piledriver.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb	mul_1		mul_2		mul_3		addmul_2
+C AMD K8,K9
+C AMD K10
+C AMD bull	~4.8		~4.55		-		~4.3
+C AMD pile	~4.6		~4.55		-		~4.55
+C AMD bobcat
+C AMD jaguar
+C Intel P4
+C Intel core
+C Intel NHM
+C Intel SBR
+C Intel IBR
+C Intel HWL
+C Intel BWL
+C Intel atom
+C VIA nano
+
+C The inner loops of this code are the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjorn Granlund.
+
+C TODO
+C  * Merge bull-specific mul_1, if it is not slower the TOOM22 range.
+C    Alternatively, we could tweak the present code (which was loopmixed for a
+C    different CPU).
+C  * Merge faster mul_2, such as the one in the same directory as this file.
+C  * Further micro-optimise.
+
+C When playing with pointers, set this to $2 to fall back to conservative
+C indexing in wind-down code.
+define(`I',`$1')
+
+
+define(`rp',      `%rdi')
+define(`up',      `%rsi')
+define(`un_param',`%rdx')
+define(`vp',      `%rcx')
+define(`vn',      `%r8')
+
+define(`un',      `%rbx')
+
+define(`w0',	`%r10')
+define(`w1',	`%r11')
+define(`w2',	`%r12')
+define(`w3',	`%r13')
+define(`n',	`%rbp')
+define(`v0',	`%r9')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_mul_basecase)
+	FUNC_ENTRY(4)
+IFDOS(`	mov	56(%rsp), %r8d	')
+	push	%rbx
+	push	%rbp
+	mov	un_param, un		C free up rdx
+	neg	un
+
+	mov	(up), %rax		C shared for mul_1 and mul_2
+	lea	(up,un_param,8), up	C point at operand end
+	lea	(rp,un_param,8), rp	C point at rp[un-1]
+
+	mov	(vp), v0		C shared for mul_1 and mul_2
+	mul	v0			C shared for mul_1 and mul_2
+
+	test	$1, R8(vn)
+	jz	L(do_mul_2)
+
+L(do_mul_1):
+	test	$1, R8(un)
+	jnz	L(m1x1)
+
+L(m1x0):mov	%rax, w0		C un = 2, 4, 6, 8, ...
+	mov	%rdx, w1
+	mov	8(up,un,8), %rax
+	test	$2, R8(un)
+	jnz	L(m110)
+
+L(m100):lea	2(un), n		C un = 4, 8, 12, ...
+	jmp	L(m1l0)
+
+L(m110):lea	(un), n			C un = 2, 6, 10, ...
+	jmp	L(m1l2)
+
+L(m1x1):mov	%rax, w1		C un = 1, 3, 5, 7, ...
+	mov	%rdx, w0
+	test	$2, R8(un)
+	jz	L(m111)
+
+L(m101):lea	3(un), n		C un = 1, 5, 9, ...
+	test	n, n
+	js	L(m1l1)
+	mov	%rax, -8(rp)
+	mov	%rdx, (rp)
+	pop	%rbp
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+
+L(m111):lea	1(un), n		C un = 3, 7, 11, ...
+	mov	8(up,un,8), %rax
+	jmp	L(m1l3)
+
+	ALIGN(16)
+L(m1tp):mov	%rdx, w0
+	add	%rax, w1
+L(m1l1):mov	-16(up,n,8), %rax
+	adc	$0, w0
+	mul	v0
+	add	%rax, w0
+	mov	w1, -24(rp,n,8)
+	mov	-8(up,n,8), %rax
+	mov	%rdx, w1
+	adc	$0, w1
+L(m1l0):mul	v0
+	mov	w0, -16(rp,n,8)
+	add	%rax, w1
+	mov	%rdx, w0
+	mov	(up,n,8), %rax
+	adc	$0, w0
+L(m1l3):mul	v0
+	mov	w1, -8(rp,n,8)
+	mov	%rdx, w1
+	add	%rax, w0
+	mov	8(up,n,8), %rax
+	adc	$0, w1
+L(m1l2):mul	v0
+	mov	w0, (rp,n,8)
+	add	$4, n
+	jnc	L(m1tp)
+
+L(m1ed):add	%rax, w1
+	adc	$0, %rdx
+	mov	w1, I(-8(rp),-24(rp,n,8))
+	mov	%rdx, I((rp),-16(rp,n,8))
+
+	dec	R32(vn)
+	jz	L(ret2)
+
+	lea	8(vp), vp
+	lea	8(rp), rp
+	push	%r12
+	push	%r13
+	push	%r14
+	jmp	L(do_addmul)
+
+L(do_mul_2):
+define(`v1',	`%r14')
+	push	%r12
+	push	%r13
+	push	%r14
+
+	mov	8(vp), v1
+
+	test	$1, R8(un)
+	jnz	L(m2b1)
+
+L(m2b0):lea	(un), n
+	mov	%rax, w2		C 0
+	mov	(up,un,8), %rax
+	mov	%rdx, w1		C 1
+	mul	v1
+	mov	%rax, w0		C 1
+	mov	w2, (rp,un,8)		C 0
+	mov	8(up,un,8), %rax
+	mov	%rdx, w2		C 2
+	jmp	L(m2l0)
+
+L(m2b1):lea	1(un), n
+	mov	%rax, w0		C 1
+	mov	%rdx, w3		C 2
+	mov	(up,un,8), %rax
+	mul	v1
+	mov	w0, (rp,un,8)		C 1
+	mov	%rdx, w0		C 3
+	mov	%rax, w2		C 0
+	mov	8(up,un,8), %rax
+	jmp	L(m2l1)
+
+	ALIGN(32)
+L(m2tp):add	%rax, w2		C 0
+	mov	(up,n,8), %rax
+	adc	$0, w0			C 1
+L(m2l1):mul	v0
+	add	%rax, w2		C 0
+	mov	(up,n,8), %rax
+	mov	%rdx, w1		C 1
+	adc	$0, w1			C 1
+	mul	v1
+	add	w3, w2			C 0
+	adc	$0, w1			C 1
+	add	%rax, w0		C 1
+	mov	w2, (rp,n,8)		C 0
+	mov	8(up,n,8), %rax
+	mov	%rdx, w2		C 2
+	adc	$0, w2			C 2
+L(m2l0):mul	v0
+	add	%rax, w0		C 1
+	mov	%rdx, w3		C 2
+	adc	$0, w3			C 2
+	add	w1, w0			C 1
+	adc	$0, w3			C 2
+	mov	8(up,n,8), %rax
+	mul	v1
+	add	$2, n
+	mov	w0, -8(rp,n,8)		C 1
+	mov	%rdx, w0		C 3
+	jnc	L(m2tp)
+
+L(m2ed):add	%rax, w2
+	adc	$0, %rdx
+	add	w3, w2
+	adc	$0, %rdx
+	mov	w2, I((rp),(rp,n,8))
+	mov	%rdx, I(8(rp),8(rp,n,8))
+
+	add	$-2, R32(vn)
+	jz	L(ret5)
+
+	lea	16(vp), vp
+	lea	16(rp), rp
+
+
+L(do_addmul):
+	push	%r15
+	push	vn			C save vn in new stack slot
+define(`vn',	`(%rsp)')
+define(`X0',	`%r14')
+define(`X1',	`%r15')
+define(`v1',	`%r8')
+
+L(outer):
+	mov	(vp), v0
+	mov	8(vp), v1
+
+	mov	(up,un,8), %rax
+	mul	v0
+
+	test	$1, R8(un)
+	jnz	L(bx1)
+
+L(bx0):	mov	%rax, X1
+	mov	(up,un,8), %rax
+	mov	%rdx, X0
+	mul	v1
+	test	$2, R8(un)
+	jnz	L(b10)
+
+L(b00):	lea	(un), n			C un = 4, 8, 12, ...
+	mov	(rp,un,8), w3
+	mov	%rax, w0
+	mov	8(up,un,8), %rax
+	mov	%rdx, w1
+	jmp	L(lo0)
+
+L(b10):	lea	2(un), n		C un = 2, 6, 10, ...
+	mov	(rp,un,8), w1
+	mov	%rdx, w3
+	mov	%rax, w2
+	mov	8(up,un,8), %rax
+	jmp	L(lo2)
+
+L(bx1):	mov	%rax, X0
+	mov	(up,un,8), %rax
+	mov	%rdx, X1
+	mul	v1
+	test	$2, R8(un)
+	jz	L(b11)
+
+L(b01):	lea	1(un), n		C un = 1, 5, 9, ...
+	mov	(rp,un,8), w2
+	mov	%rdx, w0
+	mov	%rax, w3
+	jmp	L(lo1)
+
+L(b11):	lea	-1(un), n		C un = 3, 7, 11, ...
+	mov	(rp,un,8), w0
+	mov	%rax, w1
+	mov	8(up,un,8), %rax
+	mov	%rdx, w2
+	jmp	L(lo3)
+
+	ALIGN(32)
+L(top):
+L(lo2):	mul	v0
+	add	w1, X1
+	mov	X1, -16(rp,n,8)
+	mov	%rdx, X1
+	adc	%rax, X0
+	adc	$0, X1
+	mov	-8(up,n,8), %rax
+	mul	v1
+	mov	-8(rp,n,8), w1
+	mov	%rdx, w0
+	add	w1, w2
+	adc	%rax, w3
+	adc	$0, w0
+L(lo1):	mov	(up,n,8), %rax
+	mul	v0
+	add	w2, X0
+	mov	X0, -8(rp,n,8)
+	mov	%rdx, X0
+	adc	%rax, X1
+	mov	(up,n,8), %rax
+	adc	$0, X0
+	mov	(rp,n,8), w2
+	mul	v1
+	add	w2, w3
+	adc	%rax, w0
+	mov	8(up,n,8), %rax
+	mov	%rdx, w1
+	adc	$0, w1
+L(lo0):	mul	v0
+	add	w3, X1
+	mov	X1, (rp,n,8)
+	adc	%rax, X0
+	mov	8(up,n,8), %rax
+	mov	%rdx, X1
+	adc	$0, X1
+	mov	8(rp,n,8), w3
+	mul	v1
+	add	w3, w0
+	adc	%rax, w1
+	mov	16(up,n,8), %rax
+	mov	%rdx, w2
+	adc	$0, w2
+L(lo3):	mul	v0
+	add	w0, X0
+	mov	X0, 8(rp,n,8)
+	mov	%rdx, X0
+	adc	%rax, X1
+	adc	$0, X0
+	mov	16(up,n,8), %rax
+	mov	16(rp,n,8), w0
+	mul	v1
+	mov	%rdx, w3
+	add	w0, w1
+	adc	%rax, w2
+	adc	$0, w3
+	mov	24(up,n,8), %rax
+	add	$4, n
+	jnc	L(top)
+
+L(end):	mul	v0
+	add	w1, X1
+	mov	X1, I(-16(rp),-16(rp,n,8))
+	mov	%rdx, X1
+	adc	%rax, X0
+	adc	$0, X1
+	mov	I(-8(up),-8(up,n,8)), %rax
+	mul	v1
+	mov	I(-8(rp),-8(rp,n,8)), w1
+	add	w1, w2
+	adc	%rax, w3
+	adc	$0, %rdx
+	add	w2, X0
+	adc	$0, X1
+	mov	X0, I(-8(rp),-8(rp,n,8))
+	add	w3, X1
+	mov	X1, I((rp),(rp,n,8))
+	adc	$0, %rdx
+	mov	%rdx, I(8(rp),8(rp,n,8))
+
+
+	addl	$-2, vn
+	lea	16(vp), vp
+	lea	16(rp), rp
+	jnz	L(outer)
+
+	pop	%rax		C deallocate vn slot
+	pop	%r15
+L(ret5):pop	%r14
+	pop	%r13
+	pop	%r12
+L(ret2):pop	%rbp
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/bd1/popcount.asm b/third_party/gmp/mpn/x86_64/bd1/popcount.asm
new file mode 100644
index 0000000..28ce461
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/bd1/popcount.asm

@@ -0,0 +1,191 @@
+dnl  AMD64 SSSE3/XOP mpn_popcount -- population count.
+
+dnl  Copyright 2010-2017 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+C		    cycles/limb	  good for cpu?
+C AMD K8,K9		n/a
+C AMD K10		n/a
+C AMD bd1		 1.27		y
+C AMD bd2		 1.24		y
+C AMD bd3		 ?
+C AMD bd4		 1.22
+C AMD zen		n/a
+C AMD bobcat		n/a
+C AMD jaguar		n/a
+C Intel P4		n/a
+C Intel CNR		n/a
+C Intel PNR		n/a
+C Intel NHM		n/a
+C Intel SBR		n/a
+C Intel IBR		n/a
+C Intel HWL		n/a
+C Intel BWL		n/a
+C Intel SKL		n/a
+C Intel atom		n/a
+C Intel SLM		n/a
+C VIA nano		n/a
+
+C TODO
+C  * We need to use .byte for vpshlb, vpperm, vphaddubq, and all popcnt if we
+C    intend to support old systems.
+
+C We use vpshlb and vpperm below, which are XOP extensions to AVX.  Some
+C systems, e.g., NetBSD, set OSXSAVE but nevertheless trigger SIGILL for AVX.
+C We fall back to the core2 code.
+ifdef(`GMP_AVX_NOT_REALLY_AVAILABLE',`
+MULFUNC_PROLOGUE(mpn_popcount)
+include_mpn(`x86_64/core2/popcount.asm')
+',`
+
+define(`up',		`%rdi')
+define(`n',		`%rsi')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(32)
+PROLOGUE(mpn_popcount)
+	FUNC_ENTRY(3)
+	lea	L(cnsts)(%rip), %r9
+
+ifdef(`PIC', `define(`OFF1',32) define(`OFF2',48) define(`OFF3',64)',
+	     `define(`OFF1',64) define(`OFF2',80) define(`OFF3',96)')
+	movdqa	OFF1`'(%r9), %xmm7	C nibble counts table
+	movdqa	OFF2`'(%r9), %xmm6	C splat shift counts
+	movdqa	OFF3`'(%r9), %xmm9	C masks
+	pxor	%xmm4, %xmm4
+	pxor	%xmm5, %xmm5		C 0-reg
+	pxor	%xmm8, %xmm8		C grand total count
+
+	xor	R32(%rdx), R32(%rdx)
+
+	mov	R32(n), R32(%rax)
+	and	$7, R32(%rax)
+ifdef(`PIC',`
+	movslq	(%r9,%rax,4), %rax
+	add	%r9, %rax
+	jmp	*%rax
+',`
+	jmp	*(%r9,%rax,8)
+')
+
+L(1):	.byte	0xf3,0x48,0x0f,0xb8,0x17	C popcnt (up),%rdx
+	add	$8, up
+	dec	n
+	jnz	L(top)
+	mov	%rdx, %rax
+	FUNC_EXIT()
+	ret
+
+L(2):	add	$-48, up
+	jmp	L(e2)
+
+L(3):	.byte	0xf3,0x48,0x0f,0xb8,0x17	C popcnt (up), %rdx
+	add	$-40, up
+	jmp	L(e2)
+
+L(4):	add	$-32, up
+	jmp	L(e4)
+
+L(5):	.byte	0xf3,0x48,0x0f,0xb8,0x17	C popcnt (up), %rdx
+	add	$-24, up
+	jmp	L(e4)
+
+L(6):	add	$-16, up
+	jmp	L(e6)
+
+L(7):	.byte	0xf3,0x48,0x0f,0xb8,0x17	C popcnt (up), %rdx
+	add	$-8, up
+	jmp	L(e6)
+
+	ALIGN(32)
+L(top):	lddqu	(up), %xmm0
+	.byte	0x8f,0xe9,0x48,0x94,0xc8	C vpshlb %xmm6, %xmm0, %xmm1
+	pand	%xmm9, %xmm0
+	pand	%xmm9, %xmm1
+	.byte	0x8f,0xe8,0x40,0xa3,0xd7,0x00	C vpperm %xmm0,%xmm7,%xmm7,%xmm2
+	.byte	0x8f,0xe8,0x40,0xa3,0xdf,0x10	C vpperm %xmm1, %xmm7, %xmm7, %xmm3
+	paddb	%xmm2, %xmm3
+	paddb	%xmm3, %xmm4
+L(e6):	lddqu	16(up), %xmm0
+	.byte	0x8f,0xe9,0x48,0x94,0xc8	C vpshlb %xmm6, %xmm0, %xmm1
+	pand	%xmm9, %xmm0
+	pand	%xmm9, %xmm1
+	.byte	0x8f,0xe8,0x40,0xa3,0xd7,0x00	C vpperm %xmm0,%xmm7,%xmm7,%xmm2
+	.byte	0x8f,0xe8,0x40,0xa3,0xdf,0x10	C vpperm %xmm1,%xmm7,%xmm7,%xmm3
+	paddb	%xmm2, %xmm3
+	paddb	%xmm3, %xmm4
+L(e4):	lddqu	32(up), %xmm0
+	.byte	0x8f,0xe9,0x48,0x94,0xc8	C vpshlb %xmm6, %xmm0, %xmm1
+	pand	%xmm9, %xmm0
+	pand	%xmm9, %xmm1
+	.byte	0x8f,0xe8,0x40,0xa3,0xd7,0x00	C vpperm %xmm0, %xmm7, %xmm7, %xmm2
+	.byte	0x8f,0xe9,0x78,0xd3,0xec	C vphaddubq %xmm4, %xmm5
+	.byte	0x8f,0xe8,0x40,0xa3,0xe7,0x10	C vpperm %xmm1,%xmm7,%xmm7,%xmm4
+	paddb	%xmm2, %xmm4
+L(e2):	popcnt	48(up), %r8
+	popcnt	56(up), %r9
+	add	$64, up
+	paddq	%xmm5, %xmm8			C sum to 2 x 64-bit counts
+	add	%r8, %rdx
+	add	%r9, %rdx
+	sub	$8, n
+	jg	L(top)
+
+	.byte	0x8f,0xe9,0x78,0xd3,0xec	C vphaddubq %xmm4, %xmm5
+	paddq	%xmm5, %xmm8
+	pshufd	$14, %xmm8, %xmm0
+	paddq	%xmm8, %xmm0
+	movq	%xmm0, %rax
+	add	%rdx, %rax
+	FUNC_EXIT()
+	ret
+EPILOGUE()
+DEF_OBJECT(L(cnsts),16,`JUMPTABSECT')
+	JMPENT(	L(top), L(cnsts))
+	JMPENT(	L(1), L(cnsts))
+	JMPENT(	L(2), L(cnsts))
+	JMPENT(	L(3), L(cnsts))
+	JMPENT(	L(4), L(cnsts))
+	JMPENT(	L(5), L(cnsts))
+	JMPENT(	L(6), L(cnsts))
+	JMPENT(	L(7), L(cnsts))
+	.byte	0x00,0x01,0x01,0x02,0x01,0x02,0x02,0x03
+	.byte	0x01,0x02,0x02,0x03,0x02,0x03,0x03,0x04
+	.byte	-4,-4,-4,-4,-4,-4,-4,-4
+	.byte	-4,-4,-4,-4,-4,-4,-4,-4
+	.byte	0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f
+	.byte	0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f
+END_OBJECT(L(cnsts))
+')

diff --git a/third_party/gmp/mpn/x86_64/bd1/sec_tabselect.asm b/third_party/gmp/mpn/x86_64/bd1/sec_tabselect.asm
new file mode 100644
index 0000000..e436034
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/bd1/sec_tabselect.asm

@@ -0,0 +1,37 @@
+dnl  X86-64 mpn_sec_tabselect.
+
+dnl  Copyright 2012, 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_sec_tabselect)
+include_mpn(`x86_64/fastsse/sec_tabselect.asm')

diff --git a/third_party/gmp/mpn/x86_64/bd1/sublsh1_n.asm b/third_party/gmp/mpn/x86_64/bd1/sublsh1_n.asm
new file mode 100644
index 0000000..4ba673d
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/bd1/sublsh1_n.asm

@@ -0,0 +1,37 @@
+dnl  AMD64 mpn_sublsh1_n
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_sublsh1_n mpn_sublsh1_nc)
+include_mpn(`x86_64/atom/sublsh1_n.asm')

diff --git a/third_party/gmp/mpn/x86_64/bd2/gcd_11.asm b/third_party/gmp/mpn/x86_64/bd2/gcd_11.asm
new file mode 100644
index 0000000..b167077
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/bd2/gcd_11.asm

@@ -0,0 +1,96 @@
+dnl  AMD64 mpn_gcd_11 optimised for AMD BD2, BD3, BT2.
+
+dnl  Based on the K7 gcd_1.asm, by Kevin Ryde.  Rehacked for AMD64 by Torbjorn
+dnl  Granlund.
+
+dnl  Copyright 2000-2002, 2005, 2009, 2011, 2012, 2017, 2019 Free Software
+dnl  Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C	     cycles/bit (approx)
+C AMD K8,K9	 ?
+C AMD K10	 ?
+C AMD bd1	 5.4
+C AMD bd2	 3.72
+C AMD bd3	 ?
+C AMD bd4	 4.12
+C AMD bt1	 9.0
+C AMD bt2	 3.97
+C AMD zn1	 3.36
+C AMD zn2	 3.33
+C Intel P4	 ?
+C Intel CNR	 ?
+C Intel PNR	 ?
+C Intel NHM	 ?
+C Intel WSM	 ?
+C Intel SBR	 ?
+C Intel IBR	 ?
+C Intel HWL	 ?
+C Intel BWL	 ?
+C Intel SKL	 ?
+C Intel atom	 ?
+C Intel SLM	 ?
+C Intel GLM	 ?
+C Intel GLM+	 ?
+C VIA nano	 ?
+
+define(`u0',    `%rdi')
+define(`v0',    `%rsi')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(64)
+PROLOGUE(mpn_gcd_11)
+	FUNC_ENTRY(2)
+	mov	v0, %rdx
+	sub	u0, %rdx
+	jz	L(end)
+
+	ALIGN(16)
+L(top):	rep;bsf	%rdx, %rcx		C tzcnt!
+	mov	u0, %rax
+	sub	v0, u0			C u - v
+	cmovc	%rdx, u0		C u = |u - v|
+	cmovc	%rax, v0		C v = min(u,v)
+	shr	R8(%rcx), u0
+	mov	v0, %rdx
+	sub	u0, %rdx		C v - u
+	jnz	L(top)
+
+L(end):	mov	v0, %rax
+	C rax = result
+	C rdx = 0 for the benefit of internal gcd_22 call
+	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/bd2/gcd_22.asm b/third_party/gmp/mpn/x86_64/bd2/gcd_22.asm
new file mode 100644
index 0000000..a4f30ea
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/bd2/gcd_22.asm

@@ -0,0 +1,142 @@
+dnl  AMD64 mpn_gcd_22.  Assumes useless bsf, useless shrd, tzcnt, no shlx.
+
+dnl  Copyright 2019 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C	     cycles/bit
+C AMD K8,K9	12.3
+C AMD K10	 8.0
+C AMD bd1	10.0
+C AMD bd2	 7.2 
+C AMD bd3	 ?
+C AMD bd4	 6.7
+C AMD bt1	13.6
+C AMD bt2	 8.9
+C AMD zn1	 5.7
+C AMD zn2	 5.6
+C Intel P4	 ?
+C Intel CNR	 9.7
+C Intel PNR	 9.7
+C Intel NHM	 9.4
+C Intel WSM	 9.5
+C Intel SBR	10.3
+C Intel IBR	 ?
+C Intel HWL	 8.2
+C Intel BWL	 7.4
+C Intel SKL	 7.3
+C Intel atom	26.5
+C Intel SLM	17.4
+C Intel GLM	13.4
+C Intel GLM+	12.4
+C VIA nano	 ?
+
+
+define(`u1',    `%rdi')
+define(`u0',    `%rsi')
+define(`v1',    `%rdx')
+define(`v0_param', `%rcx')
+
+define(`v0',    `%rax')
+define(`cnt',   `%rcx')
+
+define(`s0',    `%r8')
+define(`s1',    `%r9')
+define(`t0',    `%r10')
+define(`t1',    `%r11')
+
+dnl ABI_SUPPORT(DOS64)	C returns mp_double_limb_t in memory
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(64)
+PROLOGUE(mpn_gcd_22)
+	FUNC_ENTRY(4)
+	mov	v0_param, v0
+
+	ALIGN(16)
+L(top):	mov	v0, t0
+	sub	u0, t0
+	jz	L(lowz)		C	jump when low limb result = 0
+	mov	v1, t1
+	sbb	u1, t1
+
+	rep;bsf	t0, cnt		C tzcnt!
+	mov	u0, s0
+	mov	u1, s1
+
+	sub	v0, u0
+	sbb	v1, u1
+
+L(bck):	cmovc	t0, u0		C u = |u - v|
+	cmovc	t1, u1		C u = |u - v|
+	cmovc	s0, v0		C v = min(u,v)
+	cmovc	s1, v1		C v = min(u,v)
+
+C Rightshift (u1,,u0) into (u1,,u0)
+L(shr):	shr	R8(cnt), u0
+	mov	u1, t1
+	shr	R8(cnt), u1
+	neg	cnt
+	shl	R8(cnt), t1
+	or	t1, u0
+
+	test	v1, v1
+	jnz	L(top)
+	test	u1, u1
+	jnz	L(top)
+
+L(gcd_11):
+	mov	v0, %rdi
+C	mov	u0, %rsi
+	TCALL(	mpn_gcd_11)
+
+L(lowz):C We come here when v0 - u0 = 0
+	C 1. If v1 - u1 = 0, then gcd is u = v.
+	C 2. Else compute gcd_21({v1,v0}, |u1-v1|)
+	mov	v1, t0
+	sub	u1, t0
+	je	L(end)
+
+	xor	t1, t1
+	rep;bsf	t0, cnt		C tzcnt!
+	mov	u0, s0
+	mov	u1, s1
+	mov	u1, u0
+	xor	u1, u1
+	sub	v1, u0
+	jmp	L(bck)
+
+L(end):	C mov	v0, %rax
+	C mov	v1, %rdx
+	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/bd2/gmp-mparam.h b/third_party/gmp/mpn/x86_64/bd2/gmp-mparam.h
new file mode 100644
index 0000000..61573ea
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/bd2/gmp-mparam.h

@@ -0,0 +1,263 @@
+/* AMD bd2 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+/* 4000-4200 MHz Piledriver Vishera  */
+/* FFT tuning limit = 464,626,631 */
+/* Generated by tuneup.c, 2019-10-18, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD                 0  /* always */
+#define MOD_1_UNNORM_THRESHOLD               0  /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          6
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          3
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD        23
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD        34
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     12
+#define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_1_NORM_THRESHOLD              2
+#define DIV_QR_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           22
+
+#define DIV_1_VS_MUL_1_PERCENT             293
+
+#define MUL_TOOM22_THRESHOLD                16
+#define MUL_TOOM33_THRESHOLD                57
+#define MUL_TOOM44_THRESHOLD               152
+#define MUL_TOOM6H_THRESHOLD               230
+#define MUL_TOOM8H_THRESHOLD               309
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      97
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     107
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD     105
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD     103
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD     142
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 20
+#define SQR_TOOM3_THRESHOLD                 73
+#define SQR_TOOM4_THRESHOLD                200
+#define SQR_TOOM6_THRESHOLD                286
+#define SQR_TOOM8_THRESHOLD                430
+
+#define MULMID_TOOM42_THRESHOLD             20
+
+#define MULMOD_BNM1_THRESHOLD               11
+#define SQRMOD_BNM1_THRESHOLD               13
+
+#define MUL_FFT_MODF_THRESHOLD             372  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    372, 5}, {     17, 6}, {      9, 5}, {     19, 6}, \
+    {     10, 5}, {     21, 6}, {     11, 5}, {     23, 6}, \
+    {     25, 7}, {     13, 6}, {     27, 7}, {     21, 8}, \
+    {     11, 7}, {     25, 8}, {     13, 7}, {     27, 8}, \
+    {     15, 7}, {     32, 8}, {     17, 7}, {     35, 8}, \
+    {     21, 9}, {     11, 8}, {     27, 9}, {     15, 8}, \
+    {     35, 9}, {     19, 8}, {     41, 9}, {     23, 8}, \
+    {     47, 9}, {     27,10}, {     15, 9}, {     31, 8}, \
+    {     63, 9}, {     39,10}, {     23, 9}, {     55,11}, \
+    {     15,10}, {     31, 9}, {     71,10}, {     39, 9}, \
+    {     83,10}, {     47, 9}, {     95,10}, {     55,11}, \
+    {     31,10}, {     79,11}, {     47,10}, {     95,12}, \
+    {     31,11}, {     63,10}, {    135,11}, {     79, 8}, \
+    {    639, 9}, {    335,10}, {    175, 9}, {    351,10}, \
+    {    191,12}, {     63,11}, {    127,10}, {    255,11}, \
+    {    143,10}, {    287,11}, {    159,12}, {     95,11}, \
+    {    191,13}, {     63,12}, {    127,11}, {    271,10}, \
+    {    543,11}, {    287,12}, {    159,11}, {    351,12}, \
+    {    191,11}, {    383,10}, {    767,11}, {    415,12}, \
+    {    223,11}, {    447,13}, {    127,12}, {    255,11}, \
+    {    511,10}, {   1023,11}, {    543,12}, {    287,11}, \
+    {    575,12}, {    319,11}, {    639,10}, {   1279,12}, \
+    {    351,13}, {    191,12}, {    383,11}, {    767,12}, \
+    {    415,11}, {    831,10}, {   1663,12}, {    447,14}, \
+    {    127,13}, {    255,12}, {    511,11}, {   1023,12}, \
+    {    543,11}, {   1087,10}, {   2175,12}, {    575,11}, \
+    {   1151,13}, {    319,12}, {    639,11}, {   1279,12}, \
+    {    671,11}, {   1343,10}, {   2687,12}, {    703,11}, \
+    {   1407,13}, {    383,12}, {    767,11}, {   1535,12}, \
+    {    799,11}, {   1599,12}, {    831,11}, {   1663,13}, \
+    {    447,12}, {    895,14}, {    255,13}, {    511,12}, \
+    {   1087,11}, {   2175,13}, {    575,12}, {   1215,11}, \
+    {   2431,10}, {   4863,13}, {    639,12}, {   1343,11}, \
+    {   2687,13}, {    703,12}, {   1407,11}, {   2815,14}, \
+    {    383,13}, {    767,12}, {   1599,13}, {    831,12}, \
+    {   1727,11}, {   3455,13}, {    895,15}, {    255,14}, \
+    {    511,13}, {   1087,12}, {   2175,13}, {   1215,12}, \
+    {   2431,11}, {   4863,14}, {    639,13}, {   1343,12}, \
+    {   2687,13}, {   1407,12}, {   2815,13}, {   1471,12}, \
+    {   2943,11}, {   5887,14}, {    767,13}, {   1599,12}, \
+    {   3199,13}, {   1727,12}, {   3455,14}, {    895,13}, \
+    {   1791,12}, {   3583,13}, {   1919,12}, {   3839,11}, \
+    {   7679,15}, {    511,14}, {   1023,13}, {   2175,14}, \
+    {   1151,13}, {   2303,12}, {   4607,13}, {   2431,12}, \
+    {   4863,14}, {   1279,13}, {   2687,14}, {   1407,13}, \
+    {   2815,12}, {   5631,13}, {   2943,12}, {   5887,15}, \
+    {    767,14}, {   1535,13}, {   3199,14}, {   1663,13}, \
+    {   3455,12}, {   6911,14}, {   1791,13}, {   3583,14}, \
+    {   1919,13}, {   3839,12}, {   7679,16}, {    511,15}, \
+    {   1023,14}, {   2175,13}, {   4479,14}, {   2303,13}, \
+    {   4607,14}, {   2431,13}, {   4863,15}, {   1279,14}, \
+    {   2815,13}, {   5631,14}, {   2943,13}, {   5887,12}, \
+    {  11775,15}, {   1535,14}, {   3455,13}, {   6911,15}, \
+    {   1791,14}, {   3839,13}, {   7679,16}, {   1023,15}, \
+    {   2047,14}, {   4479,13}, {   8959,15}, {   2303,14}, \
+    {   4863,15}, {   2815,14}, {   5887,13}, {  11775,16}, \
+    {   1535,15}, {   3327,14}, {   6911,15}, {   3839,14}, \
+    {   7679,13}, {  15359,17}, {   1023,16}, {   2047,15}, \
+    {   4351,14}, {   8959,15}, {   4863,16}, {   2559,15}, \
+    {   5887,14}, {  11775,16}, {   3071,15}, {   6911,16}, \
+    {   3583,15}, {   7679,14}, {  15359,15}, {   7935,14}, \
+    {  15871,17}, {   2047,16}, {   4095,15}, {   8959,16}, \
+    {   4607,15}, {   9983,14}, {  19967,16}, {   5631,15}, \
+    {  11775,17}, {   3071,16}, {  65536,17}, { 131072,18}, \
+    { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \
+    {4194304,23}, {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 262
+#define MUL_FFT_THRESHOLD                 4544
+
+#define SQR_FFT_MODF_THRESHOLD             344  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    344, 5}, {     19, 6}, {     10, 5}, {     21, 6}, \
+    {     11, 5}, {     23, 6}, {     25, 7}, {     13, 6}, \
+    {     27, 7}, {     25, 8}, {     13, 7}, {     28, 8}, \
+    {     15, 7}, {     31, 8}, {     17, 7}, {     35, 8}, \
+    {     19, 7}, {     39, 8}, {     21, 9}, {     11, 8}, \
+    {     27, 9}, {     15, 8}, {     35, 9}, {     19, 8}, \
+    {     39, 9}, {     23, 8}, {     47, 9}, {     27,10}, \
+    {     15, 9}, {     39,10}, {     23, 9}, {     51,11}, \
+    {     15,10}, {     31, 9}, {     67,10}, {     39, 9}, \
+    {     79,10}, {     47,11}, {     31,10}, {     79,11}, \
+    {     47,10}, {     95,12}, {     31,11}, {     63,10}, \
+    {    127, 9}, {    255,10}, {    135,11}, {     79,10}, \
+    {    159,11}, {     95,10}, {    191,12}, {     63, 9}, \
+    {    511,10}, {    271,11}, {    143,10}, {    303,11}, \
+    {    159,12}, {     95,11}, {    191,13}, {     63,12}, \
+    {    127,11}, {    287,10}, {    575,11}, {    303,12}, \
+    {    159,11}, {    351,12}, {    191,11}, {    383,12}, \
+    {    223,11}, {    447,13}, {    127,12}, {    255,11}, \
+    {    511,10}, {   1023,12}, {    287,11}, {    575,10}, \
+    {   1151,11}, {    607,12}, {    319,11}, {    639,10}, \
+    {   1279,12}, {    351,13}, {    191,12}, {    383,11}, \
+    {    767,12}, {    415,11}, {    831,10}, {   1663,12}, \
+    {    447,14}, {    127,13}, {    255,12}, {    511,11}, \
+    {   1023,12}, {    543,11}, {   1087,10}, {   2175,12}, \
+    {    575,11}, {   1151,12}, {    607,13}, {    319,12}, \
+    {    639,11}, {   1279,12}, {    671,11}, {   1343,10}, \
+    {   2687,12}, {    703,11}, {   1407,13}, {    383,12}, \
+    {    767,11}, {   1535,12}, {    799,11}, {   1599,12}, \
+    {    831,11}, {   1663,13}, {    447,12}, {    895,14}, \
+    {    255,13}, {    511,12}, {   1087,11}, {   2175,13}, \
+    {    575,12}, {   1215,11}, {   2431,10}, {   4863,13}, \
+    {    639,12}, {   1343,11}, {   2687,13}, {    703,12}, \
+    {   1407,14}, {    383,13}, {    767,12}, {   1599,13}, \
+    {    831,12}, {   1727,13}, {    895,15}, {    255,14}, \
+    {    511,13}, {   1087,12}, {   2175,13}, {   1151,12}, \
+    {   2303,13}, {   1215,12}, {   2431,11}, {   4863,14}, \
+    {    639,13}, {   1343,12}, {   2687,13}, {   1407,12}, \
+    {   2815,13}, {   1471,12}, {   2943,11}, {   5887,14}, \
+    {    767,13}, {   1599,12}, {   3199,13}, {   1727,12}, \
+    {   3455,14}, {    895,13}, {   1791,12}, {   3583,13}, \
+    {   1919,12}, {   3839,15}, {    511,14}, {   1023,13}, \
+    {   2175,14}, {   1151,13}, {   2303,12}, {   4607,13}, \
+    {   2431,12}, {   4863,14}, {   1279,13}, {   2687,14}, \
+    {   1407,13}, {   2943,12}, {   5887,11}, {  11775,15}, \
+    {    767,14}, {   1535,13}, {   3199,14}, {   1663,13}, \
+    {   3455,12}, {   6911,14}, {   1791,13}, {   3583,14}, \
+    {   1919,13}, {   3839,16}, {    511,15}, {   1023,14}, \
+    {   2175,13}, {   4479,14}, {   2303,13}, {   4607,14}, \
+    {   2431,13}, {   4863,15}, {   1279,14}, {   2815,13}, \
+    {   5631,14}, {   2943,13}, {   5887,12}, {  11775,15}, \
+    {   1535,14}, {   3455,13}, {   6911,15}, {   1791,14}, \
+    {   3839,13}, {   7679,16}, {   1023,15}, {   2047,14}, \
+    {   4479,13}, {   8959,15}, {   2303,14}, {   4863,15}, \
+    {   2815,14}, {   5887,13}, {  11775,16}, {   1535,15}, \
+    {   3327,14}, {   6911,15}, {   3839,14}, {   7679,17}, \
+    {   1023,16}, {   2047,15}, {   4351,14}, {   8959,15}, \
+    {   4863,16}, {   2559,15}, {   5887,14}, {  11775,16}, \
+    {   3071,15}, {   6911,16}, {   3583,15}, {   7679,14}, \
+    {  15359,15}, {   7935,14}, {  15871,17}, {   2047,16}, \
+    {   4095,15}, {   8959,16}, {   4607,15}, {   9983,14}, \
+    {  19967,16}, {   5119,15}, {  10239,16}, {   5631,15}, \
+    {  11775,17}, {   3071,16}, {  65536,17}, { 131072,18}, \
+    { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \
+    {4194304,23}, {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 254
+#define SQR_FFT_THRESHOLD                 2880
+
+#define MULLO_BASECASE_THRESHOLD             0  /* always */
+#define MULLO_DC_THRESHOLD                  30
+#define MULLO_MUL_N_THRESHOLD             8907
+#define SQRLO_BASECASE_THRESHOLD             8
+#define SQRLO_DC_THRESHOLD                  53
+#define SQRLO_SQR_THRESHOLD               5724
+
+#define DC_DIV_QR_THRESHOLD                 52
+#define DC_DIVAPPR_Q_THRESHOLD             159
+#define DC_BDIV_QR_THRESHOLD                44
+#define DC_BDIV_Q_THRESHOLD                 79
+
+#define INV_MULMOD_BNM1_THRESHOLD           30
+#define INV_NEWTON_THRESHOLD               172
+#define INV_APPR_THRESHOLD                 172
+
+#define BINV_NEWTON_THRESHOLD              226
+#define REDC_1_TO_REDC_2_THRESHOLD          40
+#define REDC_2_TO_REDC_N_THRESHOLD          51
+
+#define MU_DIV_QR_THRESHOLD               1308
+#define MU_DIVAPPR_Q_THRESHOLD            1258
+#define MUPI_DIV_QR_THRESHOLD               85
+#define MU_BDIV_QR_THRESHOLD              1142
+#define MU_BDIV_Q_THRESHOLD               1210
+
+#define POWM_SEC_TABLE  3,16,129,523,1297
+
+#define GET_STR_DC_THRESHOLD                13
+#define GET_STR_PRECOMPUTE_THRESHOLD        20
+#define SET_STR_DC_THRESHOLD               228
+#define SET_STR_PRECOMPUTE_THRESHOLD      1033
+
+#define FAC_DSC_THRESHOLD                  172
+#define FAC_ODD_THRESHOLD                   28
+
+#define MATRIX22_STRASSEN_THRESHOLD         19
+#define HGCD2_DIV1_METHOD                    1  /* 8.54% faster than 3 */
+#define HGCD_THRESHOLD                     108
+#define HGCD_APPR_THRESHOLD                 50
+#define HGCD_REDUCE_THRESHOLD             2681
+#define GCD_DC_THRESHOLD                   393
+#define GCDEXT_DC_THRESHOLD                278
+#define JACOBI_BASE_METHOD                   4  /* 13.69% faster than 1 */
+
+/* Tuneup completed successfully, took 463931 seconds */

diff --git a/third_party/gmp/mpn/x86_64/bd4/aorrlsh_n.asm b/third_party/gmp/mpn/x86_64/bd4/aorrlsh_n.asm
new file mode 100644
index 0000000..ff0d27b
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/bd4/aorrlsh_n.asm

@@ -0,0 +1,38 @@
+dnl  X86-64 mpn_addlsh_n and mpn_rsblsh_n.
+
+dnl  Copyright 2017 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_addlsh_n mpn_rsblsh_n)
+include_mpn(`x86_64/zen/aorrlsh_n.asm')

diff --git a/third_party/gmp/mpn/x86_64/bd4/gcd_11.asm b/third_party/gmp/mpn/x86_64/bd4/gcd_11.asm
new file mode 100644
index 0000000..4176b85
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/bd4/gcd_11.asm

@@ -0,0 +1,96 @@
+dnl  AMD64 mpn_gcd_11 optimised for AMD BD4, ZN1.
+
+dnl  Based on the K7 gcd_1.asm, by Kevin Ryde.  Rehacked for AMD64 by Torbjorn
+dnl  Granlund.
+
+dnl  Copyright 2000-2002, 2005, 2009, 2011, 2012, 2017, 2019 Free Software
+dnl  Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C	     cycles/bit (approx)
+C AMD K8,K9	 -
+C AMD K10	 -
+C AMD bd1	 -
+C AMD bd2	 -
+C AMD bd3	 -
+C AMD bd4	 3.73
+C AMD bt1	 -
+C AMD bt2	 -
+C AMD zn1	 3.33
+C AMD zn2	 3.48
+C Intel P4	 -
+C Intel CNR	 -
+C Intel PNR	 -
+C Intel NHM	 -
+C Intel WSM	 -
+C Intel SBR	 -
+C Intel IBR	 -
+C Intel HWL	 ?
+C Intel BWL	 ?
+C Intel SKL	 ?
+C Intel atom	 -
+C Intel SLM	 -
+C Intel GLM	 -
+C Intel GLM+	 -
+C VIA nano	 -
+
+define(`u0',    `%rdi')
+define(`v0',    `%rsi')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(64)
+PROLOGUE(mpn_gcd_11)
+	FUNC_ENTRY(2)
+	mov	u0, %rax
+	mov	v0, %rdx
+	sub	u0, %rdx		C v - u
+	jz	L(end)
+
+	ALIGN(16)
+L(top):	rep;bsf	%rdx, %rcx		C tzcnt!
+	sub	v0, u0			C u - v
+	cmovc	%rdx, u0		C u = |u - v|
+	cmovc	%rax, v0		C v = min(u,v)
+	shrx(	%rcx, u0, %rax)
+	shrx(	%rcx, u0, u0)
+	mov	v0, %rdx
+	sub	%rax, %rdx		C v - u
+	jnz	L(top)
+
+L(end):	C rax = result
+	C rdx = 0 for the benefit of internal gcd_22 call
+	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/bd4/gcd_22.asm b/third_party/gmp/mpn/x86_64/bd4/gcd_22.asm
new file mode 100644
index 0000000..5dfd9e3
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/bd4/gcd_22.asm

@@ -0,0 +1,37 @@
+dnl  AMD64 mpn_gcd_22.
+
+dnl  Copyright 2019 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+dnl ABI_SUPPORT(DOS64)	C returns mp_double_limb_t in memory
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_gcd_22)
+include_mpn(`x86_64/coreihwl/gcd_22.asm')

diff --git a/third_party/gmp/mpn/x86_64/bd4/gmp-mparam.h b/third_party/gmp/mpn/x86_64/bd4/gmp-mparam.h
new file mode 100644
index 0000000..9d2038c
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/bd4/gmp-mparam.h

@@ -0,0 +1,266 @@
+/* AMD bd4 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+/* 3800-4200 MHz Excavator/Bristol Ridge  */
+/* FFT tuning limit = 461,179,335 */
+/* Generated by tuneup.c, 2019-10-18, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD                 0  /* always */
+#define MOD_1_UNNORM_THRESHOLD               0  /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          6
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          5
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD        17
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD        52
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     13
+#define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_1_NORM_THRESHOLD              1
+#define DIV_QR_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           25
+
+#define DIV_1_VS_MUL_1_PERCENT             298
+
+#define MUL_TOOM22_THRESHOLD                16
+#define MUL_TOOM33_THRESHOLD                53
+#define MUL_TOOM44_THRESHOLD               142
+#define MUL_TOOM6H_THRESHOLD               206
+#define MUL_TOOM8H_THRESHOLD               292
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      83
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     102
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD      97
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD      98
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD      82
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 20
+#define SQR_TOOM3_THRESHOLD                 71
+#define SQR_TOOM4_THRESHOLD                202
+#define SQR_TOOM6_THRESHOLD                298
+#define SQR_TOOM8_THRESHOLD                466
+
+#define MULMID_TOOM42_THRESHOLD             20
+
+#define MULMOD_BNM1_THRESHOLD               11
+#define SQRMOD_BNM1_THRESHOLD               14
+
+#define MUL_FFT_MODF_THRESHOLD             316  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    316, 5}, {     19, 6}, {     10, 5}, {     21, 6}, \
+    {     21, 7}, {     11, 6}, {     23, 7}, {     12, 6}, \
+    {     25, 7}, {     21, 8}, {     11, 7}, {     24, 8}, \
+    {     13, 7}, {     28, 8}, {     15, 7}, {     31, 8}, \
+    {     21, 9}, {     11, 8}, {     27, 9}, {     15, 8}, \
+    {     33, 9}, {     19, 8}, {     41, 9}, {     23, 8}, \
+    {     47, 9}, {     27,10}, {     15, 9}, {     39,10}, \
+    {     23, 9}, {     51,11}, {     15,10}, {     31, 9}, \
+    {     67,10}, {     39, 9}, {     83,10}, {     47, 9}, \
+    {     99,10}, {     55,11}, {     31,10}, {     87,11}, \
+    {     47,10}, {     95, 9}, {    191,10}, {    103,12}, \
+    {     31,11}, {     63,10}, {    127, 9}, {    255,10}, \
+    {    135, 9}, {    271, 5}, {   4351, 6}, {   2303, 7}, \
+    {   1215, 8}, {    639,10}, {    175,11}, {     95,10}, \
+    {    191, 9}, {    383,10}, {    207, 9}, {    415,11}, \
+    {    111,12}, {     63,11}, {    127,10}, {    255, 9}, \
+    {    511,10}, {    271, 9}, {    543,11}, {    143,10}, \
+    {    287, 9}, {    575,10}, {    303,11}, {    159,10}, \
+    {    319, 9}, {    639,11}, {    175,12}, {     95,11}, \
+    {    191,10}, {    383,11}, {    207,10}, {    415, 9}, \
+    {    831,13}, {     63,12}, {    127,11}, {    255,10}, \
+    {    511,11}, {    271,10}, {    543,11}, {    287,10}, \
+    {    575,11}, {    303,12}, {    159,11}, {    319,10}, \
+    {    639,11}, {    351,12}, {    191,11}, {    383,10}, \
+    {    767,11}, {    415,10}, {    831,12}, {    223,11}, \
+    {    447,10}, {    895,11}, {    479,13}, {    127,12}, \
+    {    255,11}, {    543,12}, {    287,11}, {    607,12}, \
+    {    319,11}, {    639,12}, {    351,13}, {    191,12}, \
+    {    383,11}, {    767,12}, {    415,11}, {    831,12}, \
+    {    447,11}, {    895,12}, {    479,14}, {    127,13}, \
+    {    255,12}, {    543,11}, {   1087,12}, {    607,13}, \
+    {    319,12}, {    671,11}, {   1343,10}, {   2687,12}, \
+    {    703,13}, {    383,12}, {    767,11}, {   1535,12}, \
+    {    831,13}, {    447,12}, {    895,11}, {   1791,12}, \
+    {    959,14}, {    255,13}, {    511,12}, {   1087,13}, \
+    {    575,12}, {   1151,11}, {   2303,12}, {   1215,11}, \
+    {   2431,13}, {    639,12}, {   1343,11}, {   2687,13}, \
+    {    703,14}, {    383,13}, {    767,12}, {   1535,13}, \
+    {    831,12}, {   1663,13}, {    959,15}, {    255,14}, \
+    {    511,13}, {   1087,12}, {   2175,13}, {   1151,12}, \
+    {   2303,13}, {   1215,12}, {   2431,14}, {    639,13}, \
+    {   1343,12}, {   2687,13}, {   1407,12}, {   2815,13}, \
+    {   1471,14}, {    767,13}, {   1535,12}, {   3071,13}, \
+    {   1663,14}, {    895,13}, {   1791,12}, {   3583,13}, \
+    {   1919,15}, {    511,14}, {   1023,13}, {   2175,14}, \
+    {   1151,13}, {   2303,12}, {   4607,13}, {   2431,12}, \
+    {   4863,14}, {   1279,13}, {   2687,14}, {   1407,13}, \
+    {   2815,15}, {    767,14}, {   1535,13}, {   3071,14}, \
+    {   1663,13}, {   3455,12}, {   6911,14}, {   1791,13}, \
+    {   3583,14}, {   1919,16}, {    511,15}, {   1023,14}, \
+    {   2303,13}, {   4607,14}, {   2431,13}, {   4863,15}, \
+    {   1279,14}, {   2943,13}, {   5887,15}, {   1535,14}, \
+    {   3455,13}, {   6911,15}, {   1791,14}, {   3839,13}, \
+    {   7679,16}, {   1023,15}, {   2047,14}, {   4351,15}, \
+    {   2303,14}, {   4863,15}, {   2815,14}, {   5887,16}, \
+    {   1535,15}, {   3071,14}, {   6143,15}, {   3327,14}, \
+    {   6911,15}, {   3839,14}, {   7679,17}, {   1023,16}, \
+    {   2047,15}, {   4863,16}, {   2559,15}, {   5887,14}, \
+    {  11775,16}, {   3071,15}, {   6911,16}, {   3583,15}, \
+    {   7679,17}, {   2047,16}, {   4095,15}, {   8191,16}, \
+    {   4607,15}, {   9983,16}, {   5631,15}, {  11775,17}, \
+    {   3071,16}, {  65536,17}, { 131072,18}, { 262144,19}, \
+    { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \
+    {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 253
+#define MUL_FFT_THRESHOLD                 4224
+
+#define SQR_FFT_MODF_THRESHOLD             300  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    300, 5}, {     19, 6}, {     10, 5}, {     21, 6}, \
+    {     23, 7}, {     12, 6}, {     25, 7}, {     21, 8}, \
+    {     11, 7}, {     25, 8}, {     13, 7}, {     27, 8}, \
+    {     15, 7}, {     31, 8}, {     21, 9}, {     11, 8}, \
+    {     27, 9}, {     15, 8}, {     35, 9}, {     19, 8}, \
+    {     39, 9}, {     23, 8}, {     47, 9}, {     27,10}, \
+    {     15, 9}, {     39,10}, {     23, 9}, {     51,11}, \
+    {     15,10}, {     31, 9}, {     63,10}, {     39, 9}, \
+    {     79,10}, {     47, 9}, {     95,10}, {     55,11}, \
+    {     31,10}, {     79,11}, {     47,10}, {     95, 9}, \
+    {    191, 8}, {    383,10}, {    103,12}, {     31,11}, \
+    {     63,10}, {    127, 9}, {    255, 8}, {    511, 9}, \
+    {    271, 8}, {    543,11}, {     79,10}, {    159, 9}, \
+    {    319, 8}, {    639,10}, {    175,11}, {     95,10}, \
+    {    191, 9}, {    383, 5}, {   6399, 6}, {   3327, 7}, \
+    {   1727, 6}, {   3455, 7}, {   1791,11}, {    127,10}, \
+    {    255, 9}, {    511,10}, {    271, 9}, {    543,10}, \
+    {    287, 9}, {    575,10}, {    303,11}, {    159,10}, \
+    {    319, 9}, {    639,11}, {    175,10}, {    351,12}, \
+    {     95,11}, {    191,10}, {    383,11}, {    207,10}, \
+    {    415, 9}, {    831,13}, {     63,11}, {    255,10}, \
+    {    511,11}, {    271,10}, {    543,11}, {    287,10}, \
+    {    575,11}, {    303,10}, {    607,12}, {    159,11}, \
+    {    319,10}, {    639,11}, {    351,10}, {    703,12}, \
+    {    191,11}, {    383,10}, {    767,11}, {    415,10}, \
+    {    831,12}, {    223,11}, {    447,10}, {    895,11}, \
+    {    479,12}, {    255,11}, {    511,10}, {   1023,11}, \
+    {    543,12}, {    287,11}, {    575,10}, {   1151,11}, \
+    {    607,12}, {    319,11}, {    639,12}, {    351,11}, \
+    {    703,13}, {    191,12}, {    383,11}, {    767,12}, \
+    {    415,11}, {    831,12}, {    447,11}, {    895,12}, \
+    {    479,13}, {    255,12}, {    511,11}, {   1023,12}, \
+    {    543,11}, {   1087,12}, {    575,11}, {   1151,12}, \
+    {    607,13}, {    319,12}, {    639,11}, {   1279,12}, \
+    {    671,11}, {   1343,12}, {    703,13}, {    383,12}, \
+    {    767,11}, {   1535,12}, {    831,11}, {   1663,13}, \
+    {    447,12}, {    959,14}, {    255,13}, {    511,12}, \
+    {   1087,13}, {    575,12}, {   1151,11}, {   2303,12}, \
+    {   1215,11}, {   2431,13}, {    639,12}, {   1343,13}, \
+    {    703,14}, {    383,13}, {    767,12}, {   1535,13}, \
+    {    831,12}, {   1663,13}, {    895,12}, {   1791,13}, \
+    {    959,15}, {    255,14}, {    511,13}, {   1023,12}, \
+    {   2047,13}, {   1087,12}, {   2175,13}, {   1151,12}, \
+    {   2303,13}, {   1215,12}, {   2431,14}, {    639,13}, \
+    {   1343,12}, {   2687,13}, {   1407,12}, {   2815,13}, \
+    {   1471,14}, {    767,13}, {   1599,12}, {   3199,13}, \
+    {   1663,14}, {    895,13}, {   1791,12}, {   3583,15}, \
+    {    511,14}, {   1023,13}, {   2175,14}, {   1151,13}, \
+    {   2303,12}, {   4607,13}, {   2431,12}, {   4863,14}, \
+    {   1279,13}, {   2687,14}, {   1407,13}, {   2815,15}, \
+    {    767,14}, {   1535,13}, {   3199,14}, {   1663,13}, \
+    {   3455,14}, {   1791,13}, {   3583,14}, {   1919,16}, \
+    {    511,15}, {   1023,14}, {   2303,13}, {   4607,14}, \
+    {   2431,13}, {   4863,15}, {   1279,14}, {   2815,13}, \
+    {   5631,14}, {   2943,13}, {   5887,15}, {   1535,14}, \
+    {   3455,15}, {   1791,14}, {   3583,13}, {   7167,14}, \
+    {   3839,13}, {   7679,16}, {   1023,15}, {   2047,14}, \
+    {   4223,15}, {   2303,14}, {   4863,15}, {   2815,14}, \
+    {   5887,16}, {   1535,15}, {   3071,14}, {   6143,15}, \
+    {   3327,14}, {   6911,15}, {   3583,14}, {   7167,15}, \
+    {   3839,14}, {   7679,17}, {   1023,16}, {   2047,15}, \
+    {   4095,14}, {   8191,15}, {   4863,16}, {   2559,15}, \
+    {   5887,14}, {  11775,16}, {   3071,15}, {   6911,16}, \
+    {   3583,15}, {   7679,14}, {  15359,17}, {   2047,16}, \
+    {   4095,15}, {   8447,16}, {   4607,15}, {   9983,16}, \
+    {   5119,15}, {  10239,16}, {   5631,15}, {  11775,17}, \
+    {   3071,16}, {  65536,17}, { 131072,18}, { 262144,19}, \
+    { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \
+    {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 273
+#define SQR_FFT_THRESHOLD                 2752
+
+#define MULLO_BASECASE_THRESHOLD             0  /* always */
+#define MULLO_DC_THRESHOLD                  43
+#define MULLO_MUL_N_THRESHOLD             8397
+#define SQRLO_BASECASE_THRESHOLD             6
+#define SQRLO_DC_THRESHOLD                  54
+#define SQRLO_SQR_THRESHOLD               5397
+
+#define DC_DIV_QR_THRESHOLD                 39
+#define DC_DIVAPPR_Q_THRESHOLD             165
+#define DC_BDIV_QR_THRESHOLD                39
+#define DC_BDIV_Q_THRESHOLD                 76
+
+#define INV_MULMOD_BNM1_THRESHOLD           30
+#define INV_NEWTON_THRESHOLD               177
+#define INV_APPR_THRESHOLD                 155
+
+#define BINV_NEWTON_THRESHOLD              230
+#define REDC_1_TO_REDC_2_THRESHOLD          28
+#define REDC_2_TO_REDC_N_THRESHOLD          43
+
+#define MU_DIV_QR_THRESHOLD               1142
+#define MU_DIVAPPR_Q_THRESHOLD            1142
+#define MUPI_DIV_QR_THRESHOLD               66
+#define MU_BDIV_QR_THRESHOLD               998
+#define MU_BDIV_Q_THRESHOLD               1142
+
+#define POWM_SEC_TABLE  1,16,175,269,839,1420
+
+#define GET_STR_DC_THRESHOLD                12
+#define GET_STR_PRECOMPUTE_THRESHOLD        19
+#define SET_STR_DC_THRESHOLD               552
+#define SET_STR_PRECOMPUTE_THRESHOLD      1038
+
+#define FAC_DSC_THRESHOLD                  151
+#define FAC_ODD_THRESHOLD                   23
+
+#define MATRIX22_STRASSEN_THRESHOLD         17
+#define HGCD2_DIV1_METHOD                    1  /* 8.11% faster than 3 */
+#define HGCD_THRESHOLD                      87
+#define HGCD_APPR_THRESHOLD                 96
+#define HGCD_REDUCE_THRESHOLD             2121
+#define GCD_DC_THRESHOLD                   327
+#define GCDEXT_DC_THRESHOLD                241
+#define JACOBI_BASE_METHOD                   4  /* 21.40% faster than 1 */
+
+/* Tuneup completed successfully, took 431056 seconds */

diff --git a/third_party/gmp/mpn/x86_64/bdiv_dbm1c.asm b/third_party/gmp/mpn/x86_64/bdiv_dbm1c.asm
new file mode 100644
index 0000000..a53bd52
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/bdiv_dbm1c.asm

@@ -0,0 +1,106 @@
+dnl  x86_64 mpn_bdiv_dbm1.
+
+dnl  Copyright 2008, 2011, 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C AMD K8,K9	 2.25
+C AMD K10	 2.25
+C Intel P4	12.5
+C Intel core2	 4
+C Intel NHM	 3.75
+C Intel SBR	 3.6
+C Intel atom	20
+C VIA nano	 4
+
+C TODO
+C  * Optimise feed-in code.
+
+C INPUT PARAMETERS
+define(`qp',	  `%rdi')
+define(`up',	  `%rsi')
+define(`n_param', `%rdx')
+define(`bd',	  `%rcx')
+define(`cy',	  `%r8')
+
+define(`n',       `%r9')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_bdiv_dbm1c)
+	FUNC_ENTRY(4)
+IFDOS(`	mov	56(%rsp), %r8	')
+	mov	(up), %rax
+	mov	n_param, n
+	mov	R32(n_param), R32(%r11)
+	mul	bd
+	lea	(up,n,8), up
+	lea	(qp,n,8), qp
+	neg	n
+	and	$3, R32(%r11)
+	jz	L(lo0)
+	lea	-4(n,%r11), n
+	cmp	$2, R32(%r11)
+	jc	L(lo1)
+	jz	L(lo2)
+	jmp	L(lo3)
+
+	ALIGN(16)
+L(top):	mov	(up,n,8), %rax
+	mul	bd
+L(lo0):	sub	%rax, %r8
+	mov	%r8, (qp,n,8)
+	sbb	%rdx, %r8
+	mov	8(up,n,8), %rax
+	mul	bd
+L(lo3):	sub	%rax, %r8
+	mov	%r8, 8(qp,n,8)
+	sbb	%rdx, %r8
+	mov	16(up,n,8), %rax
+	mul	bd
+L(lo2):	sub	%rax, %r8
+	mov	%r8, 16(qp,n,8)
+	sbb	%rdx, %r8
+	mov	24(up,n,8), %rax
+	mul	bd
+L(lo1):	sub	%rax, %r8
+	mov	%r8, 24(qp,n,8)
+	sbb	%rdx, %r8
+	add	$4, n
+	jnz	L(top)
+
+	mov	%r8, %rax
+	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/bdiv_q_1.asm b/third_party/gmp/mpn/x86_64/bdiv_q_1.asm
new file mode 100644
index 0000000..85538c9
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/bdiv_q_1.asm

@@ -0,0 +1,195 @@
+dnl  AMD64 mpn_bdiv_q_1, mpn_pi1_bdiv_q_1 -- Hensel division by 1-limb divisor.
+
+dnl  Copyright 2001, 2002, 2004-2006, 2010-2012, 2017 Free Software Foundation,
+dnl  Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	    cycles/limb    cycles/limb
+C	       norm	       unorm
+C AMD K8,K9	11		11
+C AMD K10	11		11
+C AMD bull	13.5		14
+C AMD pile	14		15
+C AMD steam
+C AMD excavator
+C AMD bobcat	14		14
+C AMD jaguar	14.5		15
+C Intel P4	33		33
+C Intel core2	13.5		13.25
+C Intel NHM	14		14
+C Intel SBR	8		8.25
+C Intel IBR	7.75		7.85
+C Intel HWL	8		8
+C Intel BWL	8		8
+C Intel SKL	8		8
+C Intel atom	34		36
+C Intel SLM	13.7		13.5
+C VIA nano	19.25		19.25	needs re-measuring
+
+C INPUT PARAMETERS
+define(`rp',		`%rdi')
+define(`up',		`%rsi')
+define(`n',		`%rdx')
+define(`d',		`%rcx')
+define(`di',		`%r8')		C	just mpn_pi1_bdiv_q_1
+define(`ncnt',		`%r9')		C	just mpn_pi1_bdiv_q_1
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_bdiv_q_1)
+	FUNC_ENTRY(4)
+	push	%rbx
+
+	mov	%rcx, %rax
+	xor	R32(%rcx), R32(%rcx)	C ncnt count
+	mov	%rdx, %r10
+
+	bt	$0, R32(%rax)
+	jnc	L(evn)			C skip bsf unless divisor is even
+
+L(odd):	mov	%rax, %rbx
+	shr	R32(%rax)
+	and	$127, R32(%rax)		C d/2, 7 bits
+
+	LEA(	binvert_limb_table, %rdx)
+
+	movzbl	(%rdx,%rax), R32(%rax)	C inv 8 bits
+
+	mov	%rbx, %r11		C d without twos
+
+	lea	(%rax,%rax), R32(%rdx)	C 2*inv
+	imul	R32(%rax), R32(%rax)	C inv*inv
+	imul	R32(%rbx), R32(%rax)	C inv*inv*d
+	sub	R32(%rax), R32(%rdx)	C inv = 2*inv - inv*inv*d, 16 bits
+
+	lea	(%rdx,%rdx), R32(%rax)	C 2*inv
+	imul	R32(%rdx), R32(%rdx)	C inv*inv
+	imul	R32(%rbx), R32(%rdx)	C inv*inv*d
+	sub	R32(%rdx), R32(%rax)	C inv = 2*inv - inv*inv*d, 32 bits
+
+	lea	(%rax,%rax), %r8	C 2*inv
+	imul	%rax, %rax		C inv*inv
+	imul	%rbx, %rax		C inv*inv*d
+	sub	%rax, %r8		C inv = 2*inv - inv*inv*d, 64 bits
+
+	jmp	L(pi1)
+
+L(evn):	bsf	%rax, %rcx
+	shr	R8(%rcx), %rax
+	jmp	L(odd)
+EPILOGUE()
+
+PROLOGUE(mpn_pi1_bdiv_q_1)
+	FUNC_ENTRY(4)
+IFDOS(`	mov	56(%rsp), %r8	')
+IFDOS(`	mov	64(%rsp), %r9	')
+	push	%rbx
+
+	mov	%rcx, %r11		C d
+	mov	%rdx, %r10		C n
+	mov	%r9, %rcx		C ncnt
+
+L(pi1):	mov	(up), %rax		C up[0]
+
+	dec	%r10
+	jz	L(one)
+
+	lea	8(up,%r10,8), up	C up end
+	lea	(rp,%r10,8), rp		C rp end
+	neg	%r10			C -n
+
+	test	R32(%rcx), R32(%rcx)
+	jnz	L(unorm)		C branch if count != 0
+	xor	R32(%rbx), R32(%rbx)
+	jmp	L(nent)
+
+	ALIGN(8)
+L(ntop):mul	%r11			C carry limb in rdx	0 10
+	mov	-8(up,%r10,8), %rax	C
+	sub	%rbx, %rax		C apply carry bit
+	setc	R8(%rbx)		C
+	sub	%rdx, %rax		C apply carry limb	5
+	adc	$0, R32(%rbx)		C			6
+L(nent):imul	%r8, %rax		C			6
+	mov	%rax, (rp,%r10,8)	C
+	inc	%r10			C
+	jnz	L(ntop)
+
+	mov	-8(up), %r9		C up high limb
+	jmp	L(com)
+
+L(unorm):
+	mov	(up,%r10,8), %r9	C up[1]
+	shr	R8(%rcx), %rax		C
+	neg	R32(%rcx)
+	shl	R8(%rcx), %r9		C
+	neg	R32(%rcx)
+	or	%r9, %rax
+	xor	R32(%rbx), R32(%rbx)
+	jmp	L(uent)
+
+	ALIGN(8)
+L(utop):mul	%r11			C carry limb in rdx	0 10
+	mov	(up,%r10,8), %rax	C
+	shl	R8(%rcx), %rax		C
+	neg	R32(%rcx)
+	or	%r9, %rax
+	sub	%rbx, %rax		C apply carry bit
+	setc	R8(%rbx)		C
+	sub	%rdx, %rax		C apply carry limb	5
+	adc	$0, R32(%rbx)		C			6
+L(uent):imul	%r8, %rax		C			6
+	mov	(up,%r10,8), %r9	C
+	shr	R8(%rcx), %r9		C
+	neg	R32(%rcx)
+	mov	%rax, (rp,%r10,8)	C
+	inc	%r10			C
+	jnz	L(utop)
+
+L(com):	mul	%r11			C carry limb in rdx
+	sub	%rbx, %r9		C apply carry bit
+	sub	%rdx, %r9		C apply carry limb
+	imul	%r8, %r9
+	mov	%r9, (rp)
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+
+L(one):	shr	R8(%rcx), %rax
+	imul	%r8, %rax
+	mov	%rax, (rp)
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/bt1/aors_n.asm b/third_party/gmp/mpn/x86_64/bt1/aors_n.asm
new file mode 100644
index 0000000..9b6b5c7
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/bt1/aors_n.asm

@@ -0,0 +1,159 @@
+dnl  AMD64 mpn_add_n, mpn_sub_n optimised for bobcat.
+
+dnl  Copyright 2003-2005, 2007, 2008, 2010-2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	    cycles/limb
+C AMD K8,K9	 1.77
+C AMD K10	 1.76\1.82
+C AMD bd1	 1.67\2.12
+C AMD bd2	 1.62\1.82
+C AMD bd3
+C AMD bd4	 1.55\2.2
+C AMD zen
+C AMD bt1	 2.54
+C AMD bt2	 2
+C Intel P4	11
+C Intel PNR	 4.76
+C Intel NHM	 5.27
+C Intel SBR	 2
+C Intel IBR	 1.94
+C Intel HWL	 1.63
+C Intel BWL	 1.51
+C Intel SKL	 1.51
+C Intel atom	 3.56
+C Intel SLM	 4
+C VIA nano
+
+C The loop of this code is the result of running a code generation and
+C optimization tool suite written by David Harvey and Torbjorn Granlund.
+
+C INPUT PARAMETERS
+define(`rp',	`%rdi')	C rcx
+define(`up',	`%rsi')	C rdx
+define(`vp',	`%rdx')	C r8
+define(`n',	`%rcx')	C r9
+define(`cy',	`%r8')	C rsp+40    (mpn_add_nc and mpn_sub_nc)
+
+ifdef(`OPERATION_add_n', `
+	define(ADCSBB,	      adc)
+	define(func,	      mpn_add_n)
+	define(func_nc,	      mpn_add_nc)')
+ifdef(`OPERATION_sub_n', `
+	define(ADCSBB,	      sbb)
+	define(func,	      mpn_sub_n)
+	define(func_nc,	      mpn_sub_nc)')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(func)
+	FUNC_ENTRY(4)
+	xor	%r8, %r8
+L(ent):	test	$1, R8(n)
+	jnz	L(bx1)
+
+L(bx0):	test	$2, R8(n)
+	jnz	L(b10)
+
+L(b00):	shr	$2, n
+	neg	%r8
+	mov	$3, R32(%rax)
+	mov	(up), %r10
+	mov	8(up), %r11
+	jmp	L(lo0)
+
+L(b10):	shr	$2, n
+	neg	%r8
+	mov	$1, R32(%rax)
+	mov	(up), %r8
+	mov	8(up), %r9
+	jrcxz	L(cj2)
+	jmp	L(top)
+
+L(bx1):	test	$2, R8(n)
+	jnz	L(b11)
+
+L(b01):	shr	$2, n
+	neg	%r8
+	mov	$0, R32(%rax)
+	mov	(up), %r9
+	jrcxz	L(cj1)
+	mov	8(up), %r10
+	jmp	L(lo1)
+
+	ALIGN(8)
+L(b11):	inc	n
+	shr	$2, n
+	neg	%r8
+	mov	$2, R32(%rax)
+	mov	(up), %r11
+	jmp	L(lo3)
+
+	ALIGN(4)
+L(top):	mov	8(up,%rax,8), %r10
+	ADCSBB	-8(vp,%rax,8), %r8
+	mov	%r8, -8(rp,%rax,8)
+L(lo1):	mov	16(up,%rax,8), %r11
+	ADCSBB	(vp,%rax,8), %r9
+	lea	4(%rax), %rax
+	mov	%r9, -32(rp,%rax,8)
+L(lo0):	ADCSBB	-24(vp,%rax,8), %r10
+	mov	%r10, -24(rp,%rax,8)
+L(lo3):	ADCSBB	-16(vp,%rax,8), %r11
+	dec	n
+	mov	-8(up,%rax,8), %r8
+	mov	%r11, -16(rp,%rax,8)
+L(lo2):	mov	(up,%rax,8), %r9
+	jnz	L(top)
+
+L(cj2):	ADCSBB	-8(vp,%rax,8), %r8
+	mov	%r8, -8(rp,%rax,8)
+L(cj1):	ADCSBB	(vp,%rax,8), %r9
+	mov	%r9, (rp,%rax,8)
+
+	mov	$0, R32(%rax)
+	adc	$0, R32(%rax)
+
+	FUNC_EXIT()
+	ret
+EPILOGUE()
+
+	ALIGN(16)
+PROLOGUE(func_nc)
+	FUNC_ENTRY(4)
+IFDOS(`	mov	56(%rsp), %r8	')
+	jmp	L(ent)
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/bt1/aorsmul_1.asm b/third_party/gmp/mpn/x86_64/bt1/aorsmul_1.asm
new file mode 100644
index 0000000..41e1d8a
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/bt1/aorsmul_1.asm

@@ -0,0 +1,191 @@
+dnl  AMD64 mpn_addmul_1 and mpn_submul_1 optimised for AMD bt1/bt2.
+
+dnl  Copyright 2003-2005, 2007, 2008, 2011, 2012, 2018-2019 Free Software
+dnl  Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C AMD K8,K9	 4.52		old measurement
+C AMD K10	 4.51		old measurement
+C AMD bd1	 4.66		old measurement
+C AMD bd2	 4.57		old measurement
+C AMD bd3	 ?
+C AMD bd4	 ?
+C AMD zen	 ?
+C AMD bt1	 5.04
+C AMD bt2	 5.07
+C Intel P4	16.8	18.6	old measurement
+C Intel PNR	 5.59		old measurement
+C Intel NHM	 5.39		old measurement
+C Intel SBR	 3.93		old measurement
+C Intel IBR	 3.59		old measurement
+C Intel HWL	 3.61		old measurement
+C Intel BWL	 2.76		old measurement
+C Intel SKL	 2.77		old measurement
+C Intel atom	23		old measurement
+C Intel SLM	 8		old measurement
+C Intel GLM	 ?
+C VIA nano	 5.63		old measurement
+
+C The ALIGNment here might look completely ad-hoc.  They are not.
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ifdef(`OPERATION_addmul_1',`
+      define(`ADDSUB',        `add')
+      define(`func',  `mpn_addmul_1')
+')
+ifdef(`OPERATION_submul_1',`
+      define(`ADDSUB',        `sub')
+      define(`func',  `mpn_submul_1')
+')
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
+
+C Standard parameters
+define(`rp',              `%rdi')
+define(`up',              `%rsi')
+define(`n_param',         `%rdx')
+define(`v0',              `%rcx')
+C Standard allocations
+define(`n',               `%rbx')
+define(`w0',              `%r8')
+define(`w1',              `%r9')
+define(`w2',              `%r10')
+define(`w3',              `%r11')
+
+C DOS64 parameters
+IFDOS(` define(`rp',      `%rcx')    ') dnl
+IFDOS(` define(`up',      `%rsi')    ') dnl
+IFDOS(` define(`n_param', `%r8')     ') dnl
+IFDOS(` define(`v0',      `%r9')     ') dnl
+C DOS64 allocations
+IFDOS(` define(`n',       `%rbx')    ') dnl
+IFDOS(` define(`w0',      `%r8')     ') dnl
+IFDOS(` define(`w1',      `%rdi')    ') dnl
+IFDOS(` define(`w2',      `%r10')    ') dnl
+IFDOS(` define(`w3',      `%r11')    ') dnl
+
+ASM_START()
+	TEXT
+	ALIGN(64)
+PROLOGUE(func)
+IFDOS(`	push	%rsi		')
+IFDOS(`	push	%rdi		')
+IFDOS(`	mov	%rdx, %rsi	')
+
+	push	%rbx
+	mov	(up), %rax
+
+	lea	(rp,n_param,8), rp
+	lea	(up,n_param,8), up
+	mov	n_param, n
+
+	test	$1, R8(n_param)
+	jne	L(bx1)
+
+L(bx0):	mul	v0
+	neg	n
+	mov	%rax, w0
+	mov	%rdx, w1
+	test	$2, R8(n)
+	jne	L(L2)
+
+L(b00):	add	$2, n
+	jmp	L(L0)
+
+	ALIGN(16)
+L(bx1):	mul	v0
+	test	$2, R8(n)
+	je	L(b01)
+
+L(b11):	mov	%rax, w2
+	mov	%rdx, w3
+	neg	n
+	inc	n
+	jmp	L(L3)
+
+	ALIGN(16)
+L(b01):	sub	$3, n
+	jc	L(n1)
+	mov	%rax, w2
+	mov	%rdx, w3
+	neg	n
+
+	ALIGN(16)
+L(top):	mov	-16(up,n,8), %rax
+	mul	v0
+	mov	%rax, w0
+	mov	%rdx, w1
+	ADDSUB	w2, -24(rp,n,8)
+	adc	w3, w0
+	adc	$0, w1
+L(L0):	mov	-8(up,n,8), %rax
+	mul	v0
+	mov	%rax, w2
+	mov	%rdx, w3
+	ADDSUB	w0, -16(rp,n,8)
+	adc	w1, w2
+	adc	$0, w3
+L(L3):	mov	(up,n,8), %rax
+	mul	v0
+	mov	%rax, w0
+	mov	%rdx, w1
+	ADDSUB	w2, -8(rp,n,8)
+	adc	w3, w0
+	adc	$0, w1
+L(L2):	mov	8(up,n,8), %rax
+	mul	v0
+	mov	%rax, w2
+	mov	%rdx, w3
+	ADDSUB	w0, (rp,n,8)
+	adc	w1, w2
+	adc	$0, w3
+	add	$4, n
+	js	L(top)
+
+L(end):	xor	R32(%rax), R32(%rax)
+	ADDSUB	w2, -8(rp)
+	adc	w3, %rax
+	pop	%rbx
+IFDOS(`	pop	%rdi		')
+IFDOS(`	pop	%rsi		')
+	ret
+
+	ALIGN(32)
+L(n1):	ADDSUB	%rax, -8(rp)
+	mov	$0, R32(%rax)
+	adc	%rdx, %rax
+	pop	%rbx
+IFDOS(`	pop	%rdi		')
+IFDOS(`	pop	%rsi		')
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/bt1/copyd.asm b/third_party/gmp/mpn/x86_64/bt1/copyd.asm
new file mode 100644
index 0000000..877714e
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/bt1/copyd.asm

@@ -0,0 +1,91 @@
+dnl  AMD64 mpn_copyd optimised for AMD bobcat.
+
+dnl  Copyright 2003, 2005, 2007, 2011, 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C AMD K8,K9	 1
+C AMD K10	 1-2  (alignment fluctuations)
+C AMD bd1	 ?
+C AMD bobcat	 1.5
+C Intel P4	 2.8
+C Intel core2	 1
+C Intel NHM	 1-1.25
+C Intel SBR	 1
+C Intel atom	 2.87
+C VIA nano	 2
+
+C INPUT PARAMETERS
+C rp	rdi
+C up	rsi
+C n	rdx
+
+define(`rp',`%rdi')
+define(`up',`%rsi')
+define(`n',`%rdx')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_copyd)
+	FUNC_ENTRY(3)
+	sub	$4, n
+	jl	L(end)
+	ALIGN(16)
+L(top):	mov	24(up,n,8), %r8
+	mov	%r8, 24(rp,n,8)
+	mov	16(up,n,8), %r8
+	mov	%r8, 16(rp,n,8)
+	mov	8(up,n,8), %r8
+	mov	%r8, 8(rp,n,8)
+	mov	(up,n,8), %r8
+	mov	%r8, (rp,n,8)
+L(ent):	sub	$4, n
+	jge	L(top)
+
+L(end):	cmp	$-4, R32(n)
+	jz	L(ret)
+	mov	24(up,n,8), %r8
+	mov	%r8, 24(rp,n,8)
+	cmp	$-3, R32(n)
+	jz	L(ret)
+	mov	16(up,n,8), %r8
+	mov	%r8, 16(rp,n,8)
+	cmp	$-2, R32(n)
+	jz	L(ret)
+	mov	8(up,n,8), %r8
+	mov	%r8, 8(rp,n,8)
+
+L(ret):	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/bt1/copyi.asm b/third_party/gmp/mpn/x86_64/bt1/copyi.asm
new file mode 100644
index 0000000..ee0f578
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/bt1/copyi.asm

@@ -0,0 +1,94 @@
+dnl  AMD64 mpn_copyi optimised for AMD bobcat.
+
+dnl  Copyright 2003, 2005, 2007, 2011, 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C AMD K8,K9	 1
+C AMD K10	 1-2  (alignment fluctuations)
+C AMD bd1	 ?
+C AMD bobcat	 1.5
+C Intel P4	 2.8
+C Intel core2	 1
+C Intel NHM	 1-1.25
+C Intel SBR	 1
+C Intel atom	 2.87
+C VIA nano	 2
+
+C INPUT PARAMETERS
+C rp	rdi
+C up	rsi
+C n	rdx
+
+define(`rp',`%rdi')
+define(`up',`%rsi')
+define(`n',`%rdx')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_copyi)
+	FUNC_ENTRY(3)
+	lea	-32(up,n,8), up
+	lea	-32(rp,n,8), rp
+	neg	n
+	add	$4, n
+	jg	L(end)
+	ALIGN(16)
+L(top):	mov	(up,n,8), %r8
+	mov	%r8, (rp,n,8)
+	mov	8(up,n,8), %r8
+	mov	%r8, 8(rp,n,8)
+	mov	16(up,n,8), %r8
+	mov	%r8, 16(rp,n,8)
+	mov	24(up,n,8), %r8
+	mov	%r8, 24(rp,n,8)
+L(ent):	add	$4, n
+	jle	L(top)
+
+L(end):	cmp	$4, R32(n)
+	jz	L(ret)
+	mov	(up,n,8), %r8
+	mov	%r8, (rp,n,8)
+	cmp	$3, R32(n)
+	jz	L(ret)
+	mov	8(up,n,8), %r8
+	mov	%r8, 8(rp,n,8)
+	cmp	$2, R32(n)
+	jz	L(ret)
+	mov	16(up,n,8), %r8
+	mov	%r8, 16(rp,n,8)
+
+L(ret):	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/bt1/gcd_11.asm b/third_party/gmp/mpn/x86_64/bt1/gcd_11.asm
new file mode 100644
index 0000000..ef53392
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/bt1/gcd_11.asm

@@ -0,0 +1,119 @@
+dnl  AMD64 mpn_gcd_11 -- 1 x 1 gcd.
+
+dnl  Based on the K7 gcd_1.asm, by Kevin Ryde.  Rehacked for AMD64 by Torbjorn
+dnl  Granlund.
+
+dnl  Copyright 2000-2002, 2005, 2009, 2011, 2012, 2017 Free Software
+dnl  Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C	     cycles/bit
+C AMD K8,K9	 ?
+C AMD K10	 ?
+C AMD bd1	 ?
+C AMD bd2	 ?
+C AMD bd3	 ?
+C AMD bd4	 ?
+C AMD bt1	 5.4
+C AMD bt2	 ?
+C AMD zn1	 ?
+C AMD zn2	 ?
+C Intel P4	 ?
+C Intel CNR	 ?
+C Intel PNR	 ?
+C Intel NHM	 ?
+C Intel WSM	 ?
+C Intel SBR	 ?
+C Intel IBR	 ?
+C Intel HWL	 ?
+C Intel BWL	 ?
+C Intel SKL	 ?
+C Intel atom	 ?
+C Intel SLM	 ?
+C Intel GLM	 ?
+C Intel GLM+	 ?
+C VIA nano	 ?
+
+
+C ctz_table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0.
+
+deflit(MAXSHIFT, 8)
+deflit(MASK, eval((m4_lshift(1,MAXSHIFT))-1))
+
+DEF_OBJECT(ctz_table,64)
+	.byte	MAXSHIFT
+forloop(i,1,MASK,
+`	.byte	m4_count_trailing_zeros(i)
+')
+END_OBJECT(ctz_table)
+
+define(`u0',    `%rdi')
+define(`v0',    `%rsi')
+
+define(`cnt',   `%rcx')
+define(`s0',    `%rax')
+define(`t0',    `%rdx')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(64)
+PROLOGUE(mpn_gcd_11)
+	FUNC_ENTRY(2)
+	LEA(	ctz_table, %r10)
+	mov	v0, t0
+	sub	u0, t0
+	jz	L(end)
+
+	ALIGN(16)
+L(top):	mov	u0, s0
+	sub	v0, u0
+	cmovc	t0, u0		C u = |u - v|
+	cmovc	s0, v0		C v = min(u,v)
+	and	$MASK, R32(t0)
+	movzbl	(%r10,t0), R32(cnt)
+	jz	L(count_better)
+L(shr):	shr	R8(cnt), u0
+	mov	v0, t0
+	sub	u0, t0
+	jnz	L(top)
+
+L(end):	mov	v0, %rax
+	C rdx = 0 for the benefit of internal gcd_22 call
+	FUNC_EXIT()
+	ret
+
+L(count_better):
+	bsf	u0, cnt
+	jmp	L(shr)
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/bt1/gcd_22.asm b/third_party/gmp/mpn/x86_64/bt1/gcd_22.asm
new file mode 100644
index 0000000..c9f221e
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/bt1/gcd_22.asm

@@ -0,0 +1,37 @@
+dnl  AMD64 mpn_gcd_22.
+
+dnl  Copyright 2019 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+dnl ABI_SUPPORT(DOS64)	C returns mp_double_limb_t in memory
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_gcd_22)
+include_mpn(`x86_64/gcd_22.asm')

diff --git a/third_party/gmp/mpn/x86_64/bt1/gmp-mparam.h b/third_party/gmp/mpn/x86_64/bt1/gmp-mparam.h
new file mode 100644
index 0000000..977a209
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/bt1/gmp-mparam.h

@@ -0,0 +1,230 @@
+/* AMD Bobcat gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+/* Disable use of slow functions.  FIXME: We should disable lib inclusion.  */
+#undef HAVE_NATIVE_mpn_mul_2
+#undef HAVE_NATIVE_mpn_addmul_2
+
+/* 1600 MHz AMD Bobcat/Zacate */
+/* FFT tuning limit = 110,472,704 */
+/* Generated by tuneup.c, 2019-10-12, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD                 0  /* always */
+#define MOD_1_UNNORM_THRESHOLD               0  /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          7
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          3
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD        31
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD        71
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     14
+#define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_1_NORM_THRESHOLD              1
+#define DIV_QR_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           20
+
+#define DIV_1_VS_MUL_1_PERCENT             270
+
+#define MUL_TOOM22_THRESHOLD                24
+#define MUL_TOOM33_THRESHOLD                66
+#define MUL_TOOM44_THRESHOLD               190
+#define MUL_TOOM6H_THRESHOLD               274
+#define MUL_TOOM8H_THRESHOLD               381
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD     129
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     138
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD     127
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD     131
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD     100
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 30
+#define SQR_TOOM3_THRESHOLD                101
+#define SQR_TOOM4_THRESHOLD                278
+#define SQR_TOOM6_THRESHOLD                372
+#define SQR_TOOM8_THRESHOLD                478
+
+#define MULMID_TOOM42_THRESHOLD             22
+
+#define MULMOD_BNM1_THRESHOLD               11
+#define SQRMOD_BNM1_THRESHOLD               13
+
+#define MUL_FFT_MODF_THRESHOLD             444  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    444, 5}, {     19, 6}, {     10, 5}, {     21, 6}, \
+    {     11, 5}, {     23, 6}, {     21, 7}, {     11, 6}, \
+    {     25, 7}, {     13, 6}, {     27, 7}, {     21, 8}, \
+    {     11, 7}, {     25, 8}, {     13, 7}, {     28, 8}, \
+    {     15, 7}, {     31, 8}, {     17, 7}, {     35, 8}, \
+    {     19, 7}, {     39, 8}, {     27, 9}, {     15, 8}, \
+    {     35, 9}, {     19, 8}, {     41, 9}, {     23, 8}, \
+    {     49, 9}, {     27,10}, {     15, 9}, {     39,10}, \
+    {     23, 9}, {     55,11}, {     15,10}, {     31, 9}, \
+    {     71,10}, {     39, 9}, {     83, 5}, {   1343, 4}, \
+    {   2687, 5}, {   1407, 6}, {    735, 7}, {    415, 8}, \
+    {    223,10}, {     79,11}, {     47,10}, {    103,12}, \
+    {     31,11}, {     63,10}, {    135,11}, {     79,10}, \
+    {    167,11}, {     95,10}, {    191,11}, {    111,12}, \
+    {     63,11}, {    127,10}, {    255,11}, {    143,10}, \
+    {    287, 9}, {    575,11}, {    159,12}, {     95,11}, \
+    {    191,10}, {    383,11}, {    207,10}, {    415,13}, \
+    {     63,12}, {    127,11}, {    255,10}, {    511,11}, \
+    {    271,10}, {    543,11}, {    287,10}, {    575,12}, \
+    {    159,11}, {    319,10}, {    639,11}, {    351,10}, \
+    {    703,12}, {    191,11}, {    383,10}, {    767,11}, \
+    {    415,12}, {    223,13}, {    127,12}, {    255,11}, \
+    {    543,12}, {    287,11}, {    607,12}, {    319,11}, \
+    {    671,12}, {    351,11}, {    703,13}, {    191,12}, \
+    {    383,11}, {    767,12}, {    415,11}, {    831,12}, \
+    {    447,14}, {    127,13}, {    255,12}, {    607,13}, \
+    {    319,12}, {    703,13}, {    383,12}, {    831,13}, \
+    {    447,12}, {    959,14}, {    255,13}, {    511,12}, \
+    {   1023,13}, {    575,12}, {   1151,13}, {    703,14}, \
+    {    383,13}, {    831,12}, {   1663,13}, {    959,15}, \
+    {    255,14}, {    511,13}, {   1087,12}, {   2175,13}, \
+    {   1151,14}, {    639,13}, {   1343,12}, {   2687,13}, \
+    {   1407,14}, {    767,13}, {   1599,12}, {   3199,13}, \
+    {   1663,14}, {    895,15}, {    511,14}, {   1023,13}, \
+    {   2175,14}, {   1151,13}, {   2431,12}, {   4863,14}, \
+    {   1279,13}, {   2687,14}, {   1407,15}, {    767,14}, \
+    {   1535,13}, {   3199,14}, {   1663,13}, {   3455,16}, \
+    {    511,15}, {   1023,14}, {   2175,13}, {   4479,14}, \
+    {   2431,13}, {   4863,15}, {   1279,14}, {   2943,13}, \
+    {   5887,15}, {   1535,14}, {   3455,13}, {   6911,15}, \
+    {   1791,14}, {   3839,16}, {   1023,15}, {   2047,14}, \
+    {   4479,15}, {   2303,14}, {   4991,15}, {   2559,14}, \
+    {   5247,15}, {   2815,14}, {   5887,16}, {   1535,15}, \
+    {   3327,14}, {  16384,15}, {  32768,16}, {  65536,17}, \
+    { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
+    {2097152,22}, {4194304,23}, {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 183
+#define MUL_FFT_THRESHOLD                 5760
+
+#define SQR_FFT_MODF_THRESHOLD             380  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    380, 5}, {     21, 6}, {     11, 5}, {     23, 6}, \
+    {     25, 7}, {     25, 8}, {     13, 7}, {     27, 8}, \
+    {     15, 7}, {     31, 8}, {     17, 7}, {     35, 8}, \
+    {     19, 7}, {     39, 8}, {     27, 9}, {     15, 8}, \
+    {     35, 9}, {     19, 8}, {     41, 9}, {     23, 8}, \
+    {     47, 9}, {     27,10}, {     15, 9}, {     31, 8}, \
+    {     63, 9}, {     39,10}, {     23, 9}, {     51,11}, \
+    {     15,10}, {     31, 9}, {     67,10}, {     39, 9}, \
+    {     79,10}, {     47, 9}, {     95,10}, {     55,11}, \
+    {     31,10}, {     63, 6}, {   1087, 7}, {    575, 8}, \
+    {    303, 9}, {    159,10}, {    103,12}, {     31,11}, \
+    {     63,10}, {    127, 9}, {    255,10}, {    135,11}, \
+    {     79,10}, {    159, 9}, {    319,11}, {     95,10}, \
+    {    191, 9}, {    383,11}, {    111,12}, {     63,11}, \
+    {    127,10}, {    255, 9}, {    511,10}, {    271,11}, \
+    {    143,10}, {    287, 9}, {    575,10}, {    303,11}, \
+    {    159,10}, {    319,12}, {     95,11}, {    191,10}, \
+    {    383,11}, {    207,13}, {     63,12}, {    127,11}, \
+    {    255,10}, {    511,11}, {    271,10}, {    543,11}, \
+    {    287,10}, {    575,11}, {    303,12}, {    159,11}, \
+    {    319,10}, {    639,11}, {    335,10}, {    671,11}, \
+    {    351,10}, {    703,12}, {    191,11}, {    383,10}, \
+    {    767,11}, {    415,12}, {    223,11}, {    447,13}, \
+    {    127,12}, {    255,11}, {    543,12}, {    287,11}, \
+    {    607,12}, {    319,11}, {    671,12}, {    351,11}, \
+    {    703,13}, {    191,12}, {    383,11}, {    767,12}, \
+    {    415,11}, {    831,12}, {    479,14}, {    127,13}, \
+    {    255,12}, {    607,13}, {    319,12}, {    703,13}, \
+    {    383,12}, {    831,13}, {    447,12}, {    895,14}, \
+    {    255,13}, {    511,12}, {   1023,13}, {    703,14}, \
+    {    383,13}, {    831,12}, {   1663,13}, {    895,15}, \
+    {    255,14}, {    511,13}, {   1087,12}, {   2175,13}, \
+    {   1151,14}, {    639,13}, {   1343,12}, {   2687,13}, \
+    {   1407,14}, {    767,13}, {   1599,12}, {   3199,13}, \
+    {   1663,14}, {    895,15}, {    511,14}, {   1023,13}, \
+    {   2175,14}, {   1151,13}, {   2431,12}, {   4863,14}, \
+    {   1279,13}, {   2687,14}, {   1407,15}, {    767,14}, \
+    {   1535,13}, {   3199,14}, {   1663,13}, {   3455,16}, \
+    {    511,15}, {   1023,14}, {   2175,13}, {   4351,14}, \
+    {   2431,13}, {   4863,15}, {   1279,14}, {   2943,13}, \
+    {   5887,15}, {   1535,14}, {   3455,15}, {   1791,14}, \
+    {   3839,16}, {   1023,15}, {   2047,14}, {   4479,15}, \
+    {   2303,14}, {   4863,15}, {   2559,14}, {   5247,15}, \
+    {   2815,14}, {   5887,16}, {   1535,15}, {   3327,14}, \
+    {  16384,15}, {  32768,16}, {  65536,17}, { 131072,18}, \
+    { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \
+    {4194304,23}, {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 186
+#define SQR_FFT_THRESHOLD                 3712
+
+#define MULLO_BASECASE_THRESHOLD             0  /* always */
+#define MULLO_DC_THRESHOLD                  42
+#define MULLO_MUL_N_THRESHOLD            10950
+#define SQRLO_BASECASE_THRESHOLD             7
+#define SQRLO_DC_THRESHOLD                 100
+#define SQRLO_SQR_THRESHOLD               7293
+
+#define DC_DIV_QR_THRESHOLD                 70
+#define DC_DIVAPPR_Q_THRESHOLD             204
+#define DC_BDIV_QR_THRESHOLD                59
+#define DC_BDIV_Q_THRESHOLD                148
+
+#define INV_MULMOD_BNM1_THRESHOLD           46
+#define INV_NEWTON_THRESHOLD               246
+#define INV_APPR_THRESHOLD                 236
+
+#define BINV_NEWTON_THRESHOLD              252
+#define REDC_1_TO_REDC_2_THRESHOLD          67
+#define REDC_2_TO_REDC_N_THRESHOLD           0  /* always */
+
+#define MU_DIV_QR_THRESHOLD               1589
+#define MU_DIVAPPR_Q_THRESHOLD            1589
+#define MUPI_DIV_QR_THRESHOLD              108
+#define MU_BDIV_QR_THRESHOLD              1442
+#define MU_BDIV_Q_THRESHOLD               1470
+
+#define POWM_SEC_TABLE  1,16,194,960,1603,1811,2499
+
+#define GET_STR_DC_THRESHOLD                20
+#define GET_STR_PRECOMPUTE_THRESHOLD        34
+#define SET_STR_DC_THRESHOLD               345
+#define SET_STR_PRECOMPUTE_THRESHOLD      1787
+
+#define FAC_DSC_THRESHOLD                  781
+#define FAC_ODD_THRESHOLD                  104
+
+#define MATRIX22_STRASSEN_THRESHOLD         17
+#define HGCD2_DIV1_METHOD                    3  /* 3.20% faster than 5 */
+#define HGCD_THRESHOLD                     110
+#define HGCD_APPR_THRESHOLD                 50
+#define HGCD_REDUCE_THRESHOLD             2681
+#define GCD_DC_THRESHOLD                   474
+#define GCDEXT_DC_THRESHOLD                293
+#define JACOBI_BASE_METHOD                   2  /* 9.38% faster than 1 */
+
+/* Tuneup completed successfully, took 358881 seconds */

diff --git a/third_party/gmp/mpn/x86_64/bt1/mul_1.asm b/third_party/gmp/mpn/x86_64/bt1/mul_1.asm
new file mode 100644
index 0000000..4394d6e
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/bt1/mul_1.asm

@@ -0,0 +1,241 @@
+dnl  AMD64 mpn_mul_1 optimised for AMD bt1/bt2.
+
+dnl  Copyright 2003-2005, 2007, 2008, 2011, 2012, 2019 Free Software
+dnl  Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C AMD K8,K9	 4.53		old measurement
+C AMD K10	 4.53		old measurement
+C AMD bd1	 4.56		old measurement
+C AMD bd2	 4.47		old measurement
+C AMD bd3	 ?
+C AMD bd4	 ?
+C AMD zen	 ?
+C AMD bt1	 5.12
+C AMD bt2	 5.17
+C Intel P4	12.6		old measurement
+C Intel PNR	 4.53		old measurement
+C Intel NHM	 4.36		old measurement
+C Intel SBR	 3.0		old measurement
+C Intel IBR	 2.55		old measurement
+C Intel HWL	 2.28		old measurement
+C Intel BWL	 2.36		old measurement
+C Intel SKL	 2.39		old measurement
+C Intel atom	21.0		old measurement
+C Intel SLM	 9		old measurement
+C Intel GLM	 ?
+C VIA nano	 ?
+
+C The loop of this code is the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjorn Granlund.
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+C Standard parameters
+define(`rp',              `%rdi')
+define(`up',              `%rsi')
+define(`n_param',         `%rdx')
+define(`v0',              `%rcx')
+define(`cy',              `%r8')
+C Standard allocations
+define(`n',               `%rbx')
+define(`w0',              `%r8')
+define(`w1',              `%r9')
+define(`w2',              `%r10')
+define(`w3',              `%r11')
+
+C DOS64 parameters
+IFDOS(` define(`rp',      `%rcx')    ') dnl
+IFDOS(` define(`up',      `%rsi')    ') dnl
+IFDOS(` define(`n_param', `%r8')     ') dnl
+IFDOS(` define(`v0',      `%r9')     ') dnl
+IFDOS(` define(`cy',      `56(%rsp)')') dnl
+C DOS64 allocations
+IFDOS(` define(`n',       `%rbx')    ') dnl
+IFDOS(` define(`w0',      `%r8')     ') dnl
+IFDOS(` define(`w1',      `%rdi')    ') dnl
+IFDOS(` define(`w2',      `%r10')    ') dnl
+IFDOS(` define(`w3',      `%r11')    ') dnl
+
+	ALIGN(64)
+PROLOGUE(mpn_mul_1)
+IFDOS(`	push	%rsi		')
+IFDOS(`	push	%rdi		')
+IFDOS(`	mov	%rdx, %rsi	')
+
+	push	%rbx
+	mov	(up), %rax
+
+	lea	(rp,n_param,8), rp
+	lea	(up,n_param,8), up
+	mov	n_param, n
+
+	test	$1, R8(n_param)
+	jne	L(bx1)
+
+L(bx0):	mul	v0
+	neg	n
+	mov	%rax, w0
+	mov	%rdx, w1
+	test	$2, R8(n)
+	jne	L(L2)
+
+L(b00):	add	$2, n
+	jmp	L(L0)
+
+	ALIGN(16)
+L(b11):	mov	%rax, w2
+	mov	%rdx, w3
+	neg	n
+	inc	n
+	jmp	L(L3)
+
+	ALIGN(16)
+L(bx1):	mul	v0
+	test	$2, R8(n)
+	jne	L(b11)
+
+L(b01):	sub	$3, n
+	jc	L(n1)
+	mov	%rax, w2
+	mov	%rdx, w3
+	neg	n
+
+	ALIGN(16)
+L(top):	mov	-16(up,n,8), %rax
+	mul	v0
+	mov	%rax, w0
+	mov	%rdx, w1
+	mov	w2, -24(rp,n,8)
+	add	w3, w0
+	adc	$0, w1
+L(L0):	mov	-8(up,n,8), %rax
+	mul	v0
+	mov	%rax, w2
+	mov	%rdx, w3
+	mov	w0, -16(rp,n,8)
+	add	w1, w2
+	adc	$0, w3
+L(L3):	mov	(up,n,8), %rax
+	mul	v0
+	mov	%rax, w0
+	mov	%rdx, w1
+	mov	w2, -8(rp,n,8)
+	add	w3, w0
+	adc	$0, w1
+L(L2):	mov	8(up,n,8), %rax
+	mul	v0
+	mov	%rax, w2
+	mov	%rdx, w3
+	mov	w0, (rp,n,8)
+	add	w1, w2
+	adc	$0, w3
+	add	$4, n
+	js	L(top)
+
+L(end):	mov	w2, -8(rp)
+	mov	w3, %rax
+	pop	%rbx
+IFDOS(`	pop	%rdi		')
+IFDOS(`	pop	%rsi		')
+	ret
+
+	ALIGN(32)
+L(n1):	mov	%rax, -8(rp)
+	mov	%rdx, %rax
+	pop	%rbx
+IFDOS(`	pop	%rdi		')
+IFDOS(`	pop	%rsi		')
+	ret
+EPILOGUE()
+
+ASM_START()
+	TEXT
+	ALIGN(64)
+PROLOGUE(mpn_mul_1c)
+IFDOS(`	push	%rsi		')
+IFDOS(`	push	%rdi		')
+IFDOS(`	mov	%rdx, %rsi	')
+	mov	cy, w2
+	push	%rbx
+	mov	(up), %rax
+
+	lea	(rp,n_param,8), rp
+	lea	(up,n_param,8), up
+	mov	n_param, n
+
+	test	$1, R8(n_param)
+	jne	L(cx1)
+
+L(cx0):	mul	v0
+	neg	n
+	mov	%rax, w0
+	mov	%rdx, w1
+	add	w2, w0
+	adc	$0, w1
+	test	$2, R8(n)
+	jne	L(L2)
+
+L(c00):	add	$2, n
+	jmp	L(L0)
+
+	ALIGN(16)
+L(cx1):	mul	v0
+	test	$2, R8(n)
+	je	L(c01)
+
+L(c11):	neg	n
+	inc	n
+	add	%rax, w2
+	mov	%rdx, w3
+	adc	$0, w3
+	jmp	L(L3)
+
+L(c01):	cmp	$1, n
+	jz	L(m1)
+	neg	n
+	add	$3, n
+	add	%rax, w2
+	mov	%rdx, w3
+	adc	$0, w3
+	jmp	L(top)
+
+	ALIGN(32)
+L(m1):	add	%rax, w2
+	mov	%rdx, %rax
+	mov	w2, -8(rp)
+	adc	$0, %rax
+	pop	%rbx
+IFDOS(`	pop	%rdi		')
+IFDOS(`	pop	%rsi		')
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/bt1/mul_basecase.asm b/third_party/gmp/mpn/x86_64/bt1/mul_basecase.asm
new file mode 100644
index 0000000..e7d46bf
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/bt1/mul_basecase.asm

@@ -0,0 +1,486 @@
+dnl  AMD64 mpn_mul_basecase optimised for AMD bobcat.
+
+dnl  Copyright 2003-2005, 2007, 2008, 2011, 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C AMD K8,K9	 4.5
+C AMD K10	 4.5
+C AMD bd1	 4.75
+C AMD bobcat	 5
+C Intel P4	17.7
+C Intel core2	 5.5
+C Intel NHM	 5.43
+C Intel SBR	 3.92
+C Intel atom	23
+C VIA nano	 5.63
+
+C This mul_basecase is based on mul_1 and addmul_1, since these both run at the
+C multiply insn bandwidth, without any apparent loop branch exit pipeline
+C replays experienced on K8.  The structure is unusual: it falls into mul_1 in
+C the same way for all n, then it splits into 4 different wind-down blocks and
+C 4 separate addmul_1 loops.
+C
+C We have not tried using the same addmul_1 loops with a switch into feed-in
+C code, as we do in other basecase implementations.  Doing that could save
+C substantial code volume, but would also probably add some overhead.
+
+C TODO
+C  * Tune un < 3 code.
+C  * Fix slowdown for un=vn=3 (67->71) compared to default code.
+C  * This is 1263 bytes, compared to 1099 bytes for default code.  Consider
+C    combining addmul loops like that code.  Tolerable slowdown?
+C  * Lots of space could be saved by replacing the "switch" code by gradual
+C    jumps out from mul_1 winddown code, perhaps with no added overhead.
+C  * Are the ALIGN(16) really necessary?  They add about 25 bytes of padding.
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+C Standard parameters
+define(`rp',              `%rdi')
+define(`up',              `%rsi')
+define(`un_param',        `%rdx')
+define(`vp',              `%rcx')
+define(`vn',              `%r8')
+C Standard allocations
+define(`un',              `%rbx')
+define(`w0',              `%r10')
+define(`w1',              `%r11')
+define(`w2',              `%r12')
+define(`w3',              `%r13')
+define(`n',               `%rbp')
+define(`v0',              `%r9')
+
+C Temp macro for allowing control over indexing.
+C Define to return $1 for more conservative ptr handling.
+define(`X',`$2')
+
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_mul_basecase)
+	FUNC_ENTRY(4)
+IFDOS(`	mov	56(%rsp), %r8d	')
+
+	mov	(up), %rax
+	mov	(vp), v0
+
+	cmp	$2, un_param
+	ja	L(ge3)
+	jz	L(u2)
+
+	mul	v0			C u0 x v0
+	mov	%rax, (rp)
+	mov	%rdx, 8(rp)
+	FUNC_EXIT()
+	ret
+
+L(u2):	mul	v0			C u0 x v0
+	mov	%rax, (rp)
+	mov	8(up), %rax
+	mov	%rdx, w0
+	mul	v0
+	add	%rax, w0
+	mov	%rdx, w1
+	adc	$0, w1
+	cmp	$1, R32(vn)
+	jnz	L(u2v2)
+	mov	w0, 8(rp)
+	mov	w1, 16(rp)
+	FUNC_EXIT()
+	ret
+
+L(u2v2):mov	8(vp), v0
+	mov	(up), %rax
+	mul	v0
+	add	%rax, w0
+	mov	w0, 8(rp)
+	mov	%rdx, %r8		C CAUTION: r8 realloc
+	adc	$0, %r8
+	mov	8(up), %rax
+	mul	v0
+	add	w1, %r8
+	adc	$0, %rdx
+	add	%r8, %rax
+	adc	$0, %rdx
+	mov	%rax, 16(rp)
+	mov	%rdx, 24(rp)
+	FUNC_EXIT()
+	ret
+
+
+L(ge3):	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+
+	lea	8(vp), vp
+
+	lea	-24(rp,un_param,8), rp
+	lea	-24(up,un_param,8), up
+	xor	R32(un), R32(un)
+	mov	$2, R32(n)
+	sub	un_param, un
+	sub	un_param, n
+
+	mul	v0
+	mov	%rax, w2
+	mov	%rdx, w3
+	jmp	L(L3)
+
+	ALIGN(16)
+L(top):	mov	w0, -16(rp,n,8)
+	add	w1, w2
+	adc	$0, w3
+	mov	(up,n,8), %rax
+	mul	v0
+	mov	%rax, w0
+	mov	%rdx, w1
+	mov	w2, -8(rp,n,8)
+	add	w3, w0
+	adc	$0, w1
+	mov	8(up,n,8), %rax
+	mul	v0
+	mov	%rax, w2
+	mov	%rdx, w3
+	mov	w0, (rp,n,8)
+	add	w1, w2
+	adc	$0, w3
+L(L3):	mov	16(up,n,8), %rax
+	mul	v0
+	mov	%rax, w0
+	mov	%rdx, w1
+	mov	w2, 8(rp,n,8)
+	add	w3, w0
+	adc	$0, w1
+	mov	24(up,n,8), %rax
+	mul	v0
+	mov	%rax, w2
+	mov	%rdx, w3
+	add	$4, n
+	js	L(top)
+
+	mov	w0, -16(rp,n,8)
+	add	w1, w2
+	adc	$0, w3
+
+C Switch on n into right addmul_l loop
+	test	n, n
+	jz	L(r2)
+	cmp	$2, R32(n)
+	ja	L(r3)
+	jz	L(r0)
+	jmp	L(r1)
+
+
+L(r3):	mov	w2, X(-8(rp,n,8),16(rp))
+	mov	w3, X((rp,n,8),24(rp))
+	add	$2, un
+
+C outer loop(3)
+L(to3):	dec	vn
+	jz	L(ret)
+	mov	(vp), v0
+	mov	8(up,un,8), %rax
+	lea	8(vp), vp
+	lea	8(rp), rp
+	mov	un, n
+	mul	v0
+	mov	%rax, w2
+	mov	%rdx, w3
+	jmp	L(al3)
+
+	ALIGN(16)
+L(ta3):	add	w0, -16(rp,n,8)
+	adc	w1, w2
+	adc	$0, w3
+	mov	(up,n,8), %rax
+	mul	v0
+	mov	%rax, w0
+	mov	%rdx, w1
+	add	w2, -8(rp,n,8)
+	adc	w3, w0
+	adc	$0, w1
+	mov	8(up,n,8), %rax
+	mul	v0
+	mov	%rax, w2
+	mov	%rdx, w3
+	add	w0, (rp,n,8)
+	adc	w1, w2
+	adc	$0, w3
+L(al3):	mov	16(up,n,8), %rax
+	mul	v0
+	mov	%rax, w0
+	mov	%rdx, w1
+	add	w2, 8(rp,n,8)
+	adc	w3, w0
+	adc	$0, w1
+	mov	24(up,n,8), %rax
+	mul	v0
+	mov	%rax, w2
+	mov	%rdx, w3
+	add	$4, n
+	js	L(ta3)
+
+	add	w0, X(-16(rp,n,8),8(rp))
+	adc	w1, w2
+	adc	$0, w3
+	add	w2, X(-8(rp,n,8),16(rp))
+	adc	$0, w3
+	mov	w3, X((rp,n,8),24(rp))
+	jmp	L(to3)
+
+
+L(r2):	mov	X(0(up,n,8),(up)), %rax
+	mul	v0
+	mov	%rax, w0
+	mov	%rdx, w1
+	mov	w2, X(-8(rp,n,8),-8(rp))
+	add	w3, w0
+	adc	$0, w1
+	mov	X(8(up,n,8),8(up)), %rax
+	mul	v0
+	mov	%rax, w2
+	mov	%rdx, w3
+	mov	w0, X((rp,n,8),(rp))
+	add	w1, w2
+	adc	$0, w3
+	mov	X(16(up,n,8),16(up)), %rax
+	mul	v0
+	mov	%rax, w0
+	mov	%rdx, w1
+	mov	w2, X(8(rp,n,8),8(rp))
+	add	w3, w0
+	adc	$0, w1
+	mov	w0, X(16(rp,n,8),16(rp))
+	adc	$0, w3
+	mov	w1, X(24(rp,n,8),24(rp))
+	inc	un
+
+C outer loop(2)
+L(to2):	dec	vn
+	jz	L(ret)
+	mov	(vp), v0
+	mov	16(up,un,8), %rax
+	lea	8(vp), vp
+	lea	8(rp), rp
+	mov	un, n
+	mul	v0
+	mov	%rax, w0
+	mov	%rdx, w1
+	jmp	L(al2)
+
+	ALIGN(16)
+L(ta2):	add	w0, -16(rp,n,8)
+	adc	w1, w2
+	adc	$0, w3
+	mov	(up,n,8), %rax
+	mul	v0
+	mov	%rax, w0
+	mov	%rdx, w1
+	add	w2, -8(rp,n,8)
+	adc	w3, w0
+	adc	$0, w1
+	mov	8(up,n,8), %rax
+	mul	v0
+	mov	%rax, w2
+	mov	%rdx, w3
+	add	w0, (rp,n,8)
+	adc	w1, w2
+	adc	$0, w3
+	mov	16(up,n,8), %rax
+	mul	v0
+	mov	%rax, w0
+	mov	%rdx, w1
+	add	w2, 8(rp,n,8)
+	adc	w3, w0
+	adc	$0, w1
+L(al2):	mov	24(up,n,8), %rax
+	mul	v0
+	mov	%rax, w2
+	mov	%rdx, w3
+	add	$4, n
+	js	L(ta2)
+
+	add	w0, X(-16(rp,n,8),8(rp))
+	adc	w1, w2
+	adc	$0, w3
+	add	w2, X(-8(rp,n,8),16(rp))
+	adc	$0, w3
+	mov	w3, X((rp,n,8),24(rp))
+	jmp	L(to2)
+
+
+L(r1):	mov	X(0(up,n,8),8(up)), %rax
+	mul	v0
+	mov	%rax, w0
+	mov	%rdx, w1
+	mov	w2, X(-8(rp,n,8),(rp))
+	add	w3, w0
+	adc	$0, w1
+	mov	X(8(up,n,8),16(up)), %rax
+	mul	v0
+	mov	%rax, w2
+	mov	%rdx, w3
+	mov	w0, X((rp,n,8),8(rp))
+	add	w1, w2
+	adc	$0, w3
+	mov	w2, X(8(rp,n,8),16(rp))
+	mov	w3, X(16(rp,n,8),24(rp))
+	add	$4, un
+
+C outer loop(1)
+L(to1):	dec	vn
+	jz	L(ret)
+	mov	(vp), v0
+	mov	-8(up,un,8), %rax
+	lea	8(vp), vp
+	lea	8(rp), rp
+	mov	un, n
+	mul	v0
+	mov	%rax, w2
+	mov	%rdx, w3
+	jmp	L(al1)
+
+	ALIGN(16)
+L(ta1):	add	w0, -16(rp,n,8)
+	adc	w1, w2
+	adc	$0, w3
+L(al1):	mov	(up,n,8), %rax
+	mul	v0
+	mov	%rax, w0
+	mov	%rdx, w1
+	add	w2, -8(rp,n,8)
+	adc	w3, w0
+	adc	$0, w1
+	mov	8(up,n,8), %rax
+	mul	v0
+	mov	%rax, w2
+	mov	%rdx, w3
+	add	w0, (rp,n,8)
+	adc	w1, w2
+	adc	$0, w3
+	mov	16(up,n,8), %rax
+	mul	v0
+	mov	%rax, w0
+	mov	%rdx, w1
+	add	w2, 8(rp,n,8)
+	adc	w3, w0
+	adc	$0, w1
+	mov	24(up,n,8), %rax
+	mul	v0
+	mov	%rax, w2
+	mov	%rdx, w3
+	add	$4, n
+	js	L(ta1)
+
+	add	w0, X(-16(rp,n,8),8(rp))
+	adc	w1, w2
+	adc	$0, w3
+	add	w2, X(-8(rp,n,8),16(rp))
+	adc	$0, w3
+	mov	w3, X((rp,n,8),24(rp))
+	jmp	L(to1)
+
+
+L(r0):	mov	X((up,n,8),16(up)), %rax
+	mul	v0
+	mov	%rax, w0
+	mov	%rdx, w1
+	mov	w2, X(-8(rp,n,8),8(rp))
+	add	w3, w0
+	adc	$0, w1
+	mov	w0, X((rp,n,8),16(rp))
+	mov	w1, X(8(rp,n,8),24(rp))
+	add	$3, un
+
+C outer loop(0)
+L(to0):	dec	vn
+	jz	L(ret)
+	mov	(vp), v0
+	mov	(up,un,8), %rax
+	lea	8(vp), vp
+	lea	8(rp), rp
+	mov	un, n
+	mul	v0
+	mov	%rax, w0
+	mov	%rdx, w1
+	jmp	L(al0)
+
+	ALIGN(16)
+L(ta0):	add	w0, -16(rp,n,8)
+	adc	w1, w2
+	adc	$0, w3
+	mov	(up,n,8), %rax
+	mul	v0
+	mov	%rax, w0
+	mov	%rdx, w1
+	add	w2, -8(rp,n,8)
+	adc	w3, w0
+	adc	$0, w1
+L(al0):	mov	8(up,n,8), %rax
+	mul	v0
+	mov	%rax, w2
+	mov	%rdx, w3
+	add	w0, (rp,n,8)
+	adc	w1, w2
+	adc	$0, w3
+	mov	16(up,n,8), %rax
+	mul	v0
+	mov	%rax, w0
+	mov	%rdx, w1
+	add	w2, 8(rp,n,8)
+	adc	w3, w0
+	adc	$0, w1
+	mov	24(up,n,8), %rax
+	mul	v0
+	mov	%rax, w2
+	mov	%rdx, w3
+	add	$4, n
+	js	L(ta0)
+
+	add	w0, X(-16(rp,n,8),8(rp))
+	adc	w1, w2
+	adc	$0, w3
+	add	w2, X(-8(rp,n,8),16(rp))
+	adc	$0, w3
+	mov	w3, X((rp,n,8),24(rp))
+	jmp	L(to0)
+
+
+L(ret):	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/bt1/redc_1.asm b/third_party/gmp/mpn/x86_64/bt1/redc_1.asm
new file mode 100644
index 0000000..d55b1e5
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/bt1/redc_1.asm

@@ -0,0 +1,507 @@
+dnl  X86-64 mpn_redc_1 optimised for AMD bobcat.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C AMD K8,K9	 ?
+C AMD K10	 ?
+C AMD bull	 ?
+C AMD pile	 ?
+C AMD steam	 ?
+C AMD bobcat	 5.0
+C AMD jaguar	 ?
+C Intel P4	 ?
+C Intel core	 ?
+C Intel NHM	 ?
+C Intel SBR	 ?
+C Intel IBR	 ?
+C Intel HWL	 ?
+C Intel BWL	 ?
+C Intel atom	 ?
+C VIA nano	 ?
+
+C TODO
+C  * Micro-optimise, none performed thus far.
+C  * Consider inlining mpn_add_n.
+C  * Single basecases out before the pushes.
+
+C When playing with pointers, set this to $2 to fall back to conservative
+C indexing in wind-down code.
+define(`I',`$1')
+
+define(`rp',          `%rdi')   C rcx
+define(`up',          `%rsi')   C rdx
+define(`mp_param',    `%rdx')   C r8
+define(`n',           `%rcx')   C r9
+define(`u0inv',       `%r8')    C stack
+
+define(`i',           `%r14')
+define(`j',           `%r15')
+define(`mp',          `%r12')
+define(`q0',          `%r13')
+define(`w0',          `%rbp')
+define(`w1',          `%r9')
+define(`w2',          `%r10')
+define(`w3',          `%r11')
+
+C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+define(`ALIGNx', `ALIGN(16)')
+
+ASM_START()
+	TEXT
+	ALIGN(32)
+PROLOGUE(mpn_redc_1)
+	FUNC_ENTRY(4)
+IFDOS(`	mov	56(%rsp), %r8	')
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+
+	mov	(up), q0
+	mov	n, j			C outer loop induction var
+	lea	(mp_param,n,8), mp
+	lea	(up,n,8), up
+	neg	n
+	imul	u0inv, q0		C first iteration q0
+
+	test	$1, R8(n)
+	jz	L(bx0)
+
+L(bx1):	test	$2, R8(n)
+	jz	L(b3)
+
+L(b1):	cmp	$-1, R32(n)
+	jz	L(n1)
+
+L(otp1):lea	1(n), i
+	mov	(mp,n,8), %rax
+	mul	q0
+	mov	%rax, w2
+	mov	%rdx, w3
+	mov	8(mp,n,8), %rax
+	mul	q0
+	mov	%rax, %rbx
+	mov	%rdx, w1
+	add	(up,n,8), w2
+	adc	w3, %rbx
+	adc	$0, w1
+	mov	16(mp,n,8), %rax
+	mul	q0
+	mov	%rax, w2
+	mov	%rdx, w3
+	add	8(up,n,8), %rbx
+	mov	%rbx, 8(up,n,8)
+	adc	w1, w2
+	adc	$0, w3
+	imul	u0inv, %rbx		C next q limb
+	jmp	L(e1)
+
+	ALIGNx
+L(tp1):	add	w0, -16(up,i,8)
+	adc	w1, w2
+	adc	$0, w3
+	mov	(mp,i,8), %rax
+	mul	q0
+	mov	%rax, w0
+	mov	%rdx, w1
+	add	w2, -8(up,i,8)
+	adc	w3, w0
+	adc	$0, w1
+	mov	8(mp,i,8), %rax
+	mul	q0
+	mov	%rax, w2
+	mov	%rdx, w3
+	add	w0, (up,i,8)
+	adc	w1, w2
+	adc	$0, w3
+L(e1):	mov	16(mp,i,8), %rax
+	mul	q0
+	mov	%rax, w0
+	mov	%rdx, w1
+	add	w2, 8(up,i,8)
+	adc	w3, w0
+	adc	$0, w1
+	mov	24(mp,i,8), %rax
+	mul	q0
+	mov	%rax, w2
+	mov	%rdx, w3
+	add	$4, i
+	js	L(tp1)
+
+L(ed1):	add	w0, I(-16(up),-16(up,i,8))
+	adc	w1, w2
+	adc	$0, w3
+	add	w2, I(-8(up),-8(up,i,8))
+	adc	$0, w3
+	mov	w3, (up,n,8)		C up[0]
+	mov	%rbx, q0		C previously computed q limb -> q0
+	lea	8(up), up		C up++
+	dec	j
+	jnz	L(otp1)
+	jmp	L(cj)
+
+L(b3):	cmp	$-3, R32(n)
+	jz	L(n3)
+
+L(otp3):lea	3(n), i
+	mov	(mp,n,8), %rax
+	mul	q0
+	mov	%rax, w2
+	mov	%rdx, w3
+	mov	8(mp,n,8), %rax
+	mul	q0
+	mov	%rax, %rbx
+	mov	%rdx, w1
+	add	(up,n,8), w2
+	adc	w3, %rbx
+	adc	$0, w1
+	mov	16(mp,n,8), %rax
+	mul	q0
+	mov	%rax, w2
+	mov	%rdx, w3
+	add	8(up,n,8), %rbx
+	mov	%rbx, 8(up,n,8)
+	adc	w1, w2
+	adc	$0, w3
+	imul	u0inv, %rbx		C next q limb
+	jmp	L(e3)
+
+	ALIGNx
+L(tp3):	add	w0, -16(up,i,8)
+	adc	w1, w2
+	adc	$0, w3
+L(e3):	mov	(mp,i,8), %rax
+	mul	q0
+	mov	%rax, w0
+	mov	%rdx, w1
+	add	w2, -8(up,i,8)
+	adc	w3, w0
+	adc	$0, w1
+	mov	8(mp,i,8), %rax
+	mul	q0
+	mov	%rax, w2
+	mov	%rdx, w3
+	add	w0, (up,i,8)
+	adc	w1, w2
+	adc	$0, w3
+	mov	16(mp,i,8), %rax
+	mul	q0
+	mov	%rax, w0
+	mov	%rdx, w1
+	add	w2, 8(up,i,8)
+	adc	w3, w0
+	adc	$0, w1
+	mov	24(mp,i,8), %rax
+	mul	q0
+	mov	%rax, w2
+	mov	%rdx, w3
+	add	$4, i
+	js	L(tp3)
+
+L(ed3):	add	w0, I(-16(up),-16(up,i,8))
+	adc	w1, w2
+	adc	$0, w3
+	add	w2, I(-8(up),-8(up,i,8))
+	adc	$0, w3
+	mov	w3, (up,n,8)		C up[0]
+	mov	%rbx, q0		C previously computed q limb -> q0
+	lea	8(up), up		C up++
+	dec	j
+	jnz	L(otp3)
+C	jmp	L(cj)
+
+L(cj):
+IFSTD(`	lea	(up,n,8), up		C param 2: up
+	lea	(up,n,8), %rdx		C param 3: up - n
+	neg	R32(n)		')	C param 4: n
+
+IFDOS(`	lea	(up,n,8), %rdx		C param 2: up
+	lea	(%rdx,n,8), %r8		C param 3: up - n
+	neg	R32(n)
+	mov	n, %r9			C param 4: n
+	mov	rp, %rcx	')	C param 1: rp
+
+IFSTD(`	sub	$8, %rsp	')
+IFDOS(`	sub	$40, %rsp	')
+	ASSERT(nz, `test $15, %rsp')
+	CALL(	mpn_add_n)
+IFSTD(`	add	$8, %rsp	')
+IFDOS(`	add	$40, %rsp	')
+
+L(ret):	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+
+L(bx0):	test	$2, R8(n)
+	jnz	L(b2)
+
+L(b0):
+L(otp0):lea	(n), i
+	mov	(mp,n,8), %rax
+	mul	q0
+	mov	%rax, w0
+	mov	%rdx, w1
+	mov	8(mp,n,8), %rax
+	mul	q0
+	mov	%rax, %rbx
+	mov	%rdx, w3
+	add	(up,n,8), w0
+	adc	w1, %rbx
+	adc	$0, w3
+	mov	16(mp,n,8), %rax
+	mul	q0
+	mov	%rax, w0
+	mov	%rdx, w1
+	add	8(up,n,8), %rbx
+	mov	%rbx, 8(up,n,8)
+	adc	w3, w0
+	adc	$0, w1
+	imul	u0inv, %rbx		C next q limb
+	jmp	L(e0)
+
+	ALIGNx
+L(tp0):	add	w0, -16(up,i,8)
+	adc	w1, w2
+	adc	$0, w3
+	mov	(mp,i,8), %rax
+	mul	q0
+	mov	%rax, w0
+	mov	%rdx, w1
+	add	w2, -8(up,i,8)
+	adc	w3, w0
+	adc	$0, w1
+	mov	8(mp,i,8), %rax
+	mul	q0
+	mov	%rax, w2
+	mov	%rdx, w3
+	add	w0, (up,i,8)
+	adc	w1, w2
+	adc	$0, w3
+	mov	16(mp,i,8), %rax
+	mul	q0
+	mov	%rax, w0
+	mov	%rdx, w1
+	add	w2, 8(up,i,8)
+	adc	w3, w0
+	adc	$0, w1
+L(e0):	mov	24(mp,i,8), %rax
+	mul	q0
+	mov	%rax, w2
+	mov	%rdx, w3
+	add	$4, i
+	js	L(tp0)
+
+L(ed0):	add	w0, I(-16(up),-16(up,i,8))
+	adc	w1, w2
+	adc	$0, w3
+	add	w2, I(-8(up),-8(up,i,8))
+	adc	$0, w3
+	mov	w3, (up,n,8)		C up[0]
+	mov	%rbx, q0		C previously computed q limb -> q0
+	lea	8(up), up		C up++
+	dec	j
+	jnz	L(otp0)
+	jmp	L(cj)
+
+L(b2):	cmp	$-2, R32(n)
+	jz	L(n2)
+
+L(otp2):lea	2(n), i
+	mov	(mp,n,8), %rax
+	mul	q0
+	mov	%rax, w0
+	mov	%rdx, w1
+	mov	8(mp,n,8), %rax
+	mul	q0
+	mov	%rax, %rbx
+	mov	%rdx, w3
+	add	(up,n,8), w0
+	adc	w1, %rbx
+	adc	$0, w3
+	mov	16(mp,n,8), %rax
+	mul	q0
+	mov	%rax, w0
+	mov	%rdx, w1
+	add	8(up,n,8), %rbx
+	mov	%rbx, 8(up,n,8)
+	adc	w3, w0
+	adc	$0, w1
+	imul	u0inv, %rbx		C next q limb
+	jmp	L(e2)
+
+	ALIGNx
+L(tp2):	add	w0, -16(up,i,8)
+	adc	w1, w2
+	adc	$0, w3
+	mov	(mp,i,8), %rax
+	mul	q0
+	mov	%rax, w0
+	mov	%rdx, w1
+	add	w2, -8(up,i,8)
+	adc	w3, w0
+	adc	$0, w1
+L(e2):	mov	8(mp,i,8), %rax
+	mul	q0
+	mov	%rax, w2
+	mov	%rdx, w3
+	add	w0, (up,i,8)
+	adc	w1, w2
+	adc	$0, w3
+	mov	16(mp,i,8), %rax
+	mul	q0
+	mov	%rax, w0
+	mov	%rdx, w1
+	add	w2, 8(up,i,8)
+	adc	w3, w0
+	adc	$0, w1
+	mov	24(mp,i,8), %rax
+	mul	q0
+	mov	%rax, w2
+	mov	%rdx, w3
+	add	$4, i
+	js	L(tp2)
+
+L(ed2):	add	w0, I(-16(up),-16(up,i,8))
+	adc	w1, w2
+	adc	$0, w3
+	add	w2, I(-8(up),-8(up,i,8))
+	adc	$0, w3
+	mov	w3, (up,n,8)		C up[0]
+	mov	%rbx, q0		C previously computed q limb -> q0
+	lea	8(up), up		C up++
+	dec	j
+	jnz	L(otp2)
+	jmp	L(cj)
+
+L(n1):	mov	(mp_param), %rax
+	mul	q0
+	add	-8(up), %rax
+	adc	(up), %rdx
+	mov	%rdx, (rp)
+	mov	$0, R32(%rax)
+	adc	R32(%rax), R32(%rax)
+	jmp	L(ret)
+
+L(n2):	mov	(mp_param), %rax
+	mov	-16(up), %rbp
+	mul	q0
+	add	%rax, %rbp
+	mov	%rdx, %r9
+	adc	$0, %r9
+	mov	-8(mp), %rax
+	mov	-8(up), %r10
+	mul	q0
+	add	%rax, %r10
+	mov	%rdx, %r11
+	adc	$0, %r11
+	add	%r9, %r10
+	adc	$0, %r11
+	mov	%r10, q0
+	imul	u0inv, q0		C next q0
+	mov	-16(mp), %rax
+	mul	q0
+	add	%rax, %r10
+	mov	%rdx, %r9
+	adc	$0, %r9
+	mov	-8(mp), %rax
+	mov	(up), %r14
+	mul	q0
+	add	%rax, %r14
+	adc	$0, %rdx
+	add	%r9, %r14
+	adc	$0, %rdx
+	xor	R32(%rax), R32(%rax)
+	add	%r11, %r14
+	adc	8(up), %rdx
+	mov	%r14, (rp)
+	mov	%rdx, 8(rp)
+	adc	R32(%rax), R32(%rax)
+	jmp	L(ret)
+
+	ALIGNx
+L(n3):	mov	-24(mp), %rax
+	mov	-24(up), %r10
+	mul	q0
+	add	%rax, %r10
+	mov	-16(mp), %rax
+	mov	%rdx, %r11
+	adc	$0, %r11
+	mov	-16(up), %rbp
+	mul	q0
+	add	%rax, %rbp
+	mov	%rdx, %r9
+	adc	$0, %r9
+	mov	-8(mp), %rax
+	add	%r11, %rbp
+	mov	-8(up), %r10
+	adc	$0, %r9
+	mul	q0
+	mov	%rbp, q0
+	imul	u0inv, q0		C next q0
+	add	%rax, %r10
+	mov	%rdx, %r11
+	adc	$0, %r11
+	mov	%rbp, -16(up)
+	add	%r9, %r10
+	adc	$0, %r11
+	mov	%r10, -8(up)
+	mov	%r11, -24(up)		C up[0]
+	lea	8(up), up		C up++
+	dec	j
+	jnz	L(n3)
+
+	mov	-48(up), %rdx
+	mov	-40(up), %rbx
+	xor	R32(%rax), R32(%rax)
+	add	%rbp, %rdx
+	adc	%r10, %rbx
+	adc	-8(up), %r11
+	mov	%rdx, (rp)
+	mov	%rbx, 8(rp)
+	mov	%r11, 16(rp)
+	adc	R32(%rax), R32(%rax)
+	jmp	L(ret)
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/x86_64/bt1/sqr_basecase.asm b/third_party/gmp/mpn/x86_64/bt1/sqr_basecase.asm
new file mode 100644
index 0000000..0e417a1
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/bt1/sqr_basecase.asm

@@ -0,0 +1,565 @@
+dnl  AMD64 mpn_sqr_basecase optimised for AMD bobcat.
+
+dnl  Copyright 2003-2005, 2007, 2008, 2011, 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C AMD K8,K9	 4.5
+C AMD K10	 4.5
+C AMD bd1	 4.75
+C AMD bobcat	 5
+C Intel P4	17.7
+C Intel core2	 5.5
+C Intel NHM	 5.43
+C Intel SBR	 3.92
+C Intel atom	23
+C VIA nano	 5.63
+
+C This sqr_basecase is based on mul_1 and addmul_1, since these both run at the
+C multiply insn bandwidth, without any apparent loop branch exit pipeline
+C replays experienced on K8.  The structure is unusual: it falls into mul_1 in
+C the same way for all n, then it splits into 4 different wind-down blocks and
+C 4 separate addmul_1 loops.
+C
+C We have not tried using the same addmul_1 loops with a switch into feed-in
+C code, as we do in other basecase implementations.  Doing that could save
+C substantial code volume, but would also probably add some overhead.
+
+C TODO
+C  * Tune un < 4 code.
+C  * Perhaps implement a larger final corner (it is now 2 x 1).
+C  * Lots of space could be saved by replacing the "switch" code by gradual
+C    jumps out from mul_1 winddown code, perhaps with no added overhead.
+C  * Are the ALIGN(16) really necessary?  They add about 25 bytes of padding.
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+C Standard parameters
+define(`rp',              `%rdi')
+define(`up',              `%rsi')
+define(`un_param',        `%rdx')
+C Standard allocations
+define(`un',              `%rbx')
+define(`w0',              `%r8')
+define(`w1',              `%r9')
+define(`w2',              `%r10')
+define(`w3',              `%r11')
+define(`n',               `%rbp')
+define(`v0',              `%rcx')
+
+C Temp macro for allowing control over indexing.
+C Define to return $1 for more conservative ptr handling.
+define(`X',`$2')
+dnl define(`X',`$1')
+
+
+ASM_START()
+	TEXT
+	ALIGN(64)
+PROLOGUE(mpn_sqr_basecase)
+	FUNC_ENTRY(3)
+
+	mov	(up), %rax
+
+	cmp	$2, R32(un_param)
+	jae	L(ge2)
+
+	mul	%rax
+	mov	%rax, (rp)
+	mov	%rdx, 8(rp)
+	FUNC_EXIT()
+	ret
+
+L(ge2):	mov	(up), v0
+	jnz	L(g2)
+
+	mul	%rax
+	mov	%rax, (rp)
+	mov	8(up), %rax
+	mov	%rdx, w0
+	mul	v0
+	add	%rax, w0
+	mov	%rdx, w1
+	adc	$0, w1
+	mov	8(up), v0
+	mov	(up), %rax
+	mul	v0
+	add	%rax, w0
+	mov	w0, 8(rp)
+	mov	%rdx, w0		C CAUTION: r8 realloc
+	adc	$0, w0
+	mov	8(up), %rax
+	mul	v0
+	add	w1, w0
+	adc	$0, %rdx
+	add	w0, %rax
+	adc	$0, %rdx
+	mov	%rax, 16(rp)
+	mov	%rdx, 24(rp)
+	FUNC_EXIT()
+	ret
+
+L(g2):	cmp	$3, R32(un_param)
+	ja	L(g3)
+	mul	%rax
+	mov	%rax, (rp)
+	mov	%rdx, 8(rp)
+	mov	8(up), %rax
+	mul	%rax
+	mov	%rax, 16(rp)
+	mov	%rdx, 24(rp)
+	mov	16(up), %rax
+	mul	%rax
+	mov	%rax, 32(rp)
+	mov	%rdx, 40(rp)
+
+	mov	(up), v0
+	mov	8(up), %rax
+	mul	v0
+	mov	%rax, w0
+	mov	%rdx, w1
+	mov	16(up), %rax
+	mul	v0
+	xor	R32(w2), R32(w2)
+	add	%rax, w1
+	adc	%rdx, w2
+
+	mov	8(up), v0
+	mov	16(up), %rax
+	mul	v0
+	xor	R32(w3), R32(w3)
+	add	%rax, w2
+	adc	%rdx, w3
+	add	w0, w0
+	adc	w1, w1
+	adc	w2, w2
+	adc	w3, w3
+	mov	$0, R32(v0)
+	adc	v0, v0
+	add	w0, 8(rp)
+	adc	w1, 16(rp)
+	adc	w2, 24(rp)
+	adc	w3, 32(rp)
+	adc	v0, 40(rp)
+	FUNC_EXIT()
+	ret
+
+L(g3):	push	%rbx
+	push	%rbp
+
+	mov	8(up), %rax
+	lea	-24(rp,un_param,8), rp
+	lea	-24(up,un_param,8), up
+	neg	un_param
+	push	un_param		C for sqr_diag_addlsh1
+	lea	(un_param), un
+	lea	3(un_param), n
+
+	mul	v0
+	mov	%rax, w2
+	mov	%rdx, w3
+	jmp	L(L3)
+
+	ALIGN(16)
+L(top):	mov	w0, -16(rp,n,8)
+	add	w1, w2
+	adc	$0, w3
+	mov	(up,n,8), %rax
+	mul	v0
+	mov	%rax, w0
+	mov	%rdx, w1
+	mov	w2, -8(rp,n,8)
+	add	w3, w0
+	adc	$0, w1
+	mov	8(up,n,8), %rax
+	mul	v0
+	mov	%rax, w2
+	mov	%rdx, w3
+	mov	w0, (rp,n,8)
+	add	w1, w2
+	adc	$0, w3
+L(L3):	mov	16(up,n,8), %rax
+	mul	v0
+	mov	%rax, w0
+	mov	%rdx, w1
+	mov	w2, 8(rp,n,8)
+	add	w3, w0
+	adc	$0, w1
+	mov	24(up,n,8), %rax
+	mul	v0
+	mov	%rax, w2
+	mov	%rdx, w3
+	add	$4, n
+	js	L(top)
+
+	mov	w0, -16(rp,n,8)
+	add	w1, w2
+	adc	$0, w3
+
+	test	n, n
+	jz	L(r2)
+	cmp	$2, R32(n)
+	ja	L(r3)
+	jz	L(r0)
+
+
+L(r1):	mov	X((up,n,8),8(up)), %rax
+	mul	v0
+	mov	%rax, w0
+	mov	%rdx, w1
+	mov	w2, X(-8(rp,n,8),(rp))
+	add	w3, w0
+	adc	$0, w1
+	mov	X(8(up,n,8),16(up)), %rax
+	mul	v0
+	mov	%rax, w2
+	mov	%rdx, w3
+	mov	w0, X((rp,n,8),8(rp))
+	add	w1, w2
+	adc	$0, w3
+	mov	w2, X(8(rp,n,8),16(rp))
+	mov	w3, X(16(rp,n,8),24(rp))
+	add	$5, un
+	jmp	L(to0)
+
+L(r2):	mov	X((up,n,8),(up)), %rax
+	mul	v0
+	mov	%rax, w0
+	mov	%rdx, w1
+	mov	w2, X(-8(rp,n,8),-8(rp))
+	add	w3, w0
+	adc	$0, w1
+	mov	X(8(up,n,8),8(up)), %rax
+	mul	v0
+	mov	%rax, w2
+	mov	%rdx, w3
+	mov	w0, X((rp,n,8),(rp))
+	add	w1, w2
+	adc	$0, w3
+	mov	X(16(up,n,8),16(up)), %rax
+	mul	v0
+	mov	%rax, w0
+	mov	%rdx, w1
+	mov	w2, X(8(rp,n,8),8(rp))
+	add	w3, w0
+	adc	$0, w1
+	mov	w0, X(16(rp,n,8),16(rp))
+	adc	$0, w3
+	mov	w1, X(24(rp,n,8),24(rp))
+	add	$6, un
+	jmp	L(to1)
+
+L(r3):	mov	w2, X(-8(rp,n,8),16(rp))
+	mov	w3, X((rp,n,8),24(rp))
+	add	$3, un
+	jmp	L(to2)
+
+L(r0):	mov	X((up,n,8),16(up)), %rax
+	mul	v0
+	mov	%rax, w0
+	mov	%rdx, w1
+	mov	w2, X(-8(rp,n,8),8(rp))
+	add	w3, w0
+	adc	$0, w1
+	mov	w0, X((rp,n,8),16(rp))
+	mov	w1, X(8(rp,n,8),24(rp))
+	add	$4, un
+C	jmp	L(to3)
+C fall through into main loop
+
+
+L(outer):
+	mov	un, n
+	mov	(up,un,8), v0
+	mov	8(up,un,8), %rax
+	lea	8(rp), rp
+	mul	v0
+	mov	%rax, w2
+	mov	%rdx, w3
+	jmp	L(al3)
+
+	ALIGN(16)
+L(ta3):	add	w0, -16(rp,n,8)
+	adc	w1, w2
+	adc	$0, w3
+	mov	(up,n,8), %rax
+	mul	v0
+	mov	%rax, w0
+	mov	%rdx, w1
+	add	w2, -8(rp,n,8)
+	adc	w3, w0
+	adc	$0, w1
+	mov	8(up,n,8), %rax
+	mul	v0
+	mov	%rax, w2
+	mov	%rdx, w3
+	add	w0, (rp,n,8)
+	adc	w1, w2
+	adc	$0, w3
+L(al3):	mov	16(up,n,8), %rax
+	mul	v0
+	mov	%rax, w0
+	mov	%rdx, w1
+	add	w2, 8(rp,n,8)
+	adc	w3, w0
+	adc	$0, w1
+	mov	24(up,n,8), %rax
+	mul	v0
+	mov	%rax, w2
+	mov	%rdx, w3
+	add	$4, n
+	js	L(ta3)
+
+	add	w0, X(-16(rp,n,8),8(rp))
+	adc	w1, w2
+	adc	$0, w3
+	add	w2, X(-8(rp,n,8),16(rp))
+	adc	$0, w3
+	mov	w3, X((rp,n,8),24(rp))
+
+
+L(to2):	mov	un, n
+	cmp	$-4, R32(un)
+	jnc	L(end)
+	add	$4, un
+	mov	8(up,n,8), v0
+	mov	16(up,n,8), %rax
+	lea	8(rp), rp
+	mul	v0
+	mov	%rax, w0
+	mov	%rdx, w1
+	jmp	L(al2)
+
+	ALIGN(16)
+L(ta2):	add	w0, -16(rp,n,8)
+	adc	w1, w2
+	adc	$0, w3
+	mov	(up,n,8), %rax
+	mul	v0
+	mov	%rax, w0
+	mov	%rdx, w1
+	add	w2, -8(rp,n,8)
+	adc	w3, w0
+	adc	$0, w1
+	mov	8(up,n,8), %rax
+	mul	v0
+	mov	%rax, w2
+	mov	%rdx, w3
+	add	w0, (rp,n,8)
+	adc	w1, w2
+	adc	$0, w3
+	mov	16(up,n,8), %rax
+	mul	v0
+	mov	%rax, w0
+	mov	%rdx, w1
+	add	w2, 8(rp,n,8)
+	adc	w3, w0
+	adc	$0, w1
+L(al2):	mov	24(up,n,8), %rax
+	mul	v0
+	mov	%rax, w2
+	mov	%rdx, w3
+	add	$4, n
+	js	L(ta2)
+
+	add	w0, X(-16(rp,n,8),8(rp))
+	adc	w1, w2
+	adc	$0, w3
+	add	w2, X(-8(rp,n,8),16(rp))
+	adc	$0, w3
+	mov	w3, X((rp,n,8),24(rp))
+
+
+L(to1):	mov	un, n
+	mov	-16(up,un,8), v0
+	mov	-8(up,un,8), %rax
+	lea	8(rp), rp
+	mul	v0
+	mov	%rax, w2
+	mov	%rdx, w3
+	jmp	L(al1)
+
+	ALIGN(16)
+L(ta1):	add	w0, -16(rp,n,8)
+	adc	w1, w2
+	adc	$0, w3
+L(al1):	mov	(up,n,8), %rax
+	mul	v0
+	mov	%rax, w0
+	mov	%rdx, w1
+	add	w2, -8(rp,n,8)
+	adc	w3, w0
+	adc	$0, w1
+	mov	8(up,n,8), %rax
+	mul	v0
+	mov	%rax, w2
+	mov	%rdx, w3
+	add	w0, (rp,n,8)
+	adc	w1, w2
+	adc	$0, w3
+	mov	16(up,n,8), %rax
+	mul	v0
+	mov	%rax, w0
+	mov	%rdx, w1
+	add	w2, 8(rp,n,8)
+	adc	w3, w0
+	adc	$0, w1
+	mov	24(up,n,8), %rax
+	mul	v0
+	mov	%rax, w2
+	mov	%rdx, w3
+	add	$4, n
+	js	L(ta1)
+
+	add	w0, X(-16(rp,n,8),8(rp))
+	adc	w1, w2
+	adc	$0, w3
+	add	w2, X(-8(rp,n,8),16(rp))
+	adc	$0, w3
+	mov	w3, X((rp,n,8),24(rp))
+
+
+L(to0):	mov	un, n
+	mov	-8(up,un,8), v0
+	mov	(up,un,8), %rax
+	lea	8(rp), rp
+	mul	v0
+	mov	%rax, w0
+	mov	%rdx, w1
+	jmp	L(al0)
+
+	ALIGN(16)
+L(ta0):	add	w0, -16(rp,n,8)
+	adc	w1, w2
+	adc	$0, w3
+	mov	(up,n,8), %rax
+	mul	v0
+	mov	%rax, w0
+	mov	%rdx, w1
+	add	w2, -8(rp,n,8)
+	adc	w3, w0
+	adc	$0, w1
+L(al0):	mov	8(up,n,8), %rax
+	mul	v0
+	mov	%rax, w2
+	mov	%rdx, w3
+	add	w0, (rp,n,8)
+	adc	w1, w2
+	adc	$0, w3
+	mov	16(up,n,8), %rax
+	mul	v0
+	mov	%rax, w0
+	mov	%rdx, w1
+	add	w2, 8(rp,n,8)
+	adc	w3, w0
+	adc	$0, w1
+	mov	24(up,n,8), %rax
+	mul	v0
+	mov	%rax, w2
+	mov	%rdx, w3
+	add	$4, n
+	js	L(ta0)
+
+	add	w0, X(-16(rp,n,8),8(rp))
+	adc	w1, w2
+	adc	$0, w3
+	add	w2, X(-8(rp,n,8),16(rp))
+	adc	$0, w3
+	mov	w3, X((rp,n,8),24(rp))
+	jmp	L(outer)
+
+
+L(end):	mov	X(8(up,un,8),(up)), v0
+	mov	X(16(up,un,8),8(up)), %rax
+	mul	v0
+	mov	%rax, w0
+	mov	%rdx, w1
+	mov	X(24(up,un,8),16(up)), %rax
+	mul	v0
+	mov	%rax, w2
+	mov	%rdx, w3
+	add	w0, X(24(rp,un,8),16(rp))
+	adc	w1, w2
+	adc	$0, w3
+	add	w2, X(32(rp,un,8),24(rp))
+	adc	$0, w3
+	mov	X(16(up,un,8),8(up)), v0
+	mov	X(24(up,un,8),16(up)), %rax
+	mul	v0
+	add	%rax, w3
+	mov	w3, X(40(rp,un,8),32(rp))
+	adc	$0, %rdx
+	mov	%rdx, X(48(rp,un,8),40(rp))
+
+
+C sqr_diag_addlsh1
+
+	lea	16(up), up
+	lea	40(rp), rp
+	pop	n
+	lea	2(n,n), n
+
+	mov	(up,n,4), %rax
+	mul	%rax
+	xor	R32(w2), R32(w2)
+
+	mov	8(rp,n,8), w0
+	mov	%rax, (rp,n,8)
+	jmp	L(lm)
+
+	ALIGN(8)
+L(tsd):	add	%rbx, w0
+	adc	%rax, w1
+	mov	w0, -8(rp,n,8)
+	mov	8(rp,n,8), w0
+	mov	w1, (rp,n,8)
+L(lm):	mov	16(rp,n,8), w1
+	adc	w0, w0
+	adc	w1, w1
+	lea	(%rdx,w2), %rbx
+	mov	8(up,n,4), %rax
+	setc	R8(w2)
+	mul	%rax
+	add	$2, n
+	js	L(tsd)
+
+L(esd):	add	%rbx, w0
+	adc	%rax, w1
+	mov	w0, X(-8(rp,n,8),-8(rp))
+	mov	w1, X((rp,n,8),(rp))
+	adc	w2, %rdx
+	mov	%rdx, X(8(rp,n,8),8(rp))
+
+	pop	%rbp
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/bt2/com.asm b/third_party/gmp/mpn/x86_64/bt2/com.asm
new file mode 100644
index 0000000..87085ea
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/bt2/com.asm

@@ -0,0 +1,37 @@
+dnl  X86-64 mpn_com.
+
+dnl  Copyright 2017 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_com)
+include_mpn(`x86_64/fastsse/com.asm')

diff --git a/third_party/gmp/mpn/x86_64/bt2/copyd.asm b/third_party/gmp/mpn/x86_64/bt2/copyd.asm
new file mode 100644
index 0000000..83c0618
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/bt2/copyd.asm

@@ -0,0 +1,37 @@
+dnl  X86-64 mpn_copyd.
+
+dnl  Copyright 2017 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_copyd)
+include_mpn(`x86_64/fastsse/copyd.asm')

diff --git a/third_party/gmp/mpn/x86_64/bt2/copyi.asm b/third_party/gmp/mpn/x86_64/bt2/copyi.asm
new file mode 100644
index 0000000..148d0e5
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/bt2/copyi.asm

@@ -0,0 +1,37 @@
+dnl  X86-64 mpn_copyi.
+
+dnl  Copyright 2017 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_copyi)
+include_mpn(`x86_64/fastsse/copyi.asm')

diff --git a/third_party/gmp/mpn/x86_64/bt2/gcd_11.asm b/third_party/gmp/mpn/x86_64/bt2/gcd_11.asm
new file mode 100644
index 0000000..0ffb6ca
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/bt2/gcd_11.asm

@@ -0,0 +1,37 @@
+dnl  AMD64 mpn_gcd_11.
+
+dnl  Copyright 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_gcd_11)
+include_mpn(`x86_64/bd2/gcd_11.asm')

diff --git a/third_party/gmp/mpn/x86_64/bt2/gcd_22.asm b/third_party/gmp/mpn/x86_64/bt2/gcd_22.asm
new file mode 100644
index 0000000..d693628
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/bt2/gcd_22.asm

@@ -0,0 +1,37 @@
+dnl  AMD64 mpn_gcd_22.
+
+dnl  Copyright 2019 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+dnl ABI_SUPPORT(DOS64)	C returns mp_double_limb_t in memory
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_gcd_22)
+include_mpn(`x86_64/bd2/gcd_22.asm')

diff --git a/third_party/gmp/mpn/x86_64/bt2/gmp-mparam.h b/third_party/gmp/mpn/x86_64/bt2/gmp-mparam.h
new file mode 100644
index 0000000..3e26726
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/bt2/gmp-mparam.h

@@ -0,0 +1,240 @@
+/* AMD Jaguar gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+/* Disable use of slow functions.  FIXME: We should disable lib inclusion.  */
+#undef HAVE_NATIVE_mpn_mul_2
+#undef HAVE_NATIVE_mpn_addmul_2
+
+/* 2050 MHz AMD Jaguar/Kabini */
+/* FFT tuning limit = 225,381,546 */
+/* Generated by tuneup.c, 2019-10-18, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD                 3
+#define MOD_1_UNNORM_THRESHOLD               4
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          4
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          4
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD        65
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD         0  /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     10
+#define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_1_NORM_THRESHOLD              4
+#define DIV_QR_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           15
+
+#define DIV_1_VS_MUL_1_PERCENT             267
+
+#define MUL_TOOM22_THRESHOLD                25
+#define MUL_TOOM33_THRESHOLD                32
+#define MUL_TOOM44_THRESHOLD                93
+#define MUL_TOOM6H_THRESHOLD               366
+#define MUL_TOOM8H_THRESHOLD               537
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      63
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     172
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD      63
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD      67
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD      91
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 20
+#define SQR_TOOM3_THRESHOLD                 97
+#define SQR_TOOM4_THRESHOLD                220
+#define SQR_TOOM6_THRESHOLD                318
+#define SQR_TOOM8_THRESHOLD                434
+
+#define MULMID_TOOM42_THRESHOLD             20
+
+#define MULMOD_BNM1_THRESHOLD               11
+#define SQRMOD_BNM1_THRESHOLD               13
+
+#define MUL_FFT_MODF_THRESHOLD             348  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    348, 5}, {     17, 6}, {      9, 5}, {     19, 6}, \
+    {     23, 7}, {     21, 8}, {     11, 7}, {     24, 8}, \
+    {     13, 7}, {     28, 8}, {     15, 7}, {     31, 8}, \
+    {     17, 7}, {     35, 8}, {     19, 7}, {     39, 8}, \
+    {     21, 9}, {     11, 8}, {     29, 9}, {     15, 8}, \
+    {     35, 9}, {     19, 8}, {     41, 9}, {     23, 8}, \
+    {     49, 9}, {     27,10}, {     15, 9}, {     39,10}, \
+    {     23, 9}, {     55,11}, {     15,10}, {     31, 9}, \
+    {     67,10}, {     39, 9}, {     79,10}, {     55,11}, \
+    {     31,10}, {     63, 6}, {   1087, 8}, {    303, 9}, \
+    {    159,10}, {     95,12}, {     31,11}, {     63,10}, \
+    {    127, 9}, {    255,11}, {     79,10}, {    159, 9}, \
+    {    319,10}, {    167,11}, {     95,10}, {    191, 9}, \
+    {    383,10}, {    207, 9}, {    415,11}, {    111,12}, \
+    {     63,11}, {    127,10}, {    255, 9}, {    511,10}, \
+    {    271,11}, {    143,10}, {    287, 9}, {    575,10}, \
+    {    303,11}, {    159,10}, {    319,12}, {     95,11}, \
+    {    191,10}, {    383,11}, {    207,10}, {    415,11}, \
+    {    223,13}, {     63,12}, {    127,11}, {    255,10}, \
+    {    511,11}, {    271,10}, {    543,11}, {    287,10}, \
+    {    575,11}, {    303,12}, {    159,11}, {    319,10}, \
+    {    639,11}, {    351,12}, {    191,11}, {    415,12}, \
+    {    223,11}, {    479,13}, {    127,12}, {    255,11}, \
+    {    543,12}, {    287,11}, {    607,12}, {    319,11}, \
+    {    639,12}, {    351,13}, {    191,12}, {    383,11}, \
+    {    767,12}, {    415,11}, {    831,12}, {    479,14}, \
+    {    127,13}, {    255,12}, {    543,11}, {   1087,12}, \
+    {    607,13}, {    319,12}, {    703,13}, {    383,12}, \
+    {    831,13}, {    447,12}, {    895,14}, {    255,13}, \
+    {    511,12}, {   1023,13}, {    575,12}, {   1151,13}, \
+    {    639,12}, {   1279,13}, {    703,14}, {    383,13}, \
+    {    831,12}, {   1663,13}, {    895,15}, {    255,14}, \
+    {    511,13}, {   1087,12}, {   2175,13}, {   1151,14}, \
+    {    639,13}, {   1343,12}, {   2687,14}, {    767,13}, \
+    {   1663,14}, {    895,15}, {    511,14}, {   1023,13}, \
+    {   2175,14}, {   1151,13}, {   2431,14}, {   1279,13}, \
+    {   2687,15}, {    767,14}, {   1663,13}, {   3327,16}, \
+    {    511,15}, {   1023,14}, {   2175,13}, {   4351,14}, \
+    {   2431,13}, {   4863,15}, {   1279,14}, {   2943,13}, \
+    {   5887,15}, {   1535,14}, {   3455,13}, {   6911,15}, \
+    {   1791,14}, {   3839,13}, {   7679,16}, {   1023,15}, \
+    {   2047,14}, {   4223,13}, {   8447,14}, {   4479,15}, \
+    {   2303,14}, {   4863,15}, {   2559,14}, {   5247,15}, \
+    {   2815,14}, {   5887,16}, {   1535,15}, {   3071,14}, \
+    {   6271,15}, {   3327,14}, {   6911,15}, {   3839,14}, \
+    {   7679,17}, {   1023,16}, {   2047,15}, {   4095,14}, \
+    {   8447,15}, {   4351,14}, {   8959,15}, {   4863,16}, \
+    {   2559,15}, {   5887,14}, {  11775,16}, {   3071,15}, \
+    {  32768,16}, {  65536,17}, { 131072,18}, { 262144,19}, \
+    { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \
+    {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 201
+#define MUL_FFT_THRESHOLD                 3200
+
+#define SQR_FFT_MODF_THRESHOLD             340  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    340, 5}, {     19, 6}, {     10, 5}, {     21, 6}, \
+    {     11, 5}, {     23, 6}, {     21, 7}, {     11, 6}, \
+    {     23, 7}, {     12, 6}, {     25, 7}, {     13, 6}, \
+    {     27, 7}, {     21, 8}, {     11, 7}, {     25, 8}, \
+    {     13, 7}, {     27, 8}, {     15, 7}, {     31, 8}, \
+    {     21, 9}, {     11, 8}, {     27, 9}, {     15, 8}, \
+    {     35, 9}, {     19, 8}, {     41, 9}, {     23, 8}, \
+    {     47, 9}, {     27,10}, {     15, 9}, {     39,10}, \
+    {     23, 9}, {     51,11}, {     15,10}, {     31, 9}, \
+    {     63,10}, {     39, 9}, {     79,10}, {     47,11}, \
+    {     31,10}, {     79,11}, {     47,10}, {     95, 6}, \
+    {   1663, 7}, {    895, 9}, {    239, 8}, {    479,10}, \
+    {    127, 9}, {    255, 8}, {    511,10}, {    135, 9}, \
+    {    271,11}, {     79, 9}, {    319,11}, {     95,10}, \
+    {    191, 9}, {    383,10}, {    207,11}, {    111,12}, \
+    {     63,11}, {    127,10}, {    255, 9}, {    511,10}, \
+    {    271, 9}, {    543,10}, {    287, 9}, {    575,10}, \
+    {    303, 9}, {    607,10}, {    319, 9}, {    639,12}, \
+    {     95,11}, {    191,10}, {    383,11}, {    207,10}, \
+    {    415,13}, {     63,12}, {    127,11}, {    255,10}, \
+    {    511,11}, {    271,10}, {    543,11}, {    287,10}, \
+    {    575,11}, {    303,10}, {    607,11}, {    319,10}, \
+    {    639,11}, {    351,10}, {    703,11}, {    367,12}, \
+    {    191,11}, {    383,10}, {    767,11}, {    415,12}, \
+    {    223,11}, {    479,13}, {    127,12}, {    255,11}, \
+    {    543,12}, {    287,11}, {    607,12}, {    319,11}, \
+    {    639,12}, {    351,11}, {    703,13}, {    191,12}, \
+    {    383,11}, {    767,12}, {    415,11}, {    831,12}, \
+    {    479,14}, {    127,13}, {    255,12}, {    607,13}, \
+    {    319,12}, {    735,13}, {    383,12}, {    831,13}, \
+    {    447,12}, {    895,14}, {    255,13}, {    511,12}, \
+    {   1023,13}, {    575,12}, {   1151,13}, {    703,14}, \
+    {    383,13}, {    831,12}, {   1663,13}, {    895,15}, \
+    {    255,14}, {    511,13}, {   1087,12}, {   2175,13}, \
+    {   1151,14}, {    639,13}, {   1343,12}, {   2687,13}, \
+    {   1407,14}, {    767,13}, {   1599,12}, {   3199,13}, \
+    {   1663,14}, {    895,15}, {    511,14}, {   1023,13}, \
+    {   2175,14}, {   1151,13}, {   2431,14}, {   1279,13}, \
+    {   2687,14}, {   1407,15}, {    767,14}, {   1535,13}, \
+    {   3199,14}, {   1663,13}, {   3455,16}, {    511,15}, \
+    {   1023,14}, {   2175,13}, {   4479,14}, {   2431,13}, \
+    {   4863,15}, {   1279,14}, {   2943,13}, {   5887,15}, \
+    {   1535,14}, {   3455,13}, {   6911,15}, {   1791,14}, \
+    {   3839,13}, {   7679,16}, {   1023,15}, {   2047,14}, \
+    {   4479,15}, {   2303,14}, {   4991,15}, {   2815,14}, \
+    {   5887,16}, {   1535,15}, {   3071,14}, {   6143,15}, \
+    {   3327,14}, {   6911,15}, {   3839,14}, {   7679,17}, \
+    {   1023,16}, {   2047,15}, {   4095,14}, {   8191,15}, \
+    {   4351,14}, {   8959,15}, {   4863,16}, {   2559,15}, \
+    {   5887,14}, {  11775,16}, {   3071,15}, {  32768,16}, \
+    {  65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \
+    {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 208
+#define SQR_FFT_THRESHOLD                 2880
+
+#define MULLO_BASECASE_THRESHOLD             0  /* always */
+#define MULLO_DC_THRESHOLD                  63
+#define MULLO_MUL_N_THRESHOLD             6253
+#define SQRLO_BASECASE_THRESHOLD             8
+#define SQRLO_DC_THRESHOLD                  54
+#define SQRLO_SQR_THRESHOLD               5558
+
+#define DC_DIV_QR_THRESHOLD                 72
+#define DC_DIVAPPR_Q_THRESHOLD             195
+#define DC_BDIV_QR_THRESHOLD                50
+#define DC_BDIV_Q_THRESHOLD                 90
+
+#define INV_MULMOD_BNM1_THRESHOLD           46
+#define INV_NEWTON_THRESHOLD               195
+#define INV_APPR_THRESHOLD                 197
+
+#define BINV_NEWTON_THRESHOLD              230
+#define REDC_1_TO_REDC_2_THRESHOLD          67
+#define REDC_2_TO_REDC_N_THRESHOLD           0  /* always */
+
+#define MU_DIV_QR_THRESHOLD               1334
+#define MU_DIVAPPR_Q_THRESHOLD            1334
+#define MUPI_DIV_QR_THRESHOLD              104
+#define MU_BDIV_QR_THRESHOLD              1017
+#define MU_BDIV_Q_THRESHOLD               1187
+
+#define POWM_SEC_TABLE  1,16,194,712,779,2387
+
+#define GET_STR_DC_THRESHOLD                15
+#define GET_STR_PRECOMPUTE_THRESHOLD        29
+#define SET_STR_DC_THRESHOLD               216
+#define SET_STR_PRECOMPUTE_THRESHOLD       994
+
+#define FAC_DSC_THRESHOLD                  153
+#define FAC_ODD_THRESHOLD                    0  /* always */
+
+#define MATRIX22_STRASSEN_THRESHOLD         17
+#define HGCD2_DIV1_METHOD                    1  /* 9.38% faster than 3 */
+#define HGCD_THRESHOLD                      77
+#define HGCD_APPR_THRESHOLD                 50
+#define HGCD_REDUCE_THRESHOLD             2121
+#define GCD_DC_THRESHOLD                   440
+#define GCDEXT_DC_THRESHOLD                273
+#define JACOBI_BASE_METHOD                   1  /* 7.74% faster than 4 */
+
+/* Tuneup completed successfully, took 495910 seconds */

diff --git a/third_party/gmp/mpn/x86_64/cnd_aors_n.asm b/third_party/gmp/mpn/x86_64/cnd_aors_n.asm
new file mode 100644
index 0000000..13a2ab3
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/cnd_aors_n.asm

@@ -0,0 +1,183 @@
+dnl  AMD64 mpn_cnd_add_n, mpn_cnd_sub_n
+
+dnl  Copyright 2011-2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C AMD K8,K9	 2
+C AMD K10	 2
+C AMD bd1	 2.32
+C AMD bobcat	 3
+C Intel P4	13
+C Intel core2	 2.9
+C Intel NHM	 2.8
+C Intel SBR	 2.4
+C Intel atom	 5.33
+C VIA nano	 3
+
+C NOTES
+C  * It might seem natural to use the cmov insn here, but since this function
+C    is supposed to have the exact same execution pattern for cnd true and
+C    false, and since cmov's documentation is not clear about whether it
+C    actually reads both source operands and writes the register for a false
+C    condition, we cannot use it.
+C  * Two cases could be optimised: (1) cnd_add_n could use ADCSBB-from-memory
+C    to save one insn/limb, and (2) when up=rp cnd_add_n and cnd_sub_n could use
+C    ADCSBB-to-memory, again saving 1 insn/limb.
+C  * This runs optimally at decoder bandwidth on K10.  It has not been tuned
+C    for any other processor.
+
+C INPUT PARAMETERS
+define(`cnd',	`%rdi')	dnl rcx
+define(`rp',	`%rsi')	dnl rdx
+define(`up',	`%rdx')	dnl r8
+define(`vp',	`%rcx')	dnl r9
+define(`n',	`%r8')	dnl rsp+40
+
+ifdef(`OPERATION_cnd_add_n', `
+	define(ADDSUB,	      add)
+	define(ADCSBB,	      adc)
+	define(func,	      mpn_cnd_add_n)')
+ifdef(`OPERATION_cnd_sub_n', `
+	define(ADDSUB,	      sub)
+	define(ADCSBB,	      sbb)
+	define(func,	      mpn_cnd_sub_n)')
+
+MULFUNC_PROLOGUE(mpn_cnd_add_n mpn_cnd_sub_n)
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(func)
+	FUNC_ENTRY(4)
+IFDOS(`	mov	56(%rsp), R32(%r8)')
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+
+	neg	cnd
+	sbb	cnd, cnd		C make cnd mask
+
+	lea	(vp,n,8), vp
+	lea	(up,n,8), up
+	lea	(rp,n,8), rp
+
+	mov	R32(n), R32(%rax)
+	neg	n
+	and	$3, R32(%rax)
+	jz	L(top)			C carry-save reg rax = 0 in this arc
+	cmp	$2, R32(%rax)
+	jc	L(b1)
+	jz	L(b2)
+
+L(b3):	mov	(vp,n,8), %r12
+	mov	8(vp,n,8), %r13
+	mov	16(vp,n,8), %r14
+	and	cnd, %r12
+	mov	(up,n,8), %r10
+	and	cnd, %r13
+	mov	8(up,n,8), %rbx
+	and	cnd, %r14
+	mov	16(up,n,8), %rbp
+	ADDSUB	%r12, %r10
+	mov	%r10, (rp,n,8)
+	ADCSBB	%r13, %rbx
+	mov	%rbx, 8(rp,n,8)
+	ADCSBB	%r14, %rbp
+	mov	%rbp, 16(rp,n,8)
+	sbb	R32(%rax), R32(%rax)	C save carry
+	add	$3, n
+	js	L(top)
+	jmp	L(end)
+
+L(b2):	mov	(vp,n,8), %r12
+	mov	8(vp,n,8), %r13
+	mov	(up,n,8), %r10
+	and	cnd, %r12
+	mov	8(up,n,8), %rbx
+	and	cnd, %r13
+	ADDSUB	%r12, %r10
+	mov	%r10, (rp,n,8)
+	ADCSBB	%r13, %rbx
+	mov	%rbx, 8(rp,n,8)
+	sbb	R32(%rax), R32(%rax)	C save carry
+	add	$2, n
+	js	L(top)
+	jmp	L(end)
+
+L(b1):	mov	(vp,n,8), %r12
+	mov	(up,n,8), %r10
+	and	cnd, %r12
+	ADDSUB	%r12, %r10
+	mov	%r10, (rp,n,8)
+	sbb	R32(%rax), R32(%rax)	C save carry
+	add	$1, n
+	jns	L(end)
+
+	ALIGN(16)
+L(top):	mov	(vp,n,8), %r12
+	mov	8(vp,n,8), %r13
+	mov	16(vp,n,8), %r14
+	mov	24(vp,n,8), %r11
+	and	cnd, %r12
+	mov	(up,n,8), %r10
+	and	cnd, %r13
+	mov	8(up,n,8), %rbx
+	and	cnd, %r14
+	mov	16(up,n,8), %rbp
+	and	cnd, %r11
+	mov	24(up,n,8), %r9
+	add	R32(%rax), R32(%rax)	C restore carry
+	ADCSBB	%r12, %r10
+	mov	%r10, (rp,n,8)
+	ADCSBB	%r13, %rbx
+	mov	%rbx, 8(rp,n,8)
+	ADCSBB	%r14, %rbp
+	mov	%rbp, 16(rp,n,8)
+	ADCSBB	%r11, %r9
+	mov	%r9, 24(rp,n,8)
+	sbb	R32(%rax), R32(%rax)	C save carry
+	add	$4, n
+	js	L(top)
+
+L(end):	neg	R32(%rax)
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/com.asm b/third_party/gmp/mpn/x86_64/com.asm
new file mode 100644
index 0000000..006acaf
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/com.asm

@@ -0,0 +1,95 @@
+dnl  AMD64 mpn_com.
+
+dnl  Copyright 2004-2006, 2011, 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C	    cycles/limb
+C AMD K8,K9	 1.25
+C AMD K10	 1.25
+C Intel P4	 2.78
+C Intel core2	 1.1
+C Intel corei	 1.5
+C Intel atom	 ?
+C VIA nano	 2
+
+C INPUT PARAMETERS
+define(`rp',`%rdi')
+define(`up',`%rsi')
+define(`n',`%rdx')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(32)
+PROLOGUE(mpn_com)
+	FUNC_ENTRY(3)
+	movq	(up), %r8
+	movl	R32(%rdx), R32(%rax)
+	leaq	(up,n,8), up
+	leaq	(rp,n,8), rp
+	negq	n
+	andl	$3, R32(%rax)
+	je	L(b00)
+	cmpl	$2, R32(%rax)
+	jc	L(b01)
+	je	L(b10)
+
+L(b11):	notq	%r8
+	movq	%r8, (rp,n,8)
+	decq	n
+	jmp	L(e11)
+L(b10):	addq	$-2, n
+	jmp	L(e10)
+	.byte	0x90,0x90,0x90,0x90,0x90,0x90
+L(b01):	notq	%r8
+	movq	%r8, (rp,n,8)
+	incq	n
+	jz	L(ret)
+
+L(oop):	movq	(up,n,8), %r8
+L(b00):	movq	8(up,n,8), %r9
+	notq	%r8
+	notq	%r9
+	movq	%r8, (rp,n,8)
+	movq	%r9, 8(rp,n,8)
+L(e11):	movq	16(up,n,8), %r8
+L(e10):	movq	24(up,n,8), %r9
+	notq	%r8
+	notq	%r9
+	movq	%r8, 16(rp,n,8)
+	movq	%r9, 24(rp,n,8)
+	addq	$4, n
+	jnc	L(oop)
+L(ret):	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/copyd.asm b/third_party/gmp/mpn/x86_64/copyd.asm
new file mode 100644
index 0000000..a5e6e59
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/copyd.asm

@@ -0,0 +1,93 @@
+dnl  AMD64 mpn_copyd -- copy limb vector, decrementing.
+
+dnl  Copyright 2003, 2005, 2007, 2011, 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C AMD K8,K9	 1
+C AMD K10	 1
+C AMD bd1	 1.36
+C AMD bobcat	 1.71
+C Intel P4	 2-3
+C Intel core2	 1
+C Intel NHM	 1
+C Intel SBR	 1
+C Intel atom	 2
+C VIA nano	 2
+
+
+IFSTD(`define(`rp',`%rdi')')
+IFSTD(`define(`up',`%rsi')')
+IFSTD(`define(`n', `%rdx')')
+
+IFDOS(`define(`rp',`%rcx')')
+IFDOS(`define(`up',`%rdx')')
+IFDOS(`define(`n', `%r8')')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(64)
+PROLOGUE(mpn_copyd)
+	lea	-8(up,n,8), up
+	lea	(rp,n,8), rp
+	sub	$4, n
+	jc	L(end)
+	nop
+
+L(top):	mov	(up), %rax
+	mov	-8(up), %r9
+	lea	-32(rp), rp
+	mov	-16(up), %r10
+	mov	-24(up), %r11
+	lea	-32(up), up
+	mov	%rax, 24(rp)
+	mov	%r9, 16(rp)
+	sub	$4, n
+	mov	%r10, 8(rp)
+	mov	%r11, (rp)
+	jnc	L(top)
+
+L(end):	shr	R32(n)
+	jnc	1f
+	mov	(up), %rax
+	mov	%rax, -8(rp)
+	lea	-8(rp), rp
+	lea	-8(up), up
+1:	shr	R32(n)
+	jnc	1f
+	mov	(up), %rax
+	mov	-8(up), %r9
+	mov	%rax, -8(rp)
+	mov	%r9, -16(rp)
+1:	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/copyi.asm b/third_party/gmp/mpn/x86_64/copyi.asm
new file mode 100644
index 0000000..bafce7a
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/copyi.asm

@@ -0,0 +1,92 @@
+dnl  AMD64 mpn_copyi -- copy limb vector, incrementing.
+
+dnl  Copyright 2003, 2005, 2007, 2011, 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C AMD K8,K9	 1
+C AMD K10	 1
+C AMD bd1	 1.36
+C AMD bobcat	 1.71
+C Intel P4	 2-3
+C Intel core2	 1
+C Intel NHM	 1
+C Intel SBR	 1
+C Intel atom	 2
+C VIA nano	 2
+
+
+IFSTD(`define(`rp',`%rdi')')
+IFSTD(`define(`up',`%rsi')')
+IFSTD(`define(`n', `%rdx')')
+
+IFDOS(`define(`rp',`%rcx')')
+IFDOS(`define(`up',`%rdx')')
+IFDOS(`define(`n', `%r8')')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(64)
+	.byte	0,0,0,0,0,0
+PROLOGUE(mpn_copyi)
+	lea	-8(rp), rp
+	sub	$4, n
+	jc	L(end)
+
+L(top):	mov	(up), %rax
+	mov	8(up), %r9
+	lea	32(rp), rp
+	mov	16(up), %r10
+	mov	24(up), %r11
+	lea	32(up), up
+	mov	%rax, -24(rp)
+	mov	%r9, -16(rp)
+	sub	$4, n
+	mov	%r10, -8(rp)
+	mov	%r11, (rp)
+	jnc	L(top)
+
+L(end):	shr	R32(n)
+	jnc	1f
+	mov	(up), %rax
+	mov	%rax, 8(rp)
+	lea	8(rp), rp
+	lea	8(up), up
+1:	shr	R32(n)
+	jnc	1f
+	mov	(up), %rax
+	mov	8(up), %r9
+	mov	%rax, 8(rp)
+	mov	%r9, 16(rp)
+1:	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/core2/aorrlsh1_n.asm b/third_party/gmp/mpn/x86_64/core2/aorrlsh1_n.asm
new file mode 100644
index 0000000..7066bb4
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/core2/aorrlsh1_n.asm

@@ -0,0 +1,53 @@
+dnl  AMD64 mpn_addlsh1_n -- rp[] = up[] + (vp[] << 1)
+dnl  AMD64 mpn_rsblsh1_n -- rp[] = (vp[] << 1) - up[]
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  Copyright 2008, 2010-2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+define(LSH, 1)
+define(RSH, 63)
+
+ifdef(`OPERATION_addlsh1_n', `
+	define(ADDSUB,	add)
+	define(ADCSBB,	adc)
+	define(func,	mpn_addlsh1_n)')
+ifdef(`OPERATION_rsblsh1_n', `
+	define(ADDSUB,	sub)
+	define(ADCSBB,	sbb)
+	define(func,	mpn_rsblsh1_n)')
+
+MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_rsblsh1_n)
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+include_mpn(`x86_64/aorrlshC_n.asm')

diff --git a/third_party/gmp/mpn/x86_64/core2/aorrlsh2_n.asm b/third_party/gmp/mpn/x86_64/core2/aorrlsh2_n.asm
new file mode 100644
index 0000000..5065120
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/core2/aorrlsh2_n.asm

@@ -0,0 +1,53 @@
+dnl  AMD64 mpn_addlsh2_n -- rp[] = up[] + (vp[] << 2)
+dnl  AMD64 mpn_rsblsh2_n -- rp[] = (vp[] << 2) - up[]
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  Copyright 2008, 2010-2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+define(LSH, 2)
+define(RSH, 62)
+
+ifdef(`OPERATION_addlsh2_n', `
+	define(ADDSUB,	add)
+	define(ADCSBB,	adc)
+	define(func,	mpn_addlsh2_n)')
+ifdef(`OPERATION_rsblsh2_n', `
+	define(ADDSUB,	sub)
+	define(ADCSBB,	sbb)
+	define(func,	mpn_rsblsh2_n)')
+
+MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_rsblsh2_n)
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+include_mpn(`x86_64/aorrlshC_n.asm')

diff --git a/third_party/gmp/mpn/x86_64/core2/aorrlsh_n.asm b/third_party/gmp/mpn/x86_64/core2/aorrlsh_n.asm
new file mode 100644
index 0000000..57abf31
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/core2/aorrlsh_n.asm

@@ -0,0 +1,38 @@
+dnl  AMD64 mpn_addlsh_n and mpn_rsblsh_n.  R = V2^k +- U.
+
+dnl  Copyright 2011, 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_addlsh_n mpn_addlsh_nc mpn_rsblsh_n)
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+include_mpn(`x86_64/coreinhm/aorrlsh_n.asm')

diff --git a/third_party/gmp/mpn/x86_64/core2/aors_err1_n.asm b/third_party/gmp/mpn/x86_64/core2/aors_err1_n.asm
new file mode 100644
index 0000000..3f875ae
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/core2/aors_err1_n.asm

@@ -0,0 +1,225 @@
+dnl  Core 2 mpn_add_err1_n, mpn_sub_err1_n
+
+dnl  Contributed by David Harvey.
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C AMD K8,K9	 ?
+C AMD K10	 ?
+C Intel P4	 ?
+C Intel core2	 4.14
+C Intel corei	 ?
+C Intel atom	 ?
+C VIA nano	 ?
+
+
+C INPUT PARAMETERS
+define(`rp',	`%rdi')
+define(`up',	`%rsi')
+define(`vp',	`%rdx')
+define(`ep',	`%rcx')
+define(`yp',	`%r8')
+define(`n',	`%r9')
+define(`cy_param',	`8(%rsp)')
+
+define(`el',	`%rbx')
+define(`eh',	`%rbp')
+define(`t0',	`%r10')
+define(`t1',	`%r11')
+define(`t2',	`%r12')
+define(`t3',	`%r13')
+define(`w0',	`%r14')
+define(`w1',	`%r15')
+
+ifdef(`OPERATION_add_err1_n', `
+	define(ADCSBB,	      adc)
+	define(func,	      mpn_add_err1_n)')
+ifdef(`OPERATION_sub_err1_n', `
+	define(ADCSBB,	      sbb)
+	define(func,	      mpn_sub_err1_n)')
+
+MULFUNC_PROLOGUE(mpn_add_err1_n mpn_sub_err1_n)
+
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(func)
+	mov	cy_param, %rax
+
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+
+	lea	(up,n,8), up
+	lea	(vp,n,8), vp
+	lea	(rp,n,8), rp
+
+	mov	R32(n), R32(%r10)
+	and	$3, R32(%r10)
+	jz	L(0mod4)
+	cmp	$2, R32(%r10)
+	jc	L(1mod4)
+	jz	L(2mod4)
+L(3mod4):
+	xor	R32(el), R32(el)
+	xor	R32(eh), R32(eh)
+	xor	R32(t0), R32(t0)
+	xor	R32(t1), R32(t1)
+	lea	-24(yp,n,8), yp
+	neg	n
+
+	shr	$1, %al		   C restore carry
+	mov	(up,n,8), w0
+	mov	8(up,n,8), w1
+	ADCSBB	(vp,n,8), w0
+	mov	w0, (rp,n,8)
+	cmovc	16(yp), el
+	ADCSBB	8(vp,n,8), w1
+	mov	w1, 8(rp,n,8)
+	cmovc	8(yp), t0
+	mov	16(up,n,8), w0
+	ADCSBB	16(vp,n,8), w0
+	mov	w0, 16(rp,n,8)
+	cmovc	(yp), t1
+	setc	%al		   C save carry
+	add	t0, el
+	adc	$0, eh
+	add	t1, el
+	adc	$0, eh
+
+	add	$3, n
+	jnz	L(loop)
+	jmp	L(end)
+
+	ALIGN(16)
+L(0mod4):
+	xor	R32(el), R32(el)
+	xor	R32(eh), R32(eh)
+	lea	(yp,n,8), yp
+	neg	n
+	jmp	L(loop)
+
+	ALIGN(16)
+L(1mod4):
+	xor	R32(el), R32(el)
+	xor	R32(eh), R32(eh)
+	lea	-8(yp,n,8), yp
+	neg	n
+
+	shr	$1, %al		   C restore carry
+	mov	(up,n,8), w0
+	ADCSBB	(vp,n,8), w0
+	mov	w0, (rp,n,8)
+	cmovc	(yp), el
+	setc	%al		   C save carry
+
+	add	$1, n
+	jnz	L(loop)
+	jmp	L(end)
+
+	ALIGN(16)
+L(2mod4):
+	xor	R32(el), R32(el)
+	xor	R32(eh), R32(eh)
+	xor	R32(t0), R32(t0)
+	lea	-16(yp,n,8), yp
+	neg	n
+
+	shr	$1, %al		   C restore carry
+	mov	(up,n,8), w0
+	mov	8(up,n,8), w1
+	ADCSBB	(vp,n,8), w0
+	mov	w0, (rp,n,8)
+	cmovc	8(yp), el
+	ADCSBB	8(vp,n,8), w1
+	mov	w1, 8(rp,n,8)
+	cmovc	(yp), t0
+	setc	%al		   C save carry
+	add	t0, el
+	adc	$0, eh
+
+	add	$2, n
+	jnz	L(loop)
+	jmp	L(end)
+
+	ALIGN(32)
+L(loop):
+	mov	(up,n,8), w0
+	shr	$1, %al		   C restore carry
+	mov	-8(yp), t0
+	mov	$0, R32(t3)
+	ADCSBB	(vp,n,8), w0
+	cmovnc	t3, t0
+	mov	w0, (rp,n,8)
+	mov	8(up,n,8), w1
+	mov	16(up,n,8), w0
+	ADCSBB	8(vp,n,8), w1
+	mov	-16(yp), t1
+	cmovnc	t3, t1
+	mov	-24(yp), t2
+	mov	w1, 8(rp,n,8)
+	ADCSBB	16(vp,n,8), w0
+	cmovnc	t3, t2
+	mov	24(up,n,8), w1
+	ADCSBB	24(vp,n,8), w1
+	cmovc	-32(yp), t3
+	setc	%al		   C save carry
+	add	t0, el
+	adc	$0, eh
+	add	t1, el
+	adc	$0, eh
+	add	t2, el
+	adc	$0, eh
+	lea	-32(yp), yp
+	mov	w0, 16(rp,n,8)
+	add	t3, el
+	adc	$0, eh
+	add	$4, n
+	mov	w1, -8(rp,n,8)
+	jnz	L(loop)
+
+L(end):
+	mov	el, (ep)
+	mov	eh, 8(ep)
+
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/core2/aors_n.asm b/third_party/gmp/mpn/x86_64/core2/aors_n.asm
new file mode 100644
index 0000000..f9e0039
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/core2/aors_n.asm

@@ -0,0 +1,150 @@
+dnl  Intel mpn_add_n/mpn_sub_n optimised for Conroe, Nehalem.
+
+dnl  Copyright 2006, 2007, 2011-2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	    cycles/limb
+C AMD K8,K9	 2
+C AMD K10	 1.93\2
+C AMD bull	 1.62\2.1
+C AMD pile	 1.6\1.7
+C AMD steam
+C AMD excavator
+C AMD bobcat	 2.79
+C AMD jaguar	 2.54
+C Intel P4	10
+C Intel core2	 2
+C Intel NHM	 2
+C Intel SBR	 2
+C Intel IBR	 1.95
+C Intel HWL	 1.72
+C Intel BWL	 1.54
+C Intel SKL	 1.52
+C Intel atom	 9
+C Intel SLM	 6.5
+C VIA nano	 3
+
+C INPUT PARAMETERS
+define(`rp',	`%rdi')
+define(`up',	`%rsi')
+define(`vp',	`%rdx')
+define(`n',	`%rcx')
+define(`cy',	`%r8')
+
+ifdef(`OPERATION_add_n', `
+	define(ADCSBB,	      adc)
+	define(func,	      mpn_add_n)
+	define(func_nc,	      mpn_add_nc)')
+ifdef(`OPERATION_sub_n', `
+	define(ADCSBB,	      sbb)
+	define(func,	      mpn_sub_n)
+	define(func_nc,	      mpn_sub_nc)')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(func)
+	FUNC_ENTRY(4)
+	xor	%r8, %r8
+L(start):
+	mov	(up), %r10
+	mov	(vp), %r11
+
+	lea	(up,n,8), up
+	lea	(vp,n,8), vp
+	lea	(rp,n,8), rp
+	mov	R32(n), R32(%rax)
+	neg	n
+	and	$3, R32(%rax)
+	je	L(b00)
+	add	%rax, n			C clear low rcx bits for jrcxz
+	cmp	$2, R32(%rax)
+	jl	L(b01)
+	je	L(b10)
+
+L(b11):	neg	%r8			C set cy
+	jmp	L(e11)
+
+L(b00):	neg	%r8			C set cy
+	mov	%r10, %r8
+	mov	%r11, %r9
+	lea	4(n), n
+	jmp	L(e00)
+
+	nop
+	nop
+	nop
+L(b01):	neg	%r8			C set cy
+	jmp	L(top)
+
+L(b10):	neg	%r8			C set cy
+	mov	%r10, %r8
+	mov	%r11, %r9
+	jmp	L(e10)
+
+L(end):	ADCSBB	%r11, %r10
+	mov	%r10, -8(rp)
+	mov	R32(%rcx), R32(%rax)	C clear eax, ecx contains 0
+	adc	R32(%rax), R32(%rax)
+	FUNC_EXIT()
+	ret
+
+	ALIGN(16)
+L(top):	jrcxz	L(end)
+	mov	(up,n,8), %r8
+	mov	(vp,n,8), %r9
+	lea	4(n), n
+	ADCSBB	%r11, %r10
+	mov	%r10, -40(rp,n,8)
+L(e00):	mov	-24(up,n,8), %r10
+	mov	-24(vp,n,8), %r11
+	ADCSBB	%r9, %r8
+	mov	%r8, -32(rp,n,8)
+L(e11):	mov	-16(up,n,8), %r8
+	mov	-16(vp,n,8), %r9
+	ADCSBB	%r11, %r10
+	mov	%r10, -24(rp,n,8)
+L(e10):	mov	-8(up,n,8), %r10
+	mov	-8(vp,n,8), %r11
+	ADCSBB	%r9, %r8
+	mov	%r8, -16(rp,n,8)
+	jmp	L(top)
+EPILOGUE()
+
+PROLOGUE(func_nc)
+	FUNC_ENTRY(4)
+IFDOS(`	mov	56(%rsp), %r8	')
+	jmp	L(start)
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/core2/aorsmul_1.asm b/third_party/gmp/mpn/x86_64/core2/aorsmul_1.asm
new file mode 100644
index 0000000..a7a5d6e
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/core2/aorsmul_1.asm

@@ -0,0 +1,188 @@
+dnl  x86-64 mpn_addmul_1 and mpn_submul_1, optimized for "Core 2".
+
+dnl  Copyright 2003-2005, 2007-2009, 2011, 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C AMD K8,K9      4.52
+C AMD K10        4.01
+C AMD bull       4.98
+C AMD pile       4.83
+C AMD steam
+C AMD excavator
+C AMD bobcat     5.56
+C AMD jaguar     5.54
+C Intel P4      16.3    17.3
+C Intel core2    4.32    4.61
+C Intel NHM      5.08
+C Intel SBR      4.04
+C Intel IBR      3.95
+C Intel HWL      3.66
+C Intel BWL      2.87
+C Intel SKL      2.79
+C Intel atom    20.6
+C Intel SLM      7.6
+C VIA nano       5.25
+
+C INPUT PARAMETERS
+define(`rp',	`%rdi')
+define(`up',	`%rsi')
+define(`n',	`%rdx')
+define(`v0',	`%rcx')
+
+ifdef(`OPERATION_addmul_1',`
+      define(`ADDSUB',        `add')
+      define(`func',     `mpn_addmul_1')
+      define(`func_1c',  `mpn_addmul_1c')
+')
+ifdef(`OPERATION_submul_1',`
+      define(`ADDSUB',        `sub')
+      define(`func',     `mpn_submul_1')
+      define(`func_1c',  `mpn_submul_1c')
+')
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c)
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+	C For DOS, on the stack we have four saved registers, return address,
+	C space for four register arguments, and finally the carry input.
+
+IFDOS(` define(`carry_in', `72(%rsp)')') dnl
+IFSTD(` define(`carry_in', `%r8')') dnl
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(func_1c)
+	FUNC_ENTRY(4)
+	push	%rbx
+	push	%rbp
+	lea	(%rdx), %rbx
+	neg	%rbx
+
+	mov	(up), %rax
+	mov	(rp), %r10
+
+	lea	-16(rp,%rdx,8), rp
+	lea	(up,%rdx,8), up
+	mul	%rcx
+	add	carry_in, %rax
+	adc	$0, %rdx
+	jmp	L(start_nc)
+EPILOGUE()
+
+	ALIGN(16)
+PROLOGUE(func)
+	FUNC_ENTRY(4)
+	push	%rbx
+	push	%rbp
+	lea	(%rdx), %rbx
+	neg	%rbx
+
+	mov	(up), %rax
+	mov	(rp), %r10
+
+	lea	-16(rp,%rdx,8), rp
+	lea	(up,%rdx,8), up
+	mul	%rcx
+
+L(start_nc):
+	test	$1, R8(%rbx)
+	jnz	L(odd)
+
+	lea	(%rax), %r11
+	mov	8(up,%rbx,8), %rax
+	lea	(%rdx), %rbp
+	mul	%rcx
+	add	$2, %rbx
+	jz	L(n2)
+
+	lea	(%rax), %r8
+	mov	(up,%rbx,8), %rax
+	lea	(%rdx), %r9
+	jmp	L(mid)
+
+	ALIGN(8)
+L(odd):	inc	%rbx
+	jz	L(n1)
+
+	lea	(%rax), %r8
+	mov	(up,%rbx,8), %rax
+	lea	(%rdx), %r9
+	mul	%rcx
+	lea	(%rax), %r11
+	mov	8(up,%rbx,8), %rax
+	lea	(%rdx), %rbp
+	jmp	L(e)
+
+	ALIGN(16)
+L(top):	mul	%rcx
+	ADDSUB	%r8, %r10
+	lea	(%rax), %r8
+	mov	(up,%rbx,8), %rax
+	adc	%r9, %r11
+	mov	%r10, -8(rp,%rbx,8)
+	mov	(rp,%rbx,8), %r10
+	lea	(%rdx), %r9
+	adc	$0, %rbp
+L(mid):	mul	%rcx
+	ADDSUB	%r11, %r10
+	lea	(%rax), %r11
+	mov	8(up,%rbx,8), %rax
+	adc	%rbp, %r8
+	mov	%r10, (rp,%rbx,8)
+	mov	8(rp,%rbx,8), %r10
+	lea	(%rdx), %rbp
+	adc	$0, %r9
+L(e):	add	$2, %rbx
+	js	L(top)
+
+	mul	%rcx
+	ADDSUB	%r8, %r10
+	adc	%r9, %r11
+	mov	%r10, -8(rp)
+	adc	%rbx, %rbp		C rbx = 0
+L(n2):	mov	(rp), %r10
+	ADDSUB	%r11, %r10
+	adc	%rbp, %rax
+	mov	%r10, (rp)
+	adc	%rbx, %rdx		C rbx = 0
+L(n1):	mov	8(rp), %r10
+	ADDSUB	%rax, %r10
+	mov	%r10, 8(rp)
+	mov	R32(%rbx), R32(%rax)	C rbx = 0
+	adc	%rdx, %rax
+	pop	%rbp
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/core2/com.asm b/third_party/gmp/mpn/x86_64/core2/com.asm
new file mode 100644
index 0000000..d7d9f79
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/core2/com.asm

@@ -0,0 +1,37 @@
+dnl  X86-64 mpn_com.
+
+dnl  Copyright 2017 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_com)
+include_mpn(`x86_64/fastsse/com-palignr.asm')

diff --git a/third_party/gmp/mpn/x86_64/core2/copyd.asm b/third_party/gmp/mpn/x86_64/core2/copyd.asm
new file mode 100644
index 0000000..57ea0e5
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/core2/copyd.asm

@@ -0,0 +1,37 @@
+dnl  X86-64 mpn_copyd.
+
+dnl  Copyright 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_copyd)
+include_mpn(`x86_64/fastsse/copyd-palignr.asm')

diff --git a/third_party/gmp/mpn/x86_64/core2/copyi.asm b/third_party/gmp/mpn/x86_64/core2/copyi.asm
new file mode 100644
index 0000000..f0c7607
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/core2/copyi.asm

@@ -0,0 +1,37 @@
+dnl  X86-64 mpn_copyi.
+
+dnl  Copyright 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_copyi)
+include_mpn(`x86_64/fastsse/copyi-palignr.asm')

diff --git a/third_party/gmp/mpn/x86_64/core2/divrem_1.asm b/third_party/gmp/mpn/x86_64/core2/divrem_1.asm
new file mode 100644
index 0000000..1b3f139
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/core2/divrem_1.asm

@@ -0,0 +1,243 @@
+dnl  x86-64 mpn_divrem_1 -- mpn by limb division.
+
+dnl  Copyright 2004, 2005, 2007-2010, 2012, 2014 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C		norm	unorm	frac
+C AMD K8,K9	15	15	12
+C AMD K10	15	15	12
+C Intel P4	44	44	43
+C Intel core2	24	24	19.5
+C Intel corei	19	19	18
+C Intel atom	51	51	36
+C VIA nano	46	44	22.5
+
+C mp_limb_t
+C mpn_divrem_1 (mp_ptr qp, mp_size_t fn,
+C               mp_srcptr np, mp_size_t nn, mp_limb_t d)
+
+C mp_limb_t
+C mpn_preinv_divrem_1 (mp_ptr qp, mp_size_t fn,
+C                      mp_srcptr np, mp_size_t nn, mp_limb_t d,
+C                      mp_limb_t dinv, int cnt)
+
+C INPUT PARAMETERS
+define(`qp',		`%rdi')
+define(`fn_param',	`%rsi')
+define(`up_param',	`%rdx')
+define(`un_param',	`%rcx')
+define(`d',		`%r8')
+define(`dinv',		`%r9')		C only for mpn_preinv_divrem_1
+C       shift passed on stack		C only for mpn_preinv_divrem_1
+
+define(`cnt',		`%rcx')
+define(`up',		`%rsi')
+define(`fn',		`%r12')
+define(`un',		`%rbx')
+
+
+C rax rbx rcx rdx rsi rdi rbp r8  r9  r10 r11 r12 r13 r14 r15
+C         cnt         qp      d  dinv
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+IFSTD(`define(`CNTOFF',		`40($1)')')
+IFDOS(`define(`CNTOFF',		`104($1)')')
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_preinv_divrem_1)
+	FUNC_ENTRY(4)
+IFDOS(`	mov	56(%rsp), %r8	')
+IFDOS(`	mov	64(%rsp), %r9	')
+	xor	R32(%rax), R32(%rax)
+	push	%r13
+	push	%r12
+	push	%rbp
+	push	%rbx
+
+	mov	fn_param, fn
+	mov	un_param, un
+	add	fn_param, un_param
+	mov	up_param, up
+
+	lea	-8(qp,un_param,8), qp
+
+	mov	CNTOFF(%rsp), R8(cnt)
+	shl	R8(cnt), d
+	jmp	L(ent)
+EPILOGUE()
+
+	ALIGN(16)
+PROLOGUE(mpn_divrem_1)
+	FUNC_ENTRY(4)
+IFDOS(`	mov	56(%rsp), %r8	')
+	xor	R32(%rax), R32(%rax)
+	push	%r13
+	push	%r12
+	push	%rbp
+	push	%rbx
+
+	mov	fn_param, fn
+	mov	un_param, un
+	add	fn_param, un_param
+	mov	up_param, up
+	je	L(ret)
+
+	lea	-8(qp,un_param,8), qp
+	xor	R32(%rbp), R32(%rbp)
+
+L(unnormalized):
+	test	un, un
+	je	L(44)
+	mov	-8(up,un,8), %rax
+	cmp	d, %rax
+	jae	L(44)
+	mov	%rbp, (qp)
+	mov	%rax, %rbp
+	lea	-8(qp), qp
+	je	L(ret)
+	dec	un
+L(44):
+	bsr	d, %rcx
+	not	R32(%rcx)
+	sal	R8(%rcx), d
+	sal	R8(%rcx), %rbp
+
+	push	%rcx
+IFSTD(`	push	%rdi		')
+IFSTD(`	push	%rsi		')
+	push	%r8
+IFSTD(`	sub	$8, %rsp	')
+IFSTD(`	mov	d, %rdi		')
+IFDOS(`	sub	$40, %rsp	')
+IFDOS(`	mov	d, %rcx		')
+	ASSERT(nz, `test $15, %rsp')
+	CALL(	mpn_invert_limb)
+IFSTD(`	add	$8, %rsp	')
+IFDOS(`	add	$40, %rsp	')
+	pop	%r8
+IFSTD(`	pop	%rsi		')
+IFSTD(`	pop	%rdi		')
+	pop	%rcx
+
+	mov	%rax, dinv
+	mov	%rbp, %rax
+	test	un, un
+	je	L(frac)
+
+L(ent):	mov	-8(up,un,8), %rbp
+	shr	R8(%rcx), %rax
+	shld	R8(%rcx), %rbp, %rax
+	sub	$2, un
+	js	L(end)
+
+	ALIGN(16)
+L(top):	lea	1(%rax), %r11
+	mul	dinv
+	mov	(up,un,8), %r10
+	shld	R8(%rcx), %r10, %rbp
+	mov	%rbp, %r13
+	add	%rax, %r13
+	adc	%r11, %rdx
+	mov	%rdx, %r11
+	imul	d, %rdx
+	sub	%rdx, %rbp
+	lea	(d,%rbp), %rax
+	sub	$8, qp
+	cmp	%r13, %rbp
+	cmovc	%rbp, %rax
+	adc	$-1, %r11
+	cmp	d, %rax
+	jae	L(ufx)
+L(uok):	dec	un
+	mov	%r11, 8(qp)
+	mov	%r10, %rbp
+	jns	L(top)
+
+L(end):	lea	1(%rax), %r11
+	sal	R8(%rcx), %rbp
+	mul	dinv
+	add	%rbp, %rax
+	adc	%r11, %rdx
+	mov	%rax, %r11
+	mov	%rdx, %r13
+	imul	d, %rdx
+	sub	%rdx, %rbp
+	mov	d, %rax
+	add	%rbp, %rax
+	cmp	%r11, %rbp
+	cmovc	%rbp, %rax
+	adc	$-1, %r13
+	cmp	d, %rax
+	jae	L(efx)
+L(eok):	mov	%r13, (qp)
+	sub	$8, qp
+	jmp	L(frac)
+
+L(ufx):	sub	d, %rax
+	inc	%r11
+	jmp	L(uok)
+L(efx):	sub	d, %rax
+	inc	%r13
+	jmp	L(eok)
+
+L(frac):mov	d, %rbp
+	neg	%rbp
+	jmp	L(fent)
+
+	ALIGN(16)			C	    K8-K10  P6-CNR P6-NHM  P4
+L(ftop):mul	dinv			C	      0,12   0,17   0,17
+	add	%r11, %rdx		C	      5      8     10
+	mov	%rax, %r11		C	      4      8      3
+	mov	%rdx, %r13		C	      6      9     11
+	imul	%rbp, %rdx		C	      6      9     11
+	mov	d, %rax			C
+	add	%rdx, %rax		C	     10     14     14
+	cmp	%r11, %rdx		C	     10     14     14
+	cmovc	%rdx, %rax		C	     11     15     15
+	adc	$-1, %r13		C
+	mov	%r13, (qp)		C
+	sub	$8, qp			C
+L(fent):lea	1(%rax), %r11		C
+	dec	fn			C
+	jns	L(ftop)			C
+
+	shr	R8(%rcx), %rax
+L(ret):	pop	%rbx
+	pop	%rbp
+	pop	%r12
+	pop	%r13
+	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/core2/gcd_11.asm b/third_party/gmp/mpn/x86_64/core2/gcd_11.asm
new file mode 100644
index 0000000..b00451f
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/core2/gcd_11.asm

@@ -0,0 +1,93 @@
+dnl  AMD64 mpn_gcd_11 optimised for Intel CNR, PNR, SBR, IBR.
+
+dnl  Based on the K7 gcd_1.asm, by Kevin Ryde.  Rehacked for AMD64 by Torbjorn
+dnl  Granlund.
+
+dnl  Copyright 2000-2002, 2005, 2009, 2011, 2012, 2017, 2019 Free Software
+dnl  Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C	     cycles/bit (approx)
+C AMD K8,K9	 ?
+C AMD K10	 ?
+C AMD bd1	 ?
+C AMD bd2	 ?
+C AMD bd3	 ?
+C AMD bd4	 ?
+C AMD bt1	 ?
+C AMD bt2	 ?
+C AMD zn1	 ?
+C AMD zn2	 ?
+C Intel P4	 ?
+C Intel CNR	 4.22  *
+C Intel PNR	 4.22  *
+C Intel NHM	 4.97
+C Intel WSM	 5.17
+C Intel SBR	 4.83  *
+C Intel IBR	 4.16  *
+C Intel HWL	 3.84
+C Intel BWL	 3.76
+C Intel SKL	 3.83
+C Intel atom	 ?
+C Intel SLM	 ?
+C Intel GLM	 ?
+C Intel GLM+	 ?
+C VIA nano	 ?
+
+define(`u0',    `%rdi')
+define(`v0',    `%rsi')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(64)
+PROLOGUE(mpn_gcd_11)
+	FUNC_ENTRY(2)
+	jmp	L(odd)
+
+	ALIGN(16)
+L(top):	cmovc	%rdx, u0		C u = |u - v|
+	cmovc	%rax, v0		C v = min(u,v)
+	shr	R8(%rcx), u0
+L(odd):	mov	v0, %rdx
+	sub	u0, %rdx		C v - u
+	bsf	%rdx, %rcx
+	mov	u0, %rax
+	sub	v0, u0			C u - v
+	jnz	L(top)
+
+L(end):	C rax = result
+	C rdx = 0 for the benefit of internal gcd_22 call
+	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/core2/gcd_22.asm b/third_party/gmp/mpn/x86_64/core2/gcd_22.asm
new file mode 100644
index 0000000..b5aa73b
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/core2/gcd_22.asm

@@ -0,0 +1,137 @@
+dnl  AMD64 mpn_gcd_22.  Assumes useful bsf, useful shrd, no tzcnt, no shlx.
+
+dnl  Copyright 2019 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C	     cycles/bit
+C AMD K8,K9	 ?
+C AMD K10	 ?
+C AMD bd1	 ?
+C AMD bd2	 ?
+C AMD bd3	 ?
+C AMD bd4	 ?
+C AMD bt1	 ?
+C AMD bt2	 ?
+C AMD zn1	 ?
+C AMD zn2	 ?
+C Intel P4	 ?
+C Intel CNR	 8.7
+C Intel PNR	 8.7
+C Intel NHM	 9.2
+C Intel WSM	 9.2
+C Intel SBR	 9.1
+C Intel IBR	 ?
+C Intel HWL	 ?
+C Intel BWL	 ?
+C Intel SKL	 ?
+C Intel atom	 ?
+C Intel SLM	 ?
+C Intel GLM	 ?
+C Intel GLM+	 ?
+C VIA nano	 ?
+
+
+define(`u1',    `%rdi')
+define(`u0',    `%rsi')
+define(`v1',    `%rdx')
+define(`v0_param', `%rcx')
+
+define(`v0',    `%rax')
+define(`cnt',   `%rcx')
+
+define(`s0',    `%r8')
+define(`s1',    `%r9')
+define(`t0',    `%r10')
+define(`t1',    `%r11')
+
+dnl ABI_SUPPORT(DOS64)	C returns mp_double_limb_t in memory
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(64)
+PROLOGUE(mpn_gcd_22)
+	FUNC_ENTRY(4)
+	mov	v0_param, v0
+
+	ALIGN(16)
+L(top):	mov	v0, t0
+	sub	u0, t0
+	jz	L(lowz)		C	jump when low limb result = 0
+	mov	v1, t1
+	sbb	u1, t1
+
+	mov	u0, s0
+	mov	u1, s1
+
+	bsf	t0, cnt
+
+	sub	v0, u0
+	sbb	v1, u1
+
+L(bck):	cmovc	t0, u0		C u = |u - v|
+	cmovc	t1, u1		C u = |u - v|
+	cmovc	s0, v0		C v = min(u,v)
+	cmovc	s1, v1		C v = min(u,v)
+
+	shrd	R8(cnt), u1, u0
+	shr	R8(cnt), u1
+
+	mov	v1, t1
+	or	u1, t1
+	jnz	L(top)
+
+L(gcd_11):
+	mov	v0, %rdi
+C	mov	u0, %rsi
+	TCALL(	mpn_gcd_11)
+
+L(lowz):C We come here when v0 - u0 = 0
+	C 1. If v1 - u1 = 0, then gcd is u = v.
+	C 2. Else compute gcd_21({v1,v0}, |u1-v1|)
+	mov	v1, t0
+	sub	u1, t0
+	je	L(end)
+
+	xor	t1, t1
+	mov	u0, s0
+	mov	u1, s1
+	bsf	t0, cnt
+	mov	u1, u0
+	xor	u1, u1
+	sub	v1, u0
+	jmp	L(bck)
+
+L(end):	C mov	v0, %rax
+	C mov	v1, %rdx
+	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/core2/gmp-mparam.h b/third_party/gmp/mpn/x86_64/core2/gmp-mparam.h
new file mode 100644
index 0000000..44f1494
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/core2/gmp-mparam.h

@@ -0,0 +1,222 @@
+/* Core 2 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+/* 3000 MHz Penryn */
+/* FFT tuning limit = 116,220,984 */
+/* Generated by tuneup.c, 2019-10-18, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD                 0  /* always */
+#define MOD_1_UNNORM_THRESHOLD               0  /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          3
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          2
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD        11
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD        18
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD      8
+#define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_1_NORM_THRESHOLD              3
+#define DIV_QR_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */
+#define DIV_QR_2_PI2_THRESHOLD              16
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           26
+
+#define DIV_1_VS_MUL_1_PERCENT             284
+
+#define MUL_TOOM22_THRESHOLD                24
+#define MUL_TOOM33_THRESHOLD                65
+#define MUL_TOOM44_THRESHOLD               184
+#define MUL_TOOM6H_THRESHOLD               256
+#define MUL_TOOM8H_THRESHOLD               381
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      73
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     122
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD      73
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD      79
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD     106
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 28
+#define SQR_TOOM3_THRESHOLD                102
+#define SQR_TOOM4_THRESHOLD                160
+#define SQR_TOOM6_THRESHOLD                366
+#define SQR_TOOM8_THRESHOLD                478
+
+#define MULMID_TOOM42_THRESHOLD             32
+
+#define MULMOD_BNM1_THRESHOLD               11
+#define SQRMOD_BNM1_THRESHOLD               17
+
+#define MUL_FFT_MODF_THRESHOLD             368  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    368, 5}, {     17, 6}, {      9, 5}, {     19, 6}, \
+    {     10, 5}, {     21, 6}, {     11, 5}, {     23, 6}, \
+    {     21, 7}, {     11, 6}, {     23, 7}, {     12, 6}, \
+    {     25, 7}, {     21, 8}, {     11, 7}, {     24, 8}, \
+    {     13, 7}, {     27, 8}, {     15, 7}, {     31, 8}, \
+    {     19, 7}, {     39, 8}, {     21, 9}, {     11, 8}, \
+    {     27, 9}, {     15, 8}, {     35, 9}, {     19, 8}, \
+    {     41, 9}, {     23, 8}, {     47, 9}, {     27,10}, \
+    {     15, 9}, {     39,10}, {     23, 9}, {     51,11}, \
+    {     15,10}, {     31, 9}, {     67,10}, {     39, 9}, \
+    {     83,10}, {     47, 9}, {     95,11}, {     31,10}, \
+    {     79,11}, {     47,10}, {     95,12}, {     31, 9}, \
+    {    255,10}, {    135,11}, {     79,10}, {    159, 9}, \
+    {    319,11}, {     95,10}, {    191, 9}, {    383,11}, \
+    {    111,12}, {     63,11}, {    127,10}, {    271,11}, \
+    {    143,10}, {    287, 9}, {    575,10}, {    303,11}, \
+    {    159,10}, {    319,12}, {     95,11}, {    191,10}, \
+    {    383,11}, {    207,10}, {    415,13}, {     63,12}, \
+    {    127,11}, {    271,10}, {    543,11}, {    287,10}, \
+    {    575,11}, {    319,10}, {    639,11}, {    351,12}, \
+    {    191,11}, {    415,12}, {    223,11}, {    479,13}, \
+    {    127,12}, {    255,11}, {    543,12}, {    287,11}, \
+    {    607,12}, {    319,11}, {    639,12}, {    351,11}, \
+    {    703,13}, {    191,12}, {    479,14}, {    127,13}, \
+    {    255,12}, {    575,13}, {    319,12}, {    703,13}, \
+    {    383,12}, {    799,13}, {    447,12}, {    959,14}, \
+    {    255,13}, {    511,12}, {   1023,13}, {    575,12}, \
+    {   1151,13}, {    703,14}, {    383,13}, {    831,12}, \
+    {   1663,13}, {    959,15}, {    255,14}, {    511,13}, \
+    {   1087,12}, {   2175,13}, {   1215,14}, {    639,13}, \
+    {   1343,12}, {   2687,13}, {   1471,14}, {    767,13}, \
+    {   1663,14}, {    895,13}, {   1791,15}, {    511,14}, \
+    {   1023,13}, {   2175,14}, {   1151,13}, {   2431,12}, \
+    {   4863,14}, {   1279,13}, {   2559,14}, {   1407,13}, \
+    {   2815,15}, {    767,14}, {   1663,13}, {   3455,12}, \
+    {   6911,14}, {   1791,16}, {    511,15}, {   1023,14}, \
+    {   2431,13}, {   4863,15}, {   1279,14}, {   2943,13}, \
+    {   5887,12}, {  11775,15}, {   1535,14}, {   3455,13}, \
+    {   6911,15}, {   1791,14}, {   3839,13}, {   7679,16}, \
+    {   1023,15}, {   2047,14}, {   4223,15}, {   2303,14}, \
+    {   4991,15}, {   2815,14}, {   5887,13}, {  11775,16}, \
+    {   1535,15}, {   3327,14}, {   6911,15}, {  32768,16}, \
+    {  65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \
+    {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 176
+#define MUL_FFT_THRESHOLD                 4736
+
+#define SQR_FFT_MODF_THRESHOLD             308  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    308, 5}, {     17, 6}, {     23, 7}, {     12, 6}, \
+    {     25, 7}, {     21, 8}, {     11, 7}, {     25, 8}, \
+    {     13, 7}, {     27, 8}, {     15, 7}, {     31, 8}, \
+    {     21, 9}, {     11, 8}, {     27, 9}, {     15, 8}, \
+    {     33, 9}, {     19, 8}, {     41, 9}, {     23, 8}, \
+    {     47, 9}, {     27,10}, {     15, 9}, {     39,10}, \
+    {     23, 9}, {     51,11}, {     15,10}, {     31, 9}, \
+    {     63,10}, {     39, 9}, {     79,10}, {     47,11}, \
+    {     31,10}, {     79,11}, {     47,12}, {     31,11}, \
+    {     63,10}, {    127, 9}, {    255,11}, {     79,10}, \
+    {    159, 6}, {   2559, 7}, {   1343, 6}, {   2687, 7}, \
+    {   1407, 9}, {    383,12}, {     63,11}, {    127,10}, \
+    {    255, 9}, {    511,10}, {    271, 9}, {    543,11}, \
+    {    143,10}, {    287, 9}, {    575,11}, {    159,10}, \
+    {    319,11}, {    175,12}, {     95,11}, {    191,10}, \
+    {    383,11}, {    207,10}, {    415,13}, {     63,12}, \
+    {    127,11}, {    255,10}, {    511,11}, {    271,10}, \
+    {    543,11}, {    287,10}, {    575,12}, {    159,11}, \
+    {    319,10}, {    639,11}, {    351,12}, {    191,11}, \
+    {    383,10}, {    767,11}, {    415,12}, {    223,11}, \
+    {    479,13}, {    127,12}, {    255,11}, {    543,12}, \
+    {    287,11}, {    575,12}, {    319,11}, {    639,12}, \
+    {    351,13}, {    191,12}, {    383,11}, {    767,12}, \
+    {    479,14}, {    127,13}, {    255,12}, {    575,13}, \
+    {    319,12}, {    703,13}, {    383,12}, {    799,13}, \
+    {    447,12}, {    895,14}, {    255,13}, {    511,12}, \
+    {   1023,13}, {    575,12}, {   1151,13}, {    639,12}, \
+    {   1279,13}, {    703,14}, {    383,13}, {    767,12}, \
+    {   1535,13}, {    959,15}, {    255,14}, {    511,13}, \
+    {   1087,12}, {   2175,13}, {   1215,14}, {    639,13}, \
+    {   1343,12}, {   2687,13}, {   1407,14}, {    767,13}, \
+    {   1599,12}, {   3199,13}, {   1663,14}, {    895,15}, \
+    {    511,14}, {   1023,13}, {   2175,14}, {   1151,13}, \
+    {   2303,12}, {   4607,13}, {   2431,12}, {   4863,14}, \
+    {   1279,13}, {   2687,14}, {   1407,15}, {    767,14}, \
+    {   1535,13}, {   3199,14}, {   1663,13}, {   3455,12}, \
+    {   6911,16}, {    511,15}, {   1023,14}, {   2303,13}, \
+    {   4607,14}, {   2431,13}, {   4863,15}, {   1279,14}, \
+    {   2943,13}, {   5887,12}, {  11775,15}, {   1535,14}, \
+    {   3455,15}, {   1791,14}, {   3583,13}, {   7167,14}, \
+    {   3839,16}, {   1023,15}, {   2047,14}, {   4223,15}, \
+    {   2303,14}, {   4863,15}, {   2815,14}, {   5887,13}, \
+    {  11775,16}, {   1535,15}, {   3071,14}, {   6143,15}, \
+    {   3327,14}, {   6911,15}, {  32768,16}, {  65536,17}, \
+    { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
+    {2097152,22}, {4194304,23}, {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 183
+#define SQR_FFT_THRESHOLD                 3520
+
+#define MULLO_BASECASE_THRESHOLD             0  /* always */
+#define MULLO_DC_THRESHOLD                  67
+#define MULLO_MUL_N_THRESHOLD             9174
+#define SQRLO_BASECASE_THRESHOLD            10
+#define SQRLO_DC_THRESHOLD                  11
+#define SQRLO_SQR_THRESHOLD               7035
+
+#define DC_DIV_QR_THRESHOLD                 53
+#define DC_DIVAPPR_Q_THRESHOLD             163
+#define DC_BDIV_QR_THRESHOLD                46
+#define DC_BDIV_Q_THRESHOLD                 76
+
+#define INV_MULMOD_BNM1_THRESHOLD           46
+#define INV_NEWTON_THRESHOLD               158
+#define INV_APPR_THRESHOLD                 167
+
+#define BINV_NEWTON_THRESHOLD              248
+#define REDC_1_TO_REDC_N_THRESHOLD          44
+
+#define MU_DIV_QR_THRESHOLD               1187
+#define MU_DIVAPPR_Q_THRESHOLD            1210
+#define MUPI_DIV_QR_THRESHOLD               73
+#define MU_BDIV_QR_THRESHOLD              1017
+#define MU_BDIV_Q_THRESHOLD               1187
+
+#define POWM_SEC_TABLE  1,64,105,579,1486
+
+#define GET_STR_DC_THRESHOLD                12
+#define GET_STR_PRECOMPUTE_THRESHOLD        17
+#define SET_STR_DC_THRESHOLD               134
+#define SET_STR_PRECOMPUTE_THRESHOLD      1752
+
+#define FAC_DSC_THRESHOLD                  351
+#define FAC_ODD_THRESHOLD                   27
+
+#define MATRIX22_STRASSEN_THRESHOLD         18
+#define HGCD2_DIV1_METHOD                    3  /* 2.14% faster than 5 */
+#define HGCD_THRESHOLD                     118
+#define HGCD_APPR_THRESHOLD                161
+#define HGCD_REDUCE_THRESHOLD             2121
+#define GCD_DC_THRESHOLD                   416
+#define GCDEXT_DC_THRESHOLD                351
+#define JACOBI_BASE_METHOD                   4  /* 3.56% faster than 1 */
+
+/* Tuneup completed successfully, took 132491 seconds */

diff --git a/third_party/gmp/mpn/x86_64/core2/hamdist.asm b/third_party/gmp/mpn/x86_64/core2/hamdist.asm
new file mode 100644
index 0000000..a78753d
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/core2/hamdist.asm

@@ -0,0 +1,210 @@
+dnl  AMD64 SSSE3 mpn_hamdist -- hamming distance.
+
+dnl  Copyright 2010-2017 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+C		    cycles/limb	  good for cpu?
+C AMD K8,K9		n/a
+C AMD K10		n/a
+C AMD bd1		 ?
+C AMD bd2		 ?
+C AMD bd3		 ?
+C AMD bd4		 ?
+C AMD zen		 ?
+C AMD bobcat		 ?
+C AMD jaguar		 ?
+C Intel P4		n/a
+C Intel CNR		 4.50		y
+C Intel PNR		 3.28		y
+C Intel NHM		 ?
+C Intel SBR		 ?
+C Intel IBR		 ?
+C Intel HWL		 ?
+C Intel BWL		 ?
+C Intel SKL		 ?
+C Intel atom		 ?
+C Intel SLM		 ?
+C VIA nano		 ?
+
+C TODO
+C  * This was hand-written without too much thought about optimal insn
+C    selection; check to see of it can be improved.
+C  * Consider doing some instruction scheduling.
+
+define(`up',		`%rdi')
+define(`vp',		`%rsi')
+define(`n',		`%rdx')
+
+ASM_START()
+	TEXT
+	ALIGN(32)
+PROLOGUE(mpn_hamdist)
+	lea	L(cnsts)(%rip), %r9
+
+ifdef(`PIC', `define(`OFF1',32) define(`OFF2',48)',
+	     `define(`OFF1',64) define(`OFF2',80)')
+	movdqa	OFF1`'(%r9), %xmm7
+	movdqa	OFF2`'(%r9), %xmm6
+	pxor	%xmm4, %xmm4
+	pxor	%xmm5, %xmm5
+	pxor	%xmm8, %xmm8
+
+	mov	R32(n), R32(%rax)
+	and	$7, R32(%rax)
+ifdef(`PIC',`
+	movslq	(%r9,%rax,4), %rax
+	add	%r9, %rax
+	jmp	*%rax
+',`
+	jmp	*(%r9,%rax,8)
+')
+
+L(1):	movq	(up), %xmm1
+	add	$8, up
+	movq	(vp), %xmm10
+	add	$8, vp
+	pxor	%xmm10, %xmm1
+	jmp	L(e1)
+
+L(2):	add	$-48, up
+	add	$-48, vp
+	jmp	L(e2)
+
+L(3):	movq	(up), %xmm1
+	add	$-40, up
+	movq	(vp), %xmm10
+	add	$-40, vp
+	pxor	%xmm10, %xmm1
+	jmp	L(e3)
+
+L(4):	add	$-32, up
+	add	$-32, vp
+	jmp	L(e4)
+
+L(5):	movq	(up), %xmm1
+	add	$-24, up
+	movq	(vp), %xmm10
+	add	$-24, vp
+	pxor	%xmm10, %xmm1
+	jmp	L(e5)
+
+L(6):	add	$-16, up
+	add	$-16, vp
+	jmp	L(e6)
+
+L(7):	movq	(up), %xmm1
+	add	$-8, up
+	movq	(vp), %xmm10
+	add	$-8, vp
+	pxor	%xmm10, %xmm1
+	jmp	L(e7)
+
+	ALIGN(32)
+L(top):	lddqu	(up), %xmm1
+	lddqu	(vp), %xmm10
+	pxor	%xmm10, %xmm1
+L(e7):	movdqa	%xmm6, %xmm0		C copy mask register
+	movdqa	%xmm7, %xmm2		C copy count register
+	movdqa	%xmm7, %xmm3		C copy count register
+	pand	%xmm1, %xmm0
+	psrlw	$4, %xmm1
+	pand	%xmm6, %xmm1
+	pshufb	%xmm0, %xmm2
+	pshufb	%xmm1, %xmm3
+	paddb	%xmm2, %xmm3
+	paddb	%xmm3, %xmm4
+L(e6):	lddqu	16(up), %xmm1
+	lddqu	16(vp), %xmm10
+	pxor	%xmm10, %xmm1
+L(e5):	movdqa	%xmm6, %xmm0
+	movdqa	%xmm7, %xmm2
+	movdqa	%xmm7, %xmm3
+	pand	%xmm1, %xmm0
+	psrlw	$4, %xmm1
+	pand	%xmm6, %xmm1
+	pshufb	%xmm0, %xmm2
+	pshufb	%xmm1, %xmm3
+	paddb	%xmm2, %xmm3
+	paddb	%xmm3, %xmm4
+L(e4):	lddqu	32(up), %xmm1
+	lddqu	32(vp), %xmm10
+	pxor	%xmm10, %xmm1
+L(e3):	movdqa	%xmm6, %xmm0
+	movdqa	%xmm7, %xmm2
+	movdqa	%xmm7, %xmm3
+	pand	%xmm1, %xmm0
+	psrlw	$4, %xmm1
+	pand	%xmm6, %xmm1
+	pshufb	%xmm0, %xmm2
+	pshufb	%xmm1, %xmm3
+	paddb	%xmm2, %xmm3
+	paddb	%xmm3, %xmm4
+L(e2):	lddqu	48(up), %xmm1
+	add	$64, up
+	lddqu	48(vp), %xmm10
+	add	$64, vp
+	pxor	%xmm10, %xmm1
+L(e1):	movdqa	%xmm6, %xmm0
+	movdqa	%xmm7, %xmm2
+	movdqa	%xmm7, %xmm3
+	pand	%xmm1, %xmm0
+	psrlw	$4, %xmm1
+	pand	%xmm6, %xmm1
+	pshufb	%xmm0, %xmm2
+	pshufb	%xmm1, %xmm3
+	psadbw	%xmm5, %xmm4		C sum to 8 x 16-bit counts
+	paddb	%xmm2, %xmm3
+	paddq	%xmm4, %xmm8		C sum to 2 x 64-bit counts
+	movdqa	%xmm3, %xmm4
+	sub	$8, n
+	jg	L(top)
+
+	psadbw	%xmm5, %xmm4
+	paddq	%xmm4, %xmm8
+	pshufd	$14, %xmm8, %xmm0
+	paddq	%xmm8, %xmm0
+	movq	%xmm0, %rax
+	ret
+EPILOGUE()
+DEF_OBJECT(L(cnsts),16,`JUMPTABSECT')
+	JMPENT(	L(top), L(cnsts))
+	JMPENT(	L(1), L(cnsts))
+	JMPENT(	L(2), L(cnsts))
+	JMPENT(	L(3), L(cnsts))
+	JMPENT(	L(4), L(cnsts))
+	JMPENT(	L(5), L(cnsts))
+	JMPENT(	L(6), L(cnsts))
+	JMPENT(	L(7), L(cnsts))
+	.byte	0x00,0x01,0x01,0x02,0x01,0x02,0x02,0x03
+	.byte	0x01,0x02,0x02,0x03,0x02,0x03,0x03,0x04
+	.byte	0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f
+	.byte	0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f
+END_OBJECT(L(cnsts))

diff --git a/third_party/gmp/mpn/x86_64/core2/logops_n.asm b/third_party/gmp/mpn/x86_64/core2/logops_n.asm
new file mode 100644
index 0000000..5ff174c
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/core2/logops_n.asm

@@ -0,0 +1,285 @@
+dnl  AMD64 logops.
+
+dnl  Copyright 2004-2017 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C		c/l	c/l	c/l	good
+C	       var-1   var-2   var-3  for cpu?
+C AMD K8,K9
+C AMD K10	 1.52	 1.75	 1.75	 n
+C AMD bd1
+C AMD bd2
+C AMD bd3
+C AMD bd4
+C AMD bt1	 2.67	~2.79	~2.79	 =
+C AMD bt2	 2.15	 2.65	 2.65	 n
+C AMD zen	 1.5	 1.5	 1.5	 =
+C Intel P4
+C Intel PNR	 2.0	 2.0	 2.0	 =
+C Intel NHM	 2.0	 2.0	 2.0	 =
+C Intel SBR	 1.5	 1.5	 1.5	 y
+C Intel IBR	 1.47	 1.48	 1.48	 y
+C Intel HWL	 1.11	 1.35	 1.35	 y
+C Intel BWL	 1.09	 1.30	 1.30	 y
+C Intel SKL	 1.21	 1.27	 1.27	 y
+C Intel atom	 3.31	 3.57	 3.57	 y
+C Intel SLM	 3.0	 3.0	 3.0	 =
+C VIA nano
+
+ifdef(`OPERATION_and_n',`
+  define(`func',`mpn_and_n')
+  define(`VARIANT_1')
+  define(`LOGOP',`and')')
+ifdef(`OPERATION_andn_n',`
+  define(`func',`mpn_andn_n')
+  define(`VARIANT_2')
+  define(`LOGOP',`and')')
+ifdef(`OPERATION_nand_n',`
+  define(`func',`mpn_nand_n')
+  define(`VARIANT_3')
+  define(`LOGOP',`and')')
+ifdef(`OPERATION_ior_n',`
+  define(`func',`mpn_ior_n')
+  define(`VARIANT_1')
+  define(`LOGOP',`or')')
+ifdef(`OPERATION_iorn_n',`
+  define(`func',`mpn_iorn_n')
+  define(`VARIANT_2')
+  define(`LOGOP',`or')')
+ifdef(`OPERATION_nior_n',`
+  define(`func',`mpn_nior_n')
+  define(`VARIANT_3')
+  define(`LOGOP',`or')')
+ifdef(`OPERATION_xor_n',`
+  define(`func',`mpn_xor_n')
+  define(`VARIANT_1')
+  define(`LOGOP',`xor')')
+ifdef(`OPERATION_xnor_n',`
+  define(`func',`mpn_xnor_n')
+  define(`VARIANT_2')
+  define(`LOGOP',`xor')')
+
+define(`addptr', `lea	$1($2), $2')
+
+MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n)
+
+C INPUT PARAMETERS
+define(`rp',`%rdi')
+define(`up',`%rsi')
+define(`vp',`%rdx')
+define(`n',`%rcx')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+
+ifdef(`VARIANT_1',`
+	TEXT
+	ALIGN(32)
+PROLOGUE(func)
+	FUNC_ENTRY(4)
+	mov	(vp), %r8
+	mov	R32(%rcx), R32(%rax)
+	and	$3, R32(%rax)
+	je	L(b00)
+	cmp	$2, R32(%rax)
+	jc	L(b01)
+	je	L(b10)
+
+L(b11):	LOGOP	(up), %r8
+	mov	%r8, (rp)
+	inc	n
+	addptr(	-8, up)
+	addptr(	-8, vp)
+	addptr(	-8, rp)
+	jmp	L(e11)
+L(b10):	add	$2, n
+	addptr(	-16, up)
+	addptr(	-16, vp)
+	addptr(	-16, rp)
+	jmp	L(e10)
+L(b01):	LOGOP	(up), %r8
+	mov	%r8, (rp)
+	dec	n
+	jz	L(ret)
+	addptr(	8, up)
+	addptr(	8, vp)
+	addptr(	8, rp)
+
+	ALIGN(16)
+L(top):	mov	(vp), %r8
+L(b00):	mov	8(vp), %r9
+	LOGOP	(up), %r8
+	LOGOP	8(up), %r9
+	mov	%r8, (rp)
+	mov	%r9, 8(rp)
+L(e11):	mov	16(vp), %r8
+L(e10):	mov	24(vp), %r9
+	addptr(	32, vp)
+	LOGOP	16(up), %r8
+	LOGOP	24(up), %r9
+	addptr(	32, up)
+	mov	%r8, 16(rp)
+	mov	%r9, 24(rp)
+	addptr(	32, rp)
+	sub	$4, n
+	jnz	L(top)
+
+L(ret):	FUNC_EXIT()
+	ret
+EPILOGUE()
+')
+
+ifdef(`VARIANT_2',`
+	TEXT
+	ALIGN(32)
+PROLOGUE(func)
+	FUNC_ENTRY(4)
+	mov	(vp), %r8
+	not	%r8
+	mov	R32(%rcx), R32(%rax)
+	and	$3, R32(%rax)
+	je	L(b00)
+	cmp	$2, R32(%rax)
+	jc	L(b01)
+	je	L(b10)
+
+L(b11):	LOGOP	(up), %r8
+	mov	%r8, (rp)
+	inc	n
+	addptr(	-8, up)
+	addptr(	-8, vp)
+	addptr(	-8, rp)
+	jmp	L(e11)
+L(b10):	add	$2, n
+	addptr(	-16, up)
+	addptr(	-16, vp)
+	addptr(	-16, rp)
+	jmp	L(e10)
+L(b01):	LOGOP	(up), %r8
+	mov	%r8, (rp)
+	dec	n
+	jz	L(ret)
+	addptr(	8, up)
+	addptr(	8, vp)
+	addptr(	8, rp)
+
+	ALIGN(16)
+L(top):	mov	(vp), %r8
+	not	%r8
+L(b00):	mov	8(vp), %r9
+	not	%r9
+	LOGOP	(up), %r8
+	LOGOP	8(up), %r9
+	mov	%r8, (rp)
+	mov	%r9, 8(rp)
+L(e11):	mov	16(vp), %r8
+	not	%r8
+L(e10):	mov	24(vp), %r9
+	not	%r9
+	addptr(	32, vp)
+	LOGOP	16(up), %r8
+	LOGOP	24(up), %r9
+	addptr(	32, up)
+	mov	%r8, 16(rp)
+	mov	%r9, 24(rp)
+	addptr(	32, rp)
+	sub	$4, n
+	jnz	L(top)
+
+L(ret):	FUNC_EXIT()
+	ret
+EPILOGUE()
+')
+
+ifdef(`VARIANT_3',`
+	TEXT
+	ALIGN(32)
+PROLOGUE(func)
+	FUNC_ENTRY(4)
+	mov	(vp), %r8
+	mov	R32(%rcx), R32(%rax)
+	and	$3, R32(%rax)
+	je	L(b00)
+	cmp	$2, R32(%rax)
+	jc	L(b01)
+	je	L(b10)
+
+L(b11):	LOGOP	(up), %r8
+	not	%r8
+	mov	%r8, (rp)
+	inc	n
+	addptr(	-8, up)
+	addptr(	-8, vp)
+	addptr(	-8, rp)
+	jmp	L(e11)
+L(b10):	add	$2, n
+	addptr(	-16, up)
+	addptr(	-16, vp)
+	addptr(	-16, rp)
+	jmp	L(e10)
+L(b01):	LOGOP	(up), %r8
+	not	%r8
+	mov	%r8, (rp)
+	dec	n
+	jz	L(ret)
+	addptr(	8, up)
+	addptr(	8, vp)
+	addptr(	8, rp)
+
+	ALIGN(16)
+L(top):	mov	(vp), %r8
+L(b00):	mov	8(vp), %r9
+	LOGOP	(up), %r8
+	not	%r8
+	LOGOP	8(up), %r9
+	not	%r9
+	mov	%r8, (rp)
+	mov	%r9, 8(rp)
+L(e11):	mov	16(vp), %r8
+L(e10):	mov	24(vp), %r9
+	addptr(	32, vp)
+	LOGOP	16(up), %r8
+	not	%r8
+	LOGOP	24(up), %r9
+	addptr(	32, up)
+	not	%r9
+	mov	%r8, 16(rp)
+	mov	%r9, 24(rp)
+	addptr(	32, rp)
+	sub	$4, n
+	jnz	L(top)
+
+L(ret):	FUNC_EXIT()
+	ret
+EPILOGUE()
+')

diff --git a/third_party/gmp/mpn/x86_64/core2/lshift.asm b/third_party/gmp/mpn/x86_64/core2/lshift.asm
new file mode 100644
index 0000000..9016a71
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/core2/lshift.asm

@@ -0,0 +1,145 @@
+dnl  x86-64 mpn_lshift optimised for Conroe/Penryn and Nehalem.
+
+dnl  Copyright 2007, 2009, 2011, 2012, 2017 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C AMD K8,K9
+C AMD K10
+C AMD bd1
+C AMD bd2
+C AMD bd3
+C AMD bd4
+C AMD zen
+C AMD bobcat
+C AMD jaguar
+C Intel P4
+C Intel core2	 1.32
+C Intel NHM	 1.30	(drops to 2.5 for n > 256)
+C Intel SBR
+C Intel IBR
+C Intel HWL
+C Intel BWL
+C Intel SKL
+C Intel atom
+C Intel SLM
+C VIA nano
+
+C INPUT PARAMETERS
+define(`rp',	`%rdi')
+define(`up',	`%rsi')
+define(`n',	`%rdx')
+define(`cnt',	`%rcx')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_lshift)
+	FUNC_ENTRY(4)
+
+	xor	R32(%rax), R32(%rax)
+
+	test	$1, R8(n)
+	jnz	L(bx1)
+L(bx0):	test	$2, R8(n)
+	jnz	L(b10)
+
+L(b00):	lea	-8(up,n,8), up
+	lea	16(rp,n,8), rp
+	mov	(up), %r10
+	mov	-8(up), %r11
+	shld	R8(cnt), %r10, %rax
+	mov	-16(up), %r8
+	shr	$2, n
+	jmp	L(00)
+
+L(bx1):	test	$2, R8(n)
+	jnz	L(b11)
+
+L(b01):	lea	-16(up,n,8), up
+	lea	8(rp,n,8), rp
+	mov	8(up), %r9
+	shld	R8(cnt), %r9, %rax
+	shr	$2, n
+	jz	L(1)
+	mov	(up), %r10
+	mov	-8(up), %r11
+	jmp	L(01)
+
+L(b10):	lea	-24(up,n,8), up
+	lea	(rp,n,8), rp
+	mov	16(up), %r8
+	mov	8(up), %r9
+	shld	R8(cnt), %r8, %rax
+	shr	$2, n
+	jz	L(2)
+	mov	(up), %r10
+	jmp	L(10)
+
+	ALIGN(16)
+L(b11):	lea	-32(up,n,8), up
+	lea	-8(rp,n,8), rp
+	mov	24(up), %r11
+	mov	16(up), %r8
+	mov	8(up), %r9
+	shld	R8(cnt), %r11, %rax
+	shr	$2, n
+	jz	L(end)
+
+	ALIGN(16)
+L(top):	shld	R8(cnt), %r8, %r11
+	mov	(up), %r10
+	mov	%r11, (rp)
+L(10):	shld	R8(cnt), %r9, %r8
+	mov	-8(up), %r11
+	mov	%r8, -8(rp)
+L(01):	shld	R8(cnt), %r10, %r9
+	mov	-16(up), %r8
+	mov	%r9, -16(rp)
+L(00):	shld	R8(cnt), %r11, %r10
+	mov	-24(up), %r9
+	add	$-32, up
+	mov	%r10, -24(rp)
+	add	$-32, rp
+	dec	n
+	jnz	L(top)
+
+L(end):	shld	R8(cnt), %r8, %r11
+	mov	%r11, (rp)
+L(2):	shld	R8(cnt), %r9, %r8
+	mov	%r8, -8(rp)
+L(1):	shl	R8(cnt), %r9
+	mov	%r9, -16(rp)
+	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/core2/lshiftc.asm b/third_party/gmp/mpn/x86_64/core2/lshiftc.asm
new file mode 100644
index 0000000..c428f13
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/core2/lshiftc.asm

@@ -0,0 +1,159 @@
+dnl  x86-64 mpn_lshiftc optimised for Conroe/Penryn and Nehalem.
+
+dnl  Copyright 2007, 2009, 2011, 2012, 2017 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C AMD K8,K9
+C AMD K10
+C AMD bd1
+C AMD bd2
+C AMD bd3
+C AMD bd4
+C AMD zen
+C AMD bobcat
+C AMD jaguar
+C Intel P4
+C Intel core2	 1.52
+C Intel NHM	 1.78	(just 2.15 for n < 256)
+C Intel SBR
+C Intel IBR
+C Intel HWL
+C Intel BWL
+C Intel SKL
+C Intel atom
+C Intel SLM
+C VIA nano
+
+C INPUT PARAMETERS
+define(`rp',	`%rdi')
+define(`up',	`%rsi')
+define(`n',	`%rdx')
+define(`cnt',	`%rcx')
+
+C TODO
+C  * This runs poorly on Nehalem compared to plain lshift, in particular for
+C    n < 256.
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_lshiftc)
+	FUNC_ENTRY(4)
+
+	xor	R32(%rax), R32(%rax)
+
+	test	$1, R8(n)
+	jnz	L(bx1)
+L(bx0):	test	$2, R8(n)
+	jnz	L(b10)
+
+L(b00):	lea	-8(up,n,8), up
+	lea	16(rp,n,8), rp
+	mov	(up), %r10
+	mov	-8(up), %r11
+	shld	R8(cnt), %r10, %rax
+	mov	-16(up), %r8
+	shr	$2, n
+	shld	R8(cnt), %r11, %r10
+	jmp	L(00)
+
+L(bx1):	test	$2, R8(n)
+	jnz	L(b11)
+
+L(b01):	lea	-16(up,n,8), up
+	lea	8(rp,n,8), rp
+	mov	8(up), %r9
+	shld	R8(cnt), %r9, %rax
+	shr	$2, n
+	jz	L(1)
+	mov	(up), %r10
+	mov	-8(up), %r11
+	shld	R8(cnt), %r10, %r9
+	jmp	L(01)
+
+L(b10):	lea	-24(up,n,8), up
+	lea	(rp,n,8), rp
+	mov	16(up), %r8
+	mov	8(up), %r9
+	shld	R8(cnt), %r8, %rax
+	shr	$2, n
+	jz	L(2)
+	mov	(up), %r10
+	shld	R8(cnt), %r9, %r8
+	jmp	L(10)
+
+	ALIGN(16)
+L(b11):	lea	-32(up,n,8), up
+	lea	-8(rp,n,8), rp
+	mov	24(up), %r11
+	mov	16(up), %r8
+	mov	8(up), %r9
+	shld	R8(cnt), %r11, %rax
+	shr	$2, n
+	jz	L(end)
+
+	ALIGN(16)
+L(top):	shld	R8(cnt), %r8, %r11
+	mov	(up), %r10
+	not	%r11
+	shld	R8(cnt), %r9, %r8
+	mov	%r11, (rp)
+L(10):	mov	-8(up), %r11
+	not	%r8
+	shld	R8(cnt), %r10, %r9
+	mov	%r8, -8(rp)
+L(01):	mov	-16(up), %r8
+	not	%r9
+	shld	R8(cnt), %r11, %r10
+	mov	%r9, -16(rp)
+L(00):	mov	-24(up), %r9
+	not	%r10
+	add	$-32, up
+	mov	%r10, -24(rp)
+	add	$-32, rp
+	dec	n
+	jnz	L(top)
+
+L(end):	shld	R8(cnt), %r8, %r11
+	not	%r11
+	mov	%r11, (rp)
+L(2):	shld	R8(cnt), %r9, %r8
+	not	%r8
+	mov	%r8, -8(rp)
+L(1):	shl	R8(cnt), %r9
+	not	%r9
+	mov	%r9, -16(rp)
+	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/core2/mul_basecase.asm b/third_party/gmp/mpn/x86_64/core2/mul_basecase.asm
new file mode 100644
index 0000000..d16be85
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/core2/mul_basecase.asm

@@ -0,0 +1,975 @@
+dnl  X86-64 mpn_mul_basecase optimised for Intel Nehalem/Westmere.
+dnl  It also seems good for Conroe/Wolfdale.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2008, 2011-2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb	mul_1		mul_2		mul_3		addmul_2
+C AMD K8,K9
+C AMD K10
+C AMD bull
+C AMD pile
+C AMD steam
+C AMD bobcat
+C AMD jaguar
+C Intel P4
+C Intel core	 4.0		 4.0		 -		4.18-4.25
+C Intel NHM	 3.75		 3.8		 -		4.06-4.2
+C Intel SBR
+C Intel IBR
+C Intel HWL
+C Intel BWL
+C Intel atom
+C VIA nano
+
+C The inner loops of this code are the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjörn Granlund.
+
+C Code structure:
+C
+C
+C               m_1(0m4)        m_1(1m4)        m_1(2m4)        m_1(3m4)
+C                  |               |               |               |
+C        m_2(0m4)  |     m_2(1m4)  |     m_2(2m4)  |     m_2(3m4)  |
+C           |      /        |      /        |      /        |      /
+C           |     /         |     /         |     /         |     /
+C           |    /          |    /          |    /          |    /
+C          \|/ |/_         \|/ |/_         \|/ |/_         \|/ |/_
+C             _____           _____           _____           _____
+C            /     \         /     \         /     \         /     \
+C          \|/      |      \|/      |      \|/      |      \|/      |
+C        am_2(0m4)  |    am_2(1m4)  |    am_2(2m4)  |    am_2(3m4)  |
+C           \      /|\      \      /|\      \      /|\      \      /|\
+C            \_____/         \_____/         \_____/         \_____/
+
+C TODO
+C  * Tune.  None done so far.
+C  * Currently 2687 bytes, making it smaller would be nice.
+C  * Implement some basecases, say for un < 4.
+C  * Try zeroing with xor in m2 loops.
+C  * Try re-rolling the m2 loops to avoid the current 9 insn code duplication
+C    between loop header and wind-down code.
+C  * Consider adc reg,reg instead of adc $0,reg in m2 loops.  This save a byte.
+
+C When playing with pointers, set this to $2 to fall back to conservative
+C indexing in wind-down code.
+define(`I',`$1')
+
+C Define this to $1 to use late loop index variable as zero, $2 to use an
+C explicit $0.
+define(`Z',`$1')
+
+define(`rp',       `%rdi')
+define(`up',       `%rsi')
+define(`un_param', `%rdx')
+define(`vp_param', `%rcx')	C FIXME reallocate vp to rcx but watch performance!
+define(`vn_param', `%r8')
+
+define(`un',       `%r9')
+define(`vn',       `(%rsp)')
+
+define(`v0',       `%r10')
+define(`v1',       `%r11')
+define(`w0',       `%rbx')
+define(`w1',       `%rcx')
+define(`w2',       `%rbp')
+define(`w3',       `%r12')
+define(`i',        `%r13')
+define(`vp',       `%r14')
+
+define(`X0',       `%r8')
+define(`X1',       `%r15')
+
+C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+define(`ALIGNx', `ALIGN(16)')
+
+define(`N', 85)
+ifdef(`N',,`define(`N',0)')
+define(`MOV', `ifelse(eval(N & $3),0,`mov	$1, $2',`lea	($1), $2')')
+
+ASM_START()
+	TEXT
+	ALIGN(32)
+PROLOGUE(mpn_mul_basecase)
+	FUNC_ENTRY(4)
+IFDOS(`	mov	56(%rsp), %r8d	')
+	mov	(up), %rax		C shared for mul_1 and mul_2
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+
+	mov	(vp_param), v0		C shared for mul_1 and mul_2
+
+	xor	un, un
+	sub	un_param, un		C un = -un_param
+
+	lea	(up,un_param,8), up
+	lea	(rp,un_param,8), rp
+
+	mul	v0			C shared for mul_1 and mul_2
+
+	test	$1, R8(vn_param)
+	jz	L(m2)
+
+	lea	8(vp_param), vp		C FIXME: delay until known needed
+
+	test	$1, R8(un)
+	jnz	L(m1x1)
+
+L(m1x0):test	$2, R8(un)
+	jnz	L(m1s2)
+
+L(m1s0):
+	lea	(un), i
+	mov	%rax, (rp,un,8)
+	mov	8(up,un,8), %rax
+	mov	%rdx, w0		C FIXME: Use lea?
+	lea	L(do_am0)(%rip), %rbp
+	jmp	L(m1e0)
+
+L(m1s2):
+	lea	2(un), i
+	mov	%rax, (rp,un,8)
+	mov	8(up,un,8), %rax
+	mov	%rdx, w0		C FIXME: Use lea?
+	mul	v0
+	lea	L(do_am2)(%rip), %rbp
+	test	i, i
+	jnz	L(m1e2)
+	add	%rax, w0
+	adc	$0, %rdx
+	mov	w0, I(-8(rp),8(rp,un,8))
+	mov	%rdx, I((rp),16(rp,un,8))
+	jmp	L(ret2)
+
+L(m1x1):test	$2, R8(un)
+	jz	L(m1s3)
+
+L(m1s1):
+	lea	1(un), i
+	mov	%rax, (rp,un,8)
+	test	i, i
+	jz	L(1)
+	mov	8(up,un,8), %rax
+	mov	%rdx, w1		C FIXME: Use lea?
+	lea	L(do_am1)(%rip), %rbp
+	jmp	L(m1e1)
+L(1):	mov	%rdx, I((rp),8(rp,un,8))
+	jmp	L(ret2)
+
+L(m1s3):
+	lea	-1(un), i
+	mov	%rax, (rp,un,8)
+	mov	8(up,un,8), %rax
+	mov	%rdx, w1		C FIXME: Use lea?
+	lea	L(do_am3)(%rip), %rbp
+	jmp	L(m1e3)
+
+	ALIGNx
+L(m1top):
+	mul	v0
+	mov	w1, -16(rp,i,8)
+L(m1e2):xor	R32(w1), R32(w1)
+	add	%rax, w0
+	mov	(up,i,8), %rax
+	adc	%rdx, w1
+	mov	w0, -8(rp,i,8)
+L(m1e1):xor	R32(w0), R32(w0)
+	mul	v0
+	add	%rax, w1
+	mov	8(up,i,8), %rax
+	adc	%rdx, w0
+	mov	w1, (rp,i,8)
+L(m1e0):xor	R32(w1), R32(w1)
+	mul	v0
+	add	%rax, w0
+	mov	16(up,i,8), %rax
+	adc	%rdx, w1
+	mov	w0, 8(rp,i,8)
+L(m1e3):xor	R32(w0), R32(w0)
+	mul	v0
+	add	%rax, w1
+	mov	24(up,i,8), %rax
+	adc	%rdx, w0
+	add	$4, i
+	js	L(m1top)
+
+	mul	v0
+	mov	w1, I(-16(rp),-16(rp,i,8))
+	add	%rax, w0
+	adc	$0, %rdx
+	mov	w0, I(-8(rp),-8(rp,i,8))
+	mov	%rdx, I((rp),(rp,i,8))
+
+	dec	vn_param
+	jz	L(ret2)
+	lea	-8(rp), rp
+	jmp	*%rbp
+
+L(m2):
+	mov	8(vp_param), v1
+	lea	16(vp_param), vp	C FIXME: delay until known needed
+
+	test	$1, R8(un)
+	jnz	L(bx1)
+
+L(bx0):	test	$2, R8(un)
+	jnz	L(b10)
+
+L(b00):	lea	(un), i
+	mov	%rax, (rp,un,8)
+	mov	%rdx, w1		C FIXME: Use lea?
+	mov	(up,un,8), %rax
+	mov	$0, R32(w2)
+	jmp	L(m2e0)
+
+L(b10):	lea	-2(un), i
+	mov	%rax, w2		C FIXME: Use lea?
+	mov	(up,un,8), %rax
+	mov	%rdx, w3		C FIXME: Use lea?
+	mov	$0, R32(w0)
+	jmp	L(m2e2)
+
+L(bx1):	test	$2, R8(un)
+	jz	L(b11)
+
+L(b01):	lea	1(un), i
+	mov	%rax, (rp,un,8)
+	mov	(up,un,8), %rax
+	mov	%rdx, w0		C FIXME: Use lea?
+	mov	$0, R32(w1)
+	jmp	L(m2e1)
+
+L(b11):	lea	-1(un), i
+	mov	%rax, w1		C FIXME: Use lea?
+	mov	(up,un,8), %rax
+	mov	%rdx, w2		C FIXME: Use lea?
+	mov	$0, R32(w3)
+	jmp	L(m2e3)
+
+	ALIGNx
+L(m2top0):
+	mul	v0
+	add	%rax, w3
+	mov	-8(up,i,8), %rax
+	mov	w3, -8(rp,i,8)
+	adc	%rdx, w0
+	adc	$0, R32(w1)
+	mul	v1
+	add	%rax, w0
+	adc	%rdx, w1
+	mov	$0, R32(w2)
+	mov	(up,i,8), %rax
+	mul	v0
+	add	%rax, w0
+	mov	w0, (rp,i,8)
+	adc	%rdx, w1
+	mov	(up,i,8), %rax
+	adc	$0, R32(w2)
+L(m2e0):mul	v1
+	add	%rax, w1
+	adc	%rdx, w2
+	mov	8(up,i,8), %rax
+	mul	v0
+	mov	$0, R32(w3)
+	add	%rax, w1
+	adc	%rdx, w2
+	adc	$0, R32(w3)
+	mov	8(up,i,8), %rax
+	mul	v1
+	add	%rax, w2
+	mov	w1, 8(rp,i,8)
+	adc	%rdx, w3
+	mov	$0, R32(w0)
+	mov	16(up,i,8), %rax
+	mul	v0
+	add	%rax, w2
+	mov	16(up,i,8), %rax
+	adc	%rdx, w3
+	adc	$0, R32(w0)
+	mul	v1
+	mov	$0, R32(w1)
+	add	%rax, w3
+	mov	24(up,i,8), %rax
+	mov	w2, 16(rp,i,8)
+	adc	%rdx, w0
+	add	$4, i
+	js	L(m2top0)
+
+	mul	v0
+	add	%rax, w3
+	mov	I(-8(up),-8(up,i,8)), %rax
+	mov	w3, I(-8(rp),-8(rp,i,8))
+	adc	%rdx, w0
+	adc	R32(w1), R32(w1)
+	mul	v1
+	add	%rax, w0
+	adc	%rdx, w1
+	mov	w0, I((rp),(rp,i,8))
+	mov	w1, I(8(rp),8(rp,i,8))
+
+	add	$-2, vn_param
+	jz	L(ret2)
+
+L(do_am0):
+	push	%r15
+	push	vn_param
+
+L(olo0):
+	mov	(vp), v0
+	mov	8(vp), v1
+	lea	16(vp), vp
+	lea	16(rp), rp
+	mov	(up,un,8), %rax
+C	lea	0(un), i
+	mov	un, i
+	mul	v0
+	mov	%rax, X0
+	mov	(up,un,8), %rax
+	MOV(	%rdx, X1, 2)
+	mul	v1
+	MOV(	%rdx, w0, 4)
+	mov	(rp,un,8), w2
+	mov	%rax, w3
+	jmp	L(lo0)
+
+	ALIGNx
+L(am2top0):
+	mul	v1
+	add	w0, w1
+	adc	%rax, w2
+	mov	(up,i,8), %rax
+	MOV(	%rdx, w3, 1)
+	adc	$0, w3
+	mul	v0
+	add	w1, X1
+	mov	X1, -8(rp,i,8)
+	adc	%rax, X0
+	MOV(	%rdx, X1, 2)
+	adc	$0, X1
+	mov	(up,i,8), %rax
+	mul	v1
+	MOV(	%rdx, w0, 4)
+	mov	(rp,i,8), w1
+	add	w1, w2
+	adc	%rax, w3
+	adc	$0, w0
+L(lo0):	mov	8(up,i,8), %rax
+	mul	v0
+	add	w2, X0
+	adc	%rax, X1
+	mov	X0, (rp,i,8)
+	MOV(	%rdx, X0, 8)
+	adc	$0, X0
+	mov	8(up,i,8), %rax
+	mov	8(rp,i,8), w2
+	mul	v1
+	add	w2, w3
+	adc	%rax, w0
+	MOV(	%rdx, w1, 16)
+	adc	$0, w1
+	mov	16(up,i,8), %rax
+	mul	v0
+	add	w3, X1
+	mov	X1, 8(rp,i,8)
+	adc	%rax, X0
+	MOV(	%rdx, X1, 32)
+	mov	16(rp,i,8), w3
+	adc	$0, X1
+	mov	16(up,i,8), %rax
+	mul	v1
+	add	w3, w0
+	MOV(	%rdx, w2, 64)
+	adc	%rax, w1
+	mov	24(up,i,8), %rax
+	adc	$0, w2
+	mul	v0
+	add	w0, X0
+	mov	X0, 16(rp,i,8)
+	MOV(	%rdx, X0, 128)
+	adc	%rax, X1
+	mov	24(up,i,8), %rax
+	mov	24(rp,i,8), w0
+	adc	$0, X0
+	add	$4, i
+	jnc	L(am2top0)
+
+	mul	v1
+	add	w0, w1
+	adc	%rax, w2
+	adc	Z(i,$0), %rdx
+	add	w1, X1
+	adc	Z(i,$0), X0
+	mov	X1, I(-8(rp),-8(rp,i,8))
+	add	w2, X0
+	mov	X0, I((rp),(rp,i,8))
+	adc	Z(i,$0), %rdx
+	mov	%rdx, I(8(rp),8(rp,i,8))
+
+	addl	$-2, vn
+	jnz	L(olo0)
+
+L(ret):	pop	%rax
+	pop	%r15
+L(ret2):pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+
+
+	ALIGNx
+L(m2top1):
+	mul	v0
+	add	%rax, w3
+	mov	-8(up,i,8), %rax
+	mov	w3, -8(rp,i,8)
+	adc	%rdx, w0
+	adc	$0, R32(w1)
+L(m2e1):mul	v1
+	add	%rax, w0
+	adc	%rdx, w1
+	mov	$0, R32(w2)
+	mov	(up,i,8), %rax
+	mul	v0
+	add	%rax, w0
+	mov	w0, (rp,i,8)
+	adc	%rdx, w1
+	mov	(up,i,8), %rax
+	adc	$0, R32(w2)
+	mul	v1
+	add	%rax, w1
+	adc	%rdx, w2
+	mov	8(up,i,8), %rax
+	mul	v0
+	mov	$0, R32(w3)
+	add	%rax, w1
+	adc	%rdx, w2
+	adc	$0, R32(w3)
+	mov	8(up,i,8), %rax
+	mul	v1
+	add	%rax, w2
+	mov	w1, 8(rp,i,8)
+	adc	%rdx, w3
+	mov	$0, R32(w0)
+	mov	16(up,i,8), %rax
+	mul	v0
+	add	%rax, w2
+	mov	16(up,i,8), %rax
+	adc	%rdx, w3
+	adc	$0, R32(w0)
+	mul	v1
+	mov	$0, R32(w1)
+	add	%rax, w3
+	mov	24(up,i,8), %rax
+	mov	w2, 16(rp,i,8)
+	adc	%rdx, w0
+	add	$4, i
+	js	L(m2top1)
+
+	mul	v0
+	add	%rax, w3
+	mov	I(-8(up),-8(up,i,8)), %rax
+	mov	w3, I(-8(rp),-8(rp,i,8))
+	adc	%rdx, w0
+	adc	R32(w1), R32(w1)
+	mul	v1
+	add	%rax, w0
+	adc	%rdx, w1
+	mov	w0, I((rp),(rp,i,8))
+	mov	w1, I(8(rp),8(rp,i,8))
+
+	add	$-2, vn_param
+	jz	L(ret2)
+
+L(do_am1):
+	push	%r15
+	push	vn_param
+
+L(olo1):
+	mov	(vp), v0
+	mov	8(vp), v1
+	lea	16(vp), vp
+	lea	16(rp), rp
+	mov	(up,un,8), %rax
+	lea	1(un), i
+	mul	v0
+	mov	%rax, X1
+	MOV(	%rdx, X0, 128)
+	mov	(up,un,8), %rax
+	mov	(rp,un,8), w1
+	mul	v1
+	mov	%rax, w2
+	mov	8(up,un,8), %rax
+	MOV(	%rdx, w3, 1)
+	jmp	L(lo1)
+
+	ALIGNx
+L(am2top1):
+	mul	v1
+	add	w0, w1
+	adc	%rax, w2
+	mov	(up,i,8), %rax
+	MOV(	%rdx, w3, 1)
+	adc	$0, w3
+L(lo1):	mul	v0
+	add	w1, X1
+	mov	X1, -8(rp,i,8)
+	adc	%rax, X0
+	MOV(	%rdx, X1, 2)
+	adc	$0, X1
+	mov	(up,i,8), %rax
+	mul	v1
+	MOV(	%rdx, w0, 4)
+	mov	(rp,i,8), w1
+	add	w1, w2
+	adc	%rax, w3
+	adc	$0, w0
+	mov	8(up,i,8), %rax
+	mul	v0
+	add	w2, X0
+	adc	%rax, X1
+	mov	X0, (rp,i,8)
+	MOV(	%rdx, X0, 8)
+	adc	$0, X0
+	mov	8(up,i,8), %rax
+	mov	8(rp,i,8), w2
+	mul	v1
+	add	w2, w3
+	adc	%rax, w0
+	MOV(	%rdx, w1, 16)
+	adc	$0, w1
+	mov	16(up,i,8), %rax
+	mul	v0
+	add	w3, X1
+	mov	X1, 8(rp,i,8)
+	adc	%rax, X0
+	MOV(	%rdx, X1, 32)
+	mov	16(rp,i,8), w3
+	adc	$0, X1
+	mov	16(up,i,8), %rax
+	mul	v1
+	add	w3, w0
+	MOV(	%rdx, w2, 64)
+	adc	%rax, w1
+	mov	24(up,i,8), %rax
+	adc	$0, w2
+	mul	v0
+	add	w0, X0
+	mov	X0, 16(rp,i,8)
+	MOV(	%rdx, X0, 128)
+	adc	%rax, X1
+	mov	24(up,i,8), %rax
+	mov	24(rp,i,8), w0
+	adc	$0, X0
+	add	$4, i
+	jnc	L(am2top1)
+
+	mul	v1
+	add	w0, w1
+	adc	%rax, w2
+	adc	Z(i,$0), %rdx
+	add	w1, X1
+	adc	Z(i,$0), X0
+	mov	X1, I(-8(rp),-8(rp,i,8))
+	add	w2, X0
+	mov	X0, I((rp),(rp,i,8))
+	adc	Z(i,$0), %rdx
+	mov	%rdx, I(8(rp),8(rp,i,8))
+
+	addl	$-2, vn
+	jnz	L(olo1)
+
+	pop	%rax
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+
+
+	ALIGNx
+L(m2top2):
+	mul	v0
+	add	%rax, w3
+	mov	-8(up,i,8), %rax
+	mov	w3, -8(rp,i,8)
+	adc	%rdx, w0
+	adc	$0, R32(w1)
+	mul	v1
+	add	%rax, w0
+	adc	%rdx, w1
+	mov	$0, R32(w2)
+	mov	(up,i,8), %rax
+	mul	v0
+	add	%rax, w0
+	mov	w0, (rp,i,8)
+	adc	%rdx, w1
+	mov	(up,i,8), %rax
+	adc	$0, R32(w2)
+	mul	v1
+	add	%rax, w1
+	adc	%rdx, w2
+	mov	8(up,i,8), %rax
+	mul	v0
+	mov	$0, R32(w3)
+	add	%rax, w1
+	adc	%rdx, w2
+	adc	$0, R32(w3)
+	mov	8(up,i,8), %rax
+	mul	v1
+	add	%rax, w2
+	mov	w1, 8(rp,i,8)
+	adc	%rdx, w3
+	mov	$0, R32(w0)
+	mov	16(up,i,8), %rax
+	mul	v0
+	add	%rax, w2
+	mov	16(up,i,8), %rax
+	adc	%rdx, w3
+	adc	$0, R32(w0)
+L(m2e2):mul	v1
+	mov	$0, R32(w1)
+	add	%rax, w3
+	mov	24(up,i,8), %rax
+	mov	w2, 16(rp,i,8)
+	adc	%rdx, w0
+	add	$4, i
+	js	L(m2top2)
+
+	mul	v0
+	add	%rax, w3
+	mov	I(-8(up),-8(up,i,8)), %rax
+	mov	w3, I(-8(rp),-8(rp,i,8))
+	adc	%rdx, w0
+	adc	R32(w1), R32(w1)
+	mul	v1
+	add	%rax, w0
+	adc	%rdx, w1
+	mov	w0, I((rp),(rp,i,8))
+	mov	w1, I(8(rp),8(rp,i,8))
+
+	add	$-2, vn_param
+	jz	L(ret2)
+
+L(do_am2):
+	push	%r15
+	push	vn_param
+
+L(olo2):
+	mov	(vp), v0
+	mov	8(vp), v1
+	lea	16(vp), vp
+	lea	16(rp), rp
+	mov	(up,un,8), %rax
+	lea	-2(un), i
+	mul	v0
+	mov	%rax, X0
+	MOV(	%rdx, X1, 32)
+	mov	(up,un,8), %rax
+	mov	(rp,un,8), w0
+	mul	v1
+	mov	%rax, w1
+	lea	(%rdx), w2
+	mov	8(up,un,8), %rax
+	jmp	L(lo2)
+
+	ALIGNx
+L(am2top2):
+	mul	v1
+	add	w0, w1
+	adc	%rax, w2
+	mov	(up,i,8), %rax
+	MOV(	%rdx, w3, 1)
+	adc	$0, w3
+	mul	v0
+	add	w1, X1
+	mov	X1, -8(rp,i,8)
+	adc	%rax, X0
+	MOV(	%rdx, X1, 2)
+	adc	$0, X1
+	mov	(up,i,8), %rax
+	mul	v1
+	MOV(	%rdx, w0, 4)
+	mov	(rp,i,8), w1
+	add	w1, w2
+	adc	%rax, w3
+	adc	$0, w0
+	mov	8(up,i,8), %rax
+	mul	v0
+	add	w2, X0
+	adc	%rax, X1
+	mov	X0, (rp,i,8)
+	MOV(	%rdx, X0, 8)
+	adc	$0, X0
+	mov	8(up,i,8), %rax
+	mov	8(rp,i,8), w2
+	mul	v1
+	add	w2, w3
+	adc	%rax, w0
+	MOV(	%rdx, w1, 16)
+	adc	$0, w1
+	mov	16(up,i,8), %rax
+	mul	v0
+	add	w3, X1
+	mov	X1, 8(rp,i,8)
+	adc	%rax, X0
+	MOV(	%rdx, X1, 32)
+	mov	16(rp,i,8), w3
+	adc	$0, X1
+	mov	16(up,i,8), %rax
+	mul	v1
+	add	w3, w0
+	MOV(	%rdx, w2, 64)
+	adc	%rax, w1
+	mov	24(up,i,8), %rax
+	adc	$0, w2
+L(lo2):	mul	v0
+	add	w0, X0
+	mov	X0, 16(rp,i,8)
+	MOV(	%rdx, X0, 128)
+	adc	%rax, X1
+	mov	24(up,i,8), %rax
+	mov	24(rp,i,8), w0
+	adc	$0, X0
+	add	$4, i
+	jnc	L(am2top2)
+
+	mul	v1
+	add	w0, w1
+	adc	%rax, w2
+	adc	Z(i,$0), %rdx
+	add	w1, X1
+	adc	Z(i,$0), X0
+	mov	X1, I(-8(rp),-8(rp,i,8))
+	add	w2, X0
+	mov	X0, I((rp),(rp,i,8))
+	adc	Z(i,$0), %rdx
+	mov	%rdx, I(8(rp),8(rp,i,8))
+
+	addl	$-2, vn
+	jnz	L(olo2)
+
+	pop	%rax
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+
+
+	ALIGNx
+L(m2top3):
+	mul	v0
+	add	%rax, w3
+	mov	-8(up,i,8), %rax
+	mov	w3, -8(rp,i,8)
+	adc	%rdx, w0
+	adc	$0, R32(w1)
+	mul	v1
+	add	%rax, w0
+	adc	%rdx, w1
+	mov	$0, R32(w2)
+	mov	(up,i,8), %rax
+	mul	v0
+	add	%rax, w0
+	mov	w0, (rp,i,8)
+	adc	%rdx, w1
+	mov	(up,i,8), %rax
+	adc	$0, R32(w2)
+	mul	v1
+	add	%rax, w1
+	adc	%rdx, w2
+	mov	8(up,i,8), %rax
+	mul	v0
+	mov	$0, R32(w3)
+	add	%rax, w1
+	adc	%rdx, w2
+	adc	$0, R32(w3)
+	mov	8(up,i,8), %rax
+L(m2e3):mul	v1
+	add	%rax, w2
+	mov	w1, 8(rp,i,8)
+	adc	%rdx, w3
+	mov	$0, R32(w0)
+	mov	16(up,i,8), %rax
+	mul	v0
+	add	%rax, w2
+	mov	16(up,i,8), %rax
+	adc	%rdx, w3
+	adc	$0, R32(w0)
+	mul	v1
+	mov	$0, R32(w1)
+	add	%rax, w3
+	mov	24(up,i,8), %rax
+	mov	w2, 16(rp,i,8)
+	adc	%rdx, w0
+	add	$4, i
+	js	L(m2top3)
+
+	mul	v0
+	add	%rax, w3
+	mov	I(-8(up),-8(up,i,8)), %rax
+	mov	w3, I(-8(rp),-8(rp,i,8))
+	adc	%rdx, w0
+	adc	$0, R32(w1)
+	mul	v1
+	add	%rax, w0
+	adc	%rdx, w1
+	mov	w0, I((rp),(rp,i,8))
+	mov	w1, I(8(rp),8(rp,i,8))
+
+	add	$-2, vn_param
+	jz	L(ret2)
+
+L(do_am3):
+	push	%r15
+	push	vn_param
+
+L(olo3):
+	mov	(vp), v0
+	mov	8(vp), v1
+	lea	16(vp), vp
+	lea	16(rp), rp
+	mov	(up,un,8), %rax
+	lea	-1(un), i
+	mul	v0
+	mov	%rax, X1
+	MOV(	%rdx, X0, 8)
+	mov	(up,un,8), %rax
+	mov	(rp,un,8), w3
+	mul	v1
+	mov	%rax, w0
+	MOV(	%rdx, w1, 16)
+	mov	8(up,un,8), %rax
+	jmp	L(lo3)
+
+	ALIGNx
+L(am2top3):
+	mul	v1
+	add	w0, w1
+	adc	%rax, w2
+	mov	(up,i,8), %rax
+	MOV(	%rdx, w3, 1)
+	adc	$0, w3
+	mul	v0
+	add	w1, X1
+	mov	X1, -8(rp,i,8)
+	adc	%rax, X0
+	MOV(	%rdx, X1, 2)
+	adc	$0, X1
+	mov	(up,i,8), %rax
+	mul	v1
+	MOV(	%rdx, w0, 4)
+	mov	(rp,i,8), w1
+	add	w1, w2
+	adc	%rax, w3
+	adc	$0, w0
+	mov	8(up,i,8), %rax
+	mul	v0
+	add	w2, X0
+	adc	%rax, X1
+	mov	X0, (rp,i,8)
+	MOV(	%rdx, X0, 8)
+	adc	$0, X0
+	mov	8(up,i,8), %rax
+	mov	8(rp,i,8), w2
+	mul	v1
+	add	w2, w3
+	adc	%rax, w0
+	MOV(	%rdx, w1, 16)
+	adc	$0, w1
+	mov	16(up,i,8), %rax
+L(lo3):	mul	v0
+	add	w3, X1
+	mov	X1, 8(rp,i,8)
+	adc	%rax, X0
+	MOV(	%rdx, X1, 32)
+	mov	16(rp,i,8), w3
+	adc	$0, X1
+	mov	16(up,i,8), %rax
+	mul	v1
+	add	w3, w0
+	MOV(	%rdx, w2, 64)
+	adc	%rax, w1
+	mov	24(up,i,8), %rax
+	adc	$0, w2
+	mul	v0
+	add	w0, X0
+	mov	X0, 16(rp,i,8)
+	MOV(	%rdx, X0, 128)
+	adc	%rax, X1
+	mov	24(up,i,8), %rax
+	mov	24(rp,i,8), w0
+	adc	$0, X0
+	add	$4, i
+	jnc	L(am2top3)
+
+	mul	v1
+	add	w0, w1
+	adc	%rax, w2
+	adc	Z(i,$0), %rdx
+	add	w1, X1
+	adc	Z(i,$0), X0
+	mov	X1, I(-8(rp),-8(rp,i,8))
+	add	w2, X0
+	mov	X0, I((rp),(rp,i,8))
+	adc	Z(i,$0), %rdx
+	mov	%rdx, I(8(rp),8(rp,i,8))
+
+	addl	$-2, vn
+	jnz	L(olo3)
+
+	pop	%rax
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/core2/mullo_basecase.asm b/third_party/gmp/mpn/x86_64/core2/mullo_basecase.asm
new file mode 100644
index 0000000..0f03d86
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/core2/mullo_basecase.asm

@@ -0,0 +1,427 @@
+dnl  AMD64 mpn_mullo_basecase optimised for Conroe/Wolfdale/Nehalem/Westmere.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2008, 2009, 2011-2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb	mul_2		addmul_2
+C AMD K8,K9
+C AMD K10
+C AMD bull
+C AMD pile
+C AMD steam
+C AMD bobcat
+C AMD jaguar
+C Intel P4
+C Intel core	 4.0		4.18-4.25
+C Intel NHM	 3.75		4.06-4.2
+C Intel SBR
+C Intel IBR
+C Intel HWL
+C Intel BWL
+C Intel atom
+C VIA nano
+
+C The inner loops of this code are the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjörn Granlund.
+
+C TODO
+C   * Implement proper cor2, replacing current cor0.
+C   * Offset n by 2 in order to avoid the outer loop cmp.  (And sqr_basecase?)
+C   * Micro-optimise.
+
+C When playing with pointers, set this to $2 to fall back to conservative
+C indexing in wind-down code.
+define(`I',`$1')
+
+define(`rp',       `%rdi')
+define(`up',       `%rsi')
+define(`vp_param', `%rdx')
+define(`n_param',  `%rcx')
+
+define(`v0',       `%r10')
+define(`v1',       `%r11')
+define(`w0',       `%rbx')
+define(`w1',       `%rcx')
+define(`w2',       `%rbp')
+define(`w3',       `%r12')
+define(`n',        `%r9')
+define(`i',        `%r13')
+define(`vp',       `%r8')
+
+define(`X0',       `%r14')
+define(`X1',       `%r15')
+
+C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+define(`ALIGNx', `ALIGN(16)')
+
+define(`N', 85)
+ifdef(`N',,`define(`N',0)')
+define(`MOV', `ifelse(eval(N & $3),0,`mov	$1, $2',`lea	($1), $2')')
+
+ASM_START()
+	TEXT
+	ALIGN(32)
+PROLOGUE(mpn_mullo_basecase)
+	FUNC_ENTRY(4)
+
+	mov	(up), %rax
+	mov	vp_param, vp
+
+	cmp	$4, n_param
+	jb	L(small)
+
+	mov	(vp_param), v0
+	push	%rbx
+	lea	(rp,n_param,8), rp	C point rp at R[un]
+	push	%rbp
+	lea	(up,n_param,8), up	C point up right after U's end
+	push	%r12
+	mov	$0, R32(n)		C FIXME
+	sub	n_param, n
+	push	%r13
+	mul	v0
+	mov	8(vp), v1
+
+	test	$1, R8(n_param)
+	jnz	L(m2x1)
+
+L(m2x0):test	$2, R8(n_param)
+	jnz	L(m2b2)
+
+L(m2b0):lea	(n), i
+	mov	%rax, (rp,n,8)
+	mov	%rdx, w1
+	mov	(up,n,8), %rax
+	xor	R32(w2), R32(w2)
+	jmp	L(m2e0)
+
+L(m2b2):lea	-2(n), i
+	mov	%rax, w2
+	mov	(up,n,8), %rax
+	mov	%rdx, w3
+	xor	R32(w0), R32(w0)
+	jmp	L(m2e2)
+
+L(m2x1):test	$2, R8(n_param)
+	jnz	L(m2b3)
+
+L(m2b1):lea	1(n), i
+	mov	%rax, (rp,n,8)
+	mov	(up,n,8), %rax
+	mov	%rdx, w0
+	xor	R32(w1), R32(w1)
+	jmp	L(m2e1)
+
+L(m2b3):lea	-1(n), i
+	xor	R32(w3), R32(w3)
+	mov	%rax, w1
+	mov	%rdx, w2
+	mov	(up,n,8), %rax
+	jmp	L(m2e3)
+
+	ALIGNx
+L(m2tp):mul	v0
+	add	%rax, w3
+	mov	-8(up,i,8), %rax
+	mov	w3, -8(rp,i,8)
+	adc	%rdx, w0
+	adc	$0, R32(w1)
+L(m2e1):mul	v1
+	add	%rax, w0
+	adc	%rdx, w1
+	mov	$0, R32(w2)
+	mov	(up,i,8), %rax
+	mul	v0
+	add	%rax, w0
+	mov	w0, (rp,i,8)
+	adc	%rdx, w1
+	mov	(up,i,8), %rax
+	adc	$0, R32(w2)
+L(m2e0):mul	v1
+	add	%rax, w1
+	adc	%rdx, w2
+	mov	8(up,i,8), %rax
+	mul	v0
+	mov	$0, R32(w3)
+	add	%rax, w1
+	adc	%rdx, w2
+	adc	$0, R32(w3)
+	mov	8(up,i,8), %rax
+L(m2e3):mul	v1
+	add	%rax, w2
+	mov	w1, 8(rp,i,8)
+	adc	%rdx, w3
+	mov	$0, R32(w0)
+	mov	16(up,i,8), %rax
+	mul	v0
+	add	%rax, w2
+	mov	16(up,i,8), %rax
+	adc	%rdx, w3
+	adc	$0, R32(w0)
+L(m2e2):mul	v1
+	mov	$0, R32(w1)		C FIXME: dead in last iteration
+	add	%rax, w3
+	mov	24(up,i,8), %rax
+	mov	w2, 16(rp,i,8)
+	adc	%rdx, w0		C FIXME: dead in last iteration
+	add	$4, i
+	js	L(m2tp)
+
+L(m2ed):imul	v0, %rax
+	add	w3, %rax
+	mov	%rax, I(-8(rp),-8(rp,i,8))
+
+	add	$2, n
+	lea	16(vp), vp
+	lea	-16(up), up
+	cmp	$-2, n
+	jge	L(cor1)
+
+	push	%r14
+	push	%r15
+
+L(outer):
+	mov	(vp), v0
+	mov	8(vp), v1
+	mov	(up,n,8), %rax
+	mul	v0
+	test	$1, R8(n)
+	jnz	L(a1x1)
+
+L(a1x0):mov	%rax, X1
+	MOV(	%rdx, X0, 8)
+	mov	(up,n,8), %rax
+	mul	v1
+	test	$2, R8(n)
+	jnz	L(a110)
+
+L(a100):lea	(n), i
+	mov	(rp,n,8), w3
+	mov	%rax, w0
+	MOV(	%rdx, w1, 16)
+	jmp	L(lo0)
+
+L(a110):lea	2(n), i
+	mov	(rp,n,8), w1
+	mov	%rax, w2
+	mov	8(up,n,8), %rax
+	MOV(	%rdx, w3, 1)
+	jmp	L(lo2)
+
+L(a1x1):mov	%rax, X0
+	MOV(	%rdx, X1, 2)
+	mov	(up,n,8), %rax
+	mul	v1
+	test	$2, R8(n)
+	jz	L(a111)
+
+L(a101):lea	1(n), i
+	MOV(	%rdx, w0, 4)
+	mov	(rp,n,8), w2
+	mov	%rax, w3
+	jmp	L(lo1)
+
+L(a111):lea	-1(n), i
+	MOV(	%rdx, w2, 64)
+	mov	%rax, w1
+	mov	(rp,n,8), w0
+	mov	8(up,n,8), %rax
+	jmp	L(lo3)
+
+	ALIGNx
+L(top):	mul	v1
+	add	w0, w1
+	adc	%rax, w2
+	mov	-8(up,i,8), %rax
+	MOV(	%rdx, w3, 1)
+	adc	$0, w3
+L(lo2):	mul	v0
+	add	w1, X1
+	mov	X1, -16(rp,i,8)
+	adc	%rax, X0
+	MOV(	%rdx, X1, 2)
+	adc	$0, X1
+	mov	-8(up,i,8), %rax
+	mul	v1
+	MOV(	%rdx, w0, 4)
+	mov	-8(rp,i,8), w1
+	add	w1, w2
+	adc	%rax, w3
+	adc	$0, w0
+L(lo1):	mov	(up,i,8), %rax
+	mul	v0
+	add	w2, X0
+	adc	%rax, X1
+	mov	X0, -8(rp,i,8)
+	MOV(	%rdx, X0, 8)
+	adc	$0, X0
+	mov	(up,i,8), %rax
+	mov	(rp,i,8), w2
+	mul	v1
+	add	w2, w3
+	adc	%rax, w0
+	MOV(	%rdx, w1, 16)
+	adc	$0, w1
+L(lo0):	mov	8(up,i,8), %rax
+	mul	v0
+	add	w3, X1
+	mov	X1, (rp,i,8)
+	adc	%rax, X0
+	MOV(	%rdx, X1, 32)
+	mov	8(rp,i,8), w3
+	adc	$0, X1
+	mov	8(up,i,8), %rax
+	mul	v1
+	add	w3, w0
+	MOV(	%rdx, w2, 64)
+	adc	%rax, w1
+	mov	16(up,i,8), %rax
+	adc	$0, w2
+L(lo3):	mul	v0
+	add	w0, X0
+	mov	X0, 8(rp,i,8)
+	MOV(	%rdx, X0, 128)
+	adc	%rax, X1
+	mov	16(up,i,8), %rax
+	mov	16(rp,i,8), w0
+	adc	$0, X0
+	add	$4, i
+	jnc	L(top)
+
+L(end):	imul	v1, %rax
+	add	w0, w1
+	adc	%rax, w2
+	mov	I(-8(up),-8(up,i,8)), %rax
+	imul	v0, %rax
+	add	w1, X1
+	mov	X1, I(-16(rp),-16(rp,i,8))
+	adc	X0, %rax
+	mov	I(-8(rp),-8(rp,i,8)), w1
+	add	w1, w2
+	add	w2, %rax
+	mov	%rax, I(-8(rp),-8(rp,i,8))
+
+	add	$2, n
+	lea	16(vp), vp
+	lea	-16(up), up
+	cmp	$-2, n
+	jl	L(outer)
+
+	pop	%r15
+	pop	%r14
+
+	jnz	L(cor0)
+
+L(cor1):mov	(vp), v0
+	mov	8(vp), v1
+	mov	-16(up), %rax
+	mul	v0			C u0 x v2
+	add	-16(rp), %rax		C FIXME: rp[0] still available in reg?
+	adc	-8(rp), %rdx		C FIXME: rp[1] still available in reg?
+	mov	-8(up), %rbx
+	imul	v0, %rbx
+	mov	-16(up), %rcx
+	imul	v1, %rcx
+	mov	%rax, -16(rp)
+	add	%rbx, %rcx
+	add	%rdx, %rcx
+	mov	%rcx, -8(rp)
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+
+L(cor0):mov	(vp), %r11
+	imul	-8(up), %r11
+	add	%rax, %r11
+	mov	%r11, -8(rp)
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+
+	ALIGN(16)
+L(small):
+	cmp	$2, n_param
+	jae	L(gt1)
+L(n1):	imul	(vp_param), %rax
+	mov	%rax, (rp)
+	FUNC_EXIT()
+	ret
+L(gt1):	ja	L(gt2)
+L(n2):	mov	(vp_param), %r9
+	mul	%r9
+	mov	%rax, (rp)
+	mov	8(up), %rax
+	imul	%r9, %rax
+	add	%rax, %rdx
+	mov	8(vp), %r9
+	mov	(up), %rcx
+	imul	%r9, %rcx
+	add	%rcx, %rdx
+	mov	%rdx, 8(rp)
+	FUNC_EXIT()
+	ret
+L(gt2):
+L(n3):	mov	(vp_param), %r9
+	mul	%r9		C u0 x v0
+	mov	%rax, (rp)
+	mov	%rdx, %r10
+	mov	8(up), %rax
+	mul	%r9		C u1 x v0
+	imul	16(up), %r9	C u2 x v0
+	add	%rax, %r10
+	adc	%rdx, %r9
+	mov	8(vp), %r11
+	mov	(up), %rax
+	mul	%r11		C u0 x v1
+	add	%rax, %r10
+	adc	%rdx, %r9
+	imul	8(up), %r11	C u1 x v1
+	add	%r11, %r9
+	mov	%r10, 8(rp)
+	mov	16(vp), %r10
+	mov	(up), %rax
+	imul	%rax, %r10	C u0 x v2
+	add	%r10, %r9
+	mov	%r9, 16(rp)
+	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/core2/popcount.asm b/third_party/gmp/mpn/x86_64/core2/popcount.asm
new file mode 100644
index 0000000..39d8c5d
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/core2/popcount.asm

@@ -0,0 +1,185 @@
+dnl  AMD64 SSSE3 mpn_popcount -- population count.
+
+dnl  Copyright 2010-2017 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+C		    cycles/limb	  good for cpu?
+C AMD K8,K9		n/a
+C AMD K10		n/a
+C AMD bd1	     1.79-1.91		n
+C AMD bd2	     1.73-1.85		n
+C AMD bd3		 ?
+C AMD bd4	     1.73-1.85		n
+C AMD zen		 1.47		n
+C AMD bobcat		 8.0		n
+C AMD jaguar		 4.78		n
+C Intel P4		n/a
+C Intel CNR		 3.75
+C Intel PNR		 2.61		y
+C Intel NHM		 2.03		n
+C Intel SBR		 1.87		n
+C Intel IBR	     1.52-1.58		n
+C Intel HWL	     1.52-1.58		n
+C Intel BWL	     1.52-1.58		n
+C Intel SKL		 1.51		n
+C Intel atom		12.3		n
+C Intel SLM		 9.1		n
+C VIA nano		 ?
+
+C TODO
+C  * This was hand-written without too much thought about optimal insn
+C    selection; check to see of it can be improved.
+C  * Consider doing some instruction scheduling.
+
+define(`up',		`%rdi')
+define(`n',		`%rsi')
+
+ASM_START()
+	TEXT
+	ALIGN(32)
+PROLOGUE(mpn_popcount)
+	lea	L(cnsts)(%rip), %r9
+
+ifdef(`PIC', `define(`OFF1',32) define(`OFF2',48)',
+	     `define(`OFF1',64) define(`OFF2',80)')
+	movdqa	OFF1`'(%r9), %xmm7
+	movdqa	OFF2`'(%r9), %xmm6
+	pxor	%xmm4, %xmm4
+	pxor	%xmm5, %xmm5
+	pxor	%xmm8, %xmm8
+
+	mov	R32(n), R32(%rax)
+	and	$7, R32(%rax)
+ifdef(`PIC',`
+	movslq	(%r9,%rax,4), %rax
+	add	%r9, %rax
+	jmp	*%rax
+',`
+	jmp	*(%r9,%rax,8)
+')
+
+L(1):	movq	(up), %xmm1
+	add	$8, up
+	jmp	L(e1)
+
+L(2):	add	$-48, up
+	jmp	L(e2)
+
+L(3):	movq	(up), %xmm1
+	add	$-40, up
+	jmp	L(e3)
+
+L(4):	add	$-32, up
+	jmp	L(e4)
+
+L(5):	movq	(up), %xmm1
+	add	$-24, up
+	jmp	L(e5)
+
+L(6):	add	$-16, up
+	jmp	L(e6)
+
+L(7):	movq	(up), %xmm1
+	add	$-8, up
+	jmp	L(e7)
+
+	ALIGN(32)
+L(top):	lddqu	(up), %xmm1
+L(e7):	movdqa	%xmm6, %xmm0		C copy mask register
+	movdqa	%xmm7, %xmm2		C copy count register
+	movdqa	%xmm7, %xmm3		C copy count register
+	pand	%xmm1, %xmm0
+	psrlw	$4, %xmm1
+	pand	%xmm6, %xmm1
+	pshufb	%xmm0, %xmm2
+	pshufb	%xmm1, %xmm3
+	paddb	%xmm2, %xmm3
+	paddb	%xmm3, %xmm4
+L(e6):	lddqu	16(up), %xmm1
+L(e5):	movdqa	%xmm6, %xmm0
+	movdqa	%xmm7, %xmm2
+	movdqa	%xmm7, %xmm3
+	pand	%xmm1, %xmm0
+	psrlw	$4, %xmm1
+	pand	%xmm6, %xmm1
+	pshufb	%xmm0, %xmm2
+	pshufb	%xmm1, %xmm3
+	paddb	%xmm2, %xmm3
+	paddb	%xmm3, %xmm4
+L(e4):	lddqu	32(up), %xmm1
+L(e3):	movdqa	%xmm6, %xmm0
+	movdqa	%xmm7, %xmm2
+	movdqa	%xmm7, %xmm3
+	pand	%xmm1, %xmm0
+	psrlw	$4, %xmm1
+	pand	%xmm6, %xmm1
+	pshufb	%xmm0, %xmm2
+	pshufb	%xmm1, %xmm3
+	paddb	%xmm2, %xmm3
+	paddb	%xmm3, %xmm4
+L(e2):	lddqu	48(up), %xmm1
+	add	$64, up
+L(e1):	movdqa	%xmm6, %xmm0
+	movdqa	%xmm7, %xmm2
+	movdqa	%xmm7, %xmm3
+	pand	%xmm1, %xmm0
+	psrlw	$4, %xmm1
+	pand	%xmm6, %xmm1
+	pshufb	%xmm0, %xmm2
+	pshufb	%xmm1, %xmm3
+	psadbw	%xmm5, %xmm4		C sum to 8 x 16-bit counts
+	paddb	%xmm2, %xmm3
+	paddq	%xmm4, %xmm8		C sum to 2 x 64-bit counts
+	movdqa	%xmm3, %xmm4
+	sub	$8, n
+	jg	L(top)
+
+	psadbw	%xmm5, %xmm4
+	paddq	%xmm4, %xmm8
+	pshufd	$14, %xmm8, %xmm0
+	paddq	%xmm8, %xmm0
+	movq	%xmm0, %rax
+	ret
+EPILOGUE()
+DEF_OBJECT(L(cnsts),16,`JUMPTABSECT')
+	JMPENT(	L(top), L(cnsts))
+	JMPENT(	L(1), L(cnsts))
+	JMPENT(	L(2), L(cnsts))
+	JMPENT(	L(3), L(cnsts))
+	JMPENT(	L(4), L(cnsts))
+	JMPENT(	L(5), L(cnsts))
+	JMPENT(	L(6), L(cnsts))
+	JMPENT(	L(7), L(cnsts))
+	.byte	0x00,0x01,0x01,0x02,0x01,0x02,0x02,0x03
+	.byte	0x01,0x02,0x02,0x03,0x02,0x03,0x03,0x04
+	.byte	0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f
+	.byte	0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f
+END_OBJECT(L(cnsts))

diff --git a/third_party/gmp/mpn/x86_64/core2/redc_1.asm b/third_party/gmp/mpn/x86_64/core2/redc_1.asm
new file mode 100644
index 0000000..8c296fd
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/core2/redc_1.asm

@@ -0,0 +1,430 @@
+dnl  X86-64 mpn_redc_1 optimised for Intel Conroe and Wolfdale.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C AMD K8,K9	 ?
+C AMD K10	 ?
+C AMD bull	 ?
+C AMD pile	 ?
+C AMD steam	 ?
+C AMD bobcat	 ?
+C AMD jaguar	 ?
+C Intel P4	 ?
+C Intel core	 4.5  (fluctuating)
+C Intel NHM	 ?
+C Intel SBR	 ?
+C Intel IBR	 ?
+C Intel HWL	 ?
+C Intel BWL	 ?
+C Intel atom	 ?
+C VIA nano	 ?
+
+C The inner loops of this code are the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjörn Granlund.
+
+C TODO
+C  * Micro-optimise, none performed thus far.
+C  * Consider inlining mpn_add_n.
+C  * Single basecases out before the pushes.
+C  * Keep up[i] in registers for basecases (might require pushes).
+
+C When playing with pointers, set this to $2 to fall back to conservative
+C indexing in wind-down code.
+define(`I',`$1')
+
+define(`rp',          `%rdi')   C rcx
+define(`up',          `%rsi')   C rdx
+define(`mp_param',    `%rdx')   C r8
+define(`n',           `%rcx')   C r9
+define(`u0inv',       `%r8')    C stack
+
+define(`i',           `%r14')
+define(`j',           `%r15')
+define(`mp',          `%r12')
+define(`q0',          `%r13')
+
+C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15
+C  X  q0'  n   X  rp  up      u0i           mp   q0 i   j
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+define(`ALIGNx', `ALIGN(16)')
+
+ASM_START()
+	TEXT
+	ALIGN(32)
+PROLOGUE(mpn_redc_1)
+	FUNC_ENTRY(4)
+IFDOS(`	mov	56(%rsp), %r8	')
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+
+	mov	(up), q0
+	mov	n, j			C outer loop induction var
+	lea	(mp_param,n,8), mp
+	lea	-16(up,n,8), up
+	neg	n
+	imul	u0inv, q0		C first iteration q0
+
+	test	$1, R8(n)
+	jz	L(b0)
+
+L(b1):	cmp	$-1, R32(n)
+	jz	L(n1)
+	cmp	$-3, R32(n)
+	jz	L(n3)
+
+	push	rp
+
+L(otp1):lea	3(n), i
+	mov	(mp,n,8), %rax
+	mul	q0
+	lea	(%rax), %rbp
+	mov	8(mp,n,8), %rax
+	lea	(%rdx), %r9
+	mul	q0
+	lea	(%rax), %r11
+	mov	16(mp,n,8), %rax
+	mov	16(up,n,8), %r10
+	lea	(%rdx), %rdi
+	mul	q0
+	add	%rbp, %r10
+	lea	(%rax), %rbp
+	mov	24(mp,n,8), %rax
+	adc	%r9, %r11
+	mov	24(up,n,8), %rbx
+	lea	(%rdx), %r9
+	adc	$0, %rdi
+	mul	q0
+	add	%r11, %rbx
+	lea	(%rax), %r11
+	mov	32(mp,n,8), %rax
+	adc	%rdi, %rbp
+	mov	%rbx, 24(up,n,8)
+	mov	32(up,n,8), %r10
+	lea	(%rdx), %rdi
+	adc	$0, %r9
+	imul	u0inv, %rbx		C next q limb
+	add	$2, i
+	jns	L(ed1)
+
+	ALIGNx
+L(tp1):	mul	q0
+	add	%rbp, %r10
+	lea	(%rax), %rbp
+	mov	(mp,i,8), %rax
+	adc	%r9, %r11
+	mov	%r10, -8(up,i,8)
+	mov	(up,i,8), %r10
+	lea	(%rdx), %r9
+	adc	$0, %rdi
+	mul	q0
+	add	%r11, %r10
+	lea	(%rax), %r11
+	mov	8(mp,i,8), %rax
+	adc	%rdi, %rbp
+	mov	%r10, (up,i,8)
+	mov	8(up,i,8), %r10
+	lea	(%rdx), %rdi
+	adc	$0, %r9
+	add	$2, i
+	js	L(tp1)
+
+L(ed1):	mul	q0
+	add	%rbp, %r10
+	adc	%r9, %r11
+	mov	%r10, I(-8(up),-8(up,i,8))
+	mov	I((up),(up,i,8)), %r10
+	adc	$0, %rdi
+	add	%r11, %r10
+	adc	%rdi, %rax
+	mov	%r10, I((up),(up,i,8))
+	mov	I(8(up),8(up,i,8)), %r10
+	adc	$0, %rdx
+	add	%rax, %r10
+	mov	%r10, I(8(up),8(up,i,8))
+	adc	$0, %rdx
+	mov	%rdx, 16(up,n,8)	C up[0]
+	mov	%rbx, q0		C previously computed q limb -> q0
+	lea	8(up), up		C up++
+	dec	j
+	jnz	L(otp1)
+	jmp	L(cj)
+
+L(b0):	cmp	$-2, R32(n)
+	jz	L(n2)
+	cmp	$-4, R32(n)
+	jz	L(n4)
+
+	push	rp
+
+L(otp0):lea	4(n), i
+	mov	(mp,n,8), %rax
+	mul	q0
+	lea	(%rax), %r11
+	mov	8(mp,n,8), %rax
+	lea	(%rdx), %rdi
+	mul	q0
+	lea	(%rax), %rbp
+	mov	16(mp,n,8), %rax
+	mov	16(up,n,8), %r10
+	lea	(%rdx), %r9
+	mul	q0
+	add	%r11, %r10
+	lea	(%rax), %r11
+	mov	24(mp,n,8), %rax
+	adc	%rdi, %rbp
+	mov	24(up,n,8), %rbx
+	lea	(%rdx), %rdi
+	adc	$0, %r9
+	mul	q0
+	add	%rbp, %rbx
+	lea	(%rax), %rbp
+	mov	32(mp,n,8), %rax
+	adc	%r9, %r11
+	mov	%rbx, 24(up,n,8)
+	mov	32(up,n,8), %r10
+	lea	(%rdx), %r9
+	adc	$0, %rdi
+	imul	u0inv, %rbx		C next q limb
+	jmp	L(e0)
+
+	ALIGNx
+L(tp0):	mul	q0
+	add	%rbp, %r10
+	lea	(%rax), %rbp
+	mov	(mp,i,8), %rax
+	adc	%r9, %r11
+	mov	%r10, -8(up,i,8)
+	mov	(up,i,8), %r10
+	lea	(%rdx), %r9
+	adc	$0, %rdi
+L(e0):	mul	q0
+	add	%r11, %r10
+	lea	(%rax), %r11
+	mov	8(mp,i,8), %rax
+	adc	%rdi, %rbp
+	mov	%r10, (up,i,8)
+	mov	8(up,i,8), %r10
+	lea	(%rdx), %rdi
+	adc	$0, %r9
+	add	$2, i
+	js	L(tp0)
+
+L(ed0):	mul	q0
+	add	%rbp, %r10
+	adc	%r9, %r11
+	mov	%r10, I(-8(up),-8(up,i,8))
+	mov	I((up),(up,i,8)), %r10
+	adc	$0, %rdi
+	add	%r11, %r10
+	adc	%rdi, %rax
+	mov	%r10, I((up),(up,i,8))
+	mov	I(8(up),8(up,i,8)), %r10
+	adc	$0, %rdx
+	add	%rax, %r10
+	mov	%r10, I(8(up),8(up,i,8))
+	adc	$0, %rdx
+	mov	%rdx, 16(up,n,8)	C up[0]
+	mov	%rbx, q0		C previously computed q limb -> q0
+	lea	8(up), up		C up++
+	dec	j
+	jnz	L(otp0)
+
+L(cj):	lea	16(up), up		C FIXME
+	pop	rp
+L(add_n):
+IFSTD(`	lea	(up,n,8), up		C param 2: up
+	lea	(up,n,8), %rdx		C param 3: up - n
+	neg	R32(n)		')	C param 4: n
+
+IFDOS(`	lea	(up,n,8), %rdx		C param 2: up
+	lea	(%rdx,n,8), %r8		C param 3: up - n
+	neg	R32(n)
+	mov	n, %r9			C param 4: n
+	mov	rp, %rcx	')	C param 1: rp
+
+IFSTD(`	sub	$8, %rsp	')
+IFDOS(`	sub	$40, %rsp	')
+	ASSERT(nz, `test $15, %rsp')
+	CALL(	mpn_add_n)
+IFSTD(`	add	$8, %rsp	')
+IFDOS(`	add	$40, %rsp	')
+
+L(ret):	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+
+L(n1):	mov	(mp_param), %rax
+	mul	q0
+	add	8(up), %rax
+	adc	16(up), %rdx
+	mov	%rdx, (rp)
+	mov	$0, R32(%rax)
+	adc	R32(%rax), R32(%rax)
+	jmp	L(ret)
+
+L(n2):	mov	(mp_param), %rax
+	mov	(up), %rbp
+	mul	q0
+	add	%rax, %rbp
+	mov	%rdx, %r9
+	adc	$0, %r9
+	mov	-8(mp), %rax
+	mov	8(up), %r10
+	mul	q0
+	add	%rax, %r10
+	mov	%rdx, %r11
+	adc	$0, %r11
+	add	%r9, %r10
+	adc	$0, %r11
+	mov	%r10, q0
+	imul	u0inv, q0		C next q0
+	mov	-16(mp), %rax
+	mul	q0
+	add	%rax, %r10
+	mov	%rdx, %r9
+	adc	$0, %r9
+	mov	-8(mp), %rax
+	mov	16(up), %r14
+	mul	q0
+	add	%rax, %r14
+	adc	$0, %rdx
+	add	%r9, %r14
+	adc	$0, %rdx
+	xor	R32(%rax), R32(%rax)
+	add	%r11, %r14
+	adc	24(up), %rdx
+	mov	%r14, (rp)
+	mov	%rdx, 8(rp)
+	adc	R32(%rax), R32(%rax)
+	jmp	L(ret)
+
+	ALIGNx
+L(n3):	mov	-24(mp), %rax
+	mov	-8(up), %r10
+	mul	q0
+	add	%rax, %r10
+	mov	-16(mp), %rax
+	mov	%rdx, %r11
+	adc	$0, %r11
+	mov	(up), %rbp
+	mul	q0
+	add	%rax, %rbp
+	mov	%rdx, %r9
+	adc	$0, %r9
+	mov	-8(mp), %rax
+	add	%r11, %rbp
+	mov	8(up), %r10
+	adc	$0, %r9
+	mul	q0
+	mov	%rbp, q0
+	imul	u0inv, q0		C next q0
+	add	%rax, %r10
+	mov	%rdx, %r11
+	adc	$0, %r11
+	mov	%rbp, (up)
+	add	%r9, %r10
+	adc	$0, %r11
+	mov	%r10, 8(up)
+	mov	%r11, -8(up)		C up[0]
+	lea	8(up), up		C up++
+	dec	j
+	jnz	L(n3)
+
+	mov	-32(up), %rdx
+	mov	-24(up), %rbx
+	xor	R32(%rax), R32(%rax)
+	add	%rbp, %rdx
+	adc	%r10, %rbx
+	adc	8(up), %r11
+	mov	%rdx, (rp)
+	mov	%rbx, 8(rp)
+	mov	%r11, 16(rp)
+	adc	R32(%rax), R32(%rax)
+	jmp	L(ret)
+
+	ALIGNx
+L(n4):	mov	-32(mp), %rax
+	mul	q0
+	lea	(%rax), %r11
+	mov	-24(mp), %rax
+	lea	(%rdx), %r14
+	mul	q0
+	lea	(%rax), %rbp
+	mov	-16(mp), %rax
+	mov	-16(up), %r10
+	lea	(%rdx), %r9
+	mul	q0
+	add	%r11, %r10
+	lea	(%rax), %r11
+	mov	-8(mp), %rax
+	adc	%r14, %rbp
+	mov	-8(up), %rbx
+	lea	(%rdx), %r14
+	adc	$0, %r9
+	mul	q0
+	add	%rbp, %rbx
+	adc	%r9, %r11
+	mov	%rbx, -8(up)
+	mov	(up), %r10
+	adc	$0, %r14
+	imul	u0inv, %rbx		C next q limb
+	add	%r11, %r10
+	adc	%r14, %rax
+	mov	%r10, (up)
+	mov	8(up), %r10
+	adc	$0, %rdx
+	add	%rax, %r10
+	mov	%r10, 8(up)
+	adc	$0, %rdx
+	mov	%rdx, -16(up)		C up[0]
+	mov	%rbx, q0		C previously computed q limb -> q0
+	lea	8(up), up		C up++
+	dec	j
+	jnz	L(n4)
+	lea	16(up), up
+	jmp	L(add_n)
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/x86_64/core2/rsh1aors_n.asm b/third_party/gmp/mpn/x86_64/core2/rsh1aors_n.asm
new file mode 100644
index 0000000..27eed37
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/core2/rsh1aors_n.asm

@@ -0,0 +1,169 @@
+dnl  X86-64 mpn_rsh1add_n, mpn_rsh1sub_n optimised for Intel Conroe/Penryn.
+
+dnl  Copyright 2003, 2005, 2009, 2011, 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C AMD K8,K9	 ?
+C AMD K10	 ?
+C Intel P4	 ?
+C Intel core2	 3.05
+C Intel NHM	 3.3
+C Intel SBR	 2.5
+C Intel atom	 ?
+C VIA nano	 ?
+
+C TODO
+C  * Loopmix to approach 2.5 c/l on NHM.
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`vp', `%rdx')
+define(`n',  `%rcx')
+
+ifdef(`OPERATION_rsh1add_n', `
+	define(ADDSUB,	      add)
+	define(ADCSBB,	      adc)
+	define(func_n,	      mpn_rsh1add_n)
+	define(func_nc,	      mpn_rsh1add_nc)')
+ifdef(`OPERATION_rsh1sub_n', `
+	define(ADDSUB,	      sub)
+	define(ADCSBB,	      sbb)
+	define(func_n,	      mpn_rsh1sub_n)
+	define(func_nc,	      mpn_rsh1sub_nc)')
+
+MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1add_nc mpn_rsh1sub_n mpn_rsh1sub_nc)
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(func_nc)
+	FUNC_ENTRY(4)
+IFDOS(`	mov	56(%rsp), %r8	')
+	push	%rbx
+	push	%rbp
+
+	neg	%r8			C set C flag from parameter
+	mov	(up), %r8
+	ADCSBB	(vp), %r8
+	jmp	L(ent)
+EPILOGUE()
+
+	ALIGN(16)
+PROLOGUE(func_n)
+	FUNC_ENTRY(4)
+	push	%rbx
+	push	%rbp
+
+	mov	(up), %r8
+	ADDSUB	(vp), %r8
+L(ent):	sbb	R32(%rbx), R32(%rbx)	C save cy
+	mov	%r8, %rax
+	and	$1, R32(%rax)		C return value
+
+	lea	(up,n,8), up
+	lea	(vp,n,8), vp
+	lea	(rp,n,8), rp
+	mov	R32(n), R32(%rbp)
+	neg	n
+	and	$3, R32(%rbp)
+	jz	L(b0)
+	cmp	$2, R32(%rbp)
+	jae	L(n1)
+
+L(b1):	mov	%r8, %rbp
+	inc	n
+	js	L(top)
+	jmp	L(end)
+
+L(n1):	jnz	L(b3)
+	add	R32(%rbx), R32(%rbx)	C restore cy
+	mov	8(up,n,8), %r11
+	ADCSBB	8(vp,n,8), %r11
+	sbb	R32(%rbx), R32(%rbx)	C save cy
+	mov	%r8, %r10
+	add	$-2, n
+	jmp	L(2)
+
+L(b3):	add	R32(%rbx), R32(%rbx)	C restore cy
+	mov	8(up,n,8), %r10
+	mov	16(up,n,8), %r11
+	ADCSBB	8(vp,n,8), %r10
+	ADCSBB	16(vp,n,8), %r11
+	sbb	R32(%rbx), R32(%rbx)	C save cy
+	mov	%r8, %r9
+	dec	n
+	jmp	L(3)
+
+L(b0):	add	R32(%rbx), R32(%rbx)	C restore cy
+	mov	8(up,n,8), %r9
+	mov	16(up,n,8), %r10
+	mov	24(up,n,8), %r11
+	ADCSBB	8(vp,n,8), %r9
+	ADCSBB	16(vp,n,8), %r10
+	ADCSBB	24(vp,n,8), %r11
+	sbb	R32(%rbx), R32(%rbx)	C save cy
+	jmp	L(4)
+
+	ALIGN(16)
+
+L(top):	add	R32(%rbx), R32(%rbx)	C restore cy
+	mov	(up,n,8), %r8
+	mov	8(up,n,8), %r9
+	mov	16(up,n,8), %r10
+	mov	24(up,n,8), %r11
+	ADCSBB	(vp,n,8), %r8
+	ADCSBB	8(vp,n,8), %r9
+	ADCSBB	16(vp,n,8), %r10
+	ADCSBB	24(vp,n,8), %r11
+	sbb	R32(%rbx), R32(%rbx)	C save cy
+	shrd	$1, %r8, %rbp
+	mov	%rbp, -8(rp,n,8)
+L(4):	shrd	$1, %r9, %r8
+	mov	%r8, (rp,n,8)
+L(3):	shrd	$1, %r10, %r9
+	mov	%r9, 8(rp,n,8)
+L(2):	shrd	$1, %r11, %r10
+	mov	%r10, 16(rp,n,8)
+L(1):	add	$4, n
+	mov	%r11, %rbp
+	js	L(top)
+
+L(end):	shrd	$1, %rbx, %rbp
+	mov	%rbp, -8(rp)
+	pop	%rbp
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/core2/rshift.asm b/third_party/gmp/mpn/x86_64/core2/rshift.asm
new file mode 100644
index 0000000..7578a53
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/core2/rshift.asm

@@ -0,0 +1,143 @@
+dnl  x86-64 mpn_rshift optimised for Conroe/Penryn and Nehalem.
+
+dnl  Copyright 2007, 2009, 2011, 2012, 2017 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C AMD K8,K9
+C AMD K10
+C AMD bd1
+C AMD bd2
+C AMD bd3
+C AMD bd4
+C AMD zen
+C AMD bobcat
+C AMD jaguar
+C Intel P4
+C Intel core2	 1.32
+C Intel NHM	 1.30	(drops to 2.5 for n > 256)
+C Intel SBR
+C Intel IBR
+C Intel HWL
+C Intel BWL
+C Intel SKL
+C Intel atom
+C Intel SLM
+C VIA nano
+
+C INPUT PARAMETERS
+define(`rp',	`%rdi')
+define(`up',	`%rsi')
+define(`n',	`%rdx')
+define(`cnt',	`%rcx')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_rshift)
+	FUNC_ENTRY(4)
+
+	xor	R32(%rax), R32(%rax)
+
+	test	$1, R8(n)
+	jnz	L(bx1)
+L(bx0):	test	$2, R8(n)
+	jnz	L(b10)
+
+L(b00):	lea	8(up), up
+	lea	-24(rp), rp
+	mov	-8(up), %r10
+	mov	(up), %r11
+	shrd	R8(cnt), %r10, %rax
+	mov	8(up), %r8
+	shr	$2, n
+	jmp	L(00)
+
+L(bx1):	test	$2, R8(n)
+	jnz	L(b11)
+
+L(b01):	lea	16(up), up
+	lea	-16(rp), rp
+	mov	-16(up), %r9
+	shrd	R8(cnt), %r9, %rax
+	shr	$2, n
+	jz	L(1)
+	mov	-8(up), %r10
+	mov	(up), %r11
+	jmp	L(01)
+
+L(b10):	lea	24(up), up
+	lea	-8(rp), rp
+	mov	-24(up), %r8
+	mov	-16(up), %r9
+	shrd	R8(cnt), %r8, %rax
+	shr	$2, n
+	jz	L(2)
+	mov	-8(up), %r10
+	jmp	L(10)
+
+L(b11):	lea	32(up), up
+	mov	-32(up), %r11
+	mov	-24(up), %r8
+	mov	-16(up), %r9
+	shrd	R8(cnt), %r11, %rax
+	shr	$2, n
+	jz	L(end)
+
+	ALIGN(16)
+L(top):	shrd	R8(cnt), %r8, %r11
+	mov	-8(up), %r10
+	mov	%r11, (rp)
+L(10):	shrd	R8(cnt), %r9, %r8
+	mov	(up), %r11
+	mov	%r8, 8(rp)
+L(01):	shrd	R8(cnt), %r10, %r9
+	mov	8(up), %r8
+	mov	%r9, 16(rp)
+L(00):	shrd	R8(cnt), %r11, %r10
+	mov	16(up), %r9
+	add	$32, up
+	mov	%r10, 24(rp)
+	add	$32, rp
+	dec	n
+	jnz	L(top)
+
+L(end):	shrd	R8(cnt), %r8, %r11
+	mov	%r11, (rp)
+L(2):	shrd	R8(cnt), %r9, %r8
+	mov	%r8, 8(rp)
+L(1):	shr	R8(cnt), %r9
+	mov	%r9, 16(rp)
+	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/core2/sec_tabselect.asm b/third_party/gmp/mpn/x86_64/core2/sec_tabselect.asm
new file mode 100644
index 0000000..e436034
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/core2/sec_tabselect.asm

@@ -0,0 +1,37 @@
+dnl  X86-64 mpn_sec_tabselect.
+
+dnl  Copyright 2012, 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_sec_tabselect)
+include_mpn(`x86_64/fastsse/sec_tabselect.asm')

diff --git a/third_party/gmp/mpn/x86_64/core2/sqr_basecase.asm b/third_party/gmp/mpn/x86_64/core2/sqr_basecase.asm
new file mode 100644
index 0000000..a112c1b
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/core2/sqr_basecase.asm

@@ -0,0 +1,984 @@
+dnl  X86-64 mpn_sqr_basecase optimised for Intel Nehalem/Westmere.
+dnl  It also seems good for Conroe/Wolfdale.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2008, 2011-2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb	mul_2		addmul_2	sqr_diag_addlsh1
+C AMD K8,K9
+C AMD K10
+C AMD bull
+C AMD pile
+C AMD steam
+C AMD bobcat
+C AMD jaguar
+C Intel P4
+C Intel core	 4.9		4.18-4.25		 3.87
+C Intel NHM	 3.8		4.06-4.2		 3.5
+C Intel SBR
+C Intel IBR
+C Intel HWL
+C Intel BWL
+C Intel atom
+C VIA nano
+
+C The inner loops of this code are the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjörn Granlund.
+
+C Code structure:
+C
+C
+C        m_2(0m4)        m_2(2m4)        m_2(1m4)        m_2(3m4)
+C           |               |               |               |
+C           |               |               |               |
+C           |               |               |               |
+C          \|/             \|/             \|/             \|/
+C              ____________                   ____________
+C             /            \                 /            \
+C            \|/            \               \|/            \
+C         am_2(3m4)       am_2(1m4)       am_2(0m4)       am_2(2m4)
+C            \            /|\                \            /|\
+C             \____________/                  \____________/
+C                       \                        /
+C                        \                      /
+C                         \                    /
+C                       tail(0m2)          tail(1m2)
+C                            \              /
+C                             \            /
+C                            sqr_diag_addlsh1
+
+C TODO
+C  * Tune.  None done so far.
+C  * Currently 2761 bytes, making it smaller would be nice.
+C  * Consider using a jumptab-based entry sequence.  One might even use a mask-
+C    less sequence, if the table is large enough to support tuneup's needs.
+C    The code would be, using non-PIC code,
+C        lea tab(%rip),%rax; jmp *(n,%rax)
+C    or,
+C        lea tab(%rip),%rax; lea (%rip),%rbx; add (n,%rax),%rbx; jmp *%rbx
+C    using PIC code.  The table entries would be Ln1,Ln2,Ln3,Lm0,Lm1,Lm2,Lm3,..
+C    with the last four entries repeated a safe number of times.
+C  * Consider expanding feed-in code in order to avoid zeroing registers.
+C  * Zero consistently with xor.
+C  * Check if using "lea (reg),reg" should be done in more places; we have some
+C    explicit "mov %rax,reg" now.
+C  * Try zeroing with xor in m2 loops.
+C  * Try re-rolling the m2 loops to avoid the current 9 insn code duplication
+C    between loop header and wind-down code.
+C  * Consider adc reg,reg instead of adc $0,reg in m2 loops.  This save a byte.
+
+C When playing with pointers, set this to $2 to fall back to conservative
+C indexing in wind-down code.
+define(`I',`$1')
+
+C Define this to $1 to use late loop index variable as zero, $2 to use an
+C explicit $0.
+define(`Z',`$1')
+
+define(`rp',       `%rdi')
+define(`up',       `%rsi')
+define(`n_param',  `%rdx')
+
+define(`n',        `%r8')
+
+define(`v0',       `%r10')
+define(`v1',       `%r11')
+define(`w0',       `%rbx')
+define(`w1',       `%rcx')
+define(`w2',       `%rbp')
+define(`w3',       `%r9')
+define(`i',        `%r13')
+
+define(`X0',       `%r12')
+define(`X1',       `%r14')
+
+C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+define(`ALIGNx', `ALIGN(16)')
+
+define(`N', 85)
+ifdef(`N',,`define(`N',0)')
+define(`MOV', `ifelse(eval(N & $3),0,`mov	$1, $2',`lea	($1), $2')')
+
+ASM_START()
+	TEXT
+	ALIGN(32)
+PROLOGUE(mpn_sqr_basecase)
+	FUNC_ENTRY(3)
+
+	cmp	$4, n_param
+	jl	L(small)
+
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+
+	mov	(up), v0
+	mov	8(up), %rax
+	mov	%rax, v1
+
+	mov	$1, R32(n)
+	sub	n_param, n		C n = -n_param+1
+	push	n
+
+	lea	(up,n_param,8), up
+	lea	(rp,n_param,8), rp
+
+	mul	v0
+
+	test	$1, R8(n)
+	jnz	L(bx1)
+
+L(bx0):	test	$2, R8(n)
+	mov	%rax, (rp,n,8)
+	jnz	L(b10)
+
+L(b00):	lea	(n), i			C n = 5, 9, ...
+	mov	%rdx, w1		C FIXME: Use lea?
+	xor	R32(w2), R32(w2)
+	jmp	L(m2e0)
+
+L(b10):	lea	2(n), i			C n = 7, 11, ...
+	mov	8(up,n,8), %rax
+	mov	%rdx, w3		C FIXME: Use lea?
+	xor	R32(w0), R32(w0)
+	xor	R32(w1), R32(w1)
+	jmp	L(m2e2)
+
+L(bx1):	test	$2, R8(n)
+	mov	%rax, (rp,n,8)
+	jz	L(b11)
+
+L(b01):	lea	1(n), i			C n = 6, 10, ...
+	mov	%rdx, w0		C FIXME: Use lea?
+	xor	R32(w1), R32(w1)
+	jmp	L(m2e1)
+
+L(b11):	lea	-1(n), i		C n = 4, 8, 12, ...
+	mov	%rdx, w2		C FIXME: Use lea?
+	xor	R32(w3), R32(w3)
+	jmp	L(m2e3)
+
+
+	ALIGNx
+L(m2top1):
+	mul	v0
+	add	%rax, w3
+	mov	-8(up,i,8), %rax
+	mov	w3, -8(rp,i,8)
+	adc	%rdx, w0
+	adc	$0, R32(w1)
+	mul	v1
+	add	%rax, w0
+	adc	%rdx, w1
+L(m2e1):mov	$0, R32(w2)
+	mov	(up,i,8), %rax
+	mul	v0
+	add	%rax, w0
+	mov	w0, (rp,i,8)
+	adc	%rdx, w1
+	mov	(up,i,8), %rax
+	adc	$0, R32(w2)
+	mul	v1
+	add	%rax, w1
+	adc	%rdx, w2
+	mov	8(up,i,8), %rax
+	mul	v0
+	mov	$0, R32(w3)
+	add	%rax, w1
+	adc	%rdx, w2
+	adc	$0, R32(w3)
+	mov	8(up,i,8), %rax
+	mul	v1
+	add	%rax, w2
+	mov	w1, 8(rp,i,8)
+	adc	%rdx, w3
+	mov	$0, R32(w0)
+	mov	16(up,i,8), %rax
+	mul	v0
+	add	%rax, w2
+	mov	16(up,i,8), %rax
+	adc	%rdx, w3
+	adc	$0, R32(w0)
+	mul	v1
+	mov	$0, R32(w1)
+	add	%rax, w3
+	mov	24(up,i,8), %rax
+	mov	w2, 16(rp,i,8)
+	adc	%rdx, w0
+	add	$4, i
+	js	L(m2top1)
+
+	mul	v0
+	add	%rax, w3
+	mov	I(-8(up),-8(up,i,8)), %rax
+	mov	w3, I(-8(rp),-8(rp,i,8))
+	adc	%rdx, w0
+	adc	R32(w1), R32(w1)
+	mul	v1
+	add	w0, %rax
+	adc	w1, %rdx
+	mov	%rax, I((rp),(rp,i,8))
+	mov	%rdx, I(8(rp),8(rp,i,8))
+
+	lea	16(rp), rp
+	add	$2, n			C decrease |n|
+	jmp	L(am2o3)
+
+	ALIGNx
+L(m2top3):
+	mul	v0
+	add	%rax, w3
+	mov	-8(up,i,8), %rax
+	mov	w3, -8(rp,i,8)
+	adc	%rdx, w0
+	adc	$0, R32(w1)
+	mul	v1
+	add	%rax, w0
+	adc	%rdx, w1
+	mov	$0, R32(w2)
+	mov	(up,i,8), %rax
+	mul	v0
+	add	%rax, w0
+	mov	w0, (rp,i,8)
+	adc	%rdx, w1
+	mov	(up,i,8), %rax
+	adc	$0, R32(w2)
+	mul	v1
+	add	%rax, w1
+	adc	%rdx, w2
+	mov	8(up,i,8), %rax
+	mul	v0
+	mov	$0, R32(w3)
+	add	%rax, w1
+	adc	%rdx, w2
+	adc	$0, R32(w3)
+	mov	8(up,i,8), %rax
+	mul	v1
+	add	%rax, w2
+	mov	w1, 8(rp,i,8)
+	adc	%rdx, w3
+L(m2e3):mov	$0, R32(w0)
+	mov	16(up,i,8), %rax
+	mul	v0
+	add	%rax, w2
+	mov	16(up,i,8), %rax
+	adc	%rdx, w3
+	adc	$0, R32(w0)
+	mul	v1
+	mov	$0, R32(w1)
+	add	%rax, w3
+	mov	24(up,i,8), %rax
+	mov	w2, 16(rp,i,8)
+	adc	%rdx, w0
+	add	$4, i
+	js	L(m2top3)
+
+	mul	v0
+	add	%rax, w3
+	mov	I(-8(up),-8(up,i,8)), %rax
+	mov	w3, I(-8(rp),-8(rp,i,8))
+	adc	%rdx, w0
+	adc	R32(w1), R32(w1)
+	mul	v1
+	add	w0, %rax
+	adc	w1, %rdx
+	mov	%rax, I((rp),(rp,i,8))
+	mov	%rdx, I(8(rp),8(rp,i,8))
+
+	lea	16(rp), rp
+	add	$2, n			C decrease |n|
+	cmp	$-1, n
+	jz	L(cor1)			C jumps iff entry n = 4
+
+L(am2o1):
+	mov	-8(up,n,8), v0
+	mov	(up,n,8), %rax
+	mov	%rax, v1
+	lea	1(n), i
+	mul	v0
+	mov	%rax, X1
+	MOV(	%rdx, X0, 128)
+	mov	(rp,n,8), w1
+	xor	R32(w2), R32(w2)
+	mov	8(up,n,8), %rax
+	xor	R32(w3), R32(w3)
+	jmp	L(lo1)
+
+	ALIGNx
+L(am2top1):
+	mul	v1
+	add	w0, w1
+	adc	%rax, w2
+	mov	(up,i,8), %rax
+	MOV(	%rdx, w3, 1)
+	adc	$0, w3
+L(lo1):	mul	v0
+	add	w1, X1
+	mov	X1, -8(rp,i,8)
+	adc	%rax, X0
+	MOV(	%rdx, X1, 2)
+	adc	$0, X1
+	mov	(up,i,8), %rax
+	mul	v1
+	MOV(	%rdx, w0, 4)
+	mov	(rp,i,8), w1
+	add	w1, w2
+	adc	%rax, w3
+	adc	$0, w0
+	mov	8(up,i,8), %rax
+	mul	v0
+	add	w2, X0
+	adc	%rax, X1
+	mov	X0, (rp,i,8)
+	MOV(	%rdx, X0, 8)
+	adc	$0, X0
+	mov	8(up,i,8), %rax
+	mov	8(rp,i,8), w2
+	mul	v1
+	add	w2, w3
+	adc	%rax, w0
+	MOV(	%rdx, w1, 16)
+	adc	$0, w1
+	mov	16(up,i,8), %rax
+	mul	v0
+	add	w3, X1
+	mov	X1, 8(rp,i,8)
+	adc	%rax, X0
+	MOV(	%rdx, X1, 32)
+	mov	16(rp,i,8), w3
+	adc	$0, X1
+	mov	16(up,i,8), %rax
+	mul	v1
+	add	w3, w0
+	MOV(	%rdx, w2, 64)
+	adc	%rax, w1
+	mov	24(up,i,8), %rax
+	adc	$0, w2
+	mul	v0
+	add	w0, X0
+	mov	X0, 16(rp,i,8)
+	MOV(	%rdx, X0, 128)
+	adc	%rax, X1
+	mov	24(up,i,8), %rax
+	mov	24(rp,i,8), w0
+	adc	$0, X0
+	add	$4, i
+	jnc	L(am2top1)
+
+	mul	v1
+	add	w0, w1
+	adc	w2, %rax
+	adc	Z(i,$0), %rdx
+	add	w1, X1
+	adc	Z(i,$0), X0
+	mov	X1, I(-8(rp),-8(rp,i,8))
+	add	X0, %rax
+	mov	%rax, I((rp),(rp,i,8))
+	adc	Z(i,$0), %rdx
+	mov	%rdx, I(8(rp),8(rp,i,8))
+
+	lea	16(rp), rp
+	add	$2, n
+
+L(am2o3):
+	mov	-8(up,n,8), v0
+	mov	(up,n,8), %rax
+	mov	%rax, v1
+	lea	-1(n), i
+	mul	v0
+	mov	%rax, X1
+	MOV(	%rdx, X0, 8)
+	mov	(rp,n,8), w3
+	xor	R32(w0), R32(w0)
+	xor	R32(w1), R32(w1)
+	mov	8(up,n,8), %rax
+	jmp	L(lo3)
+
+	ALIGNx
+L(am2top3):
+	mul	v1
+	add	w0, w1
+	adc	%rax, w2
+	mov	(up,i,8), %rax
+	MOV(	%rdx, w3, 1)
+	adc	$0, w3
+	mul	v0
+	add	w1, X1
+	mov	X1, -8(rp,i,8)
+	adc	%rax, X0
+	MOV(	%rdx, X1, 2)
+	adc	$0, X1
+	mov	(up,i,8), %rax
+	mul	v1
+	MOV(	%rdx, w0, 4)
+	mov	(rp,i,8), w1
+	add	w1, w2
+	adc	%rax, w3
+	adc	$0, w0
+	mov	8(up,i,8), %rax
+	mul	v0
+	add	w2, X0
+	adc	%rax, X1
+	mov	X0, (rp,i,8)
+	MOV(	%rdx, X0, 8)
+	adc	$0, X0
+	mov	8(up,i,8), %rax
+	mov	8(rp,i,8), w2
+	mul	v1
+	add	w2, w3
+	adc	%rax, w0
+	MOV(	%rdx, w1, 16)
+	adc	$0, w1
+	mov	16(up,i,8), %rax
+L(lo3):	mul	v0
+	add	w3, X1
+	mov	X1, 8(rp,i,8)
+	adc	%rax, X0
+	MOV(	%rdx, X1, 32)
+	mov	16(rp,i,8), w3
+	adc	$0, X1
+	mov	16(up,i,8), %rax
+	mul	v1
+	add	w3, w0
+	MOV(	%rdx, w2, 64)
+	adc	%rax, w1
+	mov	24(up,i,8), %rax
+	adc	$0, w2
+	mul	v0
+	add	w0, X0
+	mov	X0, 16(rp,i,8)
+	MOV(	%rdx, X0, 128)
+	adc	%rax, X1
+	mov	24(up,i,8), %rax
+	mov	24(rp,i,8), w0
+	adc	$0, X0
+	add	$4, i
+	jnc	L(am2top3)
+
+	mul	v1
+	add	w0, w1
+	adc	w2, %rax
+	adc	Z(i,$0), %rdx
+	add	w1, X1
+	adc	Z(i,$0), X0
+	mov	X1, I(-8(rp),-8(rp,i,8))
+	add	X0, %rax
+	mov	%rax, I((rp),(rp,i,8))
+	adc	Z(i,$0), %rdx
+	mov	%rdx, I(8(rp),8(rp,i,8))
+
+	lea	16(rp), rp
+	add	$2, n
+	cmp	$-1, n
+	jnz	L(am2o1)
+
+L(cor1):pop	n
+	mov	%rdx, w3
+	mov	-16(up), v0
+	mov	-8(up), %rax
+	mul	v0
+	add	w3, %rax
+	adc	$0, %rdx
+	mov	%rax, -8(rp)
+	mov	%rdx, (rp)
+	jmp	L(sqr_diag_addlsh1)
+
+	ALIGNx
+L(m2top2):
+L(m2e2):mul	v0
+	add	%rax, w3
+	mov	-8(up,i,8), %rax
+	mov	w3, -8(rp,i,8)
+	adc	%rdx, w0
+	adc	$0, R32(w1)
+	mul	v1
+	add	%rax, w0
+	adc	%rdx, w1
+	mov	$0, R32(w2)
+	mov	(up,i,8), %rax
+	mul	v0
+	add	%rax, w0
+	mov	w0, (rp,i,8)
+	adc	%rdx, w1
+	mov	(up,i,8), %rax
+	adc	$0, R32(w2)
+	mul	v1
+	add	%rax, w1
+	adc	%rdx, w2
+	mov	8(up,i,8), %rax
+	mul	v0
+	mov	$0, R32(w3)
+	add	%rax, w1
+	adc	%rdx, w2
+	adc	$0, R32(w3)
+	mov	8(up,i,8), %rax
+	mul	v1
+	add	%rax, w2
+	mov	w1, 8(rp,i,8)
+	adc	%rdx, w3
+	mov	$0, R32(w0)
+	mov	16(up,i,8), %rax
+	mul	v0
+	add	%rax, w2
+	mov	16(up,i,8), %rax
+	adc	%rdx, w3
+	adc	$0, R32(w0)
+	mul	v1
+	mov	$0, R32(w1)
+	add	%rax, w3
+	mov	24(up,i,8), %rax
+	mov	w2, 16(rp,i,8)
+	adc	%rdx, w0
+	add	$4, i
+	js	L(m2top2)
+
+	mul	v0
+	add	%rax, w3
+	mov	I(-8(up),-8(up,i,8)), %rax
+	mov	w3, I(-8(rp),-8(rp,i,8))
+	adc	%rdx, w0
+	adc	R32(w1), R32(w1)
+	mul	v1
+	add	w0, %rax
+	adc	w1, %rdx
+	mov	%rax, I((rp),(rp,i,8))
+	mov	%rdx, I(8(rp),8(rp,i,8))
+
+	lea	16(rp), rp
+	add	$2, n			C decrease |n|
+	jmp	L(am2o0)
+
+	ALIGNx
+L(m2top0):
+	mul	v0
+	add	%rax, w3
+	mov	-8(up,i,8), %rax
+	mov	w3, -8(rp,i,8)
+	adc	%rdx, w0
+	adc	$0, R32(w1)
+	mul	v1
+	add	%rax, w0
+	adc	%rdx, w1
+	mov	$0, R32(w2)
+	mov	(up,i,8), %rax
+	mul	v0
+	add	%rax, w0
+	mov	w0, (rp,i,8)
+	adc	%rdx, w1
+	mov	(up,i,8), %rax
+	adc	$0, R32(w2)
+	mul	v1
+	add	%rax, w1
+	adc	%rdx, w2
+L(m2e0):mov	8(up,i,8), %rax
+	mul	v0
+	mov	$0, R32(w3)
+	add	%rax, w1
+	adc	%rdx, w2
+	adc	$0, R32(w3)
+	mov	8(up,i,8), %rax
+	mul	v1
+	add	%rax, w2
+	mov	w1, 8(rp,i,8)
+	adc	%rdx, w3
+	mov	$0, R32(w0)
+	mov	16(up,i,8), %rax
+	mul	v0
+	add	%rax, w2
+	mov	16(up,i,8), %rax
+	adc	%rdx, w3
+	adc	$0, R32(w0)
+	mul	v1
+	mov	$0, R32(w1)
+	add	%rax, w3
+	mov	24(up,i,8), %rax
+	mov	w2, 16(rp,i,8)
+	adc	%rdx, w0
+	add	$4, i
+	js	L(m2top0)
+
+	mul	v0
+	add	%rax, w3
+	mov	I(-8(up),-8(up,i,8)), %rax
+	mov	w3, I(-8(rp),-8(rp,i,8))
+	adc	%rdx, w0
+	adc	R32(w1), R32(w1)
+	mul	v1
+	add	w0, %rax
+	adc	w1, %rdx
+	mov	%rax, I((rp),(rp,i,8))
+	mov	%rdx, I(8(rp),8(rp,i,8))
+
+	lea	16(rp), rp
+	add	$2, n			C decrease |n|
+	cmp	$-2, n
+	jz	L(cor2)			C jumps iff entry n = 5
+
+L(am2o2):
+	mov	-8(up,n,8), v0
+	mov	(up,n,8), %rax
+	mov	%rax, v1
+	lea	-2(n), i
+	mul	v0
+	mov	%rax, X0
+	MOV(	%rdx, X1, 32)
+	mov	(rp,n,8), w0
+	xor	R32(w1), R32(w1)
+	xor	R32(w2), R32(w2)
+	mov	8(up,n,8), %rax
+	jmp	L(lo2)
+
+	ALIGNx
+L(am2top2):
+	mul	v1
+	add	w0, w1
+	adc	%rax, w2
+	mov	(up,i,8), %rax
+	MOV(	%rdx, w3, 1)
+	adc	$0, w3
+	mul	v0
+	add	w1, X1
+	mov	X1, -8(rp,i,8)
+	adc	%rax, X0
+	MOV(	%rdx, X1, 2)
+	adc	$0, X1
+	mov	(up,i,8), %rax
+	mul	v1
+	MOV(	%rdx, w0, 4)
+	mov	(rp,i,8), w1
+	add	w1, w2
+	adc	%rax, w3
+	adc	$0, w0
+	mov	8(up,i,8), %rax
+	mul	v0
+	add	w2, X0
+	adc	%rax, X1
+	mov	X0, (rp,i,8)
+	MOV(	%rdx, X0, 8)
+	adc	$0, X0
+	mov	8(up,i,8), %rax
+	mov	8(rp,i,8), w2
+	mul	v1
+	add	w2, w3
+	adc	%rax, w0
+	MOV(	%rdx, w1, 16)
+	adc	$0, w1
+	mov	16(up,i,8), %rax
+	mul	v0
+	add	w3, X1
+	mov	X1, 8(rp,i,8)
+	adc	%rax, X0
+	MOV(	%rdx, X1, 32)
+	mov	16(rp,i,8), w3
+	adc	$0, X1
+	mov	16(up,i,8), %rax
+	mul	v1
+	add	w3, w0
+	MOV(	%rdx, w2, 64)
+	adc	%rax, w1
+	mov	24(up,i,8), %rax
+	adc	$0, w2
+L(lo2):	mul	v0
+	add	w0, X0
+	mov	X0, 16(rp,i,8)
+	MOV(	%rdx, X0, 128)
+	adc	%rax, X1
+	mov	24(up,i,8), %rax
+	mov	24(rp,i,8), w0
+	adc	$0, X0
+	add	$4, i
+	jnc	L(am2top2)
+
+	mul	v1
+	add	w0, w1
+	adc	w2, %rax
+	adc	Z(i,$0), %rdx
+	add	w1, X1
+	adc	Z(i,$0), X0
+	mov	X1, I(-8(rp),-8(rp,i,8))
+	add	X0, %rax
+	mov	%rax, I((rp),(rp,i,8))
+	adc	Z(i,$0), %rdx
+	mov	%rdx, I(8(rp),8(rp,i,8))
+
+	lea	16(rp), rp
+	add	$2, n
+
+L(am2o0):
+	mov	-8(up,n,8), v0
+	mov	(up,n,8), %rax
+	mov	%rax, v1
+	lea	0(n), i
+	mul	v0
+	mov	%rax, X0
+	MOV(	%rdx, X1, 2)
+	xor	R32(w0), R32(w0)
+	mov	(rp,n,8), w2
+	xor	R32(w3), R32(w3)
+	jmp	L(lo0)
+
+	ALIGNx
+L(am2top0):
+	mul	v1
+	add	w0, w1
+	adc	%rax, w2
+	mov	(up,i,8), %rax
+	MOV(	%rdx, w3, 1)
+	adc	$0, w3
+	mul	v0
+	add	w1, X1
+	mov	X1, -8(rp,i,8)
+	adc	%rax, X0
+	MOV(	%rdx, X1, 2)
+	adc	$0, X1
+	mov	(up,i,8), %rax
+	mul	v1
+	MOV(	%rdx, w0, 4)
+	mov	(rp,i,8), w1
+	add	w1, w2
+	adc	%rax, w3
+	adc	$0, w0
+L(lo0):	mov	8(up,i,8), %rax
+	mul	v0
+	add	w2, X0
+	adc	%rax, X1
+	mov	X0, (rp,i,8)
+	MOV(	%rdx, X0, 8)
+	adc	$0, X0
+	mov	8(up,i,8), %rax
+	mov	8(rp,i,8), w2
+	mul	v1
+	add	w2, w3
+	adc	%rax, w0
+	MOV(	%rdx, w1, 16)
+	adc	$0, w1
+	mov	16(up,i,8), %rax
+	mul	v0
+	add	w3, X1
+	mov	X1, 8(rp,i,8)
+	adc	%rax, X0
+	MOV(	%rdx, X1, 32)
+	mov	16(rp,i,8), w3
+	adc	$0, X1
+	mov	16(up,i,8), %rax
+	mul	v1
+	add	w3, w0
+	MOV(	%rdx, w2, 64)
+	adc	%rax, w1
+	mov	24(up,i,8), %rax
+	adc	$0, w2
+	mul	v0
+	add	w0, X0
+	mov	X0, 16(rp,i,8)
+	MOV(	%rdx, X0, 128)
+	adc	%rax, X1
+	mov	24(up,i,8), %rax
+	mov	24(rp,i,8), w0
+	adc	$0, X0
+	add	$4, i
+	jnc	L(am2top0)
+
+	mul	v1
+	add	w0, w1
+	adc	w2, %rax
+	adc	Z(i,$0), %rdx
+	add	w1, X1
+	adc	Z(i,$0), X0
+	mov	X1, I(-8(rp),-8(rp,i,8))
+	add	X0, %rax
+	mov	%rax, I((rp),(rp,i,8))
+	adc	Z(i,$0), %rdx
+	mov	%rdx, I(8(rp),8(rp,i,8))
+
+	lea	16(rp), rp
+	add	$2, n
+	cmp	$-2, n
+	jnz	L(am2o2)
+
+L(cor2):pop	n
+	mov	-24(up), v0
+	mov	%rax, w2
+	mov	%rdx, w0
+	mov	-16(up), %rax
+	mov	%rax, v1
+	mul	v0
+	mov	%rax, X0
+	MOV(	%rdx, X1, 32)
+	mov	-8(up), %rax
+	mul	v0
+	add	w2, X0
+	mov	X0, -16(rp)
+	MOV(	%rdx, X0, 128)
+	adc	%rax, X1
+	mov	-8(up), %rax
+	adc	$0, X0
+	mul	v1
+	add	w0, X1
+	adc	$0, X0
+	mov	X1, -8(rp)
+	add	X0, %rax
+	mov	%rax, (rp)
+	adc	$0, %rdx
+	mov	%rdx, 8(rp)
+	lea	8(rp), rp
+
+L(sqr_diag_addlsh1):
+	mov	-8(up,n,8), %rax
+	shl	n
+	xor	R32(%rbx), R32(%rbx)
+	mul	%rax
+	mov	8(rp,n,8), %r11
+	lea	(%rdx), %r10
+	mov	16(rp,n,8), %r9
+	add	%r11, %r11
+	jmp	L(dm)
+
+	ALIGNx
+L(dtop):mul	%rax
+	add	%r11, %r10
+	mov	8(rp,n,8), %r11
+	mov	%r10, -8(rp,n,8)
+	adc	%r9, %rax
+	lea	(%rdx,%rbx), %r10
+	mov	16(rp,n,8), %r9
+	adc	%r11, %r11
+L(dm):	mov	%rax, (rp,n,8)
+	mov	(up,n,4), %rax
+	adc	%r9, %r9
+	setc	R8(%rbx)
+	add	$2, n
+	js	L(dtop)
+
+	mul	%rax
+	add	%r11, %r10
+	mov	%r10, -8(rp)
+	adc	%r9, %rax
+	lea	(%rdx,%rbx), %r10
+	mov	%rax, (rp)
+	adc	$0, %r10
+	mov	%r10, 8(rp)
+
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+
+	ALIGN(16)
+L(small):
+	mov	(up), %rax
+	cmp	$2, n_param
+	jae	L(gt1)
+L(n1):
+	mul	%rax
+	mov	%rax, (rp)
+	mov	%rdx, 8(rp)
+	FUNC_EXIT()
+	ret
+
+L(gt1):	jne	L(gt2)
+L(n2):	mov	%rax, %r8
+	mul	%rax
+	mov	8(up), %r11
+	mov	%rax, (rp)
+	mov	%r11, %rax
+	mov	%rdx, %r9
+	mul	%rax
+	mov	%rax, %r10
+	mov	%r11, %rax
+	mov	%rdx, %r11
+	mul	%r8
+	xor	%r8, %r8
+	add	%rax, %r9
+	adc	%rdx, %r10
+	adc	%r8, %r11
+	add	%rax, %r9
+	mov	%r9, 8(rp)
+	adc	%rdx, %r10
+	mov	%r10, 16(rp)
+	adc	%r8, %r11
+	mov	%r11, 24(rp)
+	FUNC_EXIT()
+	ret
+
+L(gt2):
+L(n3):	mov	%rax, %r10
+	mul	%rax
+	mov	8(up), %r11
+	mov	%rax, (rp)
+	mov	%r11, %rax
+	mov	%rdx, 8(rp)
+	mul	%rax
+	mov	16(up), %rcx
+	mov	%rax, 16(rp)
+	mov	%rcx, %rax
+	mov	%rdx, 24(rp)
+	mul	%rax
+	mov	%rax, 32(rp)
+	mov	%rdx, 40(rp)
+
+	mov	%r11, %rax
+	mul	%r10
+	mov	%rax, %r8
+	mov	%rcx, %rax
+	mov	%rdx, %r9
+	mul	%r10
+	xor	%r10, %r10
+	add	%rax, %r9
+	mov	%r11, %rax
+	mov	%r10, %r11
+	adc	%rdx, %r10
+
+	mul	%rcx
+	add	%rax, %r10
+	adc	%r11, %rdx
+	add	%r8, %r8
+	adc	%r9, %r9
+	adc	%r10, %r10
+	adc	%rdx, %rdx
+	adc	%r11, %r11
+	add	%r8, 8(rp)
+	adc	%r9, 16(rp)
+	adc	%r10, 24(rp)
+	adc	%rdx, 32(rp)
+	adc	%r11, 40(rp)
+	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/core2/sublsh1_n.asm b/third_party/gmp/mpn/x86_64/core2/sublsh1_n.asm
new file mode 100644
index 0000000..46488fc
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/core2/sublsh1_n.asm

@@ -0,0 +1,47 @@
+dnl  AMD64 mpn_sublsh1_n optimised for Core 2 and Core iN.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  Copyright 2008, 2010-2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+define(LSH, 1)
+define(RSH, 63)
+
+define(ADDSUB,	sub)
+define(ADCSBB,	sbb)
+define(func,	mpn_sublsh1_n)
+
+MULFUNC_PROLOGUE(mpn_sublsh1_n)
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+include_mpn(`x86_64/core2/sublshC_n.asm')

diff --git a/third_party/gmp/mpn/x86_64/core2/sublsh2_n.asm b/third_party/gmp/mpn/x86_64/core2/sublsh2_n.asm
new file mode 100644
index 0000000..f3b1e28
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/core2/sublsh2_n.asm

@@ -0,0 +1,47 @@
+dnl  AMD64 mpn_sublsh2_n optimised for Core 2 and Core iN.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  Copyright 2008, 2010-2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+define(LSH, 2)
+define(RSH, 62)
+
+define(ADDSUB,	sub)
+define(ADCSBB,	sbb)
+define(func,	mpn_sublsh2_n)
+
+MULFUNC_PROLOGUE(mpn_sublsh2_n)
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+include_mpn(`x86_64/core2/sublshC_n.asm')

diff --git a/third_party/gmp/mpn/x86_64/core2/sublshC_n.asm b/third_party/gmp/mpn/x86_64/core2/sublshC_n.asm
new file mode 100644
index 0000000..272700d
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/core2/sublshC_n.asm

@@ -0,0 +1,158 @@
+dnl  AMD64 mpn_sublshC_n -- rp[] = up[] - (vp[] << C), optimised for Core 2 and
+dnl  Core iN.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  Copyright 2008, 2010-2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+C	     cycles/limb
+C AMD K8,K9	 4.25
+C AMD K10	 ?
+C Intel P4	 ?
+C Intel core2	 3
+C Intel NHM	 3.1
+C Intel SBR	 2.47
+C Intel atom	 ?
+C VIA nano	 ?
+
+C INPUT PARAMETERS
+define(`rp',`%rdi')
+define(`up',`%rsi')
+define(`vp',`%rdx')
+define(`n', `%rcx')
+
+ASM_START()
+	TEXT
+	ALIGN(8)
+PROLOGUE(func)
+	FUNC_ENTRY(4)
+	push	%rbx
+	push	%r12
+
+	mov	R32(%rcx), R32(%rax)
+	lea	24(up,n,8), up
+	lea	24(vp,n,8), vp
+	lea	24(rp,n,8), rp
+	neg	n
+
+	xor	R32(%r11), R32(%r11)
+
+	mov	-24(vp,n,8), %r8	C do first limb early
+	shrd	$RSH, %r8, %r11
+
+	and	$3, R32(%rax)
+	je	L(b0)
+	cmp	$2, R32(%rax)
+	jc	L(b1)
+	je	L(b2)
+
+L(b3):	mov	-16(vp,n,8), %r9
+	shrd	$RSH, %r9, %r8
+	mov	-8(vp,n,8), %r10
+	shrd	$RSH, %r10, %r9
+	mov	-24(up,n,8), %r12
+	ADDSUB	%r11, %r12
+	mov	%r12, -24(rp,n,8)
+	mov	-16(up,n,8), %r12
+	ADCSBB	%r8, %r12
+	mov	%r12, -16(rp,n,8)
+	mov	-8(up,n,8), %r12
+	ADCSBB	%r9, %r12
+	mov	%r12, -8(rp,n,8)
+	mov	%r10, %r11
+	sbb	R32(%rax), R32(%rax)	C save cy
+	add	$3, n
+	js	L(top)
+	jmp	L(end)
+
+L(b1):	mov	-24(up,n,8), %r12
+	ADDSUB	%r11, %r12
+	mov	%r12, -24(rp,n,8)
+	mov	%r8, %r11
+	sbb	R32(%rax), R32(%rax)	C save cy
+	inc	n
+	js	L(top)
+	jmp	L(end)
+
+L(b2):	mov	-16(vp,n,8), %r9
+	shrd	$RSH, %r9, %r8
+	mov	-24(up,n,8), %r12
+	ADDSUB	%r11, %r12
+	mov	%r12, -24(rp,n,8)
+	mov	-16(up,n,8), %r12
+	ADCSBB	%r8, %r12
+	mov	%r12, -16(rp,n,8)
+	mov	%r9, %r11
+	sbb	R32(%rax), R32(%rax)	C save cy
+	add	$2, n
+	js	L(top)
+	jmp	L(end)
+
+	ALIGN(16)
+L(top):	mov	-24(vp,n,8), %r8
+	shrd	$RSH, %r8, %r11
+L(b0):	mov	-16(vp,n,8), %r9
+	shrd	$RSH, %r9, %r8
+	mov	-8(vp,n,8), %r10
+	shrd	$RSH, %r10, %r9
+	mov	(vp,n,8), %rbx
+	shrd	$RSH, %rbx, %r10
+
+	add	R32(%rax), R32(%rax)	C restore cy
+
+	mov	-24(up,n,8), %r12
+	ADCSBB	%r11, %r12
+	mov	%r12, -24(rp,n,8)
+
+	mov	-16(up,n,8), %r12
+	ADCSBB	%r8, %r12
+	mov	%r12, -16(rp,n,8)
+
+	mov	-8(up,n,8), %r12
+	ADCSBB	%r9, %r12
+	mov	%r12, -8(rp,n,8)
+
+	mov	(up,n,8), %r12
+	ADCSBB	%r10, %r12
+	mov	%r12, (rp,n,8)
+
+	mov	%rbx, %r11
+	sbb	R32(%rax), R32(%rax)	C save cy
+
+	add	$4, n
+	js	L(top)
+
+L(end):	shr	$RSH, %r11
+	pop	%r12
+	pop	%rbx
+	sub	R32(%r11), R32(%rax)
+	neg	R32(%rax)
+	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/coreibwl/addmul_1.asm b/third_party/gmp/mpn/x86_64/coreibwl/addmul_1.asm
new file mode 100644
index 0000000..ee7e4ee
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/coreibwl/addmul_1.asm

@@ -0,0 +1,208 @@
+dnl  AMD64 mpn_addmul_1 optimised for Intel Broadwell.
+
+dnl  Copyright 2015, 2017 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C AMD K8,K9	n/a
+C AMD K10	n/a
+C AMD bd1	n/a
+C AMD bd2	n/a
+C AMD bd3	n/a
+C AMD bd4	 ?
+C AMD zen	 ?
+C AMD bt1	n/a
+C AMD bt2	n/a
+C Intel P4	n/a
+C Intel PNR	n/a
+C Intel NHM	n/a
+C Intel SBR	n/a
+C Intel IBR	n/a
+C Intel HWL	n/a
+C Intel BWL	 1.67	 1.74
+C Intel SKL	 1.63	 1.71
+C Intel atom	n/a
+C Intel SLM	n/a
+C VIA nano	n/a
+
+C The loop of this code is the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjorn Granlund.
+
+C TODO
+C  * Put an initial mulx before switching, targeting some free registers.
+C  * Tune feed-in code.
+C  * Trim nop execution after L(f2).
+C  * For DOS64, fix nop execution.
+
+define(`rp',      `%rdi')   C rcx
+define(`up',      `%rsi')   C rdx
+define(`n_param', `%rdx')   C r8
+define(`v0_param',`%rcx')   C r9
+
+define(`n',       `%rcx')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+dnl IFDOS(`	define(`up', ``%rsi'')	') dnl
+dnl IFDOS(`	define(`rp', ``%rcx'')	') dnl
+dnl IFDOS(`	define(`vl', ``%r9'')	') dnl
+dnl IFDOS(`	define(`r9', ``rdi'')	') dnl
+dnl IFDOS(`	define(`n',  ``%r8'')	') dnl
+dnl IFDOS(`	define(`r8', ``r11'')	') dnl
+
+ASM_START()
+	TEXT
+	ALIGN(32)
+PROLOGUE(mpn_addmul_1)
+	FUNC_ENTRY(4)
+
+	mov	v0_param, %r10
+	mov	n_param, n
+	mov	R32(n_param), R32(%r8)
+	shr	$3, n
+	and	$7, R32(%r8)		C clear OF, CF as side-effect
+	mov	%r10, %rdx
+	lea	L(tab)(%rip), %r10
+ifdef(`PIC',
+`	movslq	(%r10,%r8,4), %r8
+	lea	(%r8, %r10), %r10
+	jmp	*%r10
+',`
+	jmp	*(%r10,%r8,8)
+')
+	JUMPTABSECT
+	ALIGN(8)
+L(tab):	JMPENT(	L(f0), L(tab))
+	JMPENT(	L(f1), L(tab))
+	JMPENT(	L(f2), L(tab))
+	JMPENT(	L(f3), L(tab))
+	JMPENT(	L(f4), L(tab))
+	JMPENT(	L(f5), L(tab))
+	JMPENT(	L(f6), L(tab))
+	JMPENT(	L(f7), L(tab))
+	TEXT
+
+L(f0):	mulx(	(up), %r10, %r8)
+	lea	-8(up), up
+	lea	-8(rp), rp
+	lea	-1(n), n
+	jmp	L(b0)
+
+L(f3):	mulx(	(up), %r9, %rax)
+	lea	16(up), up
+	lea	-48(rp), rp
+	jmp	L(b3)
+
+L(f4):	mulx(	(up), %r10, %r8)
+	lea	24(up), up
+	lea	-40(rp), rp
+	jmp	L(b4)
+
+L(f5):	mulx(	(up), %r9, %rax)
+	lea	32(up), up
+	lea	-32(rp), rp
+	jmp	L(b5)
+
+L(f6):	mulx(	(up), %r10, %r8)
+	lea	40(up), up
+	lea	-24(rp), rp
+	jmp	L(b6)
+
+L(f1):	mulx(	(up), %r9, %rax)
+	jrcxz	L(1)
+	jmp	L(b1)
+L(1):	add	(rp), %r9
+	mov	%r9, (rp)
+	adc	%rcx, %rax		C relies on rcx = 0
+	FUNC_EXIT()
+	ret
+
+L(end):	adox(	(rp), %r9)
+	mov	%r9, (rp)
+	adox(	%rcx, %rax)		C relies on rcx = 0
+	adc	%rcx, %rax		C relies on rcx = 0
+	FUNC_EXIT()
+	ret
+
+ifdef(`PIC',
+`	nop;nop;nop;nop',
+`	nop;nop;nop;nop;nop;nop;nop;nop;nop;nop;nop')
+
+L(f2):	mulx(	(up), %r10, %r8)
+	lea	8(up), up
+	lea	8(rp), rp
+	mulx(	(up), %r9, %rax)
+
+	ALIGN(32)
+L(top):	adox(	-8,(rp), %r10)
+	adcx(	%r8, %r9)
+	mov	%r10, -8(rp)
+	jrcxz	L(end)
+L(b1):	mulx(	8,(up), %r10, %r8)
+	adox(	(rp), %r9)
+	lea	-1(n), n
+	mov	%r9, (rp)
+	adcx(	%rax, %r10)
+L(b0):	mulx(	16,(up), %r9, %rax)
+	adcx(	%r8, %r9)
+	adox(	8,(rp), %r10)
+	mov	%r10, 8(rp)
+L(b7):	mulx(	24,(up), %r10, %r8)
+	lea	64(up), up
+	adcx(	%rax, %r10)
+	adox(	16,(rp), %r9)
+	mov	%r9, 16(rp)
+L(b6):	mulx(	-32,(up), %r9, %rax)
+	adox(	24,(rp), %r10)
+	adcx(	%r8, %r9)
+	mov	%r10, 24(rp)
+L(b5):	mulx(	-24,(up), %r10, %r8)
+	adcx(	%rax, %r10)
+	adox(	32,(rp), %r9)
+	mov	%r9, 32(rp)
+L(b4):	mulx(	-16,(up), %r9, %rax)
+	adox(	40,(rp), %r10)
+	adcx(	%r8, %r9)
+	mov	%r10, 40(rp)
+L(b3):	adox(	48,(rp), %r9)
+	mulx(	-8,(up), %r10, %r8)
+	mov	%r9, 48(rp)
+	lea	64(rp), rp
+	adcx(	%rax, %r10)
+	mulx(	(up), %r9, %rax)
+	jmp	L(top)
+
+L(f7):	mulx(	(up), %r9, %rax)
+	lea	-16(up), up
+	lea	-16(rp), rp
+	jmp	L(b7)
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/x86_64/coreibwl/gmp-mparam.h b/third_party/gmp/mpn/x86_64/coreibwl/gmp-mparam.h
new file mode 100644
index 0000000..91c91b5
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/coreibwl/gmp-mparam.h

@@ -0,0 +1,246 @@
+/* Broadwell gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+/* Disable use of slow functions.  FIXME: We should disable lib inclusion.  */
+#undef HAVE_NATIVE_mpn_mul_2
+#undef HAVE_NATIVE_mpn_addmul_2
+
+/* 3400-3800 MHz Intel Xeon E3-1285Lv4 Broadwell */
+/* FFT tuning limit = 467,964,472 */
+/* Generated by tuneup.c, 2019-10-17, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD                 0  /* always */
+#define MOD_1_UNNORM_THRESHOLD               0  /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          4
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          3
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD        14
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD        24
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD      9
+#define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_1_NORM_THRESHOLD              1
+#define DIV_QR_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */
+#define DIV_QR_2_PI2_THRESHOLD              24
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           22
+
+#define DIV_1_VS_MUL_1_PERCENT             455
+
+#define MUL_TOOM22_THRESHOLD                26
+#define MUL_TOOM33_THRESHOLD                73
+#define MUL_TOOM44_THRESHOLD               202
+#define MUL_TOOM6H_THRESHOLD               303
+#define MUL_TOOM8H_THRESHOLD               406
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD     141
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     152
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD     137
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD     151
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD     198
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 34
+#define SQR_TOOM3_THRESHOLD                117
+#define SQR_TOOM4_THRESHOLD                336
+#define SQR_TOOM6_THRESHOLD                426
+#define SQR_TOOM8_THRESHOLD                547
+
+#define MULMID_TOOM42_THRESHOLD             46
+
+#define MULMOD_BNM1_THRESHOLD               16
+#define SQRMOD_BNM1_THRESHOLD               18
+
+#define MUL_FFT_MODF_THRESHOLD             460  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    460, 5}, {     21, 6}, {     11, 5}, {     23, 6}, \
+    {     12, 5}, {     25, 6}, {     25, 7}, {     13, 6}, \
+    {     28, 7}, {     15, 6}, {     31, 7}, {     25, 8}, \
+    {     13, 7}, {     28, 8}, {     15, 7}, {     31, 8}, \
+    {     17, 7}, {     35, 8}, {     19, 7}, {     39, 8}, \
+    {     21, 9}, {     11, 8}, {     27, 9}, {     15, 8}, \
+    {     35, 9}, {     19, 8}, {     41, 9}, {     23, 8}, \
+    {     49, 9}, {     27,10}, {     15, 9}, {     39, 8}, \
+    {     79,10}, {     23, 9}, {     55,11}, {     15,10}, \
+    {     31, 9}, {     71,10}, {     39, 9}, {     83,10}, \
+    {     47, 9}, {     99,10}, {     55,11}, {     31,10}, \
+    {     87,11}, {     47,10}, {    103,12}, {     31,11}, \
+    {     63,10}, {    135,11}, {     79,10}, {    167,11}, \
+    {     95,10}, {    199,11}, {    111,12}, {     63, 8}, \
+    {   1087,10}, {    287, 9}, {    575,10}, {    303,11}, \
+    {    159,12}, {     95,11}, {    191,10}, {    383,13}, \
+    {     63,12}, {    127,11}, {    255,10}, {    511,11}, \
+    {    271,10}, {    543,11}, {    287,10}, {    575,11}, \
+    {    303,10}, {    607,12}, {    159,11}, {    319,10}, \
+    {    639,11}, {    335,10}, {    671,11}, {    351,10}, \
+    {    703,11}, {    367,12}, {    191,11}, {    383,10}, \
+    {    767,11}, {    415,10}, {    831,11}, {    447,13}, \
+    {    127,12}, {    255,11}, {    543,12}, {    287,11}, \
+    {    607,12}, {    319,11}, {    671,12}, {    351,11}, \
+    {    703,13}, {    191,12}, {    383,11}, {    767,12}, \
+    {    415,11}, {    831,12}, {    447,14}, {    127,13}, \
+    {    255,12}, {    607,13}, {    319,12}, {    735,13}, \
+    {    383,12}, {    831,13}, {    447,12}, {    959,14}, \
+    {    255,13}, {    511,12}, {   1023,13}, {    575,12}, \
+    {   1151,13}, {    639,12}, {   1279,13}, {    703,14}, \
+    {    383,13}, {    831,12}, {   1663,13}, {    959,14}, \
+    {    511,13}, {   1087,12}, {   2175,13}, {   1151,14}, \
+    {    639,13}, {   1279,12}, {   2559,13}, {   1343,12}, \
+    {   2687,13}, {   1407,14}, {    767,13}, {   1535,12}, \
+    {   3071,13}, {   1599,12}, {   3199,13}, {   1663,14}, \
+    {    895,15}, {    511,14}, {   1023,13}, {   2175,14}, \
+    {   1151,13}, {   2431,12}, {   4863,14}, {   1279,13}, \
+    {   2687,14}, {   1407,15}, {    767,14}, {   1535,13}, \
+    {   3199,14}, {   1663,13}, {   3455,12}, {   6911,16}, \
+    {    511,15}, {   1023,14}, {   2175,13}, {   4479,14}, \
+    {   2303,13}, {   4607,14}, {   2431,13}, {   4863,15}, \
+    {   1279,14}, {   2815,13}, {   5631,14}, {   2943,13}, \
+    {   5887,15}, {   1535,14}, {   3455,13}, {   6911,15}, \
+    {   1791,14}, {   3839,13}, {   7679,16}, {   1023,15}, \
+    {   2047,14}, {   4479,15}, {   2303,14}, {   4863,15}, \
+    {   2559,14}, {   5247,15}, {   2815,14}, {   5887,16}, \
+    {   1535,15}, {   3327,14}, {   6911,15}, {   3839,14}, \
+    {   7679,17}, {   1023,16}, {   2047,15}, {   4351,14}, \
+    {   8703,15}, {   4863,16}, {   2559,15}, {   5887,14}, \
+    {  11775,16}, {   3071,15}, {   6911,16}, {   3583,15}, \
+    {   7679,14}, {  15359,17}, {   2047,16}, {   4095,15}, \
+    {   8703,16}, {   4607,15}, {   9983,14}, {  19967,16}, \
+    {   5631,15}, {  11775,17}, {   3071,16}, {  65536,17}, \
+    { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
+    {2097152,22}, {4194304,23}, {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 219
+#define MUL_FFT_THRESHOLD                 5760
+
+#define SQR_FFT_MODF_THRESHOLD             400  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    400, 5}, {     23, 6}, {     12, 5}, {     25, 6}, \
+    {     28, 7}, {     15, 6}, {     31, 7}, {     25, 8}, \
+    {     13, 7}, {     27, 8}, {     15, 7}, {     31, 8}, \
+    {     17, 7}, {     35, 8}, {     19, 7}, {     39, 8}, \
+    {     27, 9}, {     15, 8}, {     35, 9}, {     19, 8}, \
+    {     41, 9}, {     23, 8}, {     47, 9}, {     27,10}, \
+    {     15, 9}, {     39,10}, {     23, 9}, {     51,11}, \
+    {     15,10}, {     31, 9}, {     67,10}, {     39, 9}, \
+    {     79,10}, {     47, 9}, {     95,10}, {     55,11}, \
+    {     31,10}, {     79,11}, {     47,10}, {     95,12}, \
+    {     31,11}, {     63,10}, {    127,11}, {     79,10}, \
+    {    159,11}, {     95,12}, {     63,11}, {    127,10}, \
+    {    255, 9}, {    511,11}, {    143,10}, {    287, 9}, \
+    {    575,10}, {    303,11}, {    159,10}, {    319,12}, \
+    {     95, 8}, {   1599, 9}, {    831,11}, {    223,10}, \
+    {    447,12}, {    127,11}, {    255,10}, {    511,11}, \
+    {    271,10}, {    543,11}, {    287,10}, {    575,11}, \
+    {    303,10}, {    607,12}, {    159,11}, {    319,10}, \
+    {    639,11}, {    335,10}, {    671,11}, {    351,10}, \
+    {    703,11}, {    367,10}, {    735,11}, {    415,10}, \
+    {    831,12}, {    223,11}, {    447,13}, {    127,12}, \
+    {    255,11}, {    543,12}, {    287,11}, {    607,12}, \
+    {    319,11}, {    671,12}, {    351,11}, {    735,12}, \
+    {    383,11}, {    767,12}, {    415,11}, {    831,12}, \
+    {    447,14}, {    127,13}, {    255,12}, {    607,13}, \
+    {    319,12}, {    735,13}, {    383,12}, {    799,13}, \
+    {    447,12}, {    959,13}, {    511,12}, {   1023,13}, \
+    {    575,12}, {   1151,13}, {    639,12}, {   1279,13}, \
+    {    703,14}, {    383,13}, {    767,12}, {   1535,13}, \
+    {    831,12}, {   1663,13}, {    959,14}, {    511,13}, \
+    {   1087,12}, {   2175,13}, {   1151,14}, {    639,13}, \
+    {   1343,12}, {   2687,13}, {   1407,12}, {   2815,13}, \
+    {   1471,14}, {    767,13}, {   1599,12}, {   3199,13}, \
+    {   1663,14}, {    895,15}, {    511,14}, {   1023,13}, \
+    {   2175,14}, {   1151,13}, {   2431,12}, {   4863,14}, \
+    {   1279,13}, {   2687,14}, {   1407,13}, {   2815,15}, \
+    {    767,14}, {   1535,13}, {   3199,14}, {   1663,13}, \
+    {   3455,12}, {   6911,14}, {   1791,16}, {    511,15}, \
+    {   1023,14}, {   2047,13}, {   4095,14}, {   2175,13}, \
+    {   4351,14}, {   2303,13}, {   4607,14}, {   2431,13}, \
+    {   4863,15}, {   1279,14}, {   2943,13}, {   5887,15}, \
+    {   1535,14}, {   3455,13}, {   6911,15}, {   1791,14}, \
+    {   3839,13}, {   7679,16}, {   1023,15}, {   2047,14}, \
+    {   4351,15}, {   2303,14}, {   4863,15}, {   2559,14}, \
+    {   5247,15}, {   2815,14}, {   5887,16}, {   1535,15}, \
+    {   3327,14}, {   6911,15}, {   3839,14}, {   7679,17}, \
+    {   1023,16}, {   2047,15}, {   4863,16}, {   2559,15}, \
+    {   5887,14}, {  11775,16}, {   3071,15}, {   6911,16}, \
+    {   3583,15}, {   7679,14}, {  15359,15}, {   7935,17}, \
+    {   2047,16}, {   4095,15}, {   8447,16}, {   4607,15}, \
+    {   9471,14}, {  18943,15}, {   9983,14}, {  19967,16}, \
+    {   5631,15}, {  11775,17}, {   3071,16}, {  65536,17}, \
+    { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
+    {2097152,22}, {4194304,23}, {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 215
+#define SQR_FFT_THRESHOLD                 3712
+
+#define MULLO_BASECASE_THRESHOLD             0  /* always */
+#define MULLO_DC_THRESHOLD                  80
+#define MULLO_MUL_N_THRESHOLD            11025
+#define SQRLO_BASECASE_THRESHOLD             9
+#define SQRLO_DC_THRESHOLD                 109
+#define SQRLO_SQR_THRESHOLD               7293
+
+#define DC_DIV_QR_THRESHOLD                 54
+#define DC_DIVAPPR_Q_THRESHOLD             183
+#define DC_BDIV_QR_THRESHOLD                86
+#define DC_BDIV_Q_THRESHOLD                160
+
+#define INV_MULMOD_BNM1_THRESHOLD           58
+#define INV_NEWTON_THRESHOLD               171
+#define INV_APPR_THRESHOLD                 171
+
+#define BINV_NEWTON_THRESHOLD              292
+#define REDC_1_TO_REDC_2_THRESHOLD          33
+#define REDC_2_TO_REDC_N_THRESHOLD          63
+
+#define MU_DIV_QR_THRESHOLD               1589
+#define MU_DIVAPPR_Q_THRESHOLD            1589
+#define MUPI_DIV_QR_THRESHOLD               67
+#define MU_BDIV_QR_THRESHOLD              1470
+#define MU_BDIV_Q_THRESHOLD               1866
+
+#define POWM_SEC_TABLE  2,10,191,494,712,1378
+
+#define GET_STR_DC_THRESHOLD                12
+#define GET_STR_PRECOMPUTE_THRESHOLD        20
+#define SET_STR_DC_THRESHOLD               644
+#define SET_STR_PRECOMPUTE_THRESHOLD      1658
+
+#define FAC_DSC_THRESHOLD                  562
+#define FAC_ODD_THRESHOLD                   48
+
+#define MATRIX22_STRASSEN_THRESHOLD         16
+#define HGCD2_DIV1_METHOD                    5  /* 0.38% faster than 3 */
+#define HGCD_THRESHOLD                      73
+#define HGCD_APPR_THRESHOLD                 67
+#define HGCD_REDUCE_THRESHOLD             3014
+#define GCD_DC_THRESHOLD                   630
+#define GCDEXT_DC_THRESHOLD                365
+#define JACOBI_BASE_METHOD                   1  /* 29.65% faster than 4 */
+
+/* Tuneup completed successfully, took 239050 seconds */

diff --git a/third_party/gmp/mpn/x86_64/coreibwl/mul_1.asm b/third_party/gmp/mpn/x86_64/coreibwl/mul_1.asm
new file mode 100644
index 0000000..b7fae2f
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/coreibwl/mul_1.asm

@@ -0,0 +1,195 @@
+dnl  AMD64 mpn_mul_1 optimised for Intel Broadwell.
+
+dnl  Copyright 2015 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C AMD K8,K9      -
+C AMD K10        -
+C AMD bull       -
+C AMD pile       -
+C AMD steam      -
+C AMD excavator  -
+C AMD bobcat     -
+C AMD jaguar     -
+C Intel P4       -
+C Intel core2    -
+C Intel NHM      -
+C Intel SBR      -
+C Intel IBR      -
+C Intel HWL      1.70
+C Intel BWL      1.51
+C Intel SKL      1.52
+C Intel atom     -
+C Intel SLM      -
+C VIA nano       -
+
+C The loop of this code is the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjorn Granlund.
+
+C TODO
+C  * Put an initial mulx before switching, targeting some free registers.
+C  * Tune feed-in code.
+C  * Trim nop execution after L(f2).
+C  * Port to DOS64, not forgetting nop execution.
+
+define(`rp',      `%rdi')   C rcx
+define(`up',      `%rsi')   C rdx
+define(`n_param', `%rdx')   C r8
+define(`v0_param',`%rcx')   C r9
+
+define(`n',       `%rcx')
+
+dnl ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+dnl IFDOS(`	define(`up', ``%rsi'')	') dnl
+dnl IFDOS(`	define(`rp', ``%rcx'')	') dnl
+dnl IFDOS(`	define(`vl', ``%r9'')	') dnl
+dnl IFDOS(`	define(`r9', ``rdi'')	') dnl
+dnl IFDOS(`	define(`n',  ``%r8'')	') dnl
+dnl IFDOS(`	define(`r8', ``r11'')	') dnl
+
+ASM_START()
+	TEXT
+	ALIGN(32)
+PROLOGUE(mpn_mul_1)
+
+	mov	v0_param, %r10
+	mov	n_param, n
+	mov	R32(n_param), R32(%r8)
+	shr	$3, n
+	and	$7, R32(%r8)		C clear OF, CF as side-effect
+	mov	%r10, %rdx
+	lea	L(tab)(%rip), %r10
+ifdef(`PIC',
+`	movslq	(%r10,%r8,4), %r8
+	lea	(%r8, %r10), %r10
+	jmp	*%r10
+',`
+	jmp	*(%r10,%r8,8)
+')
+	JUMPTABSECT
+	ALIGN(8)
+L(tab):	JMPENT(	L(f0), L(tab))
+	JMPENT(	L(f1), L(tab))
+	JMPENT(	L(f2), L(tab))
+	JMPENT(	L(f3), L(tab))
+	JMPENT(	L(f4), L(tab))
+	JMPENT(	L(f5), L(tab))
+	JMPENT(	L(f6), L(tab))
+	JMPENT(	L(f7), L(tab))
+	TEXT
+
+L(f0):	mulx(	(up), %r10, %r8)
+	lea	56(up), up
+	lea	-8(rp), rp
+	jmp	L(b0)
+
+L(f3):	mulx(	(up), %r9, %rax)
+	lea	16(up), up
+	lea	16(rp), rp
+	inc	n
+	jmp	L(b3)
+
+L(f4):	mulx(	(up), %r10, %r8)
+	lea	24(up), up
+	lea	24(rp), rp
+	inc	n
+	jmp	L(b4)
+
+L(f5):	mulx(	(up), %r9, %rax)
+	lea	32(up), up
+	lea	32(rp), rp
+	inc	n
+	jmp	L(b5)
+
+L(f6):	mulx(	(up), %r10, %r8)
+	lea	40(up), up
+	lea	40(rp), rp
+	inc	n
+	jmp	L(b6)
+
+L(f7):	mulx(	(up), %r9, %rax)
+	lea	48(up), up
+	lea	48(rp), rp
+	inc	n
+	jmp	L(b7)
+
+L(f1):	mulx(	(up), %r9, %rax)
+	test	n, n
+	jnz	L(b1)
+L(1):	mov	%r9, (rp)
+	ret
+
+L(f2):	mulx(	(up), %r10, %r8)
+	lea	8(up), up
+	lea	8(rp), rp
+	mulx(	(up), %r9, %rax)
+	test	n, n
+	jz	L(end)
+
+	ALIGN(32)
+L(top):	mov	%r10, -8(rp)
+	adc	%r8, %r9
+L(b1):	mulx(	8,(up), %r10, %r8)
+	adc	%rax, %r10
+	lea	64(up), up
+	mov	%r9, (rp)
+L(b0):	mov	%r10, 8(rp)
+	mulx(	-48,(up), %r9, %rax)
+	lea	64(rp), rp
+	adc	%r8, %r9
+L(b7):	mulx(	-40,(up), %r10, %r8)
+	mov	%r9, -48(rp)
+	adc	%rax, %r10
+L(b6):	mov	%r10, -40(rp)
+	mulx(	-32,(up), %r9, %rax)
+	adc	%r8, %r9
+L(b5):	mulx(	-24,(up), %r10, %r8)
+	mov	%r9, -32(rp)
+	adc	%rax, %r10
+L(b4):	mulx(	-16,(up), %r9, %rax)
+	mov	%r10, -24(rp)
+	adc	%r8, %r9
+L(b3):	mulx(	-8,(up), %r10, %r8)
+	adc	%rax, %r10
+	mov	%r9, -16(rp)
+	dec	n
+	mulx(	(up), %r9, %rax)
+	jnz	L(top)
+
+L(end):	mov	%r10, -8(rp)
+	adc	%r8, %r9
+	mov	%r9, (rp)
+	adc	%rcx, %rax
+	ret
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/x86_64/coreibwl/mul_basecase.asm b/third_party/gmp/mpn/x86_64/coreibwl/mul_basecase.asm
new file mode 100644
index 0000000..42ca976
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/coreibwl/mul_basecase.asm

@@ -0,0 +1,369 @@
+dnl  AMD64 mpn_mul_basecase optimised for Intel Broadwell.
+
+dnl  Copyright 2015 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb	mul_1		addmul_1
+C AMD K8,K9	n/a		n/a
+C AMD K10	n/a		n/a
+C AMD bd1	n/a		n/a
+C AMD bd2	n/a		n/a
+C AMD bd3	n/a		n/a
+C AMD bd4	 ?		 ?
+C AMD zen	 ?		 ?
+C AMD bt1	n/a		n/a
+C AMD bt2	n/a		n/a
+C Intel P4	n/a		n/a
+C Intel PNR	n/a		n/a
+C Intel NHM	n/a		n/a
+C Intel SBR	n/a		n/a
+C Intel IBR	n/a		n/a
+C Intel HWL	 1.68		n/a
+C Intel BWL	 1.51	      1.67-1.74
+C Intel SKL	 1.52	      1.63-1.71
+C Intel atom	n/a		n/a
+C Intel SLM	n/a		n/a
+C VIA nano	n/a		n/a
+
+C The inner loops of this code are the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjorn Granlund.
+
+C TODO
+C  * Do overlapped software pipelining.
+C  * When changing this, make sure the code which falls into the inner loops
+C    does not execute too many no-ops (for both PIC and non-PIC).
+
+define(`rp',      `%rdi')
+define(`up',      `%rsi')
+define(`un_param',`%rdx')
+define(`vp_param',`%rcx')
+define(`vn',      `%r8')
+
+define(`n',       `%rcx')
+define(`n_save',  `%rbp')
+define(`vp',      `%r14')
+define(`unneg',   `%rbx')
+define(`v0',      `%rdx')
+define(`jaddr',   `%rax')
+
+define(`w0',	`%r12')
+define(`w1',	`%r9')
+define(`w2',	`%r10')
+define(`w3',	`%r11')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_mul_basecase)
+	FUNC_ENTRY(4)
+IFDOS(`	mov	56(%rsp), %r8d	')
+
+	cmp	$2, un_param
+	ja	L(gen)
+	mov	(vp_param), %rdx
+	mulx(	(up), %rax, %r9)	C 0 1
+	je	L(s2x)
+
+L(s11):	mov	%rax, (rp)
+	mov	%r9, 8(rp)
+	FUNC_EXIT()
+	ret
+
+L(s2x):	cmp	$2, vn
+	mulx(	8,(up), %r8, %r10)	C 1 2
+	je	L(s22)
+
+L(s21):	add	%r8, %r9
+	adc	$0, %r10
+	mov	%rax, (rp)
+	mov	%r9, 8(rp)
+	mov	%r10, 16(rp)
+	FUNC_EXIT()
+	ret
+
+L(s22):	add	%r8, %r9		C 1
+	adc	$0, %r10		C 2
+	mov	8(vp_param), %rdx
+	mov	%rax, (rp)
+	mulx(	(up), %r8, %r11)	C 1 2
+	mulx(	8,(up), %rax, %rdx)	C 2 3
+	add	%r11, %rax		C 2
+	adc	$0, %rdx		C 3
+	add	%r8, %r9		C 1
+	adc	%rax, %r10		C 2
+	adc	$0, %rdx		C 3
+	mov	%r9, 8(rp)
+	mov	%r10, 16(rp)
+	mov	%rdx, 24(rp)
+	FUNC_EXIT()
+	ret
+
+	ALIGN(16)
+L(gen):
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r14
+
+	mov	vp_param, vp
+	lea	1(un_param), unneg
+	mov	un_param, n_save
+	mov	R32(un_param), R32(%rax)
+	and	$-8, unneg
+	shr	$3, n_save		C loop count
+	neg	unneg
+	and	$7, R32(%rax)		C clear CF for adc as side-effect
+					C note that rax lives very long
+	mov	n_save, n
+	mov	(vp), v0
+	lea	8(vp), vp
+
+	lea	L(mtab)(%rip), %r10
+ifdef(`PIC',
+`	movslq	(%r10,%rax,4), %r11
+	lea	(%r11, %r10), %r10
+	jmp	*%r10
+',`
+	jmp	*(%r10,%rax,8)
+')
+
+L(mf0):	mulx(	(up), w2, w3)
+	lea	56(up), up
+	lea	-8(rp), rp
+	jmp	L(mb0)
+
+L(mf3):	mulx(	(up), w0, w1)
+	lea	16(up), up
+	lea	16(rp), rp
+	inc	n
+	jmp	L(mb3)
+
+L(mf4):	mulx(	(up), w2, w3)
+	lea	24(up), up
+	lea	24(rp), rp
+	inc	n
+	jmp	L(mb4)
+
+L(mf5):	mulx(	(up), w0, w1)
+	lea	32(up), up
+	lea	32(rp), rp
+	inc	n
+	jmp	L(mb5)
+
+L(mf6):	mulx(	(up), w2, w3)
+	lea	40(up), up
+	lea	40(rp), rp
+	inc	n
+	jmp	L(mb6)
+
+L(mf7):	mulx(	(up), w0, w1)
+	lea	48(up), up
+	lea	48(rp), rp
+	inc	n
+	jmp	L(mb7)
+
+L(mf1):	mulx(	(up), w0, w1)
+	jmp	L(mb1)
+
+L(mf2):	mulx(	(up), w2, w3)
+	lea	8(up), up
+	lea	8(rp), rp
+	mulx(	(up), w0, w1)
+
+	ALIGN(16)
+L(m1top):
+	mov	w2, -8(rp)
+	adc	w3, w0
+L(mb1):	mulx(	8,(up), w2, w3)
+	adc	w1, w2
+	lea	64(up), up
+	mov	w0, (rp)
+L(mb0):	mov	w2, 8(rp)
+	mulx(	-48,(up), w0, w1)
+	lea	64(rp), rp
+	adc	w3, w0
+L(mb7):	mulx(	-40,(up), w2, w3)
+	mov	w0, -48(rp)
+	adc	w1, w2
+L(mb6):	mov	w2, -40(rp)
+	mulx(	-32,(up), w0, w1)
+	adc	w3, w0
+L(mb5):	mulx(	-24,(up), w2, w3)
+	mov	w0, -32(rp)
+	adc	w1, w2
+L(mb4):	mulx(	-16,(up), w0, w1)
+	mov	w2, -24(rp)
+	adc	w3, w0
+L(mb3):	mulx(	-8,(up), w2, w3)
+	adc	w1, w2
+	mov	w0, -16(rp)
+	dec	n
+	mulx(	(up), w0, w1)
+	jnz	L(m1top)
+
+L(m1end):
+	mov	w2, -8(rp)
+	adc	w3, w0
+	mov	w0, (rp)
+	adc	%rcx, w1		C relies on rcx = 0
+	mov	w1, 8(rp)
+
+	dec	vn
+	jz	L(done)
+
+	lea	L(atab)(%rip), %r10
+ifdef(`PIC',
+`	movslq	(%r10,%rax,4), %rax
+	lea	(%rax, %r10), jaddr
+',`
+	mov	(%r10,%rax,8), jaddr
+')
+
+L(outer):
+	lea	(up,unneg,8), up
+	mov	n_save, n
+	mov	(vp), v0
+	lea	8(vp), vp
+	jmp	*jaddr
+
+L(f0):	mulx(	8,(up), w2, w3)
+	lea	8(rp,unneg,8), rp
+	lea	-1(n), n
+	jmp	L(b0)
+
+L(f3):	mulx(	-16,(up), w0, w1)
+	lea	-56(rp,unneg,8), rp
+	jmp	L(b3)
+
+L(f4):	mulx(	-24,(up), w2, w3)
+	lea	-56(rp,unneg,8), rp
+	jmp	L(b4)
+
+L(f5):	mulx(	-32,(up), w0, w1)
+	lea	-56(rp,unneg,8), rp
+	jmp	L(b5)
+
+L(f6):	mulx(	-40,(up), w2, w3)
+	lea	-56(rp,unneg,8), rp
+	jmp	L(b6)
+
+L(f7):	mulx(	16,(up), w0, w1)
+	lea	8(rp,unneg,8), rp
+	jmp	L(b7)
+
+L(f1):	mulx(	(up), w0, w1)
+	lea	8(rp,unneg,8), rp
+	jmp	L(b1)
+
+L(am1end):
+	adox(	(rp), w0)
+	adox(	%rcx, w1)		C relies on rcx = 0
+	mov	w0, (rp)
+	adc	%rcx, w1		C relies on rcx = 0
+	mov	w1, 8(rp)
+
+	dec	vn			C clear CF and OF as side-effect
+	jnz	L(outer)
+L(done):
+	pop	%r14
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+
+L(f2):
+	mulx(	-8,(up), w2, w3)
+	lea	8(rp,unneg,8), rp
+	mulx(	(up), w0, w1)
+
+	ALIGN(16)
+L(am1top):
+	adox(	-8,(rp), w2)
+	adcx(	w3, w0)
+	mov	w2, -8(rp)
+	jrcxz	L(am1end)
+L(b1):	mulx(	8,(up), w2, w3)
+	adox(	(rp), w0)
+	lea	-1(n), n
+	mov	w0, (rp)
+	adcx(	w1, w2)
+L(b0):	mulx(	16,(up), w0, w1)
+	adcx(	w3, w0)
+	adox(	8,(rp), w2)
+	mov	w2, 8(rp)
+L(b7):	mulx(	24,(up), w2, w3)
+	lea	64(up), up
+	adcx(	w1, w2)
+	adox(	16,(rp), w0)
+	mov	w0, 16(rp)
+L(b6):	mulx(	-32,(up), w0, w1)
+	adox(	24,(rp), w2)
+	adcx(	w3, w0)
+	mov	w2, 24(rp)
+L(b5):	mulx(	-24,(up), w2, w3)
+	adcx(	w1, w2)
+	adox(	32,(rp), w0)
+	mov	w0, 32(rp)
+L(b4):	mulx(	-16,(up), w0, w1)
+	adox(	40,(rp), w2)
+	adcx(	w3, w0)
+	mov	w2, 40(rp)
+L(b3):	adox(	48,(rp), w0)
+	mulx(	-8,(up), w2, w3)
+	mov	w0, 48(rp)
+	lea	64(rp), rp
+	adcx(	w1, w2)
+	mulx(	(up), w0, w1)
+	jmp	L(am1top)
+
+	JUMPTABSECT
+	ALIGN(8)
+L(mtab):JMPENT(	L(mf0), L(mtab))
+	JMPENT(	L(mf1), L(mtab))
+	JMPENT(	L(mf2), L(mtab))
+	JMPENT(	L(mf3), L(mtab))
+	JMPENT(	L(mf4), L(mtab))
+	JMPENT(	L(mf5), L(mtab))
+	JMPENT(	L(mf6), L(mtab))
+	JMPENT(	L(mf7), L(mtab))
+L(atab):JMPENT(	L(f0), L(atab))
+	JMPENT(	L(f1), L(atab))
+	JMPENT(	L(f2), L(atab))
+	JMPENT(	L(f3), L(atab))
+	JMPENT(	L(f4), L(atab))
+	JMPENT(	L(f5), L(atab))
+	JMPENT(	L(f6), L(atab))
+	JMPENT(	L(f7), L(atab))
+	TEXT
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/coreibwl/mullo_basecase.asm b/third_party/gmp/mpn/x86_64/coreibwl/mullo_basecase.asm
new file mode 100644
index 0000000..5cdb209
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/coreibwl/mullo_basecase.asm

@@ -0,0 +1,395 @@
+dnl  X64-64 mpn_mullo_basecase optimised for Intel Broadwell.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  Copyright 2017 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C The inner loops of this code are the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjorn Granlund.
+
+define(`rp',	   `%rdi')
+define(`up',	   `%rsi')
+define(`vp_param', `%rdx')
+define(`n',	   `%rcx')
+
+define(`vp',	`%r11')
+define(`jmpreg',`%rbx')
+define(`nn',    `%rbp')
+
+C TODO
+C  * Suppress more rp[] rewrites in corner.
+C  * Rearrange feed-in jumps for short branch forms.
+C  * Perhaps roll out the heavy artillery and 8-way unroll outer loop.  Since
+C    feed-in code implodes, the blow-up will not be more than perhaps 4x.
+C  * Micro-optimise critical lead-in code block around L(ent).
+C  * Write n < 4 code specifically for Broadwell (current code is for Haswell).
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(32)
+PROLOGUE(mpn_mullo_basecase)
+	FUNC_ENTRY(4)
+	cmp	$4, R32(n)
+	jae	L(big)
+
+	mov	vp_param, vp
+	mov	(up), %rdx
+
+	cmp	$2, R32(n)
+	jae	L(gt1)
+L(n1):	imul	(vp), %rdx
+	mov	%rdx, (rp)
+	FUNC_EXIT()
+	ret
+L(gt1):	ja	L(gt2)
+L(n2):	mov	(vp), %r9
+	mulx(	%r9, %rax, %rdx)
+	mov	%rax, (rp)
+	mov	8(up), %rax
+	imul	%r9, %rax
+	add	%rax, %rdx
+	mov	8(vp), %r9
+	mov	(up), %rcx
+	imul	%r9, %rcx
+	add	%rcx, %rdx
+	mov	%rdx, 8(rp)
+	FUNC_EXIT()
+	ret
+L(gt2):
+L(n3):	mov	(vp), %r9
+	mulx(	%r9, %rax, %r10)	C u0 x v0
+	mov	%rax, (rp)
+	mov	8(up), %rdx
+	mulx(	%r9, %rax, %rdx)	C u1 x v0
+	imul	16(up), %r9		C u2 x v0
+	add	%rax, %r10
+	adc	%rdx, %r9
+	mov	8(vp), %r8
+	mov	(up), %rdx
+	mulx(	%r8, %rax, %rdx)	C u0 x v1
+	add	%rax, %r10
+	adc	%rdx, %r9
+	imul	8(up), %r8		C u1 x v1
+	add	%r8, %r9
+	mov	%r10, 8(rp)
+	mov	16(vp), %r10
+	mov	(up), %rax
+	imul	%rax, %r10		C u0 x v2
+	add	%r10, %r9
+	mov	%r9, 16(rp)
+	FUNC_EXIT()
+	ret
+
+	ALIGN(16)
+L(big):	push	%r14
+	push	%r12
+	push	%rbx
+	push	%rbp
+	mov	-8(vp_param,n,8), %r14	C FIXME Put at absolute end
+	imul	(up), %r14		C FIXME Put at absolute end
+	lea	-3(n), R32(nn)
+	lea	8(vp_param), vp
+	mov	(vp_param), %rdx
+
+	mov	R32(n), R32(%rax)
+	shr	$3, R32(n)
+	and	$7, R32(%rax)		C clear OF, CF as side-effect
+	lea	L(mtab)(%rip), %r10
+ifdef(`PIC',
+`	movslq	(%r10,%rax,4), %rax
+	lea	(%rax, %r10), %r10
+	jmp	*%r10
+',`
+	jmp	*(%r10,%rax,8)
+')
+
+L(mf0):	mulx(	(up), %r10, %r8)
+	lea	56(up), up
+	lea	-8(rp), rp
+	lea	L(f7)(%rip), jmpreg
+	jmp	L(mb0)
+
+L(mf3):	mulx(	(up), %r9, %rax)
+	lea	16(up), up
+	lea	16(rp), rp
+	jrcxz	L(mc)
+	inc	R32(n)
+	lea	L(f2)(%rip), jmpreg
+	jmp	L(mb3)
+
+L(mc):	mulx(	-8,(up), %r10, %r8)
+	add	%rax, %r10
+	mov	%r9, -16(rp)
+	mulx(	(up), %r9, %rax)
+	mov	%r10, -8(rp)
+	adc	%r8, %r9
+	mov	%r9, (rp)
+	jmp	L(c2)
+
+L(mf4):	mulx(	(up), %r10, %r8)
+	lea	24(up), up
+	lea	24(rp), rp
+	inc	R32(n)
+	lea	L(f3)(%rip), jmpreg
+	jmp	L(mb4)
+
+L(mf5):	mulx(	(up), %r9, %rax)
+	lea	32(up), up
+	lea	32(rp), rp
+	inc	R32(n)
+	lea	L(f4)(%rip), jmpreg
+	jmp	L(mb5)
+
+L(mf6):	mulx(	(up), %r10, %r8)
+	lea	40(up), up
+	lea	40(rp), rp
+	inc	R32(n)
+	lea	L(f5)(%rip), jmpreg
+	jmp	L(mb6)
+
+L(mf7):	mulx(	(up), %r9, %rax)
+	lea	48(up), up
+	lea	48(rp), rp
+	lea	L(f6)(%rip), jmpreg
+	jmp	L(mb7)
+
+L(mf1):	mulx(	(up), %r9, %rax)
+	lea	L(f0)(%rip), jmpreg
+	jmp	L(mb1)
+
+L(mf2):	mulx(	(up), %r10, %r8)
+	lea	8(up), up
+	lea	8(rp), rp
+	lea	L(f1)(%rip), jmpreg
+	mulx(	(up), %r9, %rax)
+
+C FIXME ugly fallthrough FIXME
+	ALIGN(32)
+L(mtop):mov	%r10, -8(rp)
+	adc	%r8, %r9
+L(mb1):	mulx(	8,(up), %r10, %r8)
+	adc	%rax, %r10
+	lea	64(up), up
+	mov	%r9, (rp)
+L(mb0):	mov	%r10, 8(rp)
+	mulx(	-48,(up), %r9, %rax)
+	lea	64(rp), rp
+	adc	%r8, %r9
+L(mb7):	mulx(	-40,(up), %r10, %r8)
+	mov	%r9, -48(rp)
+	adc	%rax, %r10
+L(mb6):	mov	%r10, -40(rp)
+	mulx(	-32,(up), %r9, %rax)
+	adc	%r8, %r9
+L(mb5):	mulx(	-24,(up), %r10, %r8)
+	mov	%r9, -32(rp)
+	adc	%rax, %r10
+L(mb4):	mulx(	-16,(up), %r9, %rax)
+	mov	%r10, -24(rp)
+	adc	%r8, %r9
+L(mb3):	mulx(	-8,(up), %r10, %r8)
+	adc	%rax, %r10
+	mov	%r9, -16(rp)
+	dec	R32(n)
+	mulx(	(up), %r9, %rax)
+	jnz	L(mtop)
+
+L(mend):mov	%r10, -8(rp)
+	adc	%r8, %r9
+	mov	%r9, (rp)
+	adc	%rcx, %rax
+
+	lea	8(,nn,8), %r12
+	neg	%r12
+	shr	$3, R32(nn)
+	jmp	L(ent)
+
+L(f0):	mulx(	(up), %r10, %r8)
+	lea	-8(up), up
+	lea	-8(rp), rp
+	lea	L(f7)(%rip), jmpreg
+	jmp	L(b0)
+
+L(f1):	mulx(	(up), %r9, %rax)
+	lea	-1(nn), R32(nn)
+	lea	L(f0)(%rip), jmpreg
+	jmp	L(b1)
+
+L(end):	adox(	(rp), %r9)
+	mov	%r9, (rp)
+	adox(	%rcx, %rax)		C relies on rcx = 0
+	adc	%rcx, %rax		C FIXME suppress, use adc below; reqs ent path edits
+	lea	8(%r12), %r12
+L(ent):	mulx(	8,(up), %r10, %r8)	C r8 unused (use imul?)
+	add	%rax, %r14
+	add	%r10, %r14		C h
+	lea	(up,%r12), up		C reset up
+	lea	8(rp,%r12), rp		C reset rp
+	mov	(vp), %rdx
+	lea	8(vp), vp
+	or	R32(nn), R32(n)		C copy count, clear CF,OF (n = 0 prior)
+	jmp	*jmpreg
+
+L(f7):	mulx(	(up), %r9, %rax)
+	lea	-16(up), up
+	lea	-16(rp), rp
+	lea	L(f6)(%rip), jmpreg
+	jmp	L(b7)
+
+L(f2):	mulx(	(up), %r10, %r8)
+	lea	8(up), up
+	lea	8(rp), rp
+	mulx(	(up), %r9, %rax)
+	lea	L(f1)(%rip), jmpreg
+
+C FIXME ugly fallthrough FIXME
+	ALIGN(32)
+L(top):	adox(	-8,(rp), %r10)
+	adcx(	%r8, %r9)
+	mov	%r10, -8(rp)
+	jrcxz	L(end)
+L(b1):	mulx(	8,(up), %r10, %r8)
+	adox(	(rp), %r9)
+	lea	-1(n), R32(n)
+	mov	%r9, (rp)
+	adcx(	%rax, %r10)
+L(b0):	mulx(	16,(up), %r9, %rax)
+	adcx(	%r8, %r9)
+	adox(	8,(rp), %r10)
+	mov	%r10, 8(rp)
+L(b7):	mulx(	24,(up), %r10, %r8)
+	lea	64(up), up
+	adcx(	%rax, %r10)
+	adox(	16,(rp), %r9)
+	mov	%r9, 16(rp)
+L(b6):	mulx(	-32,(up), %r9, %rax)
+	adox(	24,(rp), %r10)
+	adcx(	%r8, %r9)
+	mov	%r10, 24(rp)
+L(b5):	mulx(	-24,(up), %r10, %r8)
+	adcx(	%rax, %r10)
+	adox(	32,(rp), %r9)
+	mov	%r9, 32(rp)
+L(b4):	mulx(	-16,(up), %r9, %rax)
+	adox(	40,(rp), %r10)
+	adcx(	%r8, %r9)
+	mov	%r10, 40(rp)
+L(b3):	adox(	48,(rp), %r9)
+	mulx(	-8,(up), %r10, %r8)
+	mov	%r9, 48(rp)
+	lea	64(rp), rp
+	adcx(	%rax, %r10)
+	mulx(	(up), %r9, %rax)
+	jmp	L(top)
+
+L(f6):	mulx(	(up), %r10, %r8)
+	lea	40(up), up
+	lea	-24(rp), rp
+	lea	L(f5)(%rip), jmpreg
+	jmp	L(b6)
+
+L(f5):	mulx(	(up), %r9, %rax)
+	lea	32(up), up
+	lea	-32(rp), rp
+	lea	L(f4)(%rip), jmpreg
+	jmp	L(b5)
+
+L(f4):	mulx(	(up), %r10, %r8)
+	lea	24(up), up
+	lea	-40(rp), rp
+	lea	L(f3)(%rip), jmpreg
+	jmp	L(b4)
+
+L(f3):	mulx(	(up), %r9, %rax)
+	lea	16(up), up
+	lea	-48(rp), rp
+	jrcxz	L(cor)
+	lea	L(f2)(%rip), jmpreg
+	jmp	L(b3)
+
+L(cor):	adox(	48,(rp), %r9)
+	mulx(	-8,(up), %r10, %r8)
+	mov	%r9, 48(rp)
+	lea	64(rp), rp
+	adcx(	%rax, %r10)
+	mulx(	(up), %r9, %rax)
+	adox(	-8,(rp), %r10)
+	adcx(	%r8, %r9)
+	mov	%r10, -8(rp)		C FIXME suppress
+	adox(	(rp), %r9)
+	mov	%r9, (rp)		C FIXME suppress
+	adox(	%rcx, %rax)
+L(c2):
+	mulx(	8,(up), %r10, %r8)
+	adc	%rax, %r14
+	add	%r10, %r14
+	mov	(vp), %rdx
+	test	R32(%rcx), R32(%rcx)
+	mulx(	-16,(up), %r10, %r8)
+	mulx(	-8,(up), %r9, %rax)
+	adox(	-8,(rp), %r10)
+	adcx(	%r8, %r9)
+	mov	%r10, -8(rp)
+	adox(	(rp), %r9)
+	adox(	%rcx, %rax)
+	adc	%rcx, %rax
+	mulx(	(up), %r10, %r8)
+	add	%rax, %r14
+	add	%r10, %r14
+	mov	8(vp), %rdx
+	mulx(	-16,(up), %rcx, %rax)
+	add	%r9, %rcx
+	mov	%rcx, (rp)
+	adc	$0, %rax
+	mulx(	-8,(up), %r10, %r8)
+	add	%rax, %r14
+	add	%r10, %r14
+	mov	%r14, 8(rp)
+	pop	%rbp
+	pop	%rbx
+	pop	%r12
+	pop	%r14
+	FUNC_EXIT()
+	ret
+EPILOGUE()
+	JUMPTABSECT
+	ALIGN(8)
+L(mtab):JMPENT(	L(mf7), L(mtab))
+	JMPENT(	L(mf0), L(mtab))
+	JMPENT(	L(mf1), L(mtab))
+	JMPENT(	L(mf2), L(mtab))
+	JMPENT(	L(mf3), L(mtab))
+	JMPENT(	L(mf4), L(mtab))
+	JMPENT(	L(mf5), L(mtab))
+	JMPENT(	L(mf6), L(mtab))

diff --git a/third_party/gmp/mpn/x86_64/coreibwl/sqr_basecase.asm b/third_party/gmp/mpn/x86_64/coreibwl/sqr_basecase.asm
new file mode 100644
index 0000000..e81b01b
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/coreibwl/sqr_basecase.asm

@@ -0,0 +1,839 @@
+dnl  AMD64 mpn_sqr_basecase optimised for Intel Broadwell.
+
+dnl  Copyright 2015, 2017 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb	mul_1		addmul_1
+C AMD K8,K9	n/a		n/a
+C AMD K10	n/a		n/a
+C AMD bd1	n/a		n/a
+C AMD bd2	n/a		n/a
+C AMD bd3	n/a		n/a
+C AMD bd4	 ?		 ?
+C AMD zen	 ?		 ?
+C AMD bt1	n/a		n/a
+C AMD bt2	n/a		n/a
+C Intel P4	n/a		n/a
+C Intel PNR	n/a		n/a
+C Intel NHM	n/a		n/a
+C Intel SBR	n/a		n/a
+C Intel IBR	n/a		n/a
+C Intel HWL	 1.68		n/a
+C Intel BWL	 1.51	      1.67-1.74
+C Intel SKL	 1.52	      1.63-1.71
+C Intel atom	n/a		n/a
+C Intel SLM	n/a		n/a
+C VIA nano	n/a		n/a
+
+C The inner loops of this code are the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjorn Granlund.
+
+C TODO
+C  * We have 8 addmul_1 loops which fall into each other.  The idea is to save
+C    on switching code, since a circularly updated computed goto target will
+C    hardly allow correct branch prediction.  On 2nd thought, we now might make
+C    each of the 8 loop branches be poorly predicted since they will be
+C    executed fewer times for each time.  With just one addmul_1 loop, the loop
+C    count will change only once each 8th time.
+C  * Do overlapped software pipelining.
+C  * Perhaps load in shrx/sarx, eliminating separate load insn.
+C  * Schedule add+stored in small n code.
+C  * Try swapping adox and adcx insn, making mulx have more time to run.
+
+define(`rp',      `%rdi')
+define(`up',      `%rsi')
+define(`un_param',`%rdx')
+
+define(`n',       `%rcx')
+define(`un_save', `%rbx')
+define(`u0',      `%rdx')
+
+define(`w0',	`%r8')
+define(`w1',	`%r9')
+define(`w2',	`%r10')
+define(`w3',	`%r11')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_sqr_basecase)
+	FUNC_ENTRY(3)
+
+	cmp	$2, un_param
+	jae	L(gt1)
+
+	mov	(up), %rdx
+	mulx(	%rdx, %rax, %rdx)
+	mov	%rax, (rp)
+	mov	%rdx, 8(rp)
+	FUNC_EXIT()
+	ret
+
+L(gt1):	jne	L(gt2)
+
+	mov	(up), %rdx
+	mov	8(up), %rcx
+	mulx(	%rcx, %r9, %r10)	C v0 * v1	W 1 2
+	mulx(	%rdx, %rax, %r8)	C v0 * v0	W 0 1
+	mov	%rcx, %rdx
+	mulx(	%rdx, %r11, %rdx)	C v1 * v1	W 2 3
+	add	%r9, %r9		C		W 1
+	adc	%r10, %r10		C		W 2
+	adc	$0, %rdx		C		W 3
+	add	%r9, %r8		C W 1
+	adc	%r11, %r10		C W 2
+	adc	$0, %rdx		C W 3
+	mov	%rax, (rp)
+	mov	%r8, 8(rp)
+	mov	%r10, 16(rp)
+	mov	%rdx, 24(rp)
+	FUNC_EXIT()
+	ret
+
+L(gt2):	cmp	$4, un_param
+	jae	L(gt3)
+
+	push	%rbx
+	mov	(up), %rdx
+	mulx(	8,(up), w2, w3)
+	mulx(	16,(up), w0, w1)
+	add	w3, w0
+	mov	8(up), %rdx
+	mulx(	16,(up), %rax, w3)
+	adc	%rax, w1
+	adc	$0, w3
+	test	R32(%rbx), R32(%rbx)
+	mov	(up), %rdx
+	mulx(	%rdx, %rbx, %rcx)
+	mov	%rbx, (rp)
+	mov	8(up), %rdx
+	mulx(	%rdx, %rax, %rbx)
+	mov	16(up), %rdx
+	mulx(	%rdx, %rsi, %rdx)
+	adcx(	w2, w2)
+	adcx(	w0, w0)
+	adcx(	w1, w1)
+	adcx(	w3, w3)
+	adox(	w2, %rcx)
+	adox(	w0, %rax)
+	adox(	w1, %rbx)
+	adox(	w3, %rsi)
+	mov	$0, R32(%r8)
+	adox(	%r8, %rdx)
+	adcx(	%r8, %rdx)
+	mov	%rcx, 8(rp)
+	mov	%rax, 16(rp)
+	mov	%rbx, 24(rp)
+	mov	%rsi, 32(rp)
+	mov	%rdx, 40(rp)
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+
+L(gt3):	push	%rbx
+
+	lea	-3(un_param), R32(un_save)
+	lea	5(un_param), R32(n)
+	mov	R32(un_param), R32(%rax)
+	and	$-8, R32(un_save)
+	shr	$3, R32(n)		C count for mul_1 loop
+	neg	un_save			C 8*count and offert for addmul_1 loops
+	and	$7, R32(%rax)		C clear CF for adc as side-effect
+
+	mov	(up), u0
+
+	lea	L(mtab)(%rip), %r10
+ifdef(`PIC',
+`	movslq	(%r10,%rax,4), %r8
+	lea	(%r8, %r10), %r10
+	jmp	*%r10
+',`
+	jmp	*(%r10,%rax,8)
+')
+
+L(mf0):	mulx(	u0, w0, w1)		C up[0]^2
+	add	u0, u0
+	mulx(	8,(up), w2, w3)
+	lea	64(up), up
+	add	w1, w2
+	jmp	L(mb0)
+
+L(mf3):	mulx(	u0, w2, w3)		C up[0]^2
+	add	u0, u0
+	mov	w2, (rp)
+	mulx(	8,(up), w0, w1)
+	lea	24(up), up
+	lea	24(rp), rp
+	add	w3, w0
+	jmp	L(mb3)
+
+L(mf4):	mulx(	u0, w0, w1)		C up[0]^2
+	add	u0, u0
+	mulx(	8,(up), w2, w3)
+	mov	w0, (rp)
+	lea	32(up), up
+	lea	32(rp), rp
+	add	w1, w2
+	jmp	L(mb4)
+
+L(mf5):	mulx(	u0, w2, w3)		C up[0]^2
+	add	u0, u0
+	mulx(	8,(up), w0, w1)
+	mov	w2, (rp)
+	lea	40(up), up
+	lea	40(rp), rp
+	add	w3, w0
+	jmp	L(mb5)
+
+L(mf6):	mulx(	u0, w0, w1)		C up[0]^2
+	add	u0, u0
+	mulx(	8,(up), w2, w3)
+	mov	w0, (rp)
+	lea	48(up), up
+	lea	48(rp), rp
+	add	w1, w2
+	jmp	L(mb6)
+
+L(mf7):	mulx(	u0, w2, w3)		C up[0]^2
+	add	u0, u0
+	mulx(	8,(up), w0, w1)
+	mov	w2, (rp)
+	lea	56(up), up
+	lea	56(rp), rp
+	add	w3, w0
+	jmp	L(mb7)
+
+L(mf1):	mulx(	u0, w2, w3)		C up[0]^2
+	add	u0, u0
+	mulx(	8,(up), w0, w1)
+	mov	w2, (rp)
+	lea	8(up), up
+	lea	8(rp), rp
+	add	w3, w0
+	jmp	L(mb1)
+
+L(mf2):	mulx(	u0, w0, w1)		C up[0]^2
+	add	u0, u0
+	mulx(	8,(up), w2, w3)
+	mov	w0, (rp)
+	lea	16(up), up
+	lea	16(rp), rp
+	dec	R32(n)
+	add	w1, w2
+	mulx(	(up), w0, w1)
+
+	ALIGN(16)
+L(top):	mov	w2, -8(rp)
+	adc	w3, w0
+L(mb1):	mulx(	8,(up), w2, w3)
+	adc	w1, w2
+	lea	64(up), up
+L(mb0):	mov	w0, (rp)
+	mov	w2, 8(rp)
+	mulx(	-48,(up), w0, w1)
+	lea	64(rp), rp
+	adc	w3, w0
+L(mb7):	mulx(	-40,(up), w2, w3)
+	mov	w0, -48(rp)
+	adc	w1, w2
+L(mb6):	mov	w2, -40(rp)
+	mulx(	-32,(up), w0, w1)
+	adc	w3, w0
+L(mb5):	mulx(	-24,(up), w2, w3)
+	mov	w0, -32(rp)
+	adc	w1, w2
+L(mb4):	mulx(	-16,(up), w0, w1)
+	mov	w2, -24(rp)
+	adc	w3, w0
+L(mb3):	mulx(	-8,(up), w2, w3)
+	adc	w1, w2
+	mov	w0, -16(rp)
+	dec	R32(n)
+	mulx(	(up), w0, w1)
+	jnz	L(top)
+
+L(end):	mov	w2, -8(rp)
+	adc	w3, w0
+C	mov	w0, (rp)
+C	adc	%rcx, w1
+C	mov	w1, 8(rp)
+
+	lea	L(atab)(%rip), %r10
+ifdef(`PIC',
+`	movslq	(%r10,%rax,4), %r11
+	lea	(%r11, %r10), %r11
+',`
+	mov	(%r10,%rax,8), %r11
+')
+	mov	$63, R32(%rax)
+	jmp	*%r11
+
+L(ed0):	adox(	(rp), w0)
+	adox(	%rcx, w1)		C relies on rcx = 0
+L(f7):	mov	w0, (rp)
+	adc	%rcx, w1		C relies on rcx = 0
+	mov	w1, 8(rp)
+	lea	-64(up,un_save,8), up
+	mov	R32(un_save), R32(n)
+	lea	-56(rp,un_save,8), rp
+	mov	(up), w1		C up[-1]
+	mov	8(up), u0		C up[0]
+	shrx(	%rax, w1, w0)
+	sarx(	%rax, w1, w1)
+	and	u0, w1			C "ci" in C code
+	mulx(	u0, w2, w3)		C up[0]^2
+	lea	(w0,u0,2), u0		C "u0" arg in C code
+	jmp	L(b7)
+
+	ALIGN(16)
+L(tp0):	adox(	-8,(rp), w2)
+	adcx(	w3, w0)
+	mov	w2, -8(rp)
+	jrcxz	L(ed0)
+	mulx(	8,(up), w2, w3)
+	adox(	(rp), w0)
+	lea	8(n), R32(n)
+L(b0):	mov	w0, (rp)
+	adcx(	w1, w2)
+	mulx(	16,(up), w0, w1)
+	adcx(	w3, w0)
+	adox(	8,(rp), w2)
+	mov	w2, 8(rp)
+	mulx(	24,(up), w2, w3)
+	lea	64(up), up
+	adcx(	w1, w2)
+	adox(	16,(rp), w0)
+	mov	w0, 16(rp)
+	mulx(	-32,(up), w0, w1)
+	adox(	24,(rp), w2)
+	adcx(	w3, w0)
+	mov	w2, 24(rp)
+	mulx(	-24,(up), w2, w3)
+	adcx(	w1, w2)
+	adox(	32,(rp), w0)
+	mov	w0, 32(rp)
+	mulx(	-16,(up), w0, w1)
+	adox(	40,(rp), w2)
+	adcx(	w3, w0)
+	mov	w2, 40(rp)
+	adox(	48,(rp), w0)
+	mulx(	-8,(up), w2, w3)
+	mov	w0, 48(rp)
+	lea	64(rp), rp
+	adcx(	w1, w2)
+	mulx(	(up), w0, w1)
+	jmp	L(tp0)
+
+L(ed1):	adox(	(rp), w0)
+	adox(	%rcx, w1)		C relies on rcx = 0
+L(f0):	mov	w0, (rp)
+	adc	%rcx, w1		C relies on rcx = 0
+	mov	w1, 8(rp)
+	lea	-64(up,un_save,8), up
+	mov	R32(un_save), R32(n)
+	lea	-56(rp,un_save,8), rp
+	mov	-8(up), w3		C up[-1]
+	mov	(up), u0		C up[0]
+	shrx(	%rax, w3, w2)
+	sarx(	%rax, w3, w3)
+	and	u0, w3			C "ci" in C code
+	mulx(	u0, w0, w1)		C up[0]^2
+	lea	(w2,u0,2), u0		C "u0" arg in C code
+	adcx(	w3, w0)
+	mulx(	8,(up), w2, w3)
+	adox(	(rp), w0)
+	jmp	L(b0)
+
+	ALIGN(16)
+L(tp1):	adox(	-8,(rp), w2)
+	adcx(	w3, w0)
+	mov	w2, -8(rp)
+	jrcxz	L(ed1)
+L(b1):	mulx(	8,(up), w2, w3)
+	adox(	(rp), w0)
+	lea	8(n), R32(n)
+	mov	w0, (rp)
+	adcx(	w1, w2)
+	mulx(	16,(up), w0, w1)
+	adcx(	w3, w0)
+	adox(	8,(rp), w2)
+	mov	w2, 8(rp)
+	mulx(	24,(up), w2, w3)
+	lea	64(up), up
+	adcx(	w1, w2)
+	adox(	16,(rp), w0)
+	mov	w0, 16(rp)
+	mulx(	-32,(up), w0, w1)
+	adox(	24,(rp), w2)
+	adcx(	w3, w0)
+	mov	w2, 24(rp)
+	mulx(	-24,(up), w2, w3)
+	adcx(	w1, w2)
+	adox(	32,(rp), w0)
+	mov	w0, 32(rp)
+	mulx(	-16,(up), w0, w1)
+	adox(	40,(rp), w2)
+	adcx(	w3, w0)
+	mov	w2, 40(rp)
+	adox(	48,(rp), w0)
+	mulx(	-8,(up), w2, w3)
+	mov	w0, 48(rp)
+	lea	64(rp), rp
+	adcx(	w1, w2)
+	mulx(	(up), w0, w1)
+	jmp	L(tp1)
+
+L(ed2):	adox(	(rp), w0)
+	adox(	%rcx, w1)		C relies on rcx = 0
+L(f1):	mov	w0, (rp)
+	adc	%rcx, w1		C relies on rcx = 0
+	mov	w1, 8(rp)
+	lea	(up,un_save,8), up
+	mov	R32(un_save), R32(n)
+	lea	8(un_save), un_save
+	lea	-56(rp,un_save,8), rp
+	mov	-16(up), w1		C up[-1]
+	mov	-8(up), u0		C up[0]
+	shrx(	%rax, w1, w0)
+	sarx(	%rax, w1, w1)
+	and	u0, w1			C "ci" in C code
+	mulx(	u0, w2, w3)		C up[0]^2
+	lea	(w0,u0,2), u0		C "u0" arg in C code
+	adcx(	w1, w2)			C FIXME: crossjump?
+	mulx(	(up), w0, w1)
+	adox(	-8,(rp), w2)
+	adcx(	w3, w0)
+	mov	w2, -8(rp)
+	jmp	L(b1)
+
+	ALIGN(16)
+L(tp2):	adox(	-8,(rp), w2)
+	adcx(	w3, w0)
+	mov	w2, -8(rp)
+	jrcxz	L(ed2)
+	mulx(	8,(up), w2, w3)
+	adox(	(rp), w0)
+	lea	8(n), R32(n)
+	mov	w0, (rp)
+	adcx(	w1, w2)
+	mulx(	16,(up), w0, w1)
+	adcx(	w3, w0)
+	adox(	8,(rp), w2)
+	mov	w2, 8(rp)
+	mulx(	24,(up), w2, w3)
+	lea	64(up), up
+	adcx(	w1, w2)
+	adox(	16,(rp), w0)
+	mov	w0, 16(rp)
+	mulx(	-32,(up), w0, w1)
+	adox(	24,(rp), w2)
+	adcx(	w3, w0)
+	mov	w2, 24(rp)
+	mulx(	-24,(up), w2, w3)
+	adcx(	w1, w2)
+	adox(	32,(rp), w0)
+	mov	w0, 32(rp)
+	mulx(	-16,(up), w0, w1)
+	adox(	40,(rp), w2)
+	adcx(	w3, w0)
+	mov	w2, 40(rp)
+L(b2):	adox(	48,(rp), w0)
+	mulx(	-8,(up), w2, w3)
+	mov	w0, 48(rp)
+	lea	64(rp), rp
+	adcx(	w1, w2)
+	mulx(	(up), w0, w1)
+	jmp	L(tp2)
+
+L(ed3):	adox(	(rp), w0)
+	adox(	%rcx, w1)		C relies on rcx = 0
+L(f2):	mov	w0, (rp)
+	adc	%rcx, w1		C relies on rcx = 0
+	mov	w1, 8(rp)
+	lea	(up,un_save,8), up
+	or	R32(un_save), R32(n)
+	jz	L(cor3)
+	lea	-56(rp,un_save,8), rp
+	mov	-24(up), w3		C up[-1]
+	mov	-16(up), u0		C up[0]
+	shrx(	%rax, w3, w2)
+	sarx(	%rax, w3, w3)
+	and	u0, w3			C "ci" in C code
+	mulx(	u0, w0, w1)		C up[0]^2
+	lea	(w2,u0,2), u0		C "u0" arg in C code
+	adcx(	w3, w0)
+	jmp	L(b2)
+
+	ALIGN(16)
+L(tp3):	adox(	-8,(rp), w2)
+	adcx(	w3, w0)
+	mov	w2, -8(rp)
+	jrcxz	L(ed3)
+	mulx(	8,(up), w2, w3)
+	adox(	(rp), w0)
+	lea	8(n), R32(n)
+	mov	w0, (rp)
+	adcx(	w1, w2)
+	mulx(	16,(up), w0, w1)
+	adcx(	w3, w0)
+	adox(	8,(rp), w2)
+	mov	w2, 8(rp)
+	mulx(	24,(up), w2, w3)
+	lea	64(up), up
+	adcx(	w1, w2)
+	adox(	16,(rp), w0)
+	mov	w0, 16(rp)
+	mulx(	-32,(up), w0, w1)
+	adox(	24,(rp), w2)
+	adcx(	w3, w0)
+	mov	w2, 24(rp)
+	mulx(	-24,(up), w2, w3)
+	adcx(	w1, w2)
+	adox(	32,(rp), w0)
+	mov	w0, 32(rp)
+L(b3):	mulx(	-16,(up), w0, w1)
+	adox(	40,(rp), w2)
+	adcx(	w3, w0)
+	mov	w2, 40(rp)
+	adox(	48,(rp), w0)
+	mulx(	-8,(up), w2, w3)
+	mov	w0, 48(rp)
+	lea	64(rp), rp
+	adcx(	w1, w2)
+	mulx(	(up), w0, w1)
+	jmp	L(tp3)
+
+L(ed4):	adox(	(rp), w0)
+	adox(	%rcx, w1)		C relies on rcx = 0
+L(f3):	mov	w0, (rp)
+	adc	%rcx, w1		C relies on rcx = 0
+	mov	w1, 8(rp)
+	lea	(up,un_save,8), up
+	mov	R32(un_save), R32(n)
+	lea	-56(rp,un_save,8), rp
+	mov	-32(up), w1		C up[-1]
+	mov	-24(up), u0		C up[0]
+	shrx(	%rax, w1, w0)
+	sarx(	%rax, w1, w1)
+	and	u0, w1			C "ci" in C code
+	mulx(	u0, w2, w3)		C up[0]^2
+	lea	(w0,u0,2), u0		C "u0" arg in C code
+	adcx(	w1, w2)
+	jmp	L(b3)
+
+	ALIGN(16)
+L(tp4):	adox(	-8,(rp), w2)
+	adcx(	w3, w0)
+	mov	w2, -8(rp)
+	jrcxz	L(ed4)
+	mulx(	8,(up), w2, w3)
+	adox(	(rp), w0)
+	lea	8(n), R32(n)
+	mov	w0, (rp)
+	adcx(	w1, w2)
+	mulx(	16,(up), w0, w1)
+	adcx(	w3, w0)
+	adox(	8,(rp), w2)
+	mov	w2, 8(rp)
+	mulx(	24,(up), w2, w3)
+	lea	64(up), up
+	adcx(	w1, w2)
+	adox(	16,(rp), w0)
+	mov	w0, 16(rp)
+	mulx(	-32,(up), w0, w1)
+	adox(	24,(rp), w2)
+	adcx(	w3, w0)
+	mov	w2, 24(rp)
+L(b4):	mulx(	-24,(up), w2, w3)
+	adcx(	w1, w2)
+	adox(	32,(rp), w0)
+	mov	w0, 32(rp)
+	mulx(	-16,(up), w0, w1)
+	adox(	40,(rp), w2)
+	adcx(	w3, w0)
+	mov	w2, 40(rp)
+	adox(	48,(rp), w0)
+	mulx(	-8,(up), w2, w3)
+	mov	w0, 48(rp)
+	lea	64(rp), rp
+	adcx(	w1, w2)
+	mulx(	(up), w0, w1)
+	jmp	L(tp4)
+
+L(ed5):	adox(	(rp), w0)
+	adox(	%rcx, w1)		C relies on rcx = 0
+L(f4):	mov	w0, (rp)
+	adc	%rcx, w1		C relies on rcx = 0
+	mov	w1, 8(rp)
+	lea	(up,un_save,8), up
+	mov	R32(un_save), R32(n)
+	lea	-56(rp,un_save,8), rp
+	mov	-40(up), w3		C up[-1]
+	mov	-32(up), u0		C up[0]
+	shrx(	%rax, w3, w2)
+	sarx(	%rax, w3, w3)
+	and	u0, w3			C "ci" in C code
+	mulx(	u0, w0, w1)		C up[0]^2
+	lea	(w2,u0,2), u0		C "u0" arg in C code
+	adcx(	w3, w0)
+	jmp	L(b4)
+
+	ALIGN(16)
+L(tp5):	adox(	-8,(rp), w2)
+	adcx(	w3, w0)
+	mov	w2, -8(rp)
+	jrcxz	L(ed5)
+	mulx(	8,(up), w2, w3)
+	adox(	(rp), w0)
+	lea	8(n), R32(n)
+	mov	w0, (rp)
+	adcx(	w1, w2)
+	mulx(	16,(up), w0, w1)
+	adcx(	w3, w0)
+	adox(	8,(rp), w2)
+	mov	w2, 8(rp)
+	mulx(	24,(up), w2, w3)
+	lea	64(up), up
+	adcx(	w1, w2)
+	adox(	16,(rp), w0)
+	mov	w0, 16(rp)
+L(b5):	mulx(	-32,(up), w0, w1)
+	adox(	24,(rp), w2)
+	adcx(	w3, w0)
+	mov	w2, 24(rp)
+	mulx(	-24,(up), w2, w3)
+	adcx(	w1, w2)
+	adox(	32,(rp), w0)
+	mov	w0, 32(rp)
+	mulx(	-16,(up), w0, w1)
+	adox(	40,(rp), w2)
+	adcx(	w3, w0)
+	mov	w2, 40(rp)
+	adox(	48,(rp), w0)
+	mulx(	-8,(up), w2, w3)
+	mov	w0, 48(rp)
+	lea	64(rp), rp
+	adcx(	w1, w2)
+	mulx(	(up), w0, w1)
+	jmp	L(tp5)
+
+L(ed6):	adox(	(rp), w0)
+	adox(	%rcx, w1)		C relies on rcx = 0
+L(f5):	mov	w0, (rp)
+	adc	%rcx, w1		C relies on rcx = 0
+	mov	w1, 8(rp)
+	lea	(up,un_save,8), up
+	mov	R32(un_save), R32(n)
+	lea	-56(rp,un_save,8), rp
+	mov	-48(up), w1		C up[-1]
+	mov	-40(up), u0		C up[0]
+	shrx(	%rax, w1, w0)
+	sarx(	%rax, w1, w1)
+	and	u0, w1			C "ci" in C code
+	mulx(	u0, w2, w3)		C up[0]^2
+	lea	(w0,u0,2), u0		C "u0" arg in C code
+	adcx(	w1, w2)
+	jmp	L(b5)
+
+	ALIGN(16)
+L(tp6):	adox(	-8,(rp), w2)
+	adcx(	w3, w0)
+	mov	w2, -8(rp)
+	jrcxz	L(ed6)
+	mulx(	8,(up), w2, w3)
+	adox(	(rp), w0)
+	lea	8(n), R32(n)
+	mov	w0, (rp)
+	adcx(	w1, w2)
+	mulx(	16,(up), w0, w1)
+	adcx(	w3, w0)
+	adox(	8,(rp), w2)
+	mov	w2, 8(rp)
+	mulx(	24,(up), w2, w3)
+	lea	64(up), up
+L(b6):	adcx(	w1, w2)
+	adox(	16,(rp), w0)
+	mov	w0, 16(rp)
+	mulx(	-32,(up), w0, w1)
+	adox(	24,(rp), w2)
+	adcx(	w3, w0)
+	mov	w2, 24(rp)
+	mulx(	-24,(up), w2, w3)
+	adcx(	w1, w2)
+	adox(	32,(rp), w0)
+	mov	w0, 32(rp)
+	mulx(	-16,(up), w0, w1)
+	adox(	40,(rp), w2)
+	adcx(	w3, w0)
+	mov	w2, 40(rp)
+	adox(	48,(rp), w0)
+	mulx(	-8,(up), w2, w3)
+	mov	w0, 48(rp)
+	lea	64(rp), rp
+	adcx(	w1, w2)
+	mulx(	(up), w0, w1)
+	jmp	L(tp6)
+
+L(ed7):	adox(	(rp), w0)
+	adox(	%rcx, w1)		C relies on rcx = 0
+L(f6):	mov	w0, (rp)
+	adc	%rcx, w1		C relies on rcx = 0
+	mov	w1, 8(rp)
+	lea	(up,un_save,8), up
+	mov	R32(un_save), R32(n)
+	lea	-56(rp,un_save,8), rp
+	mov	-56(up), w3		C up[-1]
+	mov	-48(up), u0		C up[0]
+	shrx(	%rax, w3, w2)
+	sarx(	%rax, w3, w3)
+	and	u0, w3			C "ci" in C code
+	mulx(	u0, w0, w1)		C up[0]^2
+	lea	(w2,u0,2), u0		C "u0" arg in C code
+	adcx(	w3, w0)
+	mulx(	-40,(up), w2, w3)
+	jmp	L(b6)
+
+	ALIGN(16)
+L(tp7):	adox(	-8,(rp), w2)
+	adcx(	w3, w0)
+	mov	w2, -8(rp)
+	jrcxz	L(ed7)
+	mulx(	8,(up), w2, w3)
+	adox(	(rp), w0)
+	lea	8(n), R32(n)
+	mov	w0, (rp)
+L(b7):	adcx(	w1, w2)
+	mulx(	16,(up), w0, w1)
+	adcx(	w3, w0)
+	adox(	8,(rp), w2)
+	mov	w2, 8(rp)
+	mulx(	24,(up), w2, w3)
+	lea	64(up), up
+	adcx(	w1, w2)
+	adox(	16,(rp), w0)
+	mov	w0, 16(rp)
+	mulx(	-32,(up), w0, w1)
+	adox(	24,(rp), w2)
+	adcx(	w3, w0)
+	mov	w2, 24(rp)
+	mulx(	-24,(up), w2, w3)
+	adcx(	w1, w2)
+	adox(	32,(rp), w0)
+	mov	w0, 32(rp)
+	mulx(	-16,(up), w0, w1)
+	adox(	40,(rp), w2)
+	adcx(	w3, w0)
+	mov	w2, 40(rp)
+	adox(	48,(rp), w0)
+	mulx(	-8,(up), w2, w3)
+	mov	w0, 48(rp)
+	lea	64(rp), rp
+	adcx(	w1, w2)
+	mulx(	(up), w0, w1)
+	jmp	L(tp7)
+
+L(cor3):lea	-64(rp), rp
+	mov	-24(up), w3		C up[-1]
+	mov	-16(up), u0		C up[0]
+	shrx(	%rax, w3, w2)
+	sarx(	%rax, w3, w3)
+	and	u0, w3			C "ci" in C code
+	mulx(	u0, w0, w1)		C up[0]^2
+	lea	(w2,u0,2), u0		C "u0" arg in C code
+	adcx(	w3, w0)
+	adox(	56,(rp), w0)
+	mulx(	-8,(up), w2, w3)
+	mov	w0, 56(rp)
+	adcx(	w1, w2)
+	mulx(	(up), %rbx, w1)
+	adox(	64,(rp), w2)
+	adcx(	w3, %rbx)
+	mov	w2, 64(rp)
+	adox(	72,(rp), %rbx)
+	adox(	%rcx, w1)		C relies on rcx = 0
+	adc	%rcx, w1		C relies on rcx = 0
+	mov	w1, 80(rp)	C FIXME
+C wd2
+	mov	-16(up), w1		C up[-1]
+	mov	-8(up), u0		C up[0]
+	shrx(	%rax, w1, w0)
+	sarx(	%rax, w1, w1)
+	and	u0, w1			C "ci" in C code
+	mulx(	u0, w2, w3)		C up[0]^2
+	lea	(w0,u0,2), u0		C "u0" arg in C code
+	adcx(	w1, w2)
+	mulx(	(up), w0, %rax)
+	adox(	%rbx, w2)
+	adcx(	w3, w0)
+	mov	w2, 72(rp)
+	adox(	80,(rp), w0)
+	adox(	%rcx, %rax)		C relies on rcx = 0
+	mov	w0, 80(rp)
+	adc	%rcx, %rax		C relies on rcx = 0
+C wd1
+	mov	-8(up), w3		C up[-1]
+	mov	(up), u0		C up[0]
+	sar	$63, w3
+	and	u0, w3			C "ci" in C code
+	mulx(	u0, w0, w1)		C up[0]^2
+	adcx(	w3, w0)
+	adox(	%rax, w0)
+	mov	w0, 88(rp)
+	adcx(	%rcx, w1)
+	adox(	%rcx, w1)
+	mov	w1, 96(rp)
+
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+
+	JUMPTABSECT
+	ALIGN(8)
+L(mtab):JMPENT(	L(mf7), L(mtab))
+	JMPENT(	L(mf0), L(mtab))
+	JMPENT(	L(mf1), L(mtab))
+	JMPENT(	L(mf2), L(mtab))
+	JMPENT(	L(mf3), L(mtab))
+	JMPENT(	L(mf4), L(mtab))
+	JMPENT(	L(mf5), L(mtab))
+	JMPENT(	L(mf6), L(mtab))
+L(atab):JMPENT(	L(f6), L(atab))
+	JMPENT(	L(f7), L(atab))
+	JMPENT(	L(f0), L(atab))
+	JMPENT(	L(f1), L(atab))
+	JMPENT(	L(f2), L(atab))
+	JMPENT(	L(f3), L(atab))
+	JMPENT(	L(f4), L(atab))
+	JMPENT(	L(f5), L(atab))
+	TEXT
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/coreihwl/addmul_2.asm b/third_party/gmp/mpn/x86_64/coreihwl/addmul_2.asm
new file mode 100644
index 0000000..9d1c405
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/coreihwl/addmul_2.asm

@@ -0,0 +1,241 @@
+dnl  AMD64 mpn_addmul_2 optimised for Intel Haswell.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C AMD K8,K9	n/a
+C AMD K10	n/a
+C AMD bull	n/a
+C AMD pile	n/a
+C AMD steam	n/a
+C AMD excavator	 ?
+C AMD bobcat	n/a
+C AMD jaguar	n/a
+C Intel P4	n/a
+C Intel core	n/a
+C Intel NHM	n/a
+C Intel SBR	n/a
+C Intel IBR	n/a
+C Intel HWL	 2.15
+C Intel BWL	 2.33
+C Intel SKL	 2.22
+C Intel atom	n/a
+C Intel SLM	n/a
+C VIA nano	n/a
+
+C The loop of this code is the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjörn Granlund.
+
+define(`rp',     `%rdi')
+define(`up',     `%rsi')
+define(`n_param',`%rdx')
+define(`vp',     `%rcx')
+
+define(`v0', `%r8')
+define(`v1', `%r9')
+define(`w0', `%rbx')
+define(`w1', `%rcx')
+define(`w2', `%rbp')
+define(`w3', `%r10')
+define(`n',  `%r11')
+define(`X0', `%r12')
+define(`X1', `%r13')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(32)
+PROLOGUE(mpn_addmul_2)
+	FUNC_ENTRY(4)
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+
+	mov	(vp), v0
+	mov	8(vp), v1
+
+	mov	n_param, n
+	shr	$2, n
+
+	test	$1, R8(n_param)
+	jnz	L(bx1)
+
+L(bx0):	mov	(rp), X0
+	mov	8(rp), X1
+	test	$2, R8(n_param)
+	jnz	L(b10)
+
+L(b00):	mov	(up), %rdx
+	lea	16(up), up
+	mulx(	v0, %rax, w1)
+	add	%rax, X0
+	mulx(	v1, %rax, w2)
+	adc	$0, w1
+	mov	X0, (rp)
+	add	%rax, X1
+	adc	$0, w2
+	mov	-8(up), %rdx
+	lea	16(rp), rp
+	jmp	L(lo0)
+
+L(b10):	mov	(up), %rdx
+	inc	n
+	mulx(	v0, %rax, w1)
+	add	%rax, X0
+	adc	$0, w1
+	mulx(	v1, %rax, w2)
+	mov	X0, (rp)
+	mov	16(rp), X0
+	add	%rax, X1
+	adc	$0, w2
+	xor	w0, w0
+	jmp	L(lo2)
+
+L(bx1):	mov	(rp), X1
+	mov	8(rp), X0
+	test	$2, R8(n_param)
+	jnz	L(b11)
+
+L(b01):	mov	(up), %rdx
+	mulx(	v0, %rax, w3)
+	add	%rax, X1
+	adc	$0, w3
+	mulx(	v1, %rax, w0)
+	add	%rax, X0
+	adc	$0, w0
+	mov	8(up), %rdx
+	mov	X1, (rp)
+	mov	16(rp), X1
+	mulx(	v0, %rax, w1)
+	lea	24(rp), rp
+	lea	24(up), up
+	jmp	L(lo1)
+
+L(b11):	mov	(up), %rdx
+	inc	n
+	mulx(	v0, %rax, w3)
+	add	%rax, X1
+	adc	$0, w3
+	mulx(	v1, %rax, w0)
+	add	%rax, X0
+	adc	$0, w0
+	mov	X1, (rp)
+	mov	8(up), %rdx
+	mulx(	v0, %rax, w1)
+	lea	8(rp), rp
+	lea	8(up), up
+	jmp	L(lo3)
+
+	ALIGN(16)
+L(top):	mulx(	v0, %rax, w3)
+	add	w0, X1
+	adc	$0, w2
+	add	%rax, X1
+	adc	$0, w3
+	mulx(	v1, %rax, w0)
+	add	%rax, X0
+	adc	$0, w0
+	lea	32(rp), rp
+	add	w1, X1
+	mov	-16(up), %rdx
+	mov	X1, -24(rp)
+	adc	$0, w3
+	add	w2, X0
+	mov	-8(rp), X1
+	mulx(	v0, %rax, w1)
+	adc	$0, w0
+L(lo1):	add	%rax, X0
+	mulx(	v1, %rax, w2)
+	adc	$0, w1
+	add	w3, X0
+	mov	X0, -16(rp)
+	adc	$0, w1
+	add	%rax, X1
+	adc	$0, w2
+	add	w0, X1
+	mov	-8(up), %rdx
+	adc	$0, w2
+L(lo0):	mulx(	v0, %rax, w3)
+	add	%rax, X1
+	adc	$0, w3
+	mov	(rp), X0
+	mulx(	v1, %rax, w0)
+	add	%rax, X0
+	adc	$0, w0
+	add	w1, X1
+	mov	X1, -8(rp)
+	adc	$0, w3
+	mov	(up), %rdx
+	add	w2, X0
+	mulx(	v0, %rax, w1)
+	adc	$0, w0
+L(lo3):	add	%rax, X0
+	adc	$0, w1
+	mulx(	v1, %rax, w2)
+	add	w3, X0
+	mov	8(rp), X1
+	mov	X0, (rp)
+	mov	16(rp), X0
+	adc	$0, w1
+	add	%rax, X1
+	adc	$0, w2
+L(lo2):	mov	8(up), %rdx
+	lea	32(up), up
+	dec	n
+	jnz	L(top)
+
+L(end):	mulx(	v0, %rax, w3)
+	add	w0, X1
+	adc	$0, w2
+	add	%rax, X1
+	adc	$0, w3
+	mulx(	v1, %rdx, %rax)
+	add	w1, X1
+	mov	X1, 8(rp)
+	adc	$0, w3
+	add	w2, %rdx
+	adc	$0, %rax
+	add	w3, %rdx
+	mov	%rdx, 16(rp)
+	adc	$0, %rax
+
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/coreihwl/aorrlsh_n.asm b/third_party/gmp/mpn/x86_64/coreihwl/aorrlsh_n.asm
new file mode 100644
index 0000000..ff0d27b
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/coreihwl/aorrlsh_n.asm

@@ -0,0 +1,38 @@
+dnl  X86-64 mpn_addlsh_n and mpn_rsblsh_n.
+
+dnl  Copyright 2017 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_addlsh_n mpn_rsblsh_n)
+include_mpn(`x86_64/zen/aorrlsh_n.asm')

diff --git a/third_party/gmp/mpn/x86_64/coreihwl/aors_n.asm b/third_party/gmp/mpn/x86_64/coreihwl/aors_n.asm
new file mode 100644
index 0000000..fc99627
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/coreihwl/aors_n.asm

@@ -0,0 +1,261 @@
+dnl  AMD64 mpn_add_n, mpn_sub_n
+
+dnl  Copyright 2017 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C AMD K8,K9
+C AMD K10
+C AMD bd1	 1.5  with fluctuations
+C AMD bd2	 1.5  with fluctuations
+C AMD bd3
+C AMD bd4	 1.6
+C AMD zen
+C AMD bt1
+C AMD bt2
+C Intel P4
+C Intel PNR
+C Intel NHM
+C Intel SBR
+C Intel IBR
+C Intel HWL	 1.21
+C Intel BWL	 1.04
+C Intel SKL
+C Intel atom
+C Intel SLM
+C VIA nano
+
+C The loop of this code is the result of running a code generation and
+C optimization tool suite written by David Harvey and Torbjorn Granlund.
+
+C INPUT PARAMETERS
+define(`rp',	`%rdi')	C rcx
+define(`up',	`%rsi')	C rdx
+define(`vp',	`%rdx')	C r8
+define(`n',	`%rcx')	C r9
+define(`cy',	`%r8')	C rsp+40    (mpn_add_nc and mpn_sub_nc)
+
+ifdef(`OPERATION_add_n', `
+	define(ADCSBB,	      adc)
+	define(func,	      mpn_add_n)
+	define(func_nc,	      mpn_add_nc)')
+ifdef(`OPERATION_sub_n', `
+	define(ADCSBB,	      sbb)
+	define(func,	      mpn_sub_n)
+	define(func_nc,	      mpn_sub_nc)')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(func_nc)
+	FUNC_ENTRY(4)
+IFDOS(`	mov	56(%rsp), %r8	')
+
+	mov	R32(n), R32(%rax)
+	shr	$3, n
+	and	$7, R32(%rax)
+
+	lea	L(tab)(%rip), %r9
+	neg	%r8			C set carry
+ifdef(`PIC',`
+	movslq	(%r9,%rax,4), %rax
+	lea	(%r9,%rax), %rax	C lea not add to preserve carry
+	jmp	*%rax
+',`
+	jmp	*(%r9,%rax,8)
+')
+EPILOGUE()
+
+	ALIGN(16)
+PROLOGUE(func)
+	FUNC_ENTRY(4)
+
+	mov	R32(n), R32(%rax)
+	shr	$3, n
+	and	$7, R32(%rax)		C clear cy as side-effect
+
+	lea	L(tab)(%rip), %r9
+ifdef(`PIC',`
+	movslq	(%r9,%rax,4), %rax
+	lea	(%r9,%rax), %rax	C lea not add to preserve carry
+	jmp	*%rax
+',`
+	jmp	*(%r9,%rax,8)
+')
+
+L(0):	mov	(up), %r8
+	mov	8(up), %r9
+	ADCSBB	(vp), %r8
+	jmp	L(e0)
+
+L(4):	mov	(up), %r8
+	mov	8(up), %r9
+	ADCSBB	(vp), %r8
+	lea	-32(up), up
+	lea	-32(vp), vp
+	lea	-32(rp), rp
+	inc	n
+	jmp	L(e4)
+
+L(5):	mov	(up), %r11
+	mov	8(up), %r8
+	mov	16(up), %r9
+	ADCSBB	(vp), %r11
+	lea	-24(up), up
+	lea	-24(vp), vp
+	lea	-24(rp), rp
+	inc	n
+	jmp	L(e5)
+
+L(6):	mov	(up), %r10
+	ADCSBB	(vp), %r10
+	mov	8(up), %r11
+	lea	-16(up), up
+	lea	-16(vp), vp
+	lea	-16(rp), rp
+	inc	n
+	jmp	L(e6)
+
+L(7):	mov	(up), %r9
+	mov	8(up), %r10
+	ADCSBB	(vp), %r9
+	ADCSBB	8(vp), %r10
+	lea	-8(up), up
+	lea	-8(vp), vp
+	lea	-8(rp), rp
+	inc	n
+	jmp	L(e7)
+
+	ALIGN(16)
+L(top):
+L(e3):	mov	%r9, 40(rp)
+L(e2):	mov	%r10, 48(rp)
+L(e1):	mov	(up), %r8
+	mov	8(up), %r9
+	ADCSBB	(vp), %r8
+	mov	%r11, 56(rp)
+	lea	64(rp), rp
+L(e0):	mov	16(up), %r10
+	ADCSBB	8(vp), %r9
+	ADCSBB	16(vp), %r10
+	mov	%r8, (rp)
+L(e7):	mov	24(up), %r11
+	mov	%r9, 8(rp)
+L(e6):	mov	32(up), %r8
+	mov	40(up), %r9
+	ADCSBB	24(vp), %r11
+	mov	%r10, 16(rp)
+L(e5):	ADCSBB	32(vp), %r8
+	mov	%r11, 24(rp)
+L(e4):	mov	48(up), %r10
+	mov	56(up), %r11
+	mov	%r8, 32(rp)
+	lea	64(up), up
+	ADCSBB	40(vp), %r9
+	ADCSBB	48(vp), %r10
+	ADCSBB	56(vp), %r11
+	lea	64(vp), vp
+	dec	n
+	jnz	L(top)
+
+L(end):	mov	%r9, 40(rp)
+	mov	%r10, 48(rp)
+	mov	%r11, 56(rp)
+	mov	R32(n), R32(%rax)
+	adc	R32(n), R32(%rax)
+	FUNC_EXIT()
+	ret
+
+	ALIGN(16)
+L(3):	mov	(up), %r9
+	mov	8(up), %r10
+	mov	16(up), %r11
+	ADCSBB	(vp), %r9
+	ADCSBB	8(vp), %r10
+	ADCSBB	16(vp), %r11
+	jrcxz	L(x3)
+	lea	24(up), up
+	lea	24(vp), vp
+	lea	-40(rp), rp
+	jmp	L(e3)
+L(x3):	mov	%r9, (rp)
+	mov	%r10, 8(rp)
+	mov	%r11, 16(rp)
+	mov	R32(n), R32(%rax)
+	adc	R32(n), R32(%rax)
+	FUNC_EXIT()
+	ret
+
+	ALIGN(16)
+L(1):	mov	(up), %r11
+	ADCSBB	(vp), %r11
+	jrcxz	L(x1)
+	lea	8(up), up
+	lea	8(vp), vp
+	lea	-56(rp), rp
+	jmp	L(e1)
+L(x1):	mov	%r11, (rp)
+	mov	R32(n), R32(%rax)
+	adc	R32(n), R32(%rax)
+	FUNC_EXIT()
+	ret
+
+	ALIGN(16)
+L(2):	mov	(up), %r10
+	mov	8(up), %r11
+	ADCSBB	(vp), %r10
+	ADCSBB	8(vp), %r11
+	jrcxz	L(x2)
+	lea	16(up), up
+	lea	16(vp), vp
+	lea	-48(rp), rp
+	jmp	L(e2)
+L(x2):	mov	%r10, (rp)
+	mov	%r11, 8(rp)
+	mov	R32(n), R32(%rax)
+	adc	R32(n), R32(%rax)
+	FUNC_EXIT()
+	ret
+EPILOGUE()
+	JUMPTABSECT
+	ALIGN(8)
+L(tab):	JMPENT(	L(0), L(tab))
+	JMPENT(	L(1), L(tab))
+	JMPENT(	L(2), L(tab))
+	JMPENT(	L(3), L(tab))
+	JMPENT(	L(4), L(tab))
+	JMPENT(	L(5), L(tab))
+	JMPENT(	L(6), L(tab))
+	JMPENT(	L(7), L(tab))

diff --git a/third_party/gmp/mpn/x86_64/coreihwl/aorsmul_1.asm b/third_party/gmp/mpn/x86_64/coreihwl/aorsmul_1.asm
new file mode 100644
index 0000000..3f43afa
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/coreihwl/aorsmul_1.asm

@@ -0,0 +1,201 @@
+dnl  AMD64 mpn_addmul_1 and mpn_submul_1 optimised for Intel Haswell.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C AMD K8,K9	 -
+C AMD K10	 -
+C AMD bull	 -
+C AMD pile	 -
+C AMD steam	 -
+C AMD excavator	 -
+C AMD bobcat	 -
+C AMD jaguar	 -
+C Intel P4	 -
+C Intel core2	 -
+C Intel NHM	 -
+C Intel SBR	 -
+C Intel IBR	 -
+C Intel HWL	 2.32
+C Intel BWL	 2.04
+C Intel SKL	 1.95
+C Intel atom	 -
+C Intel SLM	 -
+C VIA nano	 -
+
+C The loop of this code is the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjörn Granlund.
+
+C TODO
+C  * Handle small n separately, for lower overhead.
+
+define(`rp',      `%rdi')   C rcx
+define(`up',      `%rsi')   C rdx
+define(`n_param', `%rdx')   C r8
+define(`v0_param',`%rcx')   C r9
+
+define(`n',       `%rbp')
+define(`v0',      `%rdx')
+
+ifdef(`OPERATION_addmul_1',`
+  define(`ADDSUB',        `add')
+  define(`ADCSBB',        `adc')
+  define(`func',  `mpn_addmul_1')
+')
+ifdef(`OPERATION_submul_1',`
+  define(`ADDSUB',        `sub')
+  define(`ADCSBB',        `sbb')
+  define(`func',  `mpn_submul_1')
+')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(func)
+	FUNC_ENTRY(4)
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+
+	mov	n_param, n
+	mov	v0_param, v0
+
+	test	$1, R8(n)
+	jnz	L(bx1)
+
+L(bx0):	shr	$2, n
+	jc	L(b10)
+
+L(b00):	mulx(	(up), %r13, %r12)
+	mulx(	8,(up), %rbx, %rax)
+	add	%r12, %rbx
+	adc	$0, %rax
+	mov	(rp), %r12
+	mov	8(rp), %rcx
+	mulx(	16,(up), %r9, %r8)
+	lea	-16(rp), rp
+	lea	16(up), up
+	ADDSUB	%r13, %r12
+	jmp	L(lo0)
+
+L(bx1):	shr	$2, n
+	jc	L(b11)
+
+L(b01):	mulx(	(up), %r11, %r10)
+	jnz	L(gt1)
+L(n1):	ADDSUB	%r11, (rp)
+	mov	$0, R32(%rax)
+	adc	%r10, %rax
+	jmp	L(ret)
+
+L(gt1):	mulx(	8,(up), %r13, %r12)
+	mulx(	16,(up), %rbx, %rax)
+	lea	24(up), up
+	add	%r10, %r13
+	adc	%r12, %rbx
+	adc	$0, %rax
+	mov	(rp), %r10
+	mov	8(rp), %r12
+	mov	16(rp), %rcx
+	lea	-8(rp), rp
+	ADDSUB	%r11, %r10
+	jmp	L(lo1)
+
+L(b11):	mulx(	(up), %rbx, %rax)
+	mov	(rp), %rcx
+	mulx(	8,(up), %r9, %r8)
+	lea	8(up), up
+	lea	-24(rp), rp
+	inc	n			C adjust n
+	ADDSUB	%rbx, %rcx
+	jmp	L(lo3)
+
+L(b10):	mulx(	(up), %r9, %r8)
+	mulx(	8,(up), %r11, %r10)
+	lea	-32(rp), rp
+	mov	$0, R32(%rax)
+	clc				C clear cf
+	jz	L(end)			C depends on old shift
+
+	ALIGN(16)
+L(top):	adc	%rax, %r9
+	lea	32(rp), rp
+	adc	%r8, %r11
+	mulx(	16,(up), %r13, %r12)
+	mov	(rp), %r8
+	mulx(	24,(up), %rbx, %rax)
+	lea	32(up), up
+	adc	%r10, %r13
+	adc	%r12, %rbx
+	adc	$0, %rax
+	mov	8(rp), %r10
+	mov	16(rp), %r12
+	ADDSUB	%r9, %r8
+	mov	24(rp), %rcx
+	mov	%r8, (rp)
+	ADCSBB	%r11, %r10
+L(lo1):	mulx(	(up), %r9, %r8)
+	mov	%r10, 8(rp)
+	ADCSBB	%r13, %r12
+L(lo0):	mov	%r12, 16(rp)
+	ADCSBB	%rbx, %rcx
+L(lo3):	mulx(	8,(up), %r11, %r10)
+	mov	%rcx, 24(rp)
+	dec	n
+	jnz	L(top)
+
+L(end):	adc	%rax, %r9
+	adc	%r8, %r11
+	mov	32(rp), %r8
+	mov	%r10, %rax
+	adc	$0, %rax
+	mov	40(rp), %r10
+	ADDSUB	%r9, %r8
+	mov	%r8, 32(rp)
+	ADCSBB	%r11, %r10
+	mov	%r10, 40(rp)
+	adc	$0, %rax
+
+L(ret):	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/coreihwl/gcd_22.asm b/third_party/gmp/mpn/x86_64/coreihwl/gcd_22.asm
new file mode 100644
index 0000000..b5863b6
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/coreihwl/gcd_22.asm

@@ -0,0 +1,138 @@
+dnl  AMD64 mpn_gcd_22.  Assumes useless bsf, useless shrd, useful tzcnt, shlx.
+
+dnl  Copyright 2019 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C	     cycles/bit
+C AMD K8,K9	 -
+C AMD K10	 -
+C AMD bd1	 -
+C AMD bd2	 -
+C AMD bd3	 -
+C AMD bd4	 6.7
+C AMD bt1	 -
+C AMD bt2	 -
+C AMD zn1	 5.4
+C AMD zn2	 5.5
+C Intel P4	 -
+C Intel CNR	 -
+C Intel PNR	 -
+C Intel NHM	 -
+C Intel WSM	 -
+C Intel SBR	 -
+C Intel IBR	 -
+C Intel HWL	 7.1
+C Intel BWL	 5.5
+C Intel SKL	 5.6
+C Intel atom	 -
+C Intel SLM	 -
+C Intel GLM	 -
+C Intel GLM+	 -
+C VIA nano	 -
+
+
+define(`u1',    `%rdi')
+define(`u0',    `%rsi')
+define(`v1',    `%rdx')
+define(`v0',    `%rcx')
+
+define(`s0',    `%r8')
+define(`s1',    `%r9')
+define(`t0',    `%r10')
+define(`t1',    `%r11')
+define(`cnt',   `%rax')
+
+dnl ABI_SUPPORT(DOS64)	C returns mp_double_limb_t in memory
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(64)
+PROLOGUE(mpn_gcd_22)
+	FUNC_ENTRY(4)
+
+	ALIGN(16)
+L(top):	mov	v0, t0
+	sub	u0, t0
+	jz	L(lowz)		C	jump when low limb result = 0
+	mov	v1, t1
+	sbb	u1, t1
+
+	rep;bsf	t0, cnt		C tzcnt!
+
+	mov	u0, s0
+	sub	v0, u0
+	mov	u1, s1
+	sbb	v1, u1
+
+L(bck):	cmovc	t0, u0		C u = |u - v|
+	cmovc	t1, u1		C u = |u - v|
+	cmovc	s0, v0		C v = min(u,v)
+	cmovc	s1, v1		C v = min(u,v)
+
+	xor	R32(t0), R32(t0)
+	sub	cnt, t0
+	shlx(	t0, u1, s1)
+	shrx(	cnt, u0, u0)
+	shrx(	cnt, u1, u1)
+	or	s1, u0
+
+	test	v1, v1
+	jnz	L(top)
+	test	u1, u1
+	jnz	L(top)
+
+L(gcd_11):
+	mov	v0, %rdi
+C	mov	u0, %rsi
+	TCALL(	mpn_gcd_11)
+
+L(lowz):C We come here when v0 - u0 = 0
+	C 1. If v1 - u1 = 0, then gcd is u = v.
+	C 2. Else compute gcd_21({v1,v0}, |u1-v1|)
+	mov	v1, t0
+	sub	u1, t0
+	je	L(end)
+
+	xor	t1, t1
+	mov	u0, s0
+	mov	u1, s1
+	rep;bsf	t0, cnt		C tzcnt!
+	mov	u1, u0
+	xor	u1, u1
+	sub	v1, u0
+	jmp	L(bck)
+
+L(end):	mov	v0, %rax
+	C mov	v1, %rdx
+L(ret):	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/coreihwl/gmp-mparam.h b/third_party/gmp/mpn/x86_64/coreihwl/gmp-mparam.h
new file mode 100644
index 0000000..c11aeec
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/coreihwl/gmp-mparam.h

@@ -0,0 +1,253 @@
+/* Haswell gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+/* 3600-4000 MHz Intel Xeon E3-1271v3 Haswell */
+/* FFT tuning limit = 467,964,359 */
+/* Generated by tuneup.c, 2019-10-18, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD                 0  /* always */
+#define MOD_1_UNNORM_THRESHOLD               0  /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          4
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          3
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD         9
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD        26
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD      9
+#define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_1_NORM_THRESHOLD              1
+#define DIV_QR_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */
+#define DIV_QR_2_PI2_THRESHOLD               9
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           25
+
+#define DIV_1_VS_MUL_1_PERCENT             427
+
+#define MUL_TOOM22_THRESHOLD                20
+#define MUL_TOOM33_THRESHOLD                74
+#define MUL_TOOM44_THRESHOLD               195
+#define MUL_TOOM6H_THRESHOLD               276
+#define MUL_TOOM8H_THRESHOLD               381
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD     120
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     139
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD     128
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD     129
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD     170
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 32
+#define SQR_TOOM3_THRESHOLD                117
+#define SQR_TOOM4_THRESHOLD                315
+#define SQR_TOOM6_THRESHOLD                414
+#define SQR_TOOM8_THRESHOLD                  0  /* always */
+
+#define MULMID_TOOM42_THRESHOLD             42
+
+#define MULMOD_BNM1_THRESHOLD               13
+#define SQRMOD_BNM1_THRESHOLD               17
+
+#define MUL_FFT_MODF_THRESHOLD             376  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    376, 5}, {     21, 6}, {     11, 5}, {     23, 6}, \
+    {     21, 7}, {     11, 6}, {     25, 7}, {     13, 6}, \
+    {     27, 7}, {     21, 8}, {     11, 7}, {     25, 8}, \
+    {     13, 7}, {     28, 8}, {     15, 7}, {     31, 8}, \
+    {     17, 7}, {     35, 8}, {     21, 9}, {     11, 8}, \
+    {     27, 9}, {     15, 8}, {     35, 9}, {     19, 8}, \
+    {     41, 9}, {     23, 8}, {     49, 9}, {     27,10}, \
+    {     15, 9}, {     39,10}, {     23, 9}, {     51,11}, \
+    {     15,10}, {     31, 9}, {     71,10}, {     39, 9}, \
+    {     83,10}, {     47, 9}, {     95,11}, {     31,10}, \
+    {     79,11}, {     47,10}, {     95,12}, {     31,11}, \
+    {     63,10}, {    127, 9}, {    255,10}, {    135,11}, \
+    {     79,10}, {    167,11}, {     95,10}, {    191, 9}, \
+    {    383,11}, {    111,12}, {     63, 8}, {   1023,11}, \
+    {    143,10}, {    287, 9}, {    575,10}, {    303,11}, \
+    {    159,10}, {    319,12}, {     95,11}, {    191,10}, \
+    {    383,11}, {    207,13}, {     63,12}, {    127,11}, \
+    {    255,10}, {    543,11}, {    287,10}, {    575,11}, \
+    {    303,10}, {    607,12}, {    159,11}, {    319,10}, \
+    {    639,11}, {    335,10}, {    671,11}, {    351,10}, \
+    {    703,11}, {    367,10}, {    735,11}, {    383,10}, \
+    {    767,11}, {    415,10}, {    831,11}, {    447,10}, \
+    {    895,11}, {    479,13}, {    127,11}, {    543,10}, \
+    {   1087,12}, {    287,11}, {    607,10}, {   1215,12}, \
+    {    319,11}, {    671,12}, {    351,11}, {    735,13}, \
+    {    191,12}, {    383,11}, {    767,12}, {    415,11}, \
+    {    831,12}, {    447,11}, {    895,12}, {    479,11}, \
+    {    959,14}, {    127,12}, {    543,11}, {   1087,12}, \
+    {    607,11}, {   1215,10}, {   2431,12}, {    671,11}, \
+    {   1343,12}, {    703,11}, {   1407,12}, {    735,13}, \
+    {    383,12}, {    831,13}, {    447,12}, {    959,13}, \
+    {    511,12}, {   1087,11}, {   2175,13}, {    575,12}, \
+    {   1215,11}, {   2431,13}, {    639,12}, {   1343,13}, \
+    {    703,12}, {   1407,14}, {    383,13}, {    767,12}, \
+    {   1535,13}, {    831,12}, {   1727,13}, {    959,12}, \
+    {   1919,14}, {    511,13}, {   1023,12}, {   2047,13}, \
+    {   1087,12}, {   2175,13}, {   1215,14}, {    639,13}, \
+    {   1343,12}, {   2687,13}, {   1407,12}, {   2815,13}, \
+    {   1471,14}, {    767,13}, {   1599,12}, {   3199,13}, \
+    {   1727,14}, {    895,13}, {   1791,12}, {   3583,13}, \
+    {   1919,15}, {    511,14}, {   1023,13}, {   2175,14}, \
+    {   1151,13}, {   2431,12}, {   4863,14}, {   1279,13}, \
+    {   2687,14}, {   1407,13}, {   2815,15}, {    767,14}, \
+    {   1535,13}, {   3199,14}, {   1663,13}, {   3455,12}, \
+    {   6911,14}, {   1791,13}, {   3583,14}, {   1919,16}, \
+    {    511,15}, {   1023,14}, {   2175,13}, {   4351,14}, \
+    {   2431,13}, {   4863,15}, {   1279,14}, {   2943,13}, \
+    {   5887,12}, {  11775,15}, {   1535,14}, {   3455,13}, \
+    {   6911,15}, {   1791,14}, {   3839,13}, {   7679,16}, \
+    {   1023,15}, {   2047,14}, {   4351,15}, {   2303,14}, \
+    {   4863,15}, {   2815,14}, {   5887,13}, {  11775,16}, \
+    {   1535,15}, {   3327,14}, {   6911,15}, {   3839,14}, \
+    {   7679,17}, {   1023,16}, {   2047,15}, {   4863,16}, \
+    {   2559,15}, {   5887,14}, {  11775,16}, {   3071,15}, \
+    {   6911,16}, {   3583,15}, {   7679,14}, {  15359,15}, \
+    {   7935,17}, {   2047,16}, {   4095,15}, {   8447,16}, \
+    {   4607,15}, {   9983,14}, {  19967,16}, {   5631,15}, \
+    {  11775,17}, {   3071,16}, {  65536,17}, { 131072,18}, \
+    { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \
+    {4194304,23}, {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 238
+#define MUL_FFT_THRESHOLD                 4736
+
+#define SQR_FFT_MODF_THRESHOLD             368  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    368, 5}, {     21, 6}, {     11, 5}, {     23, 6}, \
+    {     25, 7}, {     13, 6}, {     27, 7}, {     21, 8}, \
+    {     11, 7}, {     25, 8}, {     13, 7}, {     28, 8}, \
+    {     15, 7}, {     31, 8}, {     21, 9}, {     11, 8}, \
+    {     27, 9}, {     15, 8}, {     33, 9}, {     19, 8}, \
+    {     41, 9}, {     23, 8}, {     47, 9}, {     27,10}, \
+    {     15, 9}, {     39,10}, {     23, 9}, {     51,11}, \
+    {     15,10}, {     31, 9}, {     67,10}, {     39, 9}, \
+    {     79,10}, {     55,11}, {     31,10}, {     79,11}, \
+    {     47,10}, {     95,12}, {     31,11}, {     63,10}, \
+    {    127, 9}, {    255,11}, {     79,10}, {    159, 9}, \
+    {    319,11}, {     95,10}, {    191,12}, {     63,11}, \
+    {    127,10}, {    255, 9}, {    511,10}, {    271,11}, \
+    {    143,10}, {    287, 9}, {    575,10}, {    303, 9}, \
+    {    607,11}, {    159,10}, {    319, 6}, {   5631, 7}, \
+    {   2943, 6}, {   5887, 8}, {   1535,11}, {    207,10}, \
+    {    415,11}, {    223,10}, {    447,11}, {    239,10}, \
+    {    479,12}, {    127,11}, {    255,10}, {    511,11}, \
+    {    271,10}, {    543,11}, {    287,10}, {    575,11}, \
+    {    303,10}, {    607,11}, {    319,10}, {    639,11}, \
+    {    335,10}, {    671,11}, {    351,10}, {    703,11}, \
+    {    367,10}, {    735,11}, {    383,10}, {    767,11}, \
+    {    415,10}, {    831,11}, {    447,10}, {    895,11}, \
+    {    479,13}, {    127,11}, {    511,10}, {   1023,11}, \
+    {    543,10}, {   1087,12}, {    287,11}, {    607,10}, \
+    {   1215,11}, {    671,12}, {    351,11}, {    735,12}, \
+    {    383,11}, {    767,12}, {    415,11}, {    831,12}, \
+    {    447,11}, {    895,12}, {    479,11}, {    959,14}, \
+    {    127,12}, {    511,11}, {   1023,12}, {    543,11}, \
+    {   1087,12}, {    607,11}, {   1215,12}, {    735,13}, \
+    {    383,12}, {    831,13}, {    447,12}, {    959,13}, \
+    {    511,12}, {   1087,13}, {    575,12}, {   1151,13}, \
+    {    639,12}, {   1279,13}, {    703,12}, {   1407,11}, \
+    {   2815,14}, {    383,13}, {    767,12}, {   1535,13}, \
+    {    831,12}, {   1727,11}, {   3455,13}, {    959,14}, \
+    {    511,13}, {   1087,12}, {   2175,13}, {   1215,14}, \
+    {    639,13}, {   1279,12}, {   2559,13}, {   1343,12}, \
+    {   2687,13}, {   1407,12}, {   2815,13}, {   1471,14}, \
+    {    767,13}, {   1599,12}, {   3199,13}, {   1727,14}, \
+    {    895,13}, {   1791,12}, {   3583,13}, {   1919,15}, \
+    {    511,14}, {   1023,13}, {   2175,14}, {   1151,13}, \
+    {   2303,12}, {   4607,13}, {   2431,12}, {   4863,14}, \
+    {   1279,13}, {   2687,14}, {   1407,13}, {   2815,15}, \
+    {    767,14}, {   1535,13}, {   3199,14}, {   1663,13}, \
+    {   3455,12}, {   6911,14}, {   1791,13}, {   3583,14}, \
+    {   1919,16}, {    511,15}, {   1023,14}, {   2431,13}, \
+    {   4863,15}, {   1279,14}, {   2943,13}, {   5887,15}, \
+    {   1535,14}, {   3455,13}, {   6911,15}, {   1791,14}, \
+    {   3839,13}, {   7679,16}, {   1023,15}, {   2047,14}, \
+    {   4351,15}, {   2303,14}, {   4863,15}, {   2815,14}, \
+    {   5887,16}, {   1535,15}, {   3071,14}, {   6143,15}, \
+    {   3327,14}, {   6911,15}, {   3839,14}, {   7679,17}, \
+    {   1023,16}, {   2047,15}, {   4863,16}, {   2559,15}, \
+    {   5887,14}, {  11775,16}, {   3071,15}, {   6911,16}, \
+    {   3583,15}, {   7679,14}, {  15359,15}, {   7935,17}, \
+    {   2047,16}, {   4095,15}, {   8191,16}, {   4607,15}, \
+    {   9983,14}, {  19967,16}, {   5631,15}, {  11775,17}, \
+    {   3071,16}, {  65536,17}, { 131072,18}, { 262144,19}, \
+    { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \
+    {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 237
+#define SQR_FFT_THRESHOLD                 3264
+
+#define MULLO_BASECASE_THRESHOLD             0  /* always */
+#define MULLO_DC_THRESHOLD                  68
+#define MULLO_MUL_N_THRESHOLD             8967
+#define SQRLO_BASECASE_THRESHOLD            11
+#define SQRLO_DC_THRESHOLD                  80
+#define SQRLO_SQR_THRESHOLD               6481
+
+#define DC_DIV_QR_THRESHOLD                 58
+#define DC_DIVAPPR_Q_THRESHOLD             182
+#define DC_BDIV_QR_THRESHOLD                60
+#define DC_BDIV_Q_THRESHOLD                123
+
+#define INV_MULMOD_BNM1_THRESHOLD           38
+#define INV_NEWTON_THRESHOLD               179
+#define INV_APPR_THRESHOLD                 182
+
+#define BINV_NEWTON_THRESHOLD              230
+#define REDC_1_TO_REDC_2_THRESHOLD          48
+#define REDC_2_TO_REDC_N_THRESHOLD          63
+
+#define MU_DIV_QR_THRESHOLD               1470
+#define MU_DIVAPPR_Q_THRESHOLD            1528
+#define MUPI_DIV_QR_THRESHOLD               82
+#define MU_BDIV_QR_THRESHOLD              1334
+#define MU_BDIV_Q_THRESHOLD               1506
+
+#define POWM_SEC_TABLE  1,22,194,473,1297,2698
+
+#define GET_STR_DC_THRESHOLD                12
+#define GET_STR_PRECOMPUTE_THRESHOLD        19
+#define SET_STR_DC_THRESHOLD              1391
+#define SET_STR_PRECOMPUTE_THRESHOLD      2654
+
+#define FAC_DSC_THRESHOLD                  562
+#define FAC_ODD_THRESHOLD                    0  /* always */
+
+#define MATRIX22_STRASSEN_THRESHOLD         15
+#define HGCD2_DIV1_METHOD                    5  /* 3.49% faster than 3 */
+#define HGCD_THRESHOLD                      96
+#define HGCD_APPR_THRESHOLD                 92
+#define HGCD_REDUCE_THRESHOLD             2681
+#define GCD_DC_THRESHOLD                   501
+#define GCDEXT_DC_THRESHOLD                365
+#define JACOBI_BASE_METHOD                   1  /* 23.87% faster than 4 */
+
+/* Tuneup completed successfully, took 238360 seconds */

diff --git a/third_party/gmp/mpn/x86_64/coreihwl/mul_1.asm b/third_party/gmp/mpn/x86_64/coreihwl/mul_1.asm
new file mode 100644
index 0000000..5e649e8
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/coreihwl/mul_1.asm

@@ -0,0 +1,159 @@
+dnl  AMD64 mpn_mul_1 using mulx optimised for Intel Haswell.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2012, 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C AMD K8,K9      -
+C AMD K10        -
+C AMD bull       -
+C AMD pile       -
+C AMD steam      -
+C AMD excavator  -
+C AMD bobcat     -
+C AMD jaguar     -
+C Intel P4       -
+C Intel core2    -
+C Intel NHM      -
+C Intel SBR      -
+C Intel IBR      -
+C Intel HWL      1.59
+C Intel BWL      1.76
+C Intel SKL      1.54
+C Intel atom     -
+C Intel SLM      -
+C VIA nano       -
+
+C The loop of this code is the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjorn Granlund.
+
+define(`rp',      `%rdi')   C rcx
+define(`up',      `%rsi')   C rdx
+define(`n_param', `%rdx')   C r8
+define(`v0_param',`%rcx')   C r9
+
+define(`n',       `%rbp')
+define(`v0',      `%rdx')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(32)
+PROLOGUE(mpn_mul_1)
+	FUNC_ENTRY(4)
+	push	%rbx
+	push	%rbp
+	push	%r12
+
+	mov	n_param, n
+	shr	$2, n
+
+	test	$1, R8(n_param)
+	jnz	L(bx1)
+
+L(bx0):	test	$2, R8(n_param)
+	mov	v0_param, v0
+	jnz	L(b10)
+
+L(b00):	mulx(	(up), %r9, %r8)
+	mulx(	8,(up), %r11, %r10)
+	mulx(	16,(up), %rcx, %r12)
+	lea	-32(rp), rp
+	jmp	L(lo0)
+
+L(b10):	mulx(	(up), %rcx, %r12)
+	mulx(	8,(up), %rbx, %rax)
+	lea	-16(rp), rp
+	test	n, n
+	jz	L(cj2)
+	mulx(	16,(up), %r9, %r8)
+	lea	16(up), up
+	jmp	L(lo2)
+
+L(bx1):	test	$2, R8(n_param)
+	mov	v0_param, v0
+	jnz	L(b11)
+
+L(b01):	mulx(	(up), %rbx, %rax)
+	lea	-24(rp), rp
+	test	n, n
+	jz	L(cj1)
+	mulx(	8,(up), %r9, %r8)
+	lea	8(up), up
+	jmp	L(lo1)
+
+L(b11):	mulx(	(up), %r11, %r10)
+	mulx(	8,(up), %rcx, %r12)
+	mulx(	16,(up), %rbx, %rax)
+	lea	-8(rp), rp
+	test	n, n
+	jz	L(cj3)
+	lea	24(up), up
+	jmp	L(lo3)
+
+	ALIGN(32)
+L(top):	lea	32(rp), rp
+	mov	%r9, (rp)
+	adc	%r8, %r11
+L(lo3):	mulx(	(up), %r9, %r8)
+	mov	%r11, 8(rp)
+	adc	%r10, %rcx
+L(lo2):	mov	%rcx, 16(rp)
+	adc	%r12, %rbx
+L(lo1):	mulx(	8,(up), %r11, %r10)
+	adc	%rax, %r9
+	mulx(	16,(up), %rcx, %r12)
+	mov	%rbx, 24(rp)
+L(lo0):	mulx(	24,(up), %rbx, %rax)
+	lea	32(up), up
+	dec	n
+	jnz	L(top)
+
+L(end):	lea	32(rp), rp
+	mov	%r9, (rp)
+	adc	%r8, %r11
+L(cj3):	mov	%r11, 8(rp)
+	adc	%r10, %rcx
+L(cj2):	mov	%rcx, 16(rp)
+	adc	%r12, %rbx
+L(cj1):	mov	%rbx, 24(rp)
+	adc	$0, %rax
+
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/x86_64/coreihwl/mul_2.asm b/third_party/gmp/mpn/x86_64/coreihwl/mul_2.asm
new file mode 100644
index 0000000..f1f044f
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/coreihwl/mul_2.asm

@@ -0,0 +1,176 @@
+dnl  AMD64 mpn_mul_2 optimised for Intel Haswell.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C AMD K8,K9	 -
+C AMD K10	 -
+C AMD bull	 -
+C AMD pile	 -
+C AMD steam	 -
+C AMD excavator	 -
+C AMD bobcat	 -
+C AMD jaguar	 -
+C Intel P4	 -
+C Intel core	 -
+C Intel NHM	 -
+C Intel SBR	 -
+C Intel IBR	 -
+C Intel HWL      3.74
+C Intel BWL      4.21
+C Intel SKL      4.20
+C Intel atom	 -
+C Intel SLM	 -
+C VIA nano	 -
+
+C The loop of this code is the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjörn Granlund.
+
+C TODO
+C  * Move test and jcc together, for insn fusion.
+
+define(`rp',     `%rdi')
+define(`up',     `%rsi')
+define(`n_param',`%rdx')
+define(`vp',     `%rcx')
+
+define(`v0', `%r8')
+define(`v1', `%r9')
+define(`w0', `%rbx')
+define(`w1', `%rcx')
+define(`w2', `%rbp')
+define(`w3', `%r10')
+define(`n',  `%r11')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(32)
+PROLOGUE(mpn_mul_2)
+	FUNC_ENTRY(4)
+	push	%rbx
+	push	%rbp
+
+	mov	(vp), v0
+	mov	8(vp), v1
+
+	lea	3(n_param), n
+	shr	$2, n
+
+	test	$1, R8(n_param)
+	jnz	L(bx1)
+
+L(bx0):	xor	w0, w0
+	test	$2, R8(n_param)
+	mov	(up), %rdx
+	mulx(	v0, w2, w1)
+	jz	L(lo0)
+
+L(b10):	lea	-16(rp), rp
+	lea	-16(up), up
+	jmp	L(lo2)
+
+L(bx1):	xor	w2, w2
+	test	$2, R8(n_param)
+	mov	(up), %rdx
+	mulx(	v0, w0, w3)
+	jnz	L(b11)
+
+L(b01):	lea	-24(rp), rp
+	lea	8(up), up
+	jmp	L(lo1)
+
+L(b11):	lea	-8(rp), rp
+	lea	-8(up), up
+	jmp	L(lo3)
+
+	ALIGN(16)
+L(top):	mulx(	v1, %rax, w0)
+	add	%rax, w2		C 0
+	mov	(up), %rdx
+	mulx(	v0, %rax, w1)
+	adc	$0, w0			C 1
+	add	%rax, w2		C 0
+	adc	$0, w1			C 1
+	add	w3, w2			C 0
+L(lo0):	mov	w2, (rp)		C 0
+	adc	$0, w1			C 1
+	mulx(	v1, %rax, w2)
+	add	%rax, w0		C 1
+	mov	8(up), %rdx
+	adc	$0, w2			C 2
+	mulx(	v0, %rax, w3)
+	add	%rax, w0		C 1
+	adc	$0, w3			C 2
+	add	w1, w0			C 1
+L(lo3):	mov	w0, 8(rp)		C 1
+	adc	$0, w3			C 2
+	mulx(	v1, %rax, w0)
+	add	%rax, w2		C 2
+	mov	16(up), %rdx
+	mulx(	v0, %rax, w1)
+	adc	$0, w0			C 3
+	add	%rax, w2		C 2
+	adc	$0, w1			C 3
+	add	w3, w2			C 2
+L(lo2):	mov	w2, 16(rp)		C 2
+	adc	$0, w1			C 3
+	mulx(	v1, %rax, w2)
+	add	%rax, w0		C 3
+	mov	24(up), %rdx
+	adc	$0, w2			C 4
+	mulx(	v0, %rax, w3)
+	add	%rax, w0		C 3
+	adc	$0, w3			C 4
+	add	w1, w0			C 3
+	lea	32(up), up
+L(lo1):	mov	w0, 24(rp)		C 3
+	adc	$0, w3			C 4
+	dec	n
+	lea	32(rp), rp
+	jnz	L(top)
+
+L(end):	mulx(	v1, %rdx, %rax)
+	add	%rdx, w2
+	adc	$0, %rax
+	add	w3, w2
+	mov	w2, (rp)
+	adc	$0, %rax
+
+	pop	%rbp
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/coreihwl/mul_basecase.asm b/third_party/gmp/mpn/x86_64/coreihwl/mul_basecase.asm
new file mode 100644
index 0000000..b2656c8
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/coreihwl/mul_basecase.asm

@@ -0,0 +1,441 @@
+dnl  AMD64 mpn_mul_basecase optimised for Intel Haswell.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb	mul_1		mul_2		mul_3		addmul_2
+C AMD K8,K9	n/a		n/a		 -		n/a
+C AMD K10	n/a		n/a		 -		n/a
+C AMD bull	n/a		n/a		 -		n/a
+C AMD pile	n/a		n/a		 -		n/a
+C AMD steam	 ?		 ?		 -		 ?
+C AMD bobcat	n/a		n/a		 -		n/a
+C AMD jaguar	 ?		 ?		 -		 ?
+C Intel P4	n/a		n/a		 -		n/a
+C Intel core	n/a		n/a		 -		n/a
+C Intel NHM	n/a		n/a		 -		n/a
+C Intel SBR	n/a		n/a		 -		n/a
+C Intel IBR	n/a		n/a		 -		n/a
+C Intel HWL	 1.77		 1.86		 -		 2.15
+C Intel BWL	 ?		 ?		 -		 ?
+C Intel atom	n/a		n/a		 -		n/a
+C VIA nano	n/a		n/a		 -		n/a
+
+C The inner loops of this code are the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjörn Granlund.
+
+C TODO
+C  * Adjoin a mul_3.
+C  * Further micro-optimise.
+
+define(`rp',      `%rdi')
+define(`up',      `%rsi')
+define(`un_param',`%rdx')
+define(`vp',      `%rcx')
+define(`vn',      `%r8')
+
+define(`un',      `%rbx')
+
+define(`w0',	`%r10')
+define(`w1',	`%r11')
+define(`w2',	`%r12')
+define(`w3',	`%r13')
+define(`n',	`%rbp')
+define(`v0',	`%r9')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_mul_basecase)
+	FUNC_ENTRY(4)
+IFDOS(`	mov	56(%rsp), %r8d	')
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	mov	un_param, un		C free up rdx
+	neg	un
+
+	mov	un_param, n		C FIXME: share
+	sar	$2, n			C FIXME: share
+
+	test	$1, R8(vn)
+	jz	L(do_mul_2)
+
+define(`w4',	`%r9')
+define(`w5',	`%r14')
+
+	mov	(vp), %rdx
+
+L(do_mul_1):
+	test	$1, R8(un)
+	jnz	L(m1x1)
+
+L(m1x0):test	$2, R8(un)
+	jnz	L(m110)
+
+L(m100):
+	mulx(	(up), w5, w2)
+	mulx(	8,(up), w1, w3)
+	lea	-24(rp), rp
+	jmp	L(m1l0)
+
+L(m110):
+	mulx(	(up), w3, w4)
+	mulx(	8,(up), w1, w5)
+	lea	-8(rp), rp
+	test	n, n
+	jz	L(cj2)
+	mulx(	16,(up), w0, w2)
+	lea	16(up), up
+	jmp	L(m1l2)
+
+L(m1x1):test	$2, R8(un)
+	jz	L(m111)
+
+L(m101):
+	mulx(	(up), w4, w5)
+	lea	-16(rp), rp
+	test	n, n
+	jz	L(cj1)
+	mulx(	8,(up), w0, w2)
+	lea	8(up), up
+	jmp	L(m1l1)
+
+L(m111):
+	mulx(	(up), w2, w3)
+	mulx(	8,(up), w0, w4)
+	mulx(	16,(up), w1, w5)
+	lea	24(up), up
+	test	n, n
+	jnz	L(gt3)
+	add	w0, w3
+	jmp	L(cj3)
+L(gt3):	add	w0, w3
+	jmp	L(m1l3)
+
+	ALIGN(32)
+L(m1tp):lea	32(rp), rp
+L(m1l3):mov	w2, (rp)
+	mulx(	(up), w0, w2)
+L(m1l2):mov	w3, 8(rp)
+	adc	w1, w4
+L(m1l1):adc	w0, w5
+	mov	w4, 16(rp)
+	mulx(	8,(up), w1, w3)
+L(m1l0):mov	w5, 24(rp)
+	mulx(	16,(up), w0, w4)
+	adc	w1, w2
+	mulx(	24,(up), w1, w5)
+	adc	w0, w3
+	lea	32(up), up
+	dec	n
+	jnz	L(m1tp)
+
+L(m1ed):lea	32(rp), rp
+L(cj3):	mov	w2, (rp)
+L(cj2):	mov	w3, 8(rp)
+	adc	w1, w4
+L(cj1):	mov	w4, 16(rp)
+	adc	$0, w5
+	mov	w5, 24(rp)
+
+	dec	R32(vn)
+	jz	L(ret5)
+
+	lea	8(vp), vp
+	lea	32(rp), rp
+C	push	%r12
+C	push	%r13
+C	push	%r14
+	jmp	L(do_addmul)
+
+L(do_mul_2):
+define(`v1',	`%r14')
+C	push	%r12
+C	push	%r13
+C	push	%r14
+
+	mov	(vp), v0
+	mov	8(vp), v1
+
+	lea	(un), n
+	sar	$2, n
+
+	test	$1, R8(un)
+	jnz	L(m2x1)
+
+L(m2x0):xor	w0, w0
+	test	$2, R8(un)
+	mov	(up), %rdx
+	mulx(	v0, w2, w1)
+	jz	L(m2l0)
+
+L(m210):lea	-16(rp), rp
+	lea	-16(up), up
+	jmp	L(m2l2)
+
+L(m2x1):xor	w2, w2
+	test	$2, R8(un)
+	mov	(up), %rdx
+	mulx(	v0, w0, w3)
+	jz	L(m211)
+
+L(m201):lea	-24(rp), rp
+	lea	8(up), up
+	jmp	L(m2l1)
+
+L(m211):lea	-8(rp), rp
+	lea	-8(up), up
+	jmp	L(m2l3)
+
+	ALIGN(16)
+L(m2tp):mulx(	v1, %rax, w0)
+	add	%rax, w2
+	mov	(up), %rdx
+	mulx(	v0, %rax, w1)
+	adc	$0, w0
+	add	%rax, w2
+	adc	$0, w1
+	add	w3, w2
+L(m2l0):mov	w2, (rp)
+	adc	$0, w1
+	mulx(	v1, %rax, w2)
+	add	%rax, w0
+	mov	8(up), %rdx
+	adc	$0, w2
+	mulx(	v0, %rax, w3)
+	add	%rax, w0
+	adc	$0, w3
+	add	w1, w0
+L(m2l3):mov	w0, 8(rp)
+	adc	$0, w3
+	mulx(	v1, %rax, w0)
+	add	%rax, w2
+	mov	16(up), %rdx
+	mulx(	v0, %rax, w1)
+	adc	$0, w0
+	add	%rax, w2
+	adc	$0, w1
+	add	w3, w2
+L(m2l2):mov	w2, 16(rp)
+	adc	$0, w1
+	mulx(	v1, %rax, w2)
+	add	%rax, w0
+	mov	24(up), %rdx
+	adc	$0, w2
+	mulx(	v0, %rax, w3)
+	add	%rax, w0
+	adc	$0, w3
+	add	w1, w0
+	lea	32(up), up
+L(m2l1):mov	w0, 24(rp)
+	adc	$0, w3
+	inc	n
+	lea	32(rp), rp
+	jnz	L(m2tp)
+
+L(m2ed):mulx(	v1, %rdx, %rax)
+	add	%rdx, w2
+	adc	$0, %rax
+	add	w3, w2
+	mov	w2, (rp)
+	adc	$0, %rax
+	mov	%rax, 8(rp)
+
+	add	$-2, R32(vn)
+	jz	L(ret5)
+	lea	16(vp), vp
+	lea	16(rp), rp
+
+
+L(do_addmul):
+	push	%r15
+	push	vn			C save vn in new stack slot
+define(`vn',	`(%rsp)')
+define(`X0',	`%r14')
+define(`X1',	`%r15')
+define(`v1',	`%r8')
+
+	lea	(rp,un,8), rp
+	lea	(up,un,8), up
+
+L(outer):
+	mov	(vp), v0
+	mov	8(vp), v1
+
+	lea	2(un), n
+	sar	$2, n
+
+	mov	(up), %rdx
+	test	$1, R8(un)
+	jnz	L(bx1)
+
+L(bx0):	mov	(rp), X0
+	mov	8(rp), X1
+	mulx(	v0, %rax, w1)
+	add	%rax, X0
+	mulx(	v1, %rax, w2)
+	adc	$0, w1
+	mov	X0, (rp)
+	add	%rax, X1
+	adc	$0, w2
+	mov	8(up), %rdx
+	test	$2, R8(un)
+	jnz	L(b10)
+
+L(b00):	lea	16(up), up
+	lea	16(rp), rp
+	jmp	L(lo0)
+
+L(b10):	mov	16(rp), X0
+	lea	32(up), up
+	mulx(	v0, %rax, w3)
+	jmp	L(lo2)
+
+L(bx1):	mov	(rp), X1
+	mov	8(rp), X0
+	mulx(	v0, %rax, w3)
+	add	%rax, X1
+	adc	$0, w3
+	mulx(	v1, %rax, w0)
+	add	%rax, X0
+	adc	$0, w0
+	mov	8(up), %rdx
+	mov	X1, (rp)
+	mulx(	v0, %rax, w1)
+	test	$2, R8(un)
+	jz	L(b11)
+
+L(b01):	mov	16(rp), X1
+	lea	24(rp), rp
+	lea	24(up), up
+	jmp	L(lo1)
+
+L(b11):	lea	8(rp), rp
+	lea	8(up), up
+	jmp	L(lo3)
+
+	ALIGN(16)
+L(top):	mulx(	v0, %rax, w3)
+	add	w0, X1
+	adc	$0, w2
+L(lo2):	add	%rax, X1
+	adc	$0, w3
+	mulx(	v1, %rax, w0)
+	add	%rax, X0
+	adc	$0, w0
+	lea	32(rp), rp
+	add	w1, X1
+	mov	-16(up), %rdx
+	mov	X1, -24(rp)
+	adc	$0, w3
+	add	w2, X0
+	mov	-8(rp), X1
+	mulx(	v0, %rax, w1)
+	adc	$0, w0
+L(lo1):	add	%rax, X0
+	mulx(	v1, %rax, w2)
+	adc	$0, w1
+	add	w3, X0
+	mov	X0, -16(rp)
+	adc	$0, w1
+	add	%rax, X1
+	adc	$0, w2
+	add	w0, X1
+	mov	-8(up), %rdx
+	adc	$0, w2
+L(lo0):	mulx(	v0, %rax, w3)
+	add	%rax, X1
+	adc	$0, w3
+	mov	(rp), X0
+	mulx(	v1, %rax, w0)
+	add	%rax, X0
+	adc	$0, w0
+	add	w1, X1
+	mov	X1, -8(rp)
+	adc	$0, w3
+	mov	(up), %rdx
+	add	w2, X0
+	mulx(	v0, %rax, w1)
+	adc	$0, w0
+L(lo3):	add	%rax, X0
+	adc	$0, w1
+	mulx(	v1, %rax, w2)
+	add	w3, X0
+	mov	8(rp), X1
+	mov	X0, (rp)
+	mov	16(rp), X0
+	adc	$0, w1
+	add	%rax, X1
+	adc	$0, w2
+	mov	8(up), %rdx
+	lea	32(up), up
+	inc	n
+	jnz	L(top)
+
+L(end):	mulx(	v0, %rax, w3)
+	add	w0, X1
+	adc	$0, w2
+	add	%rax, X1
+	adc	$0, w3
+	mulx(	v1, %rdx, %rax)
+	add	w1, X1
+	mov	X1, 8(rp)
+	adc	$0, w3
+	add	w2, %rdx
+	adc	$0, %rax
+	add	w3, %rdx
+	mov	%rdx, 16(rp)
+	adc	$0, %rax
+	mov	%rax, 24(rp)
+
+	addl	$-2, vn
+	lea	16(vp), vp
+	lea	-16(up,un,8), up
+	lea	32(rp,un,8), rp
+	jnz	L(outer)
+
+	pop	%rax		C deallocate vn slot
+	pop	%r15
+L(ret5):pop	%r14
+L(ret4):pop	%r13
+L(ret3):pop	%r12
+L(ret2):pop	%rbp
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/coreihwl/mullo_basecase.asm b/third_party/gmp/mpn/x86_64/coreihwl/mullo_basecase.asm
new file mode 100644
index 0000000..e65559b
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/coreihwl/mullo_basecase.asm

@@ -0,0 +1,422 @@
+dnl  AMD64 mpn_mullo_basecase optimised for Intel Haswell.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2008, 2009, 2011-2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb	mul_2		addmul_2
+C AMD K8,K9	n/a		n/a
+C AMD K10	n/a		n/a
+C AMD bull	n/a		n/a
+C AMD pile	n/a		n/a
+C AMD steam	 ?		 ?
+C AMD bobcat	n/a		n/a
+C AMD jaguar	 ?		 ?
+C Intel P4	n/a		n/a
+C Intel core	n/a		n/a
+C Intel NHM	n/a		n/a
+C Intel SBR	n/a		n/a
+C Intel IBR	n/a		n/a
+C Intel HWL	 1.86		 2.15
+C Intel BWL	 ?		 ?
+C Intel atom	n/a		n/a
+C VIA nano	n/a		n/a
+
+C The inner loops of this code are the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjörn Granlund.
+
+C TODO
+C   * Implement proper cor2, replacing current cor0.
+C   * Micro-optimise.
+
+define(`rp',       `%rdi')
+define(`up',       `%rsi')
+define(`vp_param', `%rdx')
+define(`n',        `%rcx')
+
+define(`vp',       `%r8')
+define(`X0',       `%r14')
+define(`X1',       `%r15')
+
+define(`w0',       `%r10')
+define(`w1',       `%r11')
+define(`w2',       `%r12')
+define(`w3',       `%r13')
+define(`i',        `%rbp')
+define(`v0',       `%r9')
+define(`v1',       `%rbx')
+
+C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(32)
+PROLOGUE(mpn_mullo_basecase)
+	FUNC_ENTRY(4)
+
+	mov	vp_param, vp
+	mov	(up), %rdx
+
+	cmp	$4, n
+	jb	L(small)
+
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+
+	mov	(vp), v0
+	mov	8(vp), v1
+
+	lea	2(n), i
+	shr	$2, i
+	neg	n
+	add	$2, n
+
+	push	up			C put entry `up' on stack
+
+	test	$1, R8(n)
+	jnz	L(m2x1)
+
+L(m2x0):mulx(	v0, w0, w3)
+	xor	R32(w2), R32(w2)
+	test	$2, R8(n)
+	jz	L(m2b2)
+
+L(m2b0):lea	-8(rp), rp
+	lea	-8(up), up
+	jmp	L(m2e0)
+
+L(m2b2):lea	-24(rp), rp
+	lea	8(up), up
+	jmp	L(m2e2)
+
+L(m2x1):mulx(	v0, w2, w1)
+	xor	R32(w0), R32(w0)
+	test	$2, R8(n)
+	jnz	L(m2b3)
+
+L(m2b1):jmp	L(m2e1)
+
+L(m2b3):lea	-16(rp), rp
+	lea	-16(up), up
+	jmp	L(m2e3)
+
+	ALIGN(16)
+L(m2tp):mulx(	v1, %rax, w0)
+	add	%rax, w2
+	mov	(up), %rdx
+	mulx(	v0, %rax, w1)
+	adc	$0, w0
+	add	%rax, w2
+	adc	$0, w1
+	add	w3, w2
+L(m2e1):mov	w2, (rp)
+	adc	$0, w1
+	mulx(	v1, %rax, w2)
+	add	%rax, w0
+	mov	8(up), %rdx
+	adc	$0, w2
+	mulx(	v0, %rax, w3)
+	add	%rax, w0
+	adc	$0, w3
+	add	w1, w0
+L(m2e0):mov	w0, 8(rp)
+	adc	$0, w3
+	mulx(	v1, %rax, w0)
+	add	%rax, w2
+	mov	16(up), %rdx
+	mulx(	v0, %rax, w1)
+	adc	$0, w0
+	add	%rax, w2
+	adc	$0, w1
+	add	w3, w2
+L(m2e3):mov	w2, 16(rp)
+	adc	$0, w1
+	mulx(	v1, %rax, w2)
+	add	%rax, w0
+	mov	24(up), %rdx
+	adc	$0, w2
+	mulx(	v0, %rax, w3)
+	add	%rax, w0
+	adc	$0, w3
+	add	w1, w0
+	lea	32(up), up
+L(m2e2):mov	w0, 24(rp)
+	adc	$0, w3
+	dec	i
+	lea	32(rp), rp
+	jnz	L(m2tp)
+
+L(m2ed):mulx(	v1, %rax, w0)
+	add	%rax, w2
+	mov	(up), %rdx
+	mulx(	v0, %rax, w1)
+	add	w2, %rax
+	add	w3, %rax
+	mov	%rax, (rp)
+
+	mov	(%rsp), up		C restore `up' to beginning
+	lea	16(vp), vp
+	lea	8(rp,n,8), rp		C put back rp to old rp + 2
+	add	$2, n
+	jge	L(cor1)
+
+	push	%r14
+	push	%r15
+
+L(outer):
+	mov	(vp), v0
+	mov	8(vp), v1
+
+	lea	(n), i
+	sar	$2, i
+
+	mov	(up), %rdx
+	test	$1, R8(n)
+	jnz	L(bx1)
+
+L(bx0):	mov	(rp), X1
+	mov	8(rp), X0
+	mulx(	v0, %rax, w3)
+	add	%rax, X1
+	adc	$0, w3
+	mulx(	v1, %rax, w0)
+	add	%rax, X0
+	adc	$0, w0
+	mov	8(up), %rdx
+	mov	X1, (rp)
+	mulx(	v0, %rax, w1)
+	test	$2, R8(n)
+	jz	L(b2)
+
+L(b0):	lea	8(rp), rp
+	lea	8(up), up
+	jmp	L(lo0)
+
+L(b2):	mov	16(rp), X1
+	lea	24(rp), rp
+	lea	24(up), up
+	jmp	L(lo2)
+
+L(bx1):	mov	(rp), X0
+	mov	8(rp), X1
+	mulx(	v0, %rax, w1)
+	add	%rax, X0
+	mulx(	v1, %rax, w2)
+	adc	$0, w1
+	mov	X0, (rp)
+	add	%rax, X1
+	adc	$0, w2
+	mov	8(up), %rdx
+	test	$2, R8(n)
+	jnz	L(b3)
+
+L(b1):	lea	16(up), up
+	lea	16(rp), rp
+	jmp	L(lo1)
+
+L(b3):	mov	16(rp), X0
+	lea	32(up), up
+	mulx(	v0, %rax, w3)
+	inc	i
+	jz	L(cj3)
+	jmp	L(lo3)
+
+	ALIGN(16)
+L(top):	mulx(	v0, %rax, w3)
+	add	w0, X1
+	adc	$0, w2
+L(lo3):	add	%rax, X1
+	adc	$0, w3
+	mulx(	v1, %rax, w0)
+	add	%rax, X0
+	adc	$0, w0
+	lea	32(rp), rp
+	add	w1, X1
+	mov	-16(up), %rdx
+	mov	X1, -24(rp)
+	adc	$0, w3
+	add	w2, X0
+	mov	-8(rp), X1
+	mulx(	v0, %rax, w1)
+	adc	$0, w0
+L(lo2):	add	%rax, X0
+	mulx(	v1, %rax, w2)
+	adc	$0, w1
+	add	w3, X0
+	mov	X0, -16(rp)
+	adc	$0, w1
+	add	%rax, X1
+	adc	$0, w2
+	add	w0, X1
+	mov	-8(up), %rdx
+	adc	$0, w2
+L(lo1):	mulx(	v0, %rax, w3)
+	add	%rax, X1
+	adc	$0, w3
+	mov	(rp), X0
+	mulx(	v1, %rax, w0)
+	add	%rax, X0
+	adc	$0, w0
+	add	w1, X1
+	mov	X1, -8(rp)
+	adc	$0, w3
+	mov	(up), %rdx
+	add	w2, X0
+	mulx(	v0, %rax, w1)
+	adc	$0, w0
+L(lo0):	add	%rax, X0
+	adc	$0, w1
+	mulx(	v1, %rax, w2)
+	add	w3, X0
+	mov	8(rp), X1
+	mov	X0, (rp)
+	mov	16(rp), X0
+	adc	$0, w1
+	add	%rax, X1
+	adc	$0, w2
+	mov	8(up), %rdx
+	lea	32(up), up
+	inc	i
+	jnz	L(top)
+
+L(end):	mulx(	v0, %rax, w3)
+	add	w0, X1
+	adc	$0, w2
+L(cj3):	add	%rax, X1
+	adc	$0, w3
+	mulx(	v1, %rax, w0)
+	add	%rax, X0
+	add	w1, X1
+	mov	-16(up), %rdx
+	mov	X1, 8(rp)
+	adc	$0, w3
+	add	w2, X0
+	mulx(	v0, %rax, w1)
+	add	X0, %rax
+	add	w3, %rax
+	mov	%rax, 16(rp)
+
+	mov	16(%rsp), up		C restore `up' to beginning
+	lea	16(vp), vp
+	lea	24(rp,n,8), rp		C put back rp to old rp + 2
+	add	$2, n
+	jl	L(outer)
+
+	pop	%r15
+	pop	%r14
+
+	jnz	L(cor0)
+
+L(cor1):mov	(vp), v0
+	mov	8(vp), v1
+	mov	(up), %rdx
+	mulx(	v0, %r12, %rbp)		C u0 x v2
+	add	(rp), %r12		C FIXME: rp[0] still available in reg?
+	adc	%rax, %rbp
+	mov	8(up), %r10
+	imul	v0, %r10
+	imul	v1, %rdx
+	mov	%r12, (rp)
+	add	%r10, %rdx
+	add	%rbp, %rdx
+	mov	%rdx, 8(rp)
+	pop	%rax			C deallocate `up' copy
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+
+L(cor0):mov	(vp), %r11
+	imul	(up), %r11
+	add	%rax, %r11
+	mov	%r11, (rp)
+	pop	%rax			C deallocate `up' copy
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+
+	ALIGN(16)
+L(small):
+	cmp	$2, n
+	jae	L(gt1)
+L(n1):	imul	(vp), %rdx
+	mov	%rdx, (rp)
+	FUNC_EXIT()
+	ret
+L(gt1):	ja	L(gt2)
+L(n2):	mov	(vp), %r9
+	mulx(	%r9, %rax, %rdx)
+	mov	%rax, (rp)
+	mov	8(up), %rax
+	imul	%r9, %rax
+	add	%rax, %rdx
+	mov	8(vp), %r9
+	mov	(up), %rcx
+	imul	%r9, %rcx
+	add	%rcx, %rdx
+	mov	%rdx, 8(rp)
+	FUNC_EXIT()
+	ret
+L(gt2):
+L(n3):	mov	(vp), %r9
+	mulx(	%r9, %rax, %r10)	C u0 x v0
+	mov	%rax, (rp)
+	mov	8(up), %rdx
+	mulx(	%r9, %rax, %rdx)	C u1 x v0
+	imul	16(up), %r9		C u2 x v0
+	add	%rax, %r10
+	adc	%rdx, %r9
+	mov	8(vp), %r11
+	mov	(up), %rdx
+	mulx(	%r11, %rax, %rdx)	C u0 x v1
+	add	%rax, %r10
+	adc	%rdx, %r9
+	imul	8(up), %r11		C u1 x v1
+	add	%r11, %r9
+	mov	%r10, 8(rp)
+	mov	16(vp), %r10
+	mov	(up), %rax
+	imul	%rax, %r10		C u0 x v2
+	add	%r10, %r9
+	mov	%r9, 16(rp)
+	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/coreihwl/redc_1.asm b/third_party/gmp/mpn/x86_64/coreihwl/redc_1.asm
new file mode 100644
index 0000000..b1d6c0a
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/coreihwl/redc_1.asm

@@ -0,0 +1,437 @@
+dnl  AMD64 mpn_redc_1 optimised for Intel Haswell.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C AMD K8,K9	n/a
+C AMD K10	n/a
+C AMD bull	n/a
+C AMD pile	n/a
+C AMD steam	 ?
+C AMD bobcat	n/a
+C AMD jaguar	 ?
+C Intel P4	n/a
+C Intel core	n/a
+C Intel NHM	n/a
+C Intel SBR	n/a
+C Intel IBR	n/a
+C Intel HWL	 2.32
+C Intel BWL	 ?
+C Intel atom	n/a
+C VIA nano	n/a
+
+C The inner loops of this code are the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjörn Granlund.
+
+C TODO
+C  * Micro-optimise.
+C  * Consider inlining mpn_add_n.  Tests indicate that this saves just 1-2
+C    cycles, though.
+
+define(`rp',          `%rdi')   C rcx
+define(`up',          `%rsi')   C rdx
+define(`mp_param',    `%rdx')   C r8
+define(`n',           `%rcx')   C r9
+define(`u0inv_param', `%r8')    C stack
+
+define(`i',           `%r14')
+define(`j',           `%r15')
+define(`mp',          `%rdi')
+define(`u0inv',       `(%rsp)')  C stack
+
+ABI_SUPPORT(DOS64)    C FIXME: needs verification
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_redc_1)
+	FUNC_ENTRY(4)
+IFDOS(`	mov	56(%rsp), %r8	')
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	push	rp
+	mov	mp_param, mp		C note that rp and mp shares register
+	mov	(up), %rdx
+
+	neg	n
+	push	%r8			C put u0inv on stack
+	imul	u0inv_param, %rdx	C first iteration q0
+	mov	n, j			C outer loop induction var
+
+	test	$1, R8(n)
+	jnz	L(bx1)
+
+L(bx0):	test	$2, R8(n)
+	jz	L(o0b)
+
+	cmp	$-2, R32(n)
+	jnz	L(o2)
+
+C Special code for n = 2 since general code cannot handle it
+	mov	8(%rsp), %rbx		C rp
+	lea	16(%rsp), %rsp		C deallocate two slots
+	mulx(	(mp), %r9, %r12)
+	mulx(	8,(mp), %r11, %r10)
+	add	%r12, %r11
+	adc	$0, %r10
+	add	(up), %r9		C = 0
+	adc	8(up), %r11		C r11 = up[1]
+	adc	$0, %r10		C -> up[0]
+	mov	%r11, %rdx
+	imul	u0inv_param, %rdx
+	mulx(	(mp), %r13, %r12)
+	mulx(	8,(mp), %r14, %r15)
+	xor	R32(%rax), R32(%rax)
+	add	%r12, %r14
+	adc	$0, %r15
+	add	%r11, %r13		C = 0
+	adc	16(up), %r14		C rp[2]
+	adc	$0, %r15		C -> up[1]
+	add	%r14, %r10
+	adc	24(up), %r15
+	mov	%r10, (%rbx)
+	mov	%r15, 8(%rbx)
+	setc	R8(%rax)
+	jmp	L(ret)
+
+L(o2):	lea	2(n), i			C inner loop induction var
+	mulx(	(mp), %r9, %r8)
+	mulx(	8,(mp), %r11, %r10)
+	sar	$2, i
+	add	%r8, %r11
+	jmp	L(lo2)
+
+	ALIGN(16)
+L(tp2):	adc	%rax, %r9
+	lea	32(up), up
+	adc	%r8, %r11
+L(lo2):	mulx(	16,(mp), %r13, %r12)
+	mov	(up), %r8
+	mulx(	24,(mp), %rbx, %rax)
+	lea	32(mp), mp
+	adc	%r10, %r13
+	adc	%r12, %rbx
+	adc	$0, %rax
+	mov	8(up), %r10
+	mov	16(up), %r12
+	add	%r9, %r8
+	mov	24(up), %rbp
+	mov	%r8, (up)
+	adc	%r11, %r10
+	mulx(	(mp), %r9, %r8)
+	mov	%r10, 8(up)
+	adc	%r13, %r12
+	mov	%r12, 16(up)
+	adc	%rbx, %rbp
+	mulx(	8,(mp), %r11, %r10)
+	mov	%rbp, 24(up)
+	inc	i
+	jnz	L(tp2)
+
+L(ed2):	mov	56(up,n,8), %rdx	C next iteration up[0]
+	lea	16(mp,n,8), mp		C mp = (last starting mp)
+	adc	%rax, %r9
+	adc	%r8, %r11
+	mov	32(up), %r8
+	adc	$0, %r10
+	imul	u0inv, %rdx		C next iteration q0
+	mov	40(up), %rax
+	add	%r9, %r8
+	mov	%r8, 32(up)
+	adc	%r11, %rax
+	mov	%rax, 40(up)
+	lea	56(up,n,8), up		C up = (last starting up) + 1
+	adc	$0, %r10
+	mov	%r10, -8(up)
+	inc	j
+	jnz	L(o2)
+
+	jmp	L(cj)
+
+
+L(bx1):	test	$2, R8(n)
+	jz	L(o3a)
+
+L(o1a):	cmp	$-1, R32(n)
+	jnz	L(o1b)
+
+C Special code for n = 1 since general code cannot handle it
+	mov	8(%rsp), %rbx		C rp
+	lea	16(%rsp), %rsp		C deallocate two slots
+	mulx(	(mp), %r11, %r10)
+	add	(up), %r11
+	adc	8(up), %r10
+	mov	%r10, (%rbx)
+	mov	$0, R32(%rax)
+	setc	R8(%rax)
+	jmp	L(ret)
+
+L(o1b):	lea	24(mp), mp
+L(o1):	lea	1(n), i			C inner loop induction var
+	mulx(	-24,(mp), %r11, %r10)
+	mulx(	-16,(mp), %r13, %r12)
+	mulx(	-8,(mp), %rbx, %rax)
+	sar	$2, i
+	add	%r10, %r13
+	adc	%r12, %rbx
+	adc	$0, %rax
+	mov	(up), %r10
+	mov	8(up), %r12
+	mov	16(up), %rbp
+	add	%r11, %r10
+	jmp	L(lo1)
+
+	ALIGN(16)
+L(tp1):	adc	%rax, %r9
+	lea	32(up), up
+	adc	%r8, %r11
+	mulx(	16,(mp), %r13, %r12)
+	mov	-8(up), %r8
+	mulx(	24,(mp), %rbx, %rax)
+	lea	32(mp), mp
+	adc	%r10, %r13
+	adc	%r12, %rbx
+	adc	$0, %rax
+	mov	(up), %r10
+	mov	8(up), %r12
+	add	%r9, %r8
+	mov	16(up), %rbp
+	mov	%r8, -8(up)
+	adc	%r11, %r10
+L(lo1):	mulx(	(mp), %r9, %r8)
+	mov	%r10, (up)
+	adc	%r13, %r12
+	mov	%r12, 8(up)
+	adc	%rbx, %rbp
+	mulx(	8,(mp), %r11, %r10)
+	mov	%rbp, 16(up)
+	inc	i
+	jnz	L(tp1)
+
+L(ed1):	mov	48(up,n,8), %rdx	C next iteration up[0]
+	lea	40(mp,n,8), mp		C mp = (last starting mp)
+	adc	%rax, %r9
+	adc	%r8, %r11
+	mov	24(up), %r8
+	adc	$0, %r10
+	imul	u0inv, %rdx		C next iteration q0
+	mov	32(up), %rax
+	add	%r9, %r8
+	mov	%r8, 24(up)
+	adc	%r11, %rax
+	mov	%rax, 32(up)
+	lea	48(up,n,8), up		C up = (last starting up) + 1
+	adc	$0, %r10
+	mov	%r10, -8(up)
+	inc	j
+	jnz	L(o1)
+
+	jmp	L(cj)
+
+L(o3a):	cmp	$-3, R32(n)
+	jnz	L(o3b)
+
+C Special code for n = 3 since general code cannot handle it
+L(n3):	mulx(	(mp), %rbx, %rax)
+	mulx(	8,(mp), %r9, %r14)
+	add	(up), %rbx
+	mulx(	16,(mp), %r11, %r10)
+	adc	%rax, %r9		C W 1
+	adc	%r14, %r11		C W 2
+	mov	8(up), %r14
+	mov	u0inv_param, %rdx
+	adc	$0, %r10		C W 3
+	mov	16(up), %rax
+	add	%r9, %r14		C W 1
+	mov	%r14, 8(up)
+	mulx(	%r14, %rdx, %r13)	C next iteration q0
+	adc	%r11, %rax		C W 2
+	mov	%rax, 16(up)
+	adc	$0, %r10		C W 3
+	mov	%r10, (up)
+	lea	8(up), up		C up = (last starting up) + 1
+	inc	j
+	jnz	L(n3)
+
+	jmp	L(cj)
+
+L(o3b):	lea	8(mp), mp
+L(o3):	lea	4(n), i			C inner loop induction var
+	mulx(	-8,(mp), %rbx, %rax)
+	mulx(	(mp), %r9, %r8)
+	mov	(up), %rbp
+	mulx(	8,(mp), %r11, %r10)
+	sar	$2, i
+	add	%rbx, %rbp
+	nop
+	adc	%rax, %r9
+	jmp	L(lo3)
+
+	ALIGN(16)
+L(tp3):	adc	%rax, %r9
+	lea	32(up), up
+L(lo3):	adc	%r8, %r11
+	mulx(	16,(mp), %r13, %r12)
+	mov	8(up), %r8
+	mulx(	24,(mp), %rbx, %rax)
+	lea	32(mp), mp
+	adc	%r10, %r13
+	adc	%r12, %rbx
+	adc	$0, %rax
+	mov	16(up), %r10
+	mov	24(up), %r12
+	add	%r9, %r8
+	mov	32(up), %rbp
+	mov	%r8, 8(up)
+	adc	%r11, %r10
+	mulx(	(mp), %r9, %r8)
+	mov	%r10, 16(up)
+	adc	%r13, %r12
+	mov	%r12, 24(up)
+	adc	%rbx, %rbp
+	mulx(	8,(mp), %r11, %r10)
+	mov	%rbp, 32(up)
+	inc	i
+	jnz	L(tp3)
+
+L(ed3):	mov	64(up,n,8), %rdx	C next iteration up[0]
+	lea	24(mp,n,8), mp		C mp = (last starting mp)
+	adc	%rax, %r9
+	adc	%r8, %r11
+	mov	40(up), %r8
+	adc	$0, %r10
+	imul	u0inv, %rdx		C next iteration q0
+	mov	48(up), %rax
+	add	%r9, %r8
+	mov	%r8, 40(up)
+	adc	%r11, %rax
+	mov	%rax, 48(up)
+	lea	64(up,n,8), up		C up = (last starting up) + 1
+	adc	$0, %r10
+	mov	%r10, -8(up)
+	inc	j
+	jnz	L(o3)
+
+	jmp	L(cj)
+
+L(o0b):	lea	16(mp), mp
+L(o0):	mov	n, i			C inner loop induction var
+	mulx(	-16,(mp), %r13, %r12)
+	mulx(	-8,(mp), %rbx, %rax)
+	sar	$2, i
+	add	%r12, %rbx
+	adc	$0, %rax
+	mov	(up), %r12
+	mov	8(up), %rbp
+	mulx(	(mp), %r9, %r8)
+	add	%r13, %r12
+	jmp	L(lo0)
+
+	ALIGN(16)
+L(tp0):	adc	%rax, %r9
+	lea	32(up), up
+	adc	%r8, %r11
+	mulx(	16,(mp), %r13, %r12)
+	mov	-16(up), %r8
+	mulx(	24,(mp), %rbx, %rax)
+	lea	32(mp), mp
+	adc	%r10, %r13
+	adc	%r12, %rbx
+	adc	$0, %rax
+	mov	-8(up), %r10
+	mov	(up), %r12
+	add	%r9, %r8
+	mov	8(up), %rbp
+	mov	%r8, -16(up)
+	adc	%r11, %r10
+	mulx(	(mp), %r9, %r8)
+	mov	%r10, -8(up)
+	adc	%r13, %r12
+	mov	%r12, (up)
+L(lo0):	adc	%rbx, %rbp
+	mulx(	8,(mp), %r11, %r10)
+	mov	%rbp, 8(up)
+	inc	i
+	jnz	L(tp0)
+
+L(ed0):	mov	40(up,n,8), %rdx	C next iteration up[0]
+	lea	32(mp,n,8), mp		C mp = (last starting mp)
+	adc	%rax, %r9
+	adc	%r8, %r11
+	mov	16(up), %r8
+	adc	$0, %r10
+	imul	u0inv, %rdx		C next iteration q0
+	mov	24(up), %rax
+	add	%r9, %r8
+	mov	%r8, 16(up)
+	adc	%r11, %rax
+	mov	%rax, 24(up)
+	lea	40(up,n,8), up		C up = (last starting up) + 1
+	adc	$0, %r10
+	mov	%r10, -8(up)
+	inc	j
+	jnz	L(o0)
+
+L(cj):
+IFSTD(`	mov	8(%rsp), %rdi		C param 1: rp
+	lea	16-8(%rsp), %rsp	C deallocate 2, add back for alignment
+	lea	(up,n,8), %rdx		C param 3: up - n
+	neg	R32(n)		')	C param 4: n
+
+IFDOS(`	mov	up, %rdx		C param 2: up
+	lea	(up,n,8), %r8		C param 3: up - n
+	neg	R32(n)
+	mov	n, %r9			C param 4: n
+	mov	8(%rsp), %rcx		C param 1: rp
+	lea	16-32-8(%rsp), %rsp')	C deallocate 2, allocate shadow, align
+
+	ASSERT(nz, `test $15, %rsp')
+	CALL(	mpn_add_n)
+
+IFSTD(`	lea	8(%rsp), %rsp	')
+IFDOS(`	lea	32+8(%rsp), %rsp')
+
+L(ret):	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/coreihwl/sqr_basecase.asm b/third_party/gmp/mpn/x86_64/coreihwl/sqr_basecase.asm
new file mode 100644
index 0000000..641cdf3
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/coreihwl/sqr_basecase.asm

@@ -0,0 +1,506 @@
+dnl  AMD64 mpn_sqr_basecase optimised for Intel Haswell.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2008, 2009, 2011-2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb	mul_2		addmul_2	sqr_diag_addlsh1
+C AMD K8,K9	n/a		n/a			n/a
+C AMD K10	n/a		n/a			n/a
+C AMD bull	n/a		n/a			n/a
+C AMD pile	n/a		n/a			n/a
+C AMD steam	 ?		 ?			 ?
+C AMD bobcat	n/a		n/a			n/a
+C AMD jaguar	 ?		 ?			 ?
+C Intel P4	n/a		n/a			n/a
+C Intel core	n/a		n/a			n/a
+C Intel NHM	n/a		n/a			n/a
+C Intel SBR	n/a		n/a			n/a
+C Intel IBR	n/a		n/a			n/a
+C Intel HWL	 1.86		 2.15			~2.5
+C Intel BWL	 ?		 ?			 ?
+C Intel atom	n/a		n/a			n/a
+C VIA nano	n/a		n/a			n/a
+
+C The inner loops of this code are the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjörn Granlund, except
+C that the sqr_diag_addlsh1 loop was manually written.
+
+C TODO
+C  * Replace current unoptimised sqr_diag_addlsh1 loop; 1.75 c/l might be
+C    possible.
+C  * Consider splitting outer loop into 2, one for n = 1 (mod 2) and one for
+C    n = 0 (mod 2).  These loops could fall into specific "corner" code.
+C  * Consider splitting outer loop into 4.
+C  * Streamline pointer updates.
+C  * Perhaps suppress a few more xor insns in feed-in code.
+C  * Make sure we write no dead registers in feed-in code.
+C  * We might use 32-bit size ops, since n >= 2^32 is non-terminating.  Watch
+C    out for negative sizes being zero-extended, though.
+C  * Provide straight-line code for n = 4; then look for simplifications in
+C    main code.
+
+define(`rp',	  `%rdi')
+define(`up',	  `%rsi')
+define(`un_param',`%rdx')
+
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(32)
+PROLOGUE(mpn_sqr_basecase)
+	FUNC_ENTRY(3)
+
+	cmp	$2, un_param
+	jae	L(gt1)
+
+	mov	(up), %rdx
+	mulx(	%rdx, %rax, %rdx)
+	mov	%rax, (rp)
+	mov	%rdx, 8(rp)
+	FUNC_EXIT()
+	ret
+
+L(gt1):	jne	L(gt2)
+
+	mov	(up), %rdx
+	mov	8(up), %rcx
+	mulx(	%rcx, %r9, %r10)	C v0 * v1	W 1 2
+	mulx(	%rdx, %rax, %r8)	C v0 * v0	W 0 1
+	mov	%rcx, %rdx
+	mulx(	%rdx, %r11, %rdx)	C v1 * v1	W 2 3
+	add	%r9, %r9		C		W 1
+	adc	%r10, %r10		C		W 2
+	adc	$0, %rdx		C		W 3
+	add	%r9, %r8		C W 1
+	adc	%r11, %r10		C W 2
+	adc	$0, %rdx		C W 3
+	mov	%rax, (rp)
+	mov	%r8, 8(rp)
+	mov	%r10, 16(rp)
+	mov	%rdx, 24(rp)
+	FUNC_EXIT()
+	ret
+
+L(gt2):	cmp	$4, un_param
+	jae	L(gt3)
+define(`v0', `%r8')
+define(`v1', `%r9')
+define(`w0', `%r10')
+define(`w2', `%r11')
+
+	mov	(up), v0
+	mov	8(up), %rdx
+	mov	%rdx, v1
+	mulx(	v0, w2, %rax)
+	mov	16(up), %rdx
+	mulx(	v0, w0, %rcx)
+	mov	w2, %r8
+	add	%rax, w0
+	adc	$0, %rcx
+	mulx(	v1, %rdx, %rax)
+	add	%rcx, %rdx
+	mov	%rdx, 24(rp)
+	adc	$0, %rax
+	mov	%rax, 32(rp)
+	xor	R32(%rcx), R32(%rcx)
+	mov	(up), %rdx
+	mulx(	%rdx, %rax, w2)
+	mov	%rax, (rp)
+	add	%r8, %r8
+	adc	w0, w0
+	setc	R8(%rcx)
+	mov	8(up), %rdx
+	mulx(	%rdx, %rax, %rdx)
+	add	w2, %r8
+	adc	%rax, w0
+	mov	%r8, 8(rp)
+	mov	w0, 16(rp)
+	mov	24(rp), %r8
+	mov	32(rp), w0
+	lea	(%rdx,%rcx), w2
+	adc	%r8, %r8
+	adc	w0, w0
+	setc	R8(%rcx)
+	mov	16(up), %rdx
+	mulx(	%rdx, %rax, %rdx)
+	add	w2, %r8
+	adc	%rax, w0
+	mov	%r8, 24(rp)
+	mov	w0, 32(rp)
+	adc	%rcx, %rdx
+	mov	%rdx, 40(rp)
+	FUNC_EXIT()
+	ret
+
+L(gt3):
+
+define(`v0', `%r8')
+define(`v1', `%r9')
+define(`w0', `%r10')
+define(`w1', `%r11')
+define(`w2', `%rbx')
+define(`w3', `%rbp')
+define(`un', `%r12')
+define(`n',  `%rcx')
+
+define(`X0', `%r13')
+define(`X1', `%r14')
+
+L(do_mul_2):
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	mov	$0, R32(un)
+	sub	un_param, un		C free up rdx
+	push	un
+	mov	(up), v0
+	mov	8(up), %rdx
+	lea	2(un), n
+	sar	$2, n			C FIXME: suppress, change loop?
+	inc	un			C decrement |un|
+	mov	%rdx, v1
+
+	test	$1, R8(un)
+	jnz	L(mx1)
+
+L(mx0):	mulx(	v0, w2, w1)
+	mov	16(up), %rdx
+	mov	w2, 8(rp)
+	xor	w2, w2
+	mulx(	v0, w0, w3)
+	test	$2, R8(un)
+	jz	L(m00)
+
+L(m10):	lea	-8(rp), rp
+	lea	-8(up), up
+	jmp	L(mlo2)
+
+L(m00):	lea	8(up), up
+	lea	8(rp), rp
+	jmp	L(mlo0)
+
+L(mx1):	mulx(	v0, w0, w3)
+	mov	16(up), %rdx
+	mov	w0, 8(rp)
+	xor	w0, w0
+	mulx(	v0, w2, w1)
+	test	$2, R8(un)
+	jz	L(mlo3)
+
+L(m01):	lea	16(rp), rp
+	lea	16(up), up
+	jmp	L(mlo1)
+
+	ALIGN(32)
+L(mtop):mulx(	v1, %rax, w0)
+	add	%rax, w2		C 0
+	mov	(up), %rdx
+	mulx(	v0, %rax, w1)
+	adc	$0, w0			C 1
+	add	%rax, w2		C 0
+L(mlo1):adc	$0, w1			C 1
+	add	w3, w2			C 0
+	mov	w2, (rp)		C 0
+	adc	$0, w1			C 1
+	mulx(	v1, %rax, w2)
+	add	%rax, w0		C 1
+	mov	8(up), %rdx
+	adc	$0, w2			C 2
+	mulx(	v0, %rax, w3)
+	add	%rax, w0		C 1
+	adc	$0, w3			C 2
+L(mlo0):add	w1, w0			C 1
+	mov	w0, 8(rp)		C 1
+	adc	$0, w3			C 2
+	mulx(	v1, %rax, w0)
+	add	%rax, w2		C 2
+	mov	16(up), %rdx
+	mulx(	v0, %rax, w1)
+	adc	$0, w0			C 3
+	add	%rax, w2		C 2
+	adc	$0, w1			C 3
+L(mlo3):add	w3, w2			C 2
+	mov	w2, 16(rp)		C 2
+	adc	$0, w1			C 3
+	mulx(	v1, %rax, w2)
+	add	%rax, w0		C 3
+	mov	24(up), %rdx
+	adc	$0, w2			C 4
+	mulx(	v0, %rax, w3)
+	add	%rax, w0		C 3
+	adc	$0, w3			C 4
+L(mlo2):add	w1, w0			C 3
+	lea	32(up), up
+	mov	w0, 24(rp)		C 3
+	adc	$0, w3			C 4
+	inc	n
+	lea	32(rp), rp
+	jnz	L(mtop)
+
+L(mend):mulx(	v1, %rdx, %rax)
+	add	%rdx, w2
+	adc	$0, %rax
+	add	w3, w2
+	mov	w2, (rp)
+	adc	$0, %rax
+	mov	%rax, 8(rp)
+
+	lea	16(up), up
+	lea	-16(rp), rp
+
+L(do_addmul_2):
+L(outer):
+	lea	(up,un,8), up		C put back up to 2 positions above last time
+	lea	48(rp,un,8), rp		C put back rp to 4 positions above last time
+
+	mov	-8(up), v0		C shared between addmul_2 and corner
+
+	add	$2, un			C decrease |un|
+	cmp	$-2, un
+	jge	L(corner)
+
+	mov	(up), v1
+
+	lea	1(un), n
+	sar	$2, n			C FIXME: suppress, change loop?
+
+	mov	v1, %rdx
+	test	$1, R8(un)
+	jnz	L(bx1)
+
+L(bx0):	mov	(rp), X0
+	mov	8(rp), X1
+	mulx(	v0, %rax, w1)
+	add	%rax, X0
+	adc	$0, w1
+	mov	X0, (rp)
+	xor	w2, w2
+	test	$2, R8(un)
+	jnz	L(b10)
+
+L(b00):	mov	8(up), %rdx
+	lea	16(rp), rp
+	lea	16(up), up
+	jmp	L(lo0)
+
+L(b10):	mov	8(up), %rdx
+	mov	16(rp), X0
+	lea	32(up), up
+	inc	n
+	mulx(	v0, %rax, w3)
+	jz	L(ex)
+	jmp	L(lo2)
+
+L(bx1):	mov	(rp), X1
+	mov	8(rp), X0
+	mulx(	v0, %rax, w3)
+	mov	8(up), %rdx
+	add	%rax, X1
+	adc	$0, w3
+	xor	w0, w0
+	mov	X1, (rp)
+	mulx(	v0, %rax, w1)
+	test	$2, R8(un)
+	jz	L(b11)
+
+L(b01):	mov	16(rp), X1
+	lea	24(rp), rp
+	lea	24(up), up
+	jmp	L(lo1)
+
+L(b11):	lea	8(rp), rp
+	lea	8(up), up
+	jmp	L(lo3)
+
+	ALIGN(32)
+L(top):	mulx(	v0, %rax, w3)
+	add	w0, X1
+	adc	$0, w2
+L(lo2):	add	%rax, X1
+	adc	$0, w3
+	mulx(	v1, %rax, w0)
+	add	%rax, X0
+	adc	$0, w0
+	lea	32(rp), rp
+	add	w1, X1
+	mov	-16(up), %rdx
+	mov	X1, -24(rp)
+	adc	$0, w3
+	add	w2, X0
+	mov	-8(rp), X1
+	mulx(	v0, %rax, w1)
+	adc	$0, w0
+L(lo1):	add	%rax, X0
+	mulx(	v1, %rax, w2)
+	adc	$0, w1
+	add	w3, X0
+	mov	X0, -16(rp)
+	adc	$0, w1
+	add	%rax, X1
+	adc	$0, w2
+	add	w0, X1
+	mov	-8(up), %rdx
+	adc	$0, w2
+L(lo0):	mulx(	v0, %rax, w3)
+	add	%rax, X1
+	adc	$0, w3
+	mov	(rp), X0
+	mulx(	v1, %rax, w0)
+	add	%rax, X0
+	adc	$0, w0
+	add	w1, X1
+	mov	X1, -8(rp)
+	adc	$0, w3
+	mov	(up), %rdx
+	add	w2, X0
+	mulx(	v0, %rax, w1)
+	adc	$0, w0
+L(lo3):	add	%rax, X0
+	adc	$0, w1
+	mulx(	v1, %rax, w2)
+	add	w3, X0
+	mov	8(rp), X1
+	mov	X0, (rp)
+	mov	16(rp), X0
+	adc	$0, w1
+	add	%rax, X1
+	adc	$0, w2
+	mov	8(up), %rdx
+	lea	32(up), up
+	inc	n
+	jnz	L(top)
+
+L(end):	mulx(	v0, %rax, w3)
+	add	w0, X1
+	adc	$0, w2
+L(ex):	add	%rax, X1
+	adc	$0, w3
+	mulx(	v1, %rdx, %rax)
+	add	w1, X1
+	mov	X1, 8(rp)
+	adc	$0, w3
+	add	w2, %rdx
+	adc	$0, %rax
+	add	%rdx, w3
+	mov	w3, 16(rp)
+	adc	$0, %rax
+	mov	%rax, 24(rp)
+
+	jmp	L(outer)		C loop until a small corner remains
+
+L(corner):
+	pop	un
+	mov	(up), %rdx
+	jg	L(small_corner)
+
+	mov	%rdx, v1
+	mov	(rp), X0
+	mov	%rax, X1		C Tricky rax reuse of last iteration
+	mulx(	v0, %rax, w1)
+	add	%rax, X0
+	adc	$0, w1
+	mov	X0, (rp)
+	mov	8(up), %rdx
+	mulx(	v0, %rax, w3)
+	add	%rax, X1
+	adc	$0, w3
+	mulx(	v1, %rdx, %rax)
+	add	w1, X1
+	mov	X1, 8(rp)
+	adc	$0, w3
+	add	w3, %rdx
+	mov	%rdx, 16(rp)
+	adc	$0, %rax
+	mov	%rax, 24(rp)
+	lea	32(rp), rp
+	lea	16(up), up
+	jmp	L(com)
+
+L(small_corner):
+	mulx(	v0, X1, w3)
+	add	%rax, X1		C Tricky rax reuse of last iteration
+	adc	$0, w3
+	mov	X1, (rp)
+	mov	w3, 8(rp)
+	lea	16(rp), rp
+	lea	8(up), up
+
+L(com):
+
+L(sqr_diag_addlsh1):
+	lea	8(up,un,8), up		C put back up at its very beginning
+	lea	(rp,un,8), rp
+	lea	(rp,un,8), rp		C put back rp at its very beginning
+	inc	un
+
+	mov	-8(up), %rdx
+	xor	R32(%rbx), R32(%rbx)	C clear CF as side effect
+	mulx(	%rdx, %rax, %r10)
+	mov	%rax, 8(rp)
+	mov	16(rp), %r8
+	mov	24(rp), %r9
+	jmp	L(dm)
+
+	ALIGN(16)
+L(dtop):mov	32(rp), %r8
+	mov	40(rp), %r9
+	lea	16(rp), rp
+	lea	(%rdx,%rbx), %r10
+L(dm):	adc	%r8, %r8
+	adc	%r9, %r9
+	setc	R8(%rbx)
+	mov	(up), %rdx
+	lea	8(up), up
+	mulx(	%rdx, %rax, %rdx)
+	add	%r10, %r8
+	adc	%rax, %r9
+	mov	%r8, 16(rp)
+	mov	%r9, 24(rp)
+	inc	un
+	jnz	L(dtop)
+
+L(dend):adc	%rbx, %rdx
+	mov	%rdx, 32(rp)
+
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/coreinhm/aorrlsh_n.asm b/third_party/gmp/mpn/x86_64/coreinhm/aorrlsh_n.asm
new file mode 100644
index 0000000..eed64e7
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/coreinhm/aorrlsh_n.asm

@@ -0,0 +1,200 @@
+dnl  AMD64 mpn_addlsh_n -- rp[] = up[] + (vp[] << k)
+dnl  AMD64 mpn_rsblsh_n -- rp[] = (vp[] << k) - up[]
+dnl  Optimised for Nehalem.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  Copyright 2011, 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C AMD K8,K9	 ?
+C AMD K10	 4.75
+C Intel P4	 ?
+C Intel core2	 2.8-3
+C Intel NHM	 2.8
+C Intel SBR	 3.55
+C Intel atom	 ?
+C VIA nano	 ?
+
+C The inner-loop probably runs close to optimally on Nehalem (using 4-way
+C unrolling).  The rest of the code is quite crude, and could perhaps be made
+C both smaller and faster.
+
+C INPUT PARAMETERS
+define(`rp',	`%rdi')
+define(`up',	`%rsi')
+define(`vp',	`%rdx')
+define(`n',	`%rcx')
+define(`cnt',	`%r8')
+define(`cy',	`%r9')			C for _nc variant
+
+ifdef(`OPERATION_addlsh_n', `
+	define(ADDSUB,	add)
+	define(ADCSBB,	adc)
+	define(IFRSB,	)
+	define(func_n,	mpn_addlsh_n)
+	define(func_nc,	mpn_addlsh_nc)')
+ifdef(`OPERATION_rsblsh_n', `
+	define(ADDSUB,	sub)
+	define(ADCSBB,	sbb)
+	define(IFRSB,	`$1')
+	define(func_n,	mpn_rsblsh_n)
+	define(func_nc,	mpn_rsblsh_nc)')
+
+C mpn_rsblsh_nc removed below, its idea of carry-in is inconsistent with
+C refmpn_rsblsh_nc
+MULFUNC_PROLOGUE(mpn_addlsh_n mpn_addlsh_nc mpn_rsblsh_n)
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(32)
+PROLOGUE(func_n)
+	FUNC_ENTRY(4)
+IFDOS(`	mov	56(%rsp), %r8d	')	C cnt
+	push	%rbx
+	xor	R32(%rbx), R32(%rbx)	C clear CF save register
+L(ent):	push	%rbp
+	mov	R32(n), R32(%rbp)
+	mov	n, %rax
+
+	mov	R32(cnt), R32(%rcx)
+	neg	R32(%rcx)
+
+	lea	-8(up,%rax,8), up
+	lea	-8(vp,%rax,8), vp
+	lea	-40(rp,%rax,8), rp
+	neg	%rax
+
+	and	$3, R32(%rbp)
+	jz	L(b0)
+	cmp	$2, R32(%rbp)
+	jc	L(b1)
+	jz	L(b2)
+
+L(b3):	xor	R32(%r9), R32(%r9)
+	mov	8(vp,%rax,8), %r10
+	mov	16(vp,%rax,8), %r11
+	shrd	%cl, %r10, %r9
+	shrd	%cl, %r11, %r10
+	add	R32(%rbx), R32(%rbx)
+	ADCSBB	8(up,%rax,8), %r9
+	mov	24(vp,%rax,8), %r8
+	ADCSBB	16(up,%rax,8), %r10
+	sbb	R32(%rbx), R32(%rbx)
+	add	$3, %rax
+	jmp	L(lo3)
+
+L(b0):	mov	8(vp,%rax,8), %r9
+	xor	R32(%r8), R32(%r8)
+	shrd	%cl, %r9, %r8
+	mov	16(vp,%rax,8), %r10
+	mov	24(vp,%rax,8), %r11
+	shrd	%cl, %r10, %r9
+	shrd	%cl, %r11, %r10
+	add	R32(%rbx), R32(%rbx)
+	ADCSBB	8(up,%rax,8), %r8
+	mov	%r8, 40(rp,%rax,8)	C offset 40
+	ADCSBB	16(up,%rax,8), %r9
+	mov	32(vp,%rax,8), %r8
+	ADCSBB	24(up,%rax,8), %r10
+	sbb	R32(%rbx), R32(%rbx)
+	add	$4, %rax
+	jmp	L(lo0)
+
+L(b1):	mov	8(vp,%rax,8), %r8
+	add	$1, %rax
+	jz	L(1)
+	mov	8(vp,%rax,8), %r9
+	xor	R32(%rbp), R32(%rbp)
+	jmp	L(lo1)
+L(1):	xor	R32(%r11), R32(%r11)
+	jmp	L(wd1)
+
+L(b2):	xor	%r10, %r10
+	mov	8(vp,%rax,8), %r11
+	shrd	%cl, %r11, %r10
+	add	R32(%rbx), R32(%rbx)
+	mov	16(vp,%rax,8), %r8
+	ADCSBB	8(up,%rax,8), %r10
+	sbb	R32(%rbx), R32(%rbx)
+	add	$2, %rax
+	jz	L(end)
+
+	ALIGN(16)
+L(top):	mov	8(vp,%rax,8), %r9
+	mov	%r11, %rbp
+L(lo2):	mov	%r10, 24(rp,%rax,8)	C offset 24
+L(lo1):	shrd	%cl, %r8, %rbp
+	shrd	%cl, %r9, %r8
+	mov	16(vp,%rax,8), %r10
+	mov	24(vp,%rax,8), %r11
+	shrd	%cl, %r10, %r9
+	shrd	%cl, %r11, %r10
+	add	R32(%rbx), R32(%rbx)
+	ADCSBB	(up,%rax,8), %rbp
+	ADCSBB	8(up,%rax,8), %r8
+	mov	%r8, 40(rp,%rax,8)	C offset 40
+	ADCSBB	16(up,%rax,8), %r9
+	mov	32(vp,%rax,8), %r8
+	ADCSBB	24(up,%rax,8), %r10
+	sbb	R32(%rbx), R32(%rbx)
+	add	$4, %rax
+	mov	%rbp, (rp,%rax,8)	C offset 32
+L(lo0):
+L(lo3):	mov	%r9, 16(rp,%rax,8)	C offset 48
+	jnz	L(top)
+
+L(end):	mov	%r10, 24(rp,%rax,8)
+L(wd1):	shrd	%cl, %r8, %r11
+	add	R32(%rbx), R32(%rbx)
+	ADCSBB	(up,%rax,8), %r11
+	mov	%r11, 32(rp,%rax,8)	C offset 32
+	adc	R32(%rax), R32(%rax)	C rax is zero after loop
+	shr	R8(%rcx), %r8
+	ADDSUB	%r8, %rax
+IFRSB(	neg	%rax)
+	pop	%rbp
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+EPILOGUE()
+PROLOGUE(func_nc)
+	FUNC_ENTRY(4)
+IFDOS(`	mov	56(%rsp), %r8d	')	C cnt
+IFDOS(`	mov	64(%rsp), %r9	')	C cy
+	push	%rbx
+	neg	cy
+	sbb	R32(%rbx), R32(%rbx)	C initialise CF save register
+	jmp	L(ent)
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/coreinhm/aorsmul_1.asm b/third_party/gmp/mpn/x86_64/coreinhm/aorsmul_1.asm
new file mode 100644
index 0000000..1be829f
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/coreinhm/aorsmul_1.asm

@@ -0,0 +1,190 @@
+dnl  AMD64 mpn_addmul_1 and mpn_submul_1 optimised for Intel Nehalem.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C            cycles/limb
+C AMD K8,K9      4.0
+C AMD K10        4.0
+C AMD bull       5.0
+C AMD pile       4.84    5.39
+C AMD steam
+C AMD excavator
+C AMD bobcat     5.56
+C AMD jaguar     5.30
+C Intel P4      15.7    17.2
+C Intel core2    5.15
+C Intel NHM      4.56
+C Intel SBR      3.44
+C Intel HWL      3.03
+C Intel BWL      2.77
+C Intel SKL      2.76
+C Intel atom    21
+C Intel SLM     11
+C VIA nano
+
+C The loop of this code is the result of running a code generation and
+C optimization tool suite written by David Harvey and Torbjorn Granlund.
+
+C N.B.: Be careful if editing, making sure the loop alignment padding does not
+C become large, as we currently fall into it.
+
+define(`rp',      `%rdi')   C rcx
+define(`up',      `%rsi')   C rdx
+define(`n_param', `%rdx')   C r8
+define(`v0',      `%rcx')   C r9
+
+define(`n',       `%rbx')
+
+ifdef(`OPERATION_addmul_1',`
+  define(`ADDSUB', `add')
+  define(`func',   `mpn_addmul_1')
+')
+ifdef(`OPERATION_submul_1',`
+  define(`ADDSUB', `sub')
+  define(`func',   `mpn_submul_1')
+')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
+
+ASM_START()
+	TEXT
+	ALIGN(32)
+PROLOGUE(func)
+	FUNC_ENTRY(4)
+	push	%rbx
+
+	mov	(up), %rax
+	lea	-8(up,n_param,8), up
+	mov	(rp), %r8
+	lea	-8(rp,n_param,8), rp
+
+	test	$1, R8(n_param)
+	jnz	L(bx1)
+
+L(bx0):	test	$2, R8(n_param)
+	jnz	L(b10)
+
+L(b00):	mov	$3, R32(n)
+	sub	n_param, n
+	mul	v0
+	mov	$0, R32(%r11)
+	mov	%r8, %r10
+	ADDSUB	%rax, %r10
+	mov	-8(up,n,8), %rax
+	adc	%rdx, %r11
+	jmp	L(lo0)
+
+L(b10):	mov	$1, R32(n)
+	sub	n_param, n
+	mul	v0
+	mov	%r8, %r10
+	mov	$0, R32(%r11)
+	ADDSUB	%rax, %r10
+	mov	8(up,n,8), %rax
+	adc	%rdx, %r11
+	jmp	L(lo2)
+
+L(bx1):	test	$2, R8(n_param)
+	jz	L(b01)
+
+L(b11):	mov	$2, R32(n)
+	sub	n_param, n
+	mul	v0
+	ADDSUB	%rax, %r8
+	mov	$0, R32(%r9)
+	mov	(up,n,8), %rax
+	adc	%rdx, %r9
+	jmp	L(lo3)
+
+L(b01):	mov	$0, R32(n)
+	sub	n_param, n
+	xor	%r11, %r11
+	add	$4, n
+	jc	L(end)
+
+	ALIGN(32)
+L(top):	mul	v0
+	ADDSUB	%rax, %r8
+	mov	$0, R32(%r9)
+	mov	-16(up,n,8), %rax
+	adc	%rdx, %r9
+L(lo1):	mul	v0
+	ADDSUB	%r11, %r8
+	mov	$0, R32(%r11)
+	mov	-16(rp,n,8), %r10
+	adc	$0, %r9
+	ADDSUB	%rax, %r10
+	mov	-8(up,n,8), %rax
+	adc	%rdx, %r11
+	mov	%r8, -24(rp,n,8)
+	ADDSUB	%r9, %r10
+	adc	$0, %r11
+L(lo0):	mov	-8(rp,n,8), %r8
+	mul	v0
+	ADDSUB	%rax, %r8
+	mov	$0, R32(%r9)
+	mov	(up,n,8), %rax
+	adc	%rdx, %r9
+	mov	%r10, -16(rp,n,8)
+	ADDSUB	%r11, %r8
+	adc	$0, %r9
+L(lo3):	mul	v0
+	mov	(rp,n,8), %r10
+	mov	$0, R32(%r11)
+	ADDSUB	%rax, %r10
+	mov	8(up,n,8), %rax
+	adc	%rdx, %r11
+	mov	%r8, -8(rp,n,8)
+	ADDSUB	%r9, %r10
+	adc	$0, %r11
+L(lo2):	mov	8(rp,n,8), %r8
+	mov	%r10, (rp,n,8)
+	add	$4, n
+	jnc	L(top)
+
+L(end):	mul	v0
+	ADDSUB	%rax, %r8
+	mov	$0, R32(%rax)
+	adc	%rdx, %rax
+	ADDSUB	%r11, %r8
+	adc	$0, %rax
+	mov	%r8, (rp)
+
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/x86_64/coreinhm/gmp-mparam.h b/third_party/gmp/mpn/x86_64/coreinhm/gmp-mparam.h
new file mode 100644
index 0000000..f56c128
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/coreinhm/gmp-mparam.h

@@ -0,0 +1,238 @@
+/* Nehalem gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+/* 2933-3200 MHz Intel Xeon X3470 Nehalem */
+/* FFT tuning limit = 468,424,931 */
+/* Generated by tuneup.c, 2019-10-18, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD                 0  /* always */
+#define MOD_1_UNNORM_THRESHOLD               0  /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          3
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          2
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD        11
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD        16
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD      7
+#define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_1_NORM_THRESHOLD              1
+#define DIV_QR_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */
+#define DIV_QR_2_PI2_THRESHOLD              10
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           17
+
+#define DIV_1_VS_MUL_1_PERCENT             301
+
+#define MUL_TOOM22_THRESHOLD                18
+#define MUL_TOOM33_THRESHOLD                59
+#define MUL_TOOM44_THRESHOLD               169
+#define MUL_TOOM6H_THRESHOLD               230
+#define MUL_TOOM8H_THRESHOLD               333
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      97
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     110
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD     104
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD     101
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD     147
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 28
+#define SQR_TOOM3_THRESHOLD                 98
+#define SQR_TOOM4_THRESHOLD                250
+#define SQR_TOOM6_THRESHOLD                351
+#define SQR_TOOM8_THRESHOLD                478
+
+#define MULMID_TOOM42_THRESHOLD             28
+
+#define MULMOD_BNM1_THRESHOLD               13
+#define SQRMOD_BNM1_THRESHOLD               13
+
+#define MUL_FFT_MODF_THRESHOLD             372  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    372, 5}, {     17, 6}, {      9, 5}, {     19, 6}, \
+    {     10, 5}, {     21, 6}, {     21, 7}, {     11, 6}, \
+    {     23, 7}, {     21, 8}, {     11, 7}, {     24, 8}, \
+    {     13, 7}, {     27, 8}, {     15, 7}, {     31, 8}, \
+    {     21, 9}, {     11, 8}, {     27, 9}, {     15, 8}, \
+    {     33, 9}, {     19, 8}, {     39, 9}, {     23, 8}, \
+    {     47, 9}, {     27,10}, {     15, 9}, {     39,10}, \
+    {     23, 9}, {     51,11}, {     15,10}, {     31, 9}, \
+    {     67,10}, {     39, 9}, {     83,10}, {     47, 9}, \
+    {     95,10}, {     55,11}, {     31,10}, {     79,11}, \
+    {     47,10}, {     95,12}, {     31, 8}, {    511,10}, \
+    {    135,11}, {     79,10}, {    159, 9}, {    319,11}, \
+    {     95,10}, {    191, 9}, {    383,12}, {     63,11}, \
+    {    127,10}, {    255, 9}, {    511,11}, {    143,10}, \
+    {    287, 9}, {    575,10}, {    303,11}, {    159,10}, \
+    {    319,12}, {     95,11}, {    191,10}, {    383,13}, \
+    {     63,12}, {    127,11}, {    255,10}, {    511,11}, \
+    {    271,10}, {    543,11}, {    287,10}, {    575,11}, \
+    {    303,10}, {    607,11}, {    319,10}, {    639,11}, \
+    {    351,10}, {    703,12}, {    191,11}, {    383,10}, \
+    {    767,11}, {    415,10}, {    831,12}, {    223,11}, \
+    {    447,10}, {    895,13}, {    127,12}, {    255,11}, \
+    {    543,12}, {    287,11}, {    607,12}, {    319,11}, \
+    {    639,12}, {    351,11}, {    703,13}, {    191,12}, \
+    {    383,11}, {    767,12}, {    415,11}, {    831,12}, \
+    {    447,11}, {    895,12}, {    479,14}, {    127,13}, \
+    {    255,12}, {    543,11}, {   1087,12}, {    607,13}, \
+    {    319,12}, {    703,13}, {    383,12}, {    831,13}, \
+    {    447,12}, {    959,14}, {    255,13}, {    511,12}, \
+    {   1087,13}, {    575,12}, {   1215,11}, {   2431,13}, \
+    {    639,12}, {   1279,13}, {    703,12}, {   1407,14}, \
+    {    383,13}, {    831,12}, {   1663,13}, {    959,14}, \
+    {    511,13}, {   1087,12}, {   2175,13}, {   1215,12}, \
+    {   2431,14}, {    639,13}, {   1343,12}, {   2687,13}, \
+    {   1407,12}, {   2815,13}, {   1471,14}, {    767,13}, \
+    {   1663,14}, {    895,13}, {   1791,15}, {    511,14}, \
+    {   1023,13}, {   2175,14}, {   1151,13}, {   2431,12}, \
+    {   4863,14}, {   1279,13}, {   2687,14}, {   1407,13}, \
+    {   2815,15}, {    767,14}, {   1663,13}, {   3455,14}, \
+    {   1919,16}, {    511,15}, {   1023,14}, {   2431,13}, \
+    {   4863,15}, {   1279,14}, {   2943,13}, {   5887,15}, \
+    {   1535,14}, {   3455,15}, {   1791,14}, {   3839,16}, \
+    {   1023,15}, {   2047,14}, {   4223,15}, {   2303,14}, \
+    {   4863,15}, {   2815,14}, {   5887,16}, {   1535,15}, \
+    {   3327,14}, {   6911,15}, {   3839,17}, {   1023,16}, \
+    {   2047,15}, {   4863,16}, {   2559,15}, {   5887,14}, \
+    {  11775,16}, {   3071,15}, {   6911,16}, {   3583,15}, \
+    {   7679,14}, {  15359,17}, {   2047,16}, {   4607,15}, \
+    {   9983,16}, {   5631,15}, {  11775,17}, {   3071,16}, \
+    {  65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \
+    {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 204
+#define MUL_FFT_THRESHOLD                 4224
+
+#define SQR_FFT_MODF_THRESHOLD             336  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    336, 5}, {     19, 6}, {     10, 5}, {     21, 6}, \
+    {     21, 7}, {     11, 6}, {     23, 7}, {     21, 8}, \
+    {     11, 7}, {     25, 8}, {     13, 7}, {     27, 8}, \
+    {     15, 7}, {     31, 8}, {     21, 9}, {     11, 8}, \
+    {     27, 9}, {     15, 8}, {     33, 9}, {     19, 8}, \
+    {     39, 9}, {     23, 8}, {     47, 9}, {     27,10}, \
+    {     15, 9}, {     39,10}, {     23, 9}, {     47,11}, \
+    {     15,10}, {     31, 9}, {     67,10}, {     39, 9}, \
+    {     79,10}, {     47,11}, {     31,10}, {     79,11}, \
+    {     47,10}, {     95,12}, {     31,11}, {     63,10}, \
+    {    127, 9}, {    255, 8}, {    511,10}, {    135,11}, \
+    {     79, 9}, {    319, 6}, {   2687, 7}, {   1407, 9}, \
+    {    383,12}, {     63,11}, {    127,10}, {    255, 9}, \
+    {    511,10}, {    271, 9}, {    543,11}, {    143,10}, \
+    {    287, 9}, {    575,10}, {    303, 9}, {    607,10}, \
+    {    319,12}, {     95,11}, {    191,10}, {    383,13}, \
+    {     63,12}, {    127,11}, {    255,10}, {    511,11}, \
+    {    271,10}, {    543,11}, {    287,10}, {    575,11}, \
+    {    303,10}, {    607,11}, {    319,10}, {    639,11}, \
+    {    351,10}, {    703,12}, {    191,11}, {    383,10}, \
+    {    767,11}, {    415,10}, {    831,12}, {    223,11}, \
+    {    447,10}, {    895,11}, {    479,13}, {    127,12}, \
+    {    255,11}, {    511,10}, {   1023,11}, {    543,12}, \
+    {    287,11}, {    607,12}, {    319,11}, {    671,12}, \
+    {    351,11}, {    703,13}, {    191,12}, {    383,11}, \
+    {    767,12}, {    415,11}, {    831,12}, {    447,11}, \
+    {    895,12}, {    479,14}, {    127,13}, {    255,12}, \
+    {    511,11}, {   1023,12}, {    543,11}, {   1087,12}, \
+    {    575,11}, {   1151,12}, {    607,13}, {    319,12}, \
+    {    671,11}, {   1343,12}, {    703,13}, {    383,12}, \
+    {    767,11}, {   1535,12}, {    831,13}, {    447,12}, \
+    {    959,13}, {    511,12}, {   1087,13}, {    575,12}, \
+    {   1215,11}, {   2431,13}, {    639,12}, {   1343,13}, \
+    {    703,14}, {    383,13}, {    767,12}, {   1535,13}, \
+    {    831,12}, {   1663,13}, {    959,14}, {    511,13}, \
+    {   1087,12}, {   2175,13}, {   1215,12}, {   2431,14}, \
+    {    639,13}, {   1343,12}, {   2687,13}, {   1407,12}, \
+    {   2815,13}, {   1471,14}, {    767,13}, {   1663,14}, \
+    {    895,13}, {   1791,15}, {    511,14}, {   1023,13}, \
+    {   2175,14}, {   1151,13}, {   2431,12}, {   4863,14}, \
+    {   1279,13}, {   2687,14}, {   1407,13}, {   2815,15}, \
+    {    767,14}, {   1535,13}, {   3071,14}, {   1663,13}, \
+    {   3455,14}, {   1919,16}, {    511,15}, {   1023,14}, \
+    {   2431,13}, {   4863,15}, {   1279,14}, {   2943,13}, \
+    {   5887,15}, {   1535,14}, {   3455,15}, {   1791,14}, \
+    {   3839,16}, {   1023,15}, {   2047,14}, {   4223,15}, \
+    {   2303,14}, {   4863,15}, {   2815,14}, {   5887,16}, \
+    {   1535,15}, {   3327,14}, {   6911,15}, {   3839,17}, \
+    {   1023,16}, {   2047,15}, {   4863,16}, {   2559,15}, \
+    {   5887,14}, {  11775,16}, {   3071,15}, {   6655,16}, \
+    {   3583,15}, {   7679,14}, {  15359,17}, {   2047,16}, \
+    {   4607,15}, {   9983,14}, {  19967,16}, {   5631,15}, \
+    {  11775,17}, {   3071,16}, {  65536,17}, { 131072,18}, \
+    { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \
+    {4194304,23}, {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 218
+#define SQR_FFT_THRESHOLD                 3520
+
+#define MULLO_BASECASE_THRESHOLD             0  /* always */
+#define MULLO_DC_THRESHOLD                  49
+#define MULLO_MUL_N_THRESHOLD             8397
+#define SQRLO_BASECASE_THRESHOLD            10
+#define SQRLO_DC_THRESHOLD                  11
+#define SQRLO_SQR_THRESHOLD               7035
+
+#define DC_DIV_QR_THRESHOLD                 47
+#define DC_DIVAPPR_Q_THRESHOLD             151
+#define DC_BDIV_QR_THRESHOLD                40
+#define DC_BDIV_Q_THRESHOLD                 30
+
+#define INV_MULMOD_BNM1_THRESHOLD           34
+#define INV_NEWTON_THRESHOLD               199
+#define INV_APPR_THRESHOLD                 157
+
+#define BINV_NEWTON_THRESHOLD              254
+#define REDC_1_TO_REDC_N_THRESHOLD          48
+
+#define MU_DIV_QR_THRESHOLD               1334
+#define MU_DIVAPPR_Q_THRESHOLD            1334
+#define MUPI_DIV_QR_THRESHOLD               83
+#define MU_BDIV_QR_THRESHOLD              1142
+#define MU_BDIV_Q_THRESHOLD               1308
+
+#define POWM_SEC_TABLE  1,64,66,452,1486
+
+#define GET_STR_DC_THRESHOLD                11
+#define GET_STR_PRECOMPUTE_THRESHOLD        18
+#define SET_STR_DC_THRESHOLD               141
+#define SET_STR_PRECOMPUTE_THRESHOLD      1023
+
+#define FAC_DSC_THRESHOLD                  182
+#define FAC_ODD_THRESHOLD                    0  /* always */
+
+#define MATRIX22_STRASSEN_THRESHOLD         19
+#define HGCD2_DIV1_METHOD                    5  /* 2.91% faster than 3 */
+#define HGCD_THRESHOLD                     116
+#define HGCD_APPR_THRESHOLD                164
+#define HGCD_REDUCE_THRESHOLD             2205
+#define GCD_DC_THRESHOLD                   321
+#define GCDEXT_DC_THRESHOLD                358
+#define JACOBI_BASE_METHOD                   4  /* 0.12% faster than 1 */
+
+/* Tuneup completed successfully, took 452116 seconds */

diff --git a/third_party/gmp/mpn/x86_64/coreinhm/hamdist.asm b/third_party/gmp/mpn/x86_64/coreinhm/hamdist.asm
new file mode 100644
index 0000000..a5a63e4
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/coreinhm/hamdist.asm

@@ -0,0 +1,196 @@
+dnl  AMD64 mpn_hamdist -- hamming distance.
+
+dnl  Copyright 2017 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+C		    cycles/limb
+C AMD K8,K9		 n/a
+C AMD K10		 3.26
+C AMD bd1		 4.2
+C AMD bd2		 4.2
+C AMD bd3		 ?
+C AMD bd4		 ?
+C AMD zen		 1.15
+C AMD bobcat		 7.29
+C AMD jaguar		 2.53
+C Intel P4		 n/a
+C Intel core2		 n/a
+C Intel NHM		 2.03
+C Intel SBR		 1.66
+C Intel IBR		 1.62
+C Intel HWL		 1.50
+C Intel BWL		 1.50
+C Intel SKL		 1.50
+C Intel atom		 n/a
+C Intel SLM		 2.55
+C VIA nano		 n/a
+
+C TODO
+C  * An AVX pshufb based variant should approach 0.5 c/l on Haswell and later
+C    Intel hardware.  Perhaps mix such a loop with popcnt instructions.
+C  * The random placement of the L0, L1, L2, etc blocks are due to branch
+C    shortening.  More work could be done there.
+C  * Combine the accumulators rax and rcx into one register to save some
+C    bookkeeping and a push/pop pair.  Unfortunately this cause a slight
+C    slowdown for at leat NHM and SBR.
+
+define(`up',		`%rdi')
+define(`vp',		`%rsi')
+define(`n',		`%rdx')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+define(`sum', `lea	($1,$2), $2')
+define(`sum', `add	$1, $2')
+
+ASM_START()
+	TEXT
+	ALIGN(32)
+PROLOGUE(mpn_hamdist)
+	FUNC_ENTRY(3)
+	push	%rbx
+	push	%rbp
+
+	mov	(up), %r10
+	xor	(vp), %r10
+
+	mov	R32(n), R32(%r8)
+	and	$3, R32(%r8)
+
+	xor	R32(%rcx), R32(%rcx)
+	.byte	0xf3,0x49,0x0f,0xb8,0xc2	C popcnt %r10,%rax
+
+	lea	L(tab)(%rip), %r9
+ifdef(`PIC',`
+	movslq	(%r9,%r8,4), %r8
+	add	%r9, %r8
+	jmp	*%r8
+',`
+	jmp	*(%r9,%r8,8)
+')
+
+L(3):	mov	8(up), %r10
+	mov	16(up), %r11
+	xor	8(vp), %r10
+	xor	16(vp), %r11
+	xor	R32(%rbp), R32(%rbp)
+	sub	$4, n
+	jle	L(x3)
+	mov	24(up), %r8
+	mov	32(up), %r9
+	add	$24, up
+	add	$24, vp
+	jmp	L(e3)
+
+L(0):	mov	8(up), %r9
+	xor	8(vp), %r9
+	mov	16(up), %r10
+	mov	24(up), %r11
+	xor	R32(%rbx), R32(%rbx)
+	xor	16(vp), %r10
+	xor	24(vp), %r11
+	add	$32, up
+	add	$32, vp
+	sub	$4, n
+	jle	L(x4)
+
+	ALIGN(16)
+L(top):
+L(e0):	.byte	0xf3,0x49,0x0f,0xb8,0xe9	C popcnt %r9,%rbp
+	mov	(up), %r8
+	mov	8(up), %r9
+	sum(	%rbx, %rax)
+L(e3):	.byte	0xf3,0x49,0x0f,0xb8,0xda	C popcnt %r10,%rbx
+	xor	(vp), %r8
+	xor	8(vp), %r9
+	sum(	%rbp, %rcx)
+L(e2):	.byte	0xf3,0x49,0x0f,0xb8,0xeb	C popcnt %r11,%rbp
+	mov	16(up), %r10
+	mov	24(up), %r11
+	add	$32, up
+	sum(	%rbx, %rax)
+L(e1):	.byte	0xf3,0x49,0x0f,0xb8,0xd8	C popcnt %r8,%rbx
+	xor	16(vp), %r10
+	xor	24(vp), %r11
+	add	$32, vp
+	sum(	%rbp, %rcx)
+	sub	$4, n
+	jg	L(top)
+
+L(x4):	.byte	0xf3,0x49,0x0f,0xb8,0xe9	C popcnt %r9,%rbp
+	sum(	%rbx, %rax)
+L(x3):	.byte	0xf3,0x49,0x0f,0xb8,0xda	C popcnt %r10,%rbx
+	sum(	%rbp, %rcx)
+	.byte	0xf3,0x49,0x0f,0xb8,0xeb	C popcnt %r11,%rbp
+	sum(	%rbx, %rax)
+	sum(	%rbp, %rcx)
+L(x2):	add	%rcx, %rax
+L(x1):	pop	%rbp
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+
+L(2):	mov	8(up), %r11
+	xor	8(vp), %r11
+	sub	$2, n
+	jle	L(n2)
+	mov	16(up), %r8
+	mov	24(up), %r9
+	xor	R32(%rbx), R32(%rbx)
+	xor	16(vp), %r8
+	xor	24(vp), %r9
+	add	$16, up
+	add	$16, vp
+	jmp	L(e2)
+L(n2):	.byte	0xf3,0x49,0x0f,0xb8,0xcb	C popcnt %r11,%rcx
+	jmp	L(x2)
+
+L(1):	dec	n
+	jle	L(x1)
+	mov	8(up), %r8
+	mov	16(up), %r9
+	xor	8(vp), %r8
+	xor	16(vp), %r9
+	xor	R32(%rbp), R32(%rbp)
+	mov	24(up), %r10
+	mov	32(up), %r11
+	add	$40, up
+	add	$8, vp
+	jmp	L(e1)
+
+EPILOGUE()
+	JUMPTABSECT
+	ALIGN(8)
+L(tab):	JMPENT(	L(0), L(tab))
+	JMPENT(	L(1), L(tab))
+	JMPENT(	L(2), L(tab))
+	JMPENT(	L(3), L(tab))

diff --git a/third_party/gmp/mpn/x86_64/coreinhm/popcount.asm b/third_party/gmp/mpn/x86_64/coreinhm/popcount.asm
new file mode 100644
index 0000000..0a3c867
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/coreinhm/popcount.asm

@@ -0,0 +1,182 @@
+dnl  AMD64 mpn_popcount -- population count.
+
+dnl  Copyright 2017 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+C		    cycles/limb
+C AMD K8,K9		 n/a
+C AMD K10		 1.39
+C AMD bd1		 4
+C AMD bd2		 4
+C AMD bd3		 ?
+C AMD bd4		 ?
+C AMD zen		 0.72
+C AMD bobcat		 5.78
+C AMD jaguar		 1.27
+C Intel P4		 n/a
+C Intel core2		 n/a
+C Intel NHM		 1.04
+C Intel SBR		 1.02
+C Intel IBR		 1.0
+C Intel HWL		 1.0
+C Intel BWL		 1.0
+C Intel SKL		 1.0
+C Intel atom		 n/a
+C Intel SLM		 1.34
+C VIA nano		 n/a
+
+C TODO
+C  * We could approach 0.5 c/l for AMD Zen with more unrolling.  That would
+C    not cause any additional feed-in overhead as we already use a jump table.
+C  * An AVX pshufb based variant should approach 0.5 c/l on Haswell and later
+C    Intel hardware.  Perhaps mix such a loop with popcnt instructions.
+C  * The random placement of the L0, L1, L2, etc blocks are due to branch
+C    shortening.
+
+define(`up',		`%rdi')
+define(`n',		`%rsi')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(32)
+PROLOGUE(mpn_popcount)
+	FUNC_ENTRY(2)
+
+	mov	R32(n), R32(%r8)
+	and	$7, R32(%r8)
+
+	.byte	0xf3,0x48,0x0f,0xb8,0x07	C popcnt (up), %rax
+	xor	R32(%rcx), R32(%rcx)
+
+	lea	L(tab)(%rip), %r9
+ifdef(`PIC',`
+	movslq	(%r9,%r8,4), %r8
+	add	%r9, %r8
+	jmp	*%r8
+',`
+	jmp	*(%r9,%r8,8)
+')
+
+L(3):	.byte	0xf3,0x4c,0x0f,0xb8,0x57,0x08	C popcnt 8(up), %r10
+	.byte	0xf3,0x4c,0x0f,0xb8,0x5f,0x10	C popcnt 16(up), %r11
+	add	$24, up
+	sub	$8, n
+	jg	L(e34)
+	add	%r10, %rax
+	add	%r11, %rax
+L(s1):	FUNC_EXIT()
+	ret
+
+L(1):	sub	$8, n
+	jle	L(s1)
+	.byte	0xf3,0x4c,0x0f,0xb8,0x47,0x08	C popcnt 8(up), %r8
+	.byte	0xf3,0x4c,0x0f,0xb8,0x4f,0x10	C popcnt 16(up), %r9
+	add	$8, up
+	jmp	L(e12)
+
+L(7):	.byte	0xf3,0x4c,0x0f,0xb8,0x57,0x08	C popcnt 0x8(%rdi),%r10
+	.byte	0xf3,0x4c,0x0f,0xb8,0x5f,0x10	C popcnt 0x10(%rdi),%r11
+	add	$-8, up
+	jmp	L(e07)
+
+L(0):	.byte	0xf3,0x48,0x0f,0xb8,0x4f,0x08	C popcnt 0x8(%rdi),%rcx
+	.byte	0xf3,0x4c,0x0f,0xb8,0x57,0x10	C popcnt 0x10(%rdi),%r10
+	.byte	0xf3,0x4c,0x0f,0xb8,0x5f,0x18	C popcnt 0x18(%rdi),%r11
+	jmp	L(e07)
+
+L(4):	.byte	0xf3,0x48,0x0f,0xb8,0x4f,0x08	C popcnt 0x8(%rdi),%rcx
+	.byte	0xf3,0x4c,0x0f,0xb8,0x57,0x10	C popcnt 0x10(%rdi),%r10
+	.byte	0xf3,0x4c,0x0f,0xb8,0x5f,0x18	C popcnt 0x18(%rdi),%r11
+	add	$32, up
+	sub	$8, n
+	jle	L(x4)
+
+	ALIGN(16)
+L(top):
+L(e34):	.byte	0xf3,0x4c,0x0f,0xb8,0x07	C popcnt (%rdi),%r8
+	.byte	0xf3,0x4c,0x0f,0xb8,0x4f,0x08	C popcnt 0x8(%rdi),%r9
+	add	%r10, %rcx
+	add	%r11, %rax
+L(e12):	.byte	0xf3,0x4c,0x0f,0xb8,0x57,0x10	C popcnt 0x10(%rdi),%r10
+	.byte	0xf3,0x4c,0x0f,0xb8,0x5f,0x18	C popcnt 0x18(%rdi),%r11
+	add	%r8, %rcx
+	add	%r9, %rax
+L(e07):	.byte	0xf3,0x4c,0x0f,0xb8,0x47,0x20	C popcnt 0x20(%rdi),%r8
+	.byte	0xf3,0x4c,0x0f,0xb8,0x4f,0x28	C popcnt 0x28(%rdi),%r9
+	add	%r10, %rcx
+	add	%r11, %rax
+L(e56):	.byte	0xf3,0x4c,0x0f,0xb8,0x57,0x30	C popcnt 0x30(%rdi),%r10
+	.byte	0xf3,0x4c,0x0f,0xb8,0x5f,0x38	C popcnt 0x38(%rdi),%r11
+	add	$64, up
+	add	%r8, %rcx
+	add	%r9, %rax
+	sub	$8, n
+	jg	L(top)
+
+L(x4):	add	%r10, %rcx
+	add	%r11, %rax
+L(x2):	add	%rcx, %rax
+
+	FUNC_EXIT()
+	ret
+
+L(2):	.byte	0xf3,0x48,0x0f,0xb8,0x4f,0x08	C popcnt 0x8(%rdi),%rcx
+	sub	$8, n
+	jle	L(x2)
+	.byte	0xf3,0x4c,0x0f,0xb8,0x47,0x10	C popcnt 0x10(%rdi),%r8
+	.byte	0xf3,0x4c,0x0f,0xb8,0x4f,0x18	C popcnt 0x18(%rdi),%r9
+	add	$16, up
+	jmp	L(e12)
+
+L(5):	.byte	0xf3,0x4c,0x0f,0xb8,0x47,0x08	C popcnt 0x8(%rdi),%r8
+	.byte	0xf3,0x4c,0x0f,0xb8,0x4f,0x10	C popcnt 0x10(%rdi),%r9
+	add	$-24, up
+	jmp	L(e56)
+
+L(6):	.byte	0xf3,0x48,0x0f,0xb8,0x4f,0x08	C popcnt 0x8(%rdi),%rcx
+	.byte	0xf3,0x4c,0x0f,0xb8,0x47,0x10	C popcnt 0x10(%rdi),%r8
+	.byte	0xf3,0x4c,0x0f,0xb8,0x4f,0x18	C popcnt 0x18(%rdi),%r9
+	add	$-16, up
+	jmp	L(e56)
+EPILOGUE()
+	JUMPTABSECT
+	ALIGN(8)
+L(tab):	JMPENT(	L(0), L(tab))
+	JMPENT(	L(1), L(tab))
+	JMPENT(	L(2), L(tab))
+	JMPENT(	L(3), L(tab))
+	JMPENT(	L(4), L(tab))
+	JMPENT(	L(5), L(tab))
+	JMPENT(	L(6), L(tab))
+	JMPENT(	L(7), L(tab))

diff --git a/third_party/gmp/mpn/x86_64/coreinhm/redc_1.asm b/third_party/gmp/mpn/x86_64/coreinhm/redc_1.asm
new file mode 100644
index 0000000..fc71c1b
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/coreinhm/redc_1.asm

@@ -0,0 +1,549 @@
+dnl  X86-64 mpn_redc_1 optimised for Intel Nehalem and Westmere.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C AMD K8,K9	 ?
+C AMD K10	 ?
+C AMD bull	 ?
+C AMD pile	 ?
+C AMD steam	 ?
+C AMD bobcat	 ?
+C AMD jaguar	 ?
+C Intel P4	 ?
+C Intel core	 ?
+C Intel NHM	 ?
+C Intel SBR	 ?
+C Intel IBR	 ?
+C Intel HWL	 ?
+C Intel BWL	 ?
+C Intel atom	 ?
+C VIA nano	 ?
+
+C The inner loops of this code are the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjörn Granlund.
+
+C TODO
+C  * Micro-optimise, none performed thus far.
+C  * Consider inlining mpn_add_n.
+C  * Single basecases out before the pushes.
+
+C When playing with pointers, set this to $2 to fall back to conservative
+C indexing in wind-down code.
+define(`I',`$1')
+
+define(`rp',          `%rdi')   C rcx
+define(`up',          `%rsi')   C rdx
+define(`mp_param',    `%rdx')   C r8
+define(`n',           `%rcx')   C r9
+define(`u0inv',       `%r8')    C stack
+
+define(`i',           `%r14')
+define(`j',           `%r15')
+define(`mp',          `%r12')
+define(`q0',          `%r13')
+
+C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+define(`ALIGNx', `ALIGN(16)')
+
+ASM_START()
+	TEXT
+	ALIGN(32)
+PROLOGUE(mpn_redc_1)
+	FUNC_ENTRY(4)
+IFDOS(`	mov	56(%rsp), %r8	')
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+
+	mov	(up), q0
+	mov	n, j			C outer loop induction var
+	lea	(mp_param,n,8), mp
+	lea	(up,n,8), up
+	neg	n
+	imul	u0inv, q0		C first iteration q0
+
+	test	$1, R8(n)
+	jz	L(bx0)
+
+L(bx1):	test	$2, R8(n)
+	jz	L(b3)
+
+L(b1):	cmp	$-1, R32(n)
+	jz	L(n1)
+
+L(otp1):lea	3(n), i
+	mov	(mp,n,8), %rax
+	mov	(up,n,8), %rbp
+	mul	q0
+	add	%rax, %rbp
+	mov	$0, R32(%r9)
+	mov	8(mp,n,8), %rax
+	adc	%rdx, %r9
+	mul	q0
+	mov	$0, R32(%r11)
+	mov	8(up,n,8), %rbx
+	add	%rax, %rbx
+	mov	16(mp,n,8), %rax
+	adc	%rdx, %r11
+	add	%r9, %rbx
+	adc	$0, %r11
+	mov	16(up,n,8), %rbp
+	mul	q0
+	add	%rax, %rbp
+	mov	$0, R32(%r9)
+	mov	24(mp,n,8), %rax
+	adc	%rdx, %r9
+	mov	%rbx, 8(up,n,8)
+	imul	u0inv, %rbx		C next q limb
+	jmp	L(e1)
+
+	ALIGNx
+L(tp1):	mul	q0
+	add	%rax, %rbp
+	mov	$0, R32(%r9)
+	mov	-16(mp,i,8), %rax
+	adc	%rdx, %r9
+	mul	q0
+	add	%r11, %rbp
+	mov	$0, R32(%r11)
+	mov	-16(up,i,8), %r10
+	adc	$0, %r9
+	add	%rax, %r10
+	mov	-8(mp,i,8), %rax
+	adc	%rdx, %r11
+	mov	%rbp, -24(up,i,8)
+	add	%r9, %r10
+	adc	$0, %r11
+	mov	-8(up,i,8), %rbp
+	mul	q0
+	add	%rax, %rbp
+	mov	$0, R32(%r9)
+	mov	(mp,i,8), %rax
+	adc	%rdx, %r9
+	mov	%r10, -16(up,i,8)
+L(e1):	add	%r11, %rbp
+	adc	$0, %r9
+	mul	q0
+	mov	(up,i,8), %r10
+	mov	$0, R32(%r11)
+	add	%rax, %r10
+	mov	8(mp,i,8), %rax
+	adc	%rdx, %r11
+	mov	%rbp, -8(up,i,8)
+	add	%r9, %r10
+	adc	$0, %r11
+	mov	8(up,i,8), %rbp
+	mov	%r10, (up,i,8)
+	add	$4, i
+	jnc	L(tp1)
+
+L(ed1):	mul	q0
+	add	%rax, %rbp
+	adc	$0, %rdx
+	add	%r11, %rbp
+	adc	$0, %rdx
+	mov	%rbp, I(-8(up),-24(up,i,8))
+	mov	%rdx, (up,n,8)		C up[0]
+	mov	%rbx, q0		C previously computed q limb -> q0
+	lea	8(up), up		C up++
+	dec	j
+	jnz	L(otp1)
+	jmp	L(cj)
+
+L(b3):	cmp	$-3, R32(n)
+	jz	L(n3)
+
+L(otp3):lea	5(n), i
+	mov	(mp,n,8), %rax
+	mov	(up,n,8), %rbp
+	mul	q0
+	add	%rax, %rbp
+	mov	$0, R32(%r9)
+	mov	8(mp,n,8), %rax
+	adc	%rdx, %r9
+	mul	q0
+	mov	8(up,n,8), %rbx
+	mov	$0, R32(%r11)
+	add	%rax, %rbx
+	mov	16(mp,n,8), %rax
+	adc	%rdx, %r11
+	add	%r9, %rbx
+	adc	$0, %r11
+	mov	16(up,n,8), %rbp
+	mov	%rbx, 8(up,n,8)
+	imul	u0inv, %rbx		C next q limb
+C	jmp	L(tp3)
+
+	ALIGNx
+L(tp3):	mul	q0
+	add	%rax, %rbp
+	mov	$0, R32(%r9)
+	mov	-16(mp,i,8), %rax
+	adc	%rdx, %r9
+	mul	q0
+	add	%r11, %rbp
+	mov	$0, R32(%r11)
+	mov	-16(up,i,8), %r10
+	adc	$0, %r9
+	add	%rax, %r10
+	mov	-8(mp,i,8), %rax
+	adc	%rdx, %r11
+	mov	%rbp, -24(up,i,8)
+	add	%r9, %r10
+	adc	$0, %r11
+	mov	-8(up,i,8), %rbp
+	mul	q0
+	add	%rax, %rbp
+	mov	$0, R32(%r9)
+	mov	(mp,i,8), %rax
+	adc	%rdx, %r9
+	mov	%r10, -16(up,i,8)
+	add	%r11, %rbp
+	adc	$0, %r9
+	mul	q0
+	mov	(up,i,8), %r10
+	mov	$0, R32(%r11)
+	add	%rax, %r10
+	mov	8(mp,i,8), %rax
+	adc	%rdx, %r11
+	mov	%rbp, -8(up,i,8)
+	add	%r9, %r10
+	adc	$0, %r11
+	mov	8(up,i,8), %rbp
+	mov	%r10, (up,i,8)
+	add	$4, i
+	jnc	L(tp3)
+
+L(ed3):	mul	q0
+	add	%rax, %rbp
+	adc	$0, %rdx
+	add	%r11, %rbp
+	adc	$0, %rdx
+	mov	%rbp, I(-8(up),-24(up,i,8))
+	mov	%rdx, (up,n,8)		C up[0]
+	mov	%rbx, q0		C previously computed q limb -> q0
+	lea	8(up), up		C up++
+	dec	j
+	jnz	L(otp3)
+C	jmp	L(cj)
+
+L(cj):
+IFSTD(`	lea	(up,n,8), up		C param 2: up
+	lea	(up,n,8), %rdx		C param 3: up - n
+	neg	R32(n)		')	C param 4: n
+
+IFDOS(`	lea	(up,n,8), %rdx		C param 2: up
+	lea	(%rdx,n,8), %r8		C param 3: up - n
+	neg	R32(n)
+	mov	n, %r9			C param 4: n
+	mov	rp, %rcx	')	C param 1: rp
+
+IFSTD(`	sub	$8, %rsp	')
+IFDOS(`	sub	$40, %rsp	')
+	ASSERT(nz, `test $15, %rsp')
+	CALL(	mpn_add_n)
+IFSTD(`	add	$8, %rsp	')
+IFDOS(`	add	$40, %rsp	')
+
+L(ret):	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+
+L(bx0):	test	$2, R8(n)
+	jnz	L(b2)
+
+L(b0):
+L(otp0):lea	2(n), i
+	mov	(mp,n,8), %rax
+	mul	q0
+	mov	$0, R32(%r11)
+	mov	(up,n,8), %r10
+	add	%rax, %r10
+	mov	8(mp,n,8), %rax
+	adc	%rdx, %r11
+	mov	8(up,n,8), %rbx
+	mul	q0
+	add	%rax, %rbx
+	mov	$0, R32(%r9)
+	mov	16(mp,n,8), %rax
+	adc	%rdx, %r9
+	add	%r11, %rbx
+	adc	$0, %r9
+	mul	q0
+	mov	16(up,n,8), %r10
+	mov	$0, R32(%r11)
+	add	%rax, %r10
+	mov	24(mp,n,8), %rax
+	adc	%rdx, %r11
+	mov	%rbx, 8(up,n,8)
+	imul	u0inv, %rbx		C next q limb
+	jmp	L(e0)
+
+	ALIGNx
+L(tp0):	mul	q0
+	add	%rax, %rbp
+	mov	$0, R32(%r9)
+	mov	-16(mp,i,8), %rax
+	adc	%rdx, %r9
+	mul	q0
+	add	%r11, %rbp
+	mov	$0, R32(%r11)
+	mov	-16(up,i,8), %r10
+	adc	$0, %r9
+	add	%rax, %r10
+	mov	-8(mp,i,8), %rax
+	adc	%rdx, %r11
+	mov	%rbp, -24(up,i,8)
+	add	%r9, %r10
+	adc	$0, %r11
+	mov	-8(up,i,8), %rbp
+	mul	q0
+	add	%rax, %rbp
+	mov	$0, R32(%r9)
+	mov	(mp,i,8), %rax
+	adc	%rdx, %r9
+	mov	%r10, -16(up,i,8)
+	add	%r11, %rbp
+	adc	$0, %r9
+	mul	q0
+	mov	(up,i,8), %r10
+	mov	$0, R32(%r11)
+	add	%rax, %r10
+	mov	8(mp,i,8), %rax
+	adc	%rdx, %r11
+	mov	%rbp, -8(up,i,8)
+L(e0):	add	%r9, %r10
+	adc	$0, %r11
+	mov	8(up,i,8), %rbp
+	mov	%r10, (up,i,8)
+	add	$4, i
+	jnc	L(tp0)
+
+L(ed0):	mul	q0
+	add	%rax, %rbp
+	adc	$0, %rdx
+	add	%r11, %rbp
+	adc	$0, %rdx
+	mov	%rbp, I(-8(up),-24(up,i,8))
+	mov	%rdx, (up,n,8)		C up[0]
+	mov	%rbx, q0		C previously computed q limb -> q0
+	lea	8(up), up		C up++
+	dec	j
+	jnz	L(otp0)
+	jmp	L(cj)
+
+L(b2):	cmp	$-2, R32(n)
+	jz	L(n2)
+
+L(otp2):lea	4(n), i
+	mov	(mp,n,8), %rax
+	mul	q0
+	mov	(up,n,8), %r10
+	mov	$0, R32(%r11)
+	add	%rax, %r10
+	mov	8(mp,n,8), %rax
+	adc	%rdx, %r11
+	mov	8(up,n,8), %rbx
+	mul	q0
+	add	%rax, %rbx
+	mov	$0, R32(%r9)
+	mov	16(mp,n,8), %rax
+	adc	%rdx, %r9
+	mul	q0
+	add	%r11, %rbx
+	mov	$0, R32(%r11)
+	mov	16(up,n,8), %r10
+	adc	$0, %r9
+	add	%rax, %r10
+	mov	24(mp,n,8), %rax
+	adc	%rdx, %r11
+	mov	%rbx, 8(up,n,8)
+	imul	u0inv, %rbx		C next q limb
+	jmp	L(e2)
+
+	ALIGNx
+L(tp2):	mul	q0
+	add	%rax, %rbp
+	mov	$0, R32(%r9)
+	mov	-16(mp,i,8), %rax
+	adc	%rdx, %r9
+	mul	q0
+	add	%r11, %rbp
+	mov	$0, R32(%r11)
+	mov	-16(up,i,8), %r10
+	adc	$0, %r9
+	add	%rax, %r10
+	mov	-8(mp,i,8), %rax
+	adc	%rdx, %r11
+	mov	%rbp, -24(up,i,8)
+L(e2):	add	%r9, %r10
+	adc	$0, %r11
+	mov	-8(up,i,8), %rbp
+	mul	q0
+	add	%rax, %rbp
+	mov	$0, R32(%r9)
+	mov	(mp,i,8), %rax
+	adc	%rdx, %r9
+	mov	%r10, -16(up,i,8)
+	add	%r11, %rbp
+	adc	$0, %r9
+	mul	q0
+	mov	(up,i,8), %r10
+	mov	$0, R32(%r11)
+	add	%rax, %r10
+	mov	8(mp,i,8), %rax
+	adc	%rdx, %r11
+	mov	%rbp, -8(up,i,8)
+	add	%r9, %r10
+	adc	$0, %r11
+	mov	8(up,i,8), %rbp
+	mov	%r10, (up,i,8)
+	add	$4, i
+	jnc	L(tp2)
+
+L(ed2):	mul	q0
+	add	%rax, %rbp
+	adc	$0, %rdx
+	add	%r11, %rbp
+	adc	$0, %rdx
+	mov	%rbp, I(-8(up),-24(up,i,8))
+	mov	%rdx, (up,n,8)		C up[0]
+	mov	%rbx, q0		C previously computed q limb -> q0
+	lea	8(up), up		C up++
+	dec	j
+	jnz	L(otp2)
+	jmp	L(cj)
+
+L(n1):	mov	(mp_param), %rax
+	mul	q0
+	add	-8(up), %rax
+	adc	(up), %rdx
+	mov	%rdx, (rp)
+	mov	$0, R32(%rax)
+	adc	R32(%rax), R32(%rax)
+	jmp	L(ret)
+
+L(n2):	mov	(mp_param), %rax
+	mov	-16(up), %rbp
+	mul	q0
+	add	%rax, %rbp
+	mov	%rdx, %r9
+	adc	$0, %r9
+	mov	-8(mp), %rax
+	mov	-8(up), %r10
+	mul	q0
+	add	%rax, %r10
+	mov	%rdx, %r11
+	adc	$0, %r11
+	add	%r9, %r10
+	adc	$0, %r11
+	mov	%r10, q0
+	imul	u0inv, q0		C next q0
+	mov	-16(mp), %rax
+	mul	q0
+	add	%rax, %r10
+	mov	%rdx, %r9
+	adc	$0, %r9
+	mov	-8(mp), %rax
+	mov	(up), %r14
+	mul	q0
+	add	%rax, %r14
+	adc	$0, %rdx
+	add	%r9, %r14
+	adc	$0, %rdx
+	xor	R32(%rax), R32(%rax)
+	add	%r11, %r14
+	adc	8(up), %rdx
+	mov	%r14, (rp)
+	mov	%rdx, 8(rp)
+	adc	R32(%rax), R32(%rax)
+	jmp	L(ret)
+
+	ALIGNx
+L(n3):	mov	-24(mp), %rax
+	mov	-24(up), %r10
+	mul	q0
+	add	%rax, %r10
+	mov	-16(mp), %rax
+	mov	%rdx, %r11
+	adc	$0, %r11
+	mov	-16(up), %rbp
+	mul	q0
+	add	%rax, %rbp
+	mov	%rdx, %r9
+	adc	$0, %r9
+	mov	-8(mp), %rax
+	add	%r11, %rbp
+	mov	-8(up), %r10
+	adc	$0, %r9
+	mul	q0
+	mov	%rbp, q0
+	imul	u0inv, q0		C next q0
+	add	%rax, %r10
+	mov	%rdx, %r11
+	adc	$0, %r11
+	mov	%rbp, -16(up)
+	add	%r9, %r10
+	adc	$0, %r11
+	mov	%r10, -8(up)
+	mov	%r11, -24(up)		C up[0]
+	lea	8(up), up		C up++
+	dec	j
+	jnz	L(n3)
+
+	mov	-48(up), %rdx
+	mov	-40(up), %rbx
+	xor	R32(%rax), R32(%rax)
+	add	%rbp, %rdx
+	adc	%r10, %rbx
+	adc	-8(up), %r11
+	mov	%rdx, (rp)
+	mov	%rbx, 8(rp)
+	mov	%r11, 16(rp)
+	adc	R32(%rax), R32(%rax)
+	jmp	L(ret)
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/x86_64/coreinhm/sec_tabselect.asm b/third_party/gmp/mpn/x86_64/coreinhm/sec_tabselect.asm
new file mode 100644
index 0000000..e436034
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/coreinhm/sec_tabselect.asm

@@ -0,0 +1,37 @@
+dnl  X86-64 mpn_sec_tabselect.
+
+dnl  Copyright 2012, 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_sec_tabselect)
+include_mpn(`x86_64/fastsse/sec_tabselect.asm')

diff --git a/third_party/gmp/mpn/x86_64/coreisbr/addmul_2.asm b/third_party/gmp/mpn/x86_64/coreisbr/addmul_2.asm
new file mode 100644
index 0000000..21f0bf4
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/coreisbr/addmul_2.asm

@@ -0,0 +1,224 @@
+dnl  AMD64 mpn_addmul_2 optimised for Intel Sandy Bridge.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb	best
+C AMD K8,K9
+C AMD K10
+C AMD bull
+C AMD pile
+C AMD bobcat
+C AMD jaguar
+C Intel P4
+C Intel core
+C Intel NHM
+C Intel SBR	 2.93		this
+C Intel IBR	 2.66		this
+C Intel HWL	 2.5		 2.15
+C Intel BWL
+C Intel atom
+C VIA nano
+
+C This code is the result of running a code generation and optimisation tool
+C suite written by David Harvey and Torbjorn Granlund.
+
+C When playing with pointers, set this to $2 to fall back to conservative
+C indexing in wind-down code.
+define(`I',`$1')
+
+
+define(`rp',      `%rdi')   C rcx
+define(`up',      `%rsi')   C rdx
+define(`n_param', `%rdx')   C r8
+define(`vp',      `%rcx')   C r9
+
+define(`n',	  `%rcx')
+define(`v0',      `%rbx')
+define(`v1',      `%rbp')
+define(`w0',      `%r8')
+define(`w1',      `%r9')
+define(`w2',      `%r10')
+define(`w3',      `%r11')
+define(`X0',      `%r12')
+define(`X1',      `%r13')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(32)
+PROLOGUE(mpn_addmul_2)
+	FUNC_ENTRY(4)
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+
+	mov	(vp), v0
+	mov	8(vp), v1
+
+	mov	(up), %rax
+
+	mov	n_param, n
+	neg	n
+
+	lea	(up,n_param,8), up
+	lea	8(rp,n_param,8), rp
+	mul	v0
+
+	test	$1, R8(n)
+	jnz	L(bx1)
+
+L(bx0):	mov	-8(rp,n,8), X0
+	mov	%rdx, w1
+	add	%rax, X0
+	adc	$0, w1
+	mov	(up,n,8), %rax
+	xor	w0, w0
+	xor	w3, w3
+	test	$2, R8(n)
+	jnz	L(b10)
+
+L(b00):	nop				C this nop make loop go faster on SBR!
+	mul	v1
+	mov	(rp,n,8), X1
+	jmp	L(lo0)
+
+L(b10):	lea	-2(n), n
+	jmp	L(lo2)
+
+L(bx1):	mov	-8(rp,n,8), X1
+	mov	%rdx, w3
+	add	%rax, X1
+	adc	$0, w3
+	mov	(up,n,8), %rax
+	xor	w1, w1
+	xor	w2, w2
+	test	$2, R8(n)
+	jz	L(b11)
+
+L(b01):	mov	(rp,n,8), X0
+	inc	n
+	jmp	L(lo1)
+
+L(b11):	dec	n
+	jmp	L(lo3)
+
+	ALIGN(32)
+L(top):
+L(lo1):	mul	v1
+	mov	%rdx, w0		C 1
+	add	%rax, X0		C 0
+	adc	$0, w0			C 1
+	add	w1, X1			C 3
+	adc	$0, w3			C 0
+	add	w2, X0			C 0
+	adc	$0, w0			C 1
+	mov	(up,n,8), %rax
+	mul	v0
+	add	%rax, X0		C 0
+	mov	%rdx, w1		C 1
+	adc	$0, w1			C 1
+	mov	(up,n,8), %rax
+	mul	v1
+	mov	X1, -16(rp,n,8)		C 3
+	mov	(rp,n,8), X1		C 1
+	add	w3, X0			C 0
+	adc	$0, w1			C 1
+L(lo0):	mov	%rdx, w2		C 2
+	mov	X0, -8(rp,n,8)		C 0
+	add	%rax, X1		C 1
+	adc	$0, w2			C 2
+	mov	8(up,n,8), %rax
+	add	w0, X1			C 1
+	adc	$0, w2			C 2
+	mul	v0
+	add	%rax, X1		C 1
+	mov	%rdx, w3		C 2
+	adc	$0, w3			C 2
+	mov	8(up,n,8), %rax
+L(lo3):	mul	v1
+	add	w1, X1			C 1
+	mov	8(rp,n,8), X0		C 2
+	adc	$0, w3			C 2
+	mov	%rdx, w0		C 3
+	add	%rax, X0		C 2
+	adc	$0, w0			C 3
+	mov	16(up,n,8), %rax
+	mul	v0
+	add	w2, X0			C 2
+	mov	X1, (rp,n,8)		C 1
+	mov	%rdx, w1		C 3
+	adc	$0, w0			C 3
+	add	%rax, X0		C 2
+	adc	$0, w1			C 3
+	mov	16(up,n,8), %rax
+	add	w3, X0			C 2
+	adc	$0, w1			C 3
+L(lo2):	mul	v1
+	mov	16(rp,n,8), X1		C 3
+	add	%rax, X1		C 3
+	mov	%rdx, w2		C 4
+	adc	$0, w2			C 4
+	mov	24(up,n,8), %rax
+	mov	X0, 8(rp,n,8)		C 2
+	mul	v0
+	add	w0, X1			C 3
+	mov	%rdx, w3		C 4
+	adc	$0, w2			C 4
+	add	%rax, X1		C 3
+	mov	24(up,n,8), %rax
+	mov	24(rp,n,8), X0		C 0	useless but harmless final read
+	adc	$0, w3			C 4
+	add	$4, n
+	jnc	L(top)
+
+L(end):	mul	v1
+	add	w1, X1
+	adc	$0, w3
+	add	w2, %rax
+	adc	$0, %rdx
+	mov	X1, I(-16(rp),-16(rp,n,8))
+	add	w3, %rax
+	adc	$0, %rdx
+	mov	%rax, I(-8(rp),-8(rp,n,8))
+	mov	%rdx, %rax
+
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/coreisbr/aorrlsh1_n.asm b/third_party/gmp/mpn/x86_64/coreisbr/aorrlsh1_n.asm
new file mode 100644
index 0000000..2319a80
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/coreisbr/aorrlsh1_n.asm

@@ -0,0 +1,54 @@
+dnl  AMD64 mpn_addlsh1_n -- rp[] = up[] + (vp[] << 1)
+dnl  AMD64 mpn_rsblsh1_n -- rp[] = (vp[] << 1) - up[]
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  Copyright 2008, 2010-2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+define(LSH, 1)
+define(RSH, 63)
+
+ifdef(`OPERATION_addlsh1_n', `
+	define(ADDSUB,	add)
+	define(ADCSBB,	adc)
+	define(func_n,	mpn_addlsh1_n)
+	define(func_nc,	mpn_addlsh1_nc)')
+ifdef(`OPERATION_rsblsh1_n', `
+	define(ADDSUB,	sub)
+	define(ADCSBB,	sbb)
+	define(func_n,	mpn_rsblsh1_n)
+	define(func_nc,	mpn_rsblsh1_nc)')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_addlsh1_nc mpn_rsblsh1_n mpn_rsblsh1_nc)
+include_mpn(`x86_64/coreisbr/aorrlshC_n.asm')

diff --git a/third_party/gmp/mpn/x86_64/coreisbr/aorrlsh2_n.asm b/third_party/gmp/mpn/x86_64/coreisbr/aorrlsh2_n.asm
new file mode 100644
index 0000000..3b7bb22
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/coreisbr/aorrlsh2_n.asm

@@ -0,0 +1,56 @@
+dnl  AMD64 mpn_addlsh2_n -- rp[] = up[] + (vp[] << 2)
+dnl  AMD64 mpn_rsblsh2_n -- rp[] = (vp[] << 2) - up[]
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  Copyright 2008, 2010-2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+define(LSH, 2)
+define(RSH, 62)
+
+ifdef(`OPERATION_addlsh2_n', `
+	define(ADDSUB,	add)
+	define(ADCSBB,	adc)
+	define(func_n,	mpn_addlsh2_n)
+	define(func_nc,	mpn_addlsh2_nc)')
+ifdef(`OPERATION_rsblsh2_n', `
+	define(ADDSUB,	sub)
+	define(ADCSBB,	sbb)
+	define(func_n,	mpn_rsblsh2_n)
+	define(func_nc,	mpn_rsblsh2_nc)')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+C mpn_rsblsh2_nc removed below, its idea of carry-in is inconsistent with
+C refmpn_rsblsh2_nc
+MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_addlsh2_nc mpn_rsblsh2_n)
+include_mpn(`x86_64/coreisbr/aorrlshC_n.asm')

diff --git a/third_party/gmp/mpn/x86_64/coreisbr/aorrlshC_n.asm b/third_party/gmp/mpn/x86_64/coreisbr/aorrlshC_n.asm
new file mode 100644
index 0000000..23ace41
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/coreisbr/aorrlshC_n.asm

@@ -0,0 +1,173 @@
+dnl  AMD64 mpn_addlshC_n -- rp[] = up[] + (vp[] << C)
+dnl  AMD64 mpn_rsblshC_n -- rp[] = (vp[] << C) - up[]
+
+dnl  Copyright 2009-2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+C	     cycles/limb
+C AMD K8,K9	 ?
+C AMD K10	 ?
+C Intel P4	 ?
+C Intel core2	 3.25
+C Intel NHM	 4
+C Intel SBR	 2  C (or 1.95 when L(top)'s alignment = 16 (mod 32))
+C Intel atom	 ?
+C VIA nano	 ?
+
+C This code probably runs close to optimally on Sandy Bridge (using 4-way
+C unrolling).  It also runs reasonably well on Core 2, but it runs poorly on
+C all other processors, including Nehalem.
+
+C INPUT PARAMETERS
+define(`rp',	`%rdi')
+define(`up',	`%rsi')
+define(`vp',	`%rdx')
+define(`n',	`%rcx')
+define(`cy',	`%r8')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(func_nc)
+	FUNC_ENTRY(4)
+IFDOS(`	mov	56(%rsp), %r8	')
+	push	%rbp
+	mov	cy, %rax
+	neg	%rax			C set msb on carry
+	xor	R32(%rbp), R32(%rbp)	C limb carry
+	mov	(vp), %r8
+	shrd	$RSH, %r8, %rbp
+	mov	R32(n), R32(%r9)
+	and	$3, R32(%r9)
+	je	L(b00)
+	cmp	$2, R32(%r9)
+	jc	L(b01)
+	je	L(b10)
+	jmp	L(b11)
+EPILOGUE()
+
+	ALIGN(16)
+PROLOGUE(func_n)
+	FUNC_ENTRY(4)
+	push	%rbp
+	xor	R32(%rbp), R32(%rbp)	C limb carry
+	mov	(vp), %r8
+	shrd	$RSH, %r8, %rbp
+	mov	R32(n), R32(%rax)
+	and	$3, R32(%rax)
+	je	L(b00)
+	cmp	$2, R32(%rax)
+	jc	L(b01)
+	je	L(b10)
+
+L(b11):	mov	8(vp), %r9
+	shrd	$RSH, %r9, %r8
+	mov	16(vp), %r10
+	shrd	$RSH, %r10, %r9
+	add	R32(%rax), R32(%rax)	C init carry flag
+	ADCSBB	(up), %rbp
+	ADCSBB	8(up), %r8
+	ADCSBB	16(up), %r9
+	mov	%rbp, (rp)
+	mov	%r8, 8(rp)
+	mov	%r9, 16(rp)
+	mov	%r10, %rbp
+	lea	24(up), up
+	lea	24(vp), vp
+	lea	24(rp), rp
+	sbb	R32(%rax), R32(%rax)	C save carry flag
+	sub	$3, n
+	ja	L(top)
+	jmp	L(end)
+
+L(b01):	add	R32(%rax), R32(%rax)	C init carry flag
+	ADCSBB	(up), %rbp
+	mov	%rbp, (rp)
+	mov	%r8, %rbp
+	lea	8(up), up
+	lea	8(vp), vp
+	lea	8(rp), rp
+	sbb	R32(%rax), R32(%rax)	C save carry flag
+	sub	$1, n
+	ja	L(top)
+	jmp	L(end)
+
+L(b10):	mov	8(vp), %r9
+	shrd	$RSH, %r9, %r8
+	add	R32(%rax), R32(%rax)	C init carry flag
+	ADCSBB	(up), %rbp
+	ADCSBB	8(up), %r8
+	mov	%rbp, (rp)
+	mov	%r8, 8(rp)
+	mov	%r9, %rbp
+	lea	16(up), up
+	lea	16(vp), vp
+	lea	16(rp), rp
+	sbb	R32(%rax), R32(%rax)	C save carry flag
+	sub	$2, n
+	ja	L(top)
+	jmp	L(end)
+
+	ALIGN(16)
+L(top):	mov	(vp), %r8
+	shrd	$RSH, %r8, %rbp
+L(b00):	mov	8(vp), %r9
+	shrd	$RSH, %r9, %r8
+	mov	16(vp), %r10
+	shrd	$RSH, %r10, %r9
+	mov	24(vp), %r11
+	shrd	$RSH, %r11, %r10
+	lea	32(vp), vp
+	add	R32(%rax), R32(%rax)	C restore carry flag
+	ADCSBB	(up), %rbp
+	ADCSBB	8(up), %r8
+	ADCSBB	16(up), %r9
+	ADCSBB	24(up), %r10
+	lea	32(up), up
+	mov	%rbp, (rp)
+	mov	%r8, 8(rp)
+	mov	%r9, 16(rp)
+	mov	%r10, 24(rp)
+	mov	%r11, %rbp
+	lea	32(rp), rp
+	sbb	R32(%rax), R32(%rax)	C save carry flag
+	sub	$4, n
+	jnz	L(top)
+
+L(end):	shr	$RSH, %rbp
+	add	R32(%rax), R32(%rax)	C restore carry flag
+	ADCSBB	$0, %rbp
+	mov	%rbp, %rax
+	pop	%rbp
+	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/coreisbr/aorrlsh_n.asm b/third_party/gmp/mpn/x86_64/coreisbr/aorrlsh_n.asm
new file mode 100644
index 0000000..db8ee68
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/coreisbr/aorrlsh_n.asm

@@ -0,0 +1,215 @@
+dnl  AMD64 mpn_addlsh_n -- rp[] = up[] + (vp[] << k)
+dnl  AMD64 mpn_rsblsh_n -- rp[] = (vp[] << k) - up[]
+dnl  Optimised for Sandy Bridge.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  Copyright 2011, 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C AMD K8,K9	 ?
+C AMD K10	 5.25
+C Intel P4	 ?
+C Intel core2	 3.1
+C Intel NHM	 3.95
+C Intel SBR	 2.75
+C Intel atom	 ?
+C VIA nano	 ?
+
+C The inner-loop probably runs close to optimally on Sandy Bridge (using 4-way
+C unrolling).  The rest of the code is quite crude, and could perhaps be made
+C both smaller and faster.
+
+C INPUT PARAMETERS
+define(`rp',	`%rdi')
+define(`up',	`%rsi')
+define(`vp',	`%rdx')
+define(`n',	`%rcx')
+define(`cnt',	`%r8')
+define(`cy',	`%r9')			C for _nc variant
+
+ifdef(`OPERATION_addlsh_n', `
+	define(ADDSUB,	add)
+	define(ADCSBB,	adc)
+	define(IFRSB,	)
+	define(func_n,	mpn_addlsh_n)
+	define(func_nc,	mpn_addlsh_nc)')
+ifdef(`OPERATION_rsblsh_n', `
+	define(ADDSUB,	sub)
+	define(ADCSBB,	sbb)
+	define(IFRSB,	`$1')
+	define(func_n,	mpn_rsblsh_n)
+	define(func_nc,	mpn_rsblsh_nc)')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+C mpn_rsblsh_nc removed below, its idea of carry-in is inconsistent with
+C refmpn_rsblsh_nc
+MULFUNC_PROLOGUE(mpn_addlsh_n mpn_addlsh_nc mpn_rsblsh_n)
+
+ASM_START()
+	TEXT
+	ALIGN(32)
+PROLOGUE(func_n)
+	FUNC_ENTRY(4)
+IFDOS(`	mov	56(%rsp), %r8d	')	C cnt
+	push	%rbx
+	xor	R32(%rbx), R32(%rbx)	C clear CF save register
+L(ent):	push	%rbp
+	mov	R32(n), R32(%rbp)
+	mov	n, %rax
+	mov	R32(cnt), R32(%rcx)
+	neg	R32(%rcx)
+	and	$3, R32(%rbp)
+	jz	L(b0)
+	lea	-32(vp,%rbp,8), vp
+	lea	-32(up,%rbp,8), up
+	lea	-32(rp,%rbp,8), rp
+	cmp	$2, R32(%rbp)
+	jc	L(b1)
+	jz	L(b2)
+
+L(b3):	xor	%r8, %r8
+	mov	8(vp), %r9
+	mov	16(vp), %r10
+	shrd	R8(%rcx), %r9, %r8
+	shrd	R8(%rcx), %r10, %r9
+	mov	24(vp), %r11
+	shrd	R8(%rcx), %r11, %r10
+	sub	$3, %rax
+	jz	L(3)
+	add	R32(%rbx), R32(%rbx)
+	lea	32(vp), vp
+	ADCSBB	8(up), %r8
+	ADCSBB	16(up), %r9
+	ADCSBB	24(up), %r10
+	lea	32(up), up
+	jmp	L(lo3)
+L(3):	add	R32(%rbx), R32(%rbx)
+	lea	32(vp), vp
+	ADCSBB	8(up), %r8
+	ADCSBB	16(up), %r9
+	ADCSBB	24(up), %r10
+	jmp	L(wd3)
+
+L(b0):	mov	(vp), %r8
+	mov	8(vp), %r9
+	xor	R32(%rbp), R32(%rbp)
+	jmp	L(lo0)
+
+L(b1):	xor	%r10, %r10
+	mov	24(vp), %r11
+	shrd	R8(%rcx), %r11, %r10
+	sub	$1, %rax
+	jz	L(1)
+	add	R32(%rbx), R32(%rbx)
+	lea	32(vp), vp
+	ADCSBB	24(up), %r10
+	lea	32(up), up
+	mov	(vp), %r8
+	jmp	L(lo1)
+L(1):	add	R32(%rbx), R32(%rbx)
+	ADCSBB	24(up), %r10
+	jmp	L(wd1)
+
+L(b2):	xor	%r9, %r9
+	mov	16(vp), %r10
+	shrd	R8(%rcx), %r10, %r9
+	mov	24(vp), %r11
+	shrd	R8(%rcx), %r11, %r10
+	sub	$2, %rax
+	jz	L(2)
+	add	R32(%rbx), R32(%rbx)
+	lea	32(vp), vp
+	ADCSBB	16(up), %r9
+	ADCSBB	24(up), %r10
+	lea	32(up), up
+	jmp	L(lo2)
+L(2):	add	R32(%rbx), R32(%rbx)
+	ADCSBB	16(up), %r9
+	ADCSBB	24(up), %r10
+	jmp	L(wd2)
+
+	ALIGN(32)			C 16-byte alignment is not enough!
+L(top):	shrd	R8(%rcx), %r11, %r10
+	add	R32(%rbx), R32(%rbx)
+	lea	32(vp), vp
+	ADCSBB	(up), %rbp
+	ADCSBB	8(up), %r8
+	ADCSBB	16(up), %r9
+	ADCSBB	24(up), %r10
+	mov	%rbp, (rp)
+	lea	32(up), up
+L(lo3):	mov	%r8, 8(rp)
+L(lo2):	mov	%r9, 16(rp)
+	mov	(vp), %r8
+L(lo1):	mov	%r10, 24(rp)
+	mov	8(vp), %r9
+	mov	%r11, %rbp
+	lea	32(rp), rp
+	sbb	R32(%rbx), R32(%rbx)
+L(lo0):	shrd	R8(%rcx), %r8, %rbp
+	mov	16(vp), %r10
+	shrd	R8(%rcx), %r9, %r8
+	shrd	R8(%rcx), %r10, %r9
+	mov	24(vp), %r11
+	sub	$4, %rax
+	jg	L(top)
+
+	shrd	R8(%rcx), %r11, %r10
+	add	R32(%rbx), R32(%rbx)
+	ADCSBB	(up), %rbp
+	ADCSBB	8(up), %r8
+	ADCSBB	16(up), %r9
+	ADCSBB	24(up), %r10
+	mov	%rbp, (rp)
+L(wd3):	mov	%r8, 8(rp)
+L(wd2):	mov	%r9, 16(rp)
+L(wd1):	mov	%r10, 24(rp)
+	adc	R32(%rax), R32(%rax)	C rax is zero after loop
+	shr	R8(%rcx), %r11
+	ADDSUB	%r11, %rax
+IFRSB(	neg	%rax)
+	pop	%rbp
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+EPILOGUE()
+PROLOGUE(func_nc)
+	FUNC_ENTRY(4)
+IFDOS(`	mov	56(%rsp), %r8d	')	C cnt
+IFDOS(`	mov	64(%rsp), %r9	')	C cy
+	push	%rbx
+	neg	cy
+	sbb	R32(%rbx), R32(%rbx)	C initialise CF save register
+	jmp	L(ent)
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/coreisbr/aors_n.asm b/third_party/gmp/mpn/x86_64/coreisbr/aors_n.asm
new file mode 100644
index 0000000..61fee3e
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/coreisbr/aors_n.asm

@@ -0,0 +1,203 @@
+dnl  AMD64 mpn_add_n, mpn_sub_n optimised for Sandy bridge, Ivy bridge, and
+dnl  Haswell.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2003-2005, 2007, 2008, 2010-2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	    cycles/limb
+C AMD K8,K9	 1.75\2.52
+C AMD K10	 1.5
+C AMD bd1	 1.69\2.25
+C AMD bd2	 1.65
+C AMD bd3	 ?
+C AMD bd4	 ?
+C AMD zen	 1.5
+C AMD bt1	 2.67
+C AMD bt2	 2.16
+C Intel P4	11.54
+C Intel PNR	 5
+C Intel NHM	 5.5
+C Intel SBR	 1.54
+C Intel IBR	 1.5
+C Intel HWL	 1.32
+C Intel BWL	 1.07
+C Intel SKL	 1.21
+C Intel atom	 4.3
+C Intel SLM	 3
+C VIA nano	 ?
+
+C The loop of this code was manually written.  It runs close to optimally on
+C Intel SBR, IBR, and HWL far as we know, except for the fluctuation problems.
+C It also runs slightly faster on average on AMD bd1 and bd2.
+C
+C No micro-optimisation has been done.
+C
+C N.B.!  The loop alignment padding insns are executed.  If editing the code,
+C make sure the padding does not become excessive.  It is now a 4-byte nop.
+
+define(`rp',	`%rdi')	C rcx
+define(`up',	`%rsi')	C rdx
+define(`vp',	`%rdx')	C r8
+define(`n',	`%rcx')	C r9
+define(`cy',	`%r8')	C rsp+40    (mpn_add_nc and mpn_sub_nc)
+
+ifdef(`OPERATION_add_n', `
+  define(ADCSBB,    adc)
+  define(func,      mpn_add_n)
+  define(func_nc,   mpn_add_nc)')
+ifdef(`OPERATION_sub_n', `
+  define(ADCSBB,    sbb)
+  define(func,      mpn_sub_n)
+  define(func_nc,   mpn_sub_nc)')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(32)
+PROLOGUE(func)
+	FUNC_ENTRY(4)
+	xor	%r8, %r8
+
+L(ent):	mov	R32(n), R32(%rax)
+	shr	$2, n
+
+	test	$1, R8(%rax)
+	jnz	L(bx1)
+
+L(bx0):	test	$2, R8(%rax)
+	jnz	L(b10)
+
+L(b00):	neg	%r8
+	mov	(up), %r8
+	mov	8(up), %r9
+	ADCSBB	(vp), %r8
+	ADCSBB	8(vp), %r9
+	mov	16(up), %r10
+	mov	24(up), %r11
+	lea	32(up), up
+	ADCSBB	16(vp), %r10
+	ADCSBB	24(vp), %r11
+	lea	32(vp), vp
+	lea	-16(rp), rp
+	jmp	L(lo0)
+
+L(b10):	neg	%r8
+	mov	(up), %r10
+	mov	8(up), %r11
+	ADCSBB	0(vp), %r10
+	ADCSBB	8(vp), %r11
+	jrcxz	L(e2)
+	mov	16(up), %r8
+	mov	24(up), %r9
+	lea	16(up), up
+	ADCSBB	16(vp), %r8
+	ADCSBB	24(vp), %r9
+	lea	16(vp), vp
+C	lea	(rp), rp
+	jmp	L(lo2)
+
+L(e2):	mov	%r10, (rp)
+	mov	%r11, 8(rp)
+	setc	R8(%rax)
+	FUNC_EXIT()
+	ret
+
+L(bx1):	test	$2, R8(%rax)
+	jnz	L(b11)
+
+L(b01):	neg	%r8
+	mov	(up), %r11
+	ADCSBB	(vp), %r11
+	jrcxz	L(e1)
+	mov	8(up), %r8
+	mov	16(up), %r9
+	lea	8(up), up
+	lea	-8(rp), rp
+	ADCSBB	8(vp), %r8
+	ADCSBB	16(vp), %r9
+	lea	8(vp), vp
+	jmp	L(lo1)
+
+L(e1):	mov	%r11, (rp)
+	setc	R8(%rax)
+	FUNC_EXIT()
+	ret
+
+L(b11):	neg	%r8
+	mov	(up), %r9
+	ADCSBB	(vp), %r9
+	mov	8(up), %r10
+	mov	16(up), %r11
+	lea	24(up), up
+	ADCSBB	8(vp), %r10
+	ADCSBB	16(vp), %r11
+	lea	24(vp), vp
+	mov	%r9, (rp)
+	lea	8(rp), rp
+	jrcxz	L(end)
+
+	ALIGN(32)
+L(top):	mov	(up), %r8
+	mov	8(up), %r9
+	ADCSBB	(vp), %r8
+	ADCSBB	8(vp), %r9
+L(lo2):	mov	%r10, (rp)
+L(lo1):	mov	%r11, 8(rp)
+	mov	16(up), %r10
+	mov	24(up), %r11
+	lea	32(up), up
+	ADCSBB	16(vp), %r10
+	ADCSBB	24(vp), %r11
+	lea	32(vp), vp
+L(lo0):	mov	%r8, 16(rp)
+L(lo3):	mov	%r9, 24(rp)
+	lea	32(rp), rp
+	dec	n
+	jnz	L(top)
+
+L(end):	mov	R32(n), R32(%rax)	C zero rax
+	mov	%r10, (rp)
+	mov	%r11, 8(rp)
+	setc	R8(%rax)
+	FUNC_EXIT()
+	ret
+EPILOGUE()
+	ALIGN(16)
+PROLOGUE(func_nc)
+	FUNC_ENTRY(4)
+IFDOS(`	mov	56(%rsp), %r8	')
+	jmp	L(ent)
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/coreisbr/aorsmul_1.asm b/third_party/gmp/mpn/x86_64/coreisbr/aorsmul_1.asm
new file mode 100644
index 0000000..b4c1572
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/coreisbr/aorsmul_1.asm

@@ -0,0 +1,212 @@
+dnl  X86-64 mpn_addmul_1 and mpn_submul_1 optimised for Intel Sandy Bridge.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C            cycles/limb
+C AMD K8,K9      4.27
+C AMD K10        4.27    4.54
+C AMD bull       4.76
+C AMD pile       4.55
+C AMD steam
+C AMD excavator
+C AMD bobcat     5.30
+C AMD jaguar     5.28
+C Intel P4      16.2    17.1
+C Intel core2    5.26
+C Intel NHM      5.09
+C Intel SBR      3.21
+C Intel IBR      2.96
+C Intel HWL      2.81
+C Intel BWL      2.76
+C Intel SKL      2.76
+C Intel atom    21.5
+C Intel SLM      9.5
+C VIA nano
+
+C The loop of this code is the result of running a code generation and
+C optimization tool suite written by David Harvey and Torbjörn Granlund.
+
+define(`rp',      `%rdi')   C rcx
+define(`up',      `%rsi')   C rdx
+define(`n_param', `%rdx')   C r8
+define(`v0',      `%rcx')   C r9
+
+define(`n',       `%rbx')
+
+define(`I',`$1')
+
+ifdef(`OPERATION_addmul_1',`
+      define(`ADDSUB',        `add')
+      define(`func',  `mpn_addmul_1')
+')
+ifdef(`OPERATION_submul_1',`
+      define(`ADDSUB',        `sub')
+      define(`func',  `mpn_submul_1')
+')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
+
+IFDOS(`	define(`up',     ``%rsi'')') dnl
+IFDOS(`	define(`rp',     ``%rcx'')') dnl
+IFDOS(`	define(`v0',     ``%r9'')') dnl
+IFDOS(`	define(`r9',     ``rdi'')') dnl
+IFDOS(`	define(`n_param',``%r8'')') dnl
+
+ASM_START()
+	TEXT
+	ALIGN(32)
+PROLOGUE(func)
+
+IFDOS(``push	%rsi		'')
+IFDOS(``push	%rdi		'')
+IFDOS(``mov	%rdx, %rsi	'')
+
+	mov	(up), %rax
+	push	%rbx
+	lea	(up,n_param,8), up
+	lea	(rp,n_param,8), rp
+
+	test	$1, R8(n_param)
+	jnz	L(b13)
+
+L(b02):	xor	R32(%r11), R32(%r11)
+	test	$2, R8(n_param)
+	jnz	L(b2)
+
+L(b0):	mov	$1, R32(n)
+	sub	n_param, n
+	mul	v0
+	mov	%rdx, %r9
+	mov	-8(rp,n,8), %r8
+	jmp	L(e0)
+
+	ALIGN(16)
+L(b2):	mov	$-1, n
+	sub	n_param, n
+	mul	v0
+	mov	8(rp,n,8), %r8
+	mov	%rdx, %r9
+	jmp	L(e2)
+
+	ALIGN(16)
+L(b13):	xor	R32(%r9), R32(%r9)
+	test	$2, R8(n_param)
+	jnz	L(b3)
+
+L(b1):	mov	$2, R32(n)
+	sub	n_param, n
+	jns	L(1)
+	mul	v0
+	mov	-16(rp,n,8), %r10
+	mov	%rdx, %r11
+	jmp	L(e1)
+
+	ALIGN(16)
+L(b3):	xor	R32(n), R32(n)
+	sub	n_param, n
+	mul	v0
+	mov	(rp,n,8), %r10
+	jmp	L(e3)
+
+	ALIGN(32)
+L(top):	mul	v0
+	mov	-16(rp,n,8), %r10
+	ADDSUB	%r11, %r8
+	mov	%rdx, %r11
+	adc	$0, %r9
+	mov	%r8, -24(rp,n,8)
+L(e1):	ADDSUB	%rax, %r10
+	mov	-8(up,n,8), %rax
+	adc	$0, %r11
+	mul	v0
+	ADDSUB	%r9, %r10
+	mov	%rdx, %r9
+	mov	-8(rp,n,8), %r8
+	adc	$0, %r11
+	mov	%r10, -16(rp,n,8)
+L(e0):	ADDSUB	%rax, %r8
+	adc	$0, %r9
+	mov	(up,n,8), %rax
+	mul	v0
+	mov	(rp,n,8), %r10
+	ADDSUB	%r11, %r8
+	mov	%r8, -8(rp,n,8)
+	adc	$0, %r9
+L(e3):	mov	%rdx, %r11
+	ADDSUB	%rax, %r10
+	mov	8(up,n,8), %rax
+	adc	$0, %r11
+	mul	v0
+	mov	8(rp,n,8), %r8
+	ADDSUB	%r9, %r10
+	mov	%rdx, %r9
+	mov	%r10, (rp,n,8)
+	adc	$0, %r11
+L(e2):	ADDSUB	%rax, %r8
+	adc	$0, %r9
+	mov	16(up,n,8), %rax
+	add	$4, n
+	jnc	L(top)
+
+L(end):	mul	v0
+	mov	I(-8(rp),-16(rp,n,8)), %r10
+	ADDSUB	%r11, %r8
+	mov	%rdx, %r11
+	adc	$0, %r9
+	mov	%r8, I(-16(rp),-24(rp,n,8))
+	ADDSUB	%rax, %r10
+	adc	$0, %r11
+	ADDSUB	%r9, %r10
+	adc	$0, %r11
+	mov	%r10, I(-8(rp),-16(rp,n,8))
+	mov	%r11, %rax
+
+	pop	%rbx
+IFDOS(``pop	%rdi		'')
+IFDOS(``pop	%rsi		'')
+	ret
+
+	ALIGN(16)
+L(1):	mul	v0
+	ADDSUB	%rax, -8(rp)
+	mov	%rdx, %rax
+	adc	$0, %rax
+	pop	%rbx
+IFDOS(``pop	%rdi		'')
+IFDOS(``pop	%rsi		'')
+	ret
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/x86_64/coreisbr/cnd_add_n.asm b/third_party/gmp/mpn/x86_64/coreisbr/cnd_add_n.asm
new file mode 100644
index 0000000..43abcc8
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/coreisbr/cnd_add_n.asm

@@ -0,0 +1,174 @@
+dnl  AMD64 mpn_cnd_add_n.
+
+dnl  Copyright 2011-2013, 2017 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C AMD K8,K9
+C AMD K10
+C AMD bd1
+C AMD bd2
+C AMD bd3
+C AMD bd4
+C AMD zen
+C AMD bobcat
+C AMD jaguar
+C Intel P4
+C Intel PNR	 3.0
+C Intel NHM	 3.75
+C Intel SBR	 1.93
+C Intel IBR	 1.89
+C Intel HWL	 1.78
+C Intel BWL	 1.50
+C Intel SKL	 1.50
+C Intel atom
+C Intel SLM	 4.0
+C VIA nano
+
+C NOTES
+C  * It might seem natural to use the cmov insn here, but since this function
+C    is supposed to have the exact same execution pattern for cnd true and
+C    false, and since cmov's documentation is not clear about whether it
+C    actually reads both source operands and writes the register for a false
+C    condition, we cannot use it.
+
+C INPUT PARAMETERS
+define(`cnd_arg', `%rdi')	dnl rcx
+define(`rp',	  `%rsi')	dnl rdx
+define(`up',	  `%rdx')	dnl r8
+define(`vp',	  `%rcx')	dnl r9
+define(`n',	  `%r8')	dnl rsp+40
+
+define(`cnd',     `%rbx')
+
+define(ADDSUB,	add)
+define(ADCSBB,	adc)
+define(func,	mpn_cnd_add_n)
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_cnd_add_n)
+	FUNC_ENTRY(4)
+IFDOS(`	mov	56(%rsp), R32(%r8)')
+	push	%rbx
+
+	neg	cnd_arg
+	sbb	cnd, cnd		C make cnd mask
+
+	test	$1, R8(n)
+	jz	L(x0)
+L(x1):	test	$2, R8(n)
+	jz	L(b1)
+
+L(b3):	mov	(vp), %rdi
+	mov	8(vp), %r9
+	mov	16(vp), %r10
+	and	cnd, %rdi
+	and	cnd, %r9
+	and	cnd, %r10
+	ADDSUB	(up), %rdi
+	mov	%rdi, (rp)
+	ADCSBB	8(up), %r9
+	mov	%r9, 8(rp)
+	ADCSBB	16(up), %r10
+	mov	%r10, 16(rp)
+	sbb	R32(%rax), R32(%rax)	C save carry
+	lea	24(up), up
+	lea	24(vp), vp
+	lea	24(rp), rp
+	sub	$3, n
+	jnz	L(top)
+	jmp	L(end)
+
+L(x0):	xor	R32(%rax), R32(%rax)
+	test	$2, R8(n)
+	jz	L(top)
+
+L(b2):	mov	(vp), %rdi
+	mov	8(vp), %r9
+	and	cnd, %rdi
+	and	cnd, %r9
+	ADDSUB	(up), %rdi
+	mov	%rdi, (rp)
+	ADCSBB	8(up), %r9
+	mov	%r9, 8(rp)
+	sbb	R32(%rax), R32(%rax)	C save carry
+	lea	16(up), up
+	lea	16(vp), vp
+	lea	16(rp), rp
+	sub	$2, n
+	jnz	L(top)
+	jmp	L(end)
+
+L(b1):	mov	(vp), %rdi
+	and	cnd, %rdi
+	ADDSUB	(up), %rdi
+	mov	%rdi, (rp)
+	sbb	R32(%rax), R32(%rax)	C save carry
+	lea	8(up), up
+	lea	8(vp), vp
+	lea	8(rp), rp
+	dec	n
+	jz	L(end)
+
+	ALIGN(16)
+L(top):	mov	(vp), %rdi
+	mov	8(vp), %r9
+	mov	16(vp), %r10
+	mov	24(vp), %r11
+	lea	32(vp), vp
+	and	cnd, %rdi
+	and	cnd, %r9
+	and	cnd, %r10
+	and	cnd, %r11
+	add	R32(%rax), R32(%rax)	C restore carry
+	ADCSBB	(up), %rdi
+	mov	%rdi, (rp)
+	ADCSBB	8(up), %r9
+	mov	%r9, 8(rp)
+	ADCSBB	16(up), %r10
+	mov	%r10, 16(rp)
+	ADCSBB	24(up), %r11
+	lea	32(up), up
+	mov	%r11, 24(rp)
+	lea	32(rp), rp
+	sbb	R32(%rax), R32(%rax)	C save carry
+	sub	$4, n
+	jnz	L(top)
+
+L(end):	neg	R32(%rax)
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/coreisbr/cnd_sub_n.asm b/third_party/gmp/mpn/x86_64/coreisbr/cnd_sub_n.asm
new file mode 100644
index 0000000..f55492b
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/coreisbr/cnd_sub_n.asm

@@ -0,0 +1,200 @@
+dnl  AMD64 mpn_cnd_add_n, mpn_cnd_sub_n
+
+dnl  Copyright 2011-2013, 2017 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C AMD K8,K9
+C AMD K10
+C AMD bd1
+C AMD bd2
+C AMD bd3
+C AMD bd4
+C AMD zen
+C AMD bobcat
+C AMD jaguar
+C Intel P4
+C Intel PNR	 3.0
+C Intel NHM	 2.75
+C Intel SBR	 2.15
+C Intel IBR	 1.96
+C Intel HWL	 2.0
+C Intel BWL	 1.65
+C Intel SKL	 1.65
+C Intel atom
+C Intel SLM	 4.5
+C VIA nano
+
+C NOTES
+C  * It might seem natural to use the cmov insn here, but since this function
+C    is supposed to have the exact same execution pattern for cnd true and
+C    false, and since cmov's documentation is not clear about whether it
+C    actually reads both source operands and writes the register for a false
+C    condition, we cannot use it.
+C  * Given that we have a dedicated cnd_add_n, it might look strange that this
+C    file provides cnd_add_n and not just cnd_sub_n.  But that's harmless, and
+C    this file's generality might come in handy for some pipeline.
+
+C INPUT PARAMETERS
+define(`cnd_arg', `%rdi')	dnl rcx
+define(`rp',	  `%rsi')	dnl rdx
+define(`up',	  `%rdx')	dnl r8
+define(`vp',	  `%rcx')	dnl r9
+define(`n',	  `%r8')	dnl rsp+40
+
+define(`cnd',     `%rbx')
+
+ifdef(`OPERATION_cnd_add_n',`
+	define(ADDSUB,	add)
+	define(ADCSBB,	adc)
+	define(func,	mpn_cnd_add_n)')
+ifdef(`OPERATION_cnd_sub_n',`
+	define(ADDSUB,	sub)
+	define(ADCSBB,	sbb)
+	define(func,	mpn_cnd_sub_n)')
+
+MULFUNC_PROLOGUE(mpn_cnd_add_n mpn_cnd_sub_n)
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(func)
+	FUNC_ENTRY(4)
+IFDOS(`	mov	56(%rsp), R32(%r8)')
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+
+	neg	cnd_arg
+	sbb	cnd, cnd		C make cnd mask
+
+	test	$1, R8(n)
+	jz	L(x0)
+L(x1):	test	$2, R8(n)
+	jz	L(b1)
+
+L(b3):	mov	(vp), %rdi
+	mov	8(vp), %r9
+	mov	16(vp), %r10
+	and	cnd, %rdi
+	mov	(up), %r12
+	and	cnd, %r9
+	mov	8(up), %r13
+	and	cnd, %r10
+	mov	16(up), %rbp
+	ADDSUB	%rdi, %r12
+	mov	%r12, (rp)
+	ADCSBB	%r9, %r13
+	mov	%r13, 8(rp)
+	ADCSBB	%r10, %rbp
+	mov	%rbp, 16(rp)
+	sbb	R32(%rax), R32(%rax)	C save carry
+	lea	24(up), up
+	lea	24(vp), vp
+	lea	24(rp), rp
+	sub	$3, n
+	jnz	L(top)
+	jmp	L(end)
+
+L(x0):	xor	R32(%rax), R32(%rax)
+	test	$2, R8(n)
+	jz	L(top)
+
+L(b2):	mov	(vp), %rdi
+	mov	8(vp), %r9
+	mov	(up), %r12
+	and	cnd, %rdi
+	mov	8(up), %r13
+	and	cnd, %r9
+	ADDSUB	%rdi, %r12
+	mov	%r12, (rp)
+	ADCSBB	%r9, %r13
+	mov	%r13, 8(rp)
+	sbb	R32(%rax), R32(%rax)	C save carry
+	lea	16(up), up
+	lea	16(vp), vp
+	lea	16(rp), rp
+	sub	$2, n
+	jnz	L(top)
+	jmp	L(end)
+
+L(b1):	mov	(vp), %rdi
+	mov	(up), %r12
+	and	cnd, %rdi
+	ADDSUB	%rdi, %r12
+	mov	%r12, (rp)
+	sbb	R32(%rax), R32(%rax)	C save carry
+	lea	8(up), up
+	lea	8(vp), vp
+	lea	8(rp), rp
+	dec	n
+	jz	L(end)
+
+	ALIGN(16)
+L(top):	mov	(vp), %rdi
+	mov	8(vp), %r9
+	mov	16(vp), %r10
+	mov	24(vp), %r11
+	lea	32(vp), vp
+	and	cnd, %rdi
+	mov	(up), %r12
+	and	cnd, %r9
+	mov	8(up), %r13
+	and	cnd, %r10
+	mov	16(up), %rbp
+	and	cnd, %r11
+	add	R32(%rax), R32(%rax)	C restore carry
+	mov	24(up), %rax
+	lea	32(up), up
+	ADCSBB	%rdi, %r12
+	mov	%r12, (rp)
+	ADCSBB	%r9, %r13
+	mov	%r13, 8(rp)
+	ADCSBB	%r10, %rbp
+	mov	%rbp, 16(rp)
+	ADCSBB	%r11, %rax
+	mov	%rax, 24(rp)
+	lea	32(rp), rp
+	sbb	R32(%rax), R32(%rax)	C save carry
+	sub	$4, n
+	jnz	L(top)
+
+L(end):	neg	R32(%rax)
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/coreisbr/divrem_1.asm b/third_party/gmp/mpn/x86_64/coreisbr/divrem_1.asm
new file mode 100644
index 0000000..d9f371f
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/coreisbr/divrem_1.asm

@@ -0,0 +1,37 @@
+dnl  X86-64 mpn_divrem_1
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_divrem_1 mpn_preinv_divrem_1)
+include_mpn(`x86_64/divrem_1.asm')

diff --git a/third_party/gmp/mpn/x86_64/coreisbr/gcd_11.asm b/third_party/gmp/mpn/x86_64/coreisbr/gcd_11.asm
new file mode 100644
index 0000000..4723093
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/coreisbr/gcd_11.asm

@@ -0,0 +1,37 @@
+dnl  AMD64 mpn_gcd_11.
+
+dnl  Copyright 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_gcd_11)
+include_mpn(`x86_64/core2/gcd_11.asm')

diff --git a/third_party/gmp/mpn/x86_64/coreisbr/gmp-mparam.h b/third_party/gmp/mpn/x86_64/coreisbr/gmp-mparam.h
new file mode 100644
index 0000000..36f4512
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/coreisbr/gmp-mparam.h

@@ -0,0 +1,241 @@
+/* Sandy Bridge gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+/* 3400-3800 MHz Intel Xeon E3-1270 Sandy Bridge */
+/* FFT tuning limit = 468,152,320 */
+/* Generated by tuneup.c, 2019-10-20, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD                 0  /* always */
+#define MOD_1_UNNORM_THRESHOLD               0  /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          4
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          2
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD         9
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD        24
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD      8
+#define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_1_NORM_THRESHOLD              1
+#define DIV_QR_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           30
+
+#define DIV_1_VS_MUL_1_PERCENT             298
+
+#define MUL_TOOM22_THRESHOLD                20
+#define MUL_TOOM33_THRESHOLD                65
+#define MUL_TOOM44_THRESHOLD               154
+#define MUL_TOOM6H_THRESHOLD               254
+#define MUL_TOOM8H_THRESHOLD               333
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD     105
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     122
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD     105
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD     113
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD     148
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 28
+#define SQR_TOOM3_THRESHOLD                 93
+#define SQR_TOOM4_THRESHOLD                248
+#define SQR_TOOM6_THRESHOLD                342
+#define SQR_TOOM8_THRESHOLD                462
+
+#define MULMID_TOOM42_THRESHOLD             36
+
+#define MULMOD_BNM1_THRESHOLD               13
+#define SQRMOD_BNM1_THRESHOLD               15
+
+#define MUL_FFT_MODF_THRESHOLD             396  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    396, 5}, {     19, 6}, {     10, 5}, {     21, 6}, \
+    {     11, 5}, {     23, 6}, {     21, 7}, {     11, 6}, \
+    {     25, 7}, {     13, 6}, {     27, 7}, {     21, 8}, \
+    {     11, 7}, {     25, 8}, {     13, 7}, {     28, 8}, \
+    {     15, 7}, {     31, 8}, {     17, 7}, {     35, 8}, \
+    {     21, 9}, {     11, 8}, {     27, 9}, {     15, 8}, \
+    {     35, 9}, {     19, 8}, {     39, 9}, {     23, 8}, \
+    {     49, 9}, {     27,10}, {     15, 9}, {     39,10}, \
+    {     23, 9}, {     51,11}, {     15,10}, {     31, 9}, \
+    {     67,10}, {     39, 9}, {     83,10}, {     47, 9}, \
+    {     95,11}, {     31,10}, {     79,11}, {     47,10}, \
+    {     95,12}, {     31,11}, {     63,10}, {    135,11}, \
+    {     79,10}, {    159, 9}, {    319,10}, {    167,11}, \
+    {     95, 7}, {   1535, 8}, {    831,10}, {    223, 9}, \
+    {    447,11}, {    127,10}, {    255, 9}, {    511,11}, \
+    {    143,10}, {    287, 9}, {    575,10}, {    303,11}, \
+    {    159,10}, {    319,12}, {     95,11}, {    191,10}, \
+    {    383,13}, {     63,12}, {    127,11}, {    255,10}, \
+    {    511,11}, {    271,10}, {    543,11}, {    287,10}, \
+    {    575,11}, {    303,12}, {    159,11}, {    319,10}, \
+    {    639,11}, {    351,10}, {    703,11}, {    367,12}, \
+    {    191,11}, {    383,10}, {    767,11}, {    415,10}, \
+    {    831,12}, {    223,11}, {    447,10}, {    895,11}, \
+    {    479,13}, {    127,12}, {    255,11}, {    543,12}, \
+    {    287,11}, {    607,12}, {    319,11}, {    639,12}, \
+    {    351,11}, {    703,12}, {    383,11}, {    767,12}, \
+    {    415,11}, {    831,12}, {    447,11}, {    895,12}, \
+    {    479,14}, {    127,13}, {    255,12}, {    543,11}, \
+    {   1087,12}, {    607,13}, {    319,12}, {    735,13}, \
+    {    383,12}, {    831,13}, {    447,12}, {    959,14}, \
+    {    255,13}, {    511,12}, {   1087,13}, {    575,12}, \
+    {   1215,13}, {    639,12}, {   1279,13}, {    703,12}, \
+    {   1407,14}, {    383,13}, {    767,12}, {   1535,13}, \
+    {    831,12}, {   1663,13}, {    959,15}, {    255,14}, \
+    {    511,13}, {   1087,12}, {   2175,13}, {   1215,14}, \
+    {    639,13}, {   1343,12}, {   2687,13}, {   1407,12}, \
+    {   2815,13}, {   1471,14}, {    767,13}, {   1663,14}, \
+    {    895,13}, {   1919,15}, {    511,14}, {   1023,13}, \
+    {   2175,14}, {   1151,13}, {   2431,12}, {   4863,14}, \
+    {   1279,13}, {   2687,14}, {   1407,13}, {   2815,15}, \
+    {    767,14}, {   1535,13}, {   3071,14}, {   1663,13}, \
+    {   3455,12}, {   6911,14}, {   1919,16}, {    511,15}, \
+    {   1023,14}, {   2431,13}, {   4863,15}, {   1279,14}, \
+    {   2943,13}, {   5887,15}, {   1535,14}, {   3455,13}, \
+    {   6911,15}, {   1791,14}, {   3839,13}, {   7679,16}, \
+    {   1023,15}, {   2047,14}, {   4223,15}, {   2303,14}, \
+    {   4863,15}, {   2815,14}, {   5887,16}, {   1535,15}, \
+    {   3327,14}, {   6911,15}, {   3839,14}, {   7679,17}, \
+    {   1023,16}, {   2047,15}, {   4863,16}, {   2559,15}, \
+    {   5887,14}, {  11775,16}, {   3071,15}, {   6911,16}, \
+    {   3583,15}, {   7679,14}, {  15359,17}, {   2047,16}, \
+    {   4095,15}, {   8191,16}, {   4607,15}, {   9983,16}, \
+    {   5631,15}, {  11775,17}, {   3071,16}, {  65536,17}, \
+    { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
+    {2097152,22}, {4194304,23}, {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 219
+#define MUL_FFT_THRESHOLD                 4736
+
+#define SQR_FFT_MODF_THRESHOLD             336  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    336, 5}, {     19, 6}, {     10, 5}, {     21, 6}, \
+    {     11, 5}, {     23, 6}, {     25, 7}, {     13, 6}, \
+    {     27, 7}, {     25, 8}, {     13, 7}, {     28, 8}, \
+    {     15, 7}, {     31, 8}, {     21, 9}, {     11, 8}, \
+    {     27, 9}, {     15, 8}, {     35, 9}, {     19, 8}, \
+    {     41, 9}, {     23, 8}, {     47, 9}, {     27,10}, \
+    {     15, 9}, {     39,10}, {     23, 9}, {     51,11}, \
+    {     15,10}, {     31, 9}, {     67,10}, {     39, 9}, \
+    {     79,10}, {     47,11}, {     31,10}, {     79,11}, \
+    {     47,10}, {     95,12}, {     31,11}, {     63,10}, \
+    {    127, 9}, {    255,10}, {    135,11}, {     79,10}, \
+    {    159, 9}, {    319,11}, {     95,10}, {    191, 9}, \
+    {    383,12}, {     63,11}, {    127,10}, {    255, 6}, \
+    {   4351, 7}, {   2303, 8}, {   1215,12}, {     95,11}, \
+    {    191,10}, {    383,13}, {     63,12}, {    127,11}, \
+    {    255,10}, {    511,11}, {    271,10}, {    543,11}, \
+    {    287,10}, {    575,11}, {    303,10}, {    607,12}, \
+    {    159,11}, {    319,10}, {    639,11}, {    335,10}, \
+    {    671,11}, {    351,10}, {    703,12}, {    191,11}, \
+    {    383,10}, {    767,11}, {    415,10}, {    831,12}, \
+    {    223,11}, {    447,10}, {    895,11}, {    479,13}, \
+    {    127,12}, {    255,11}, {    543,12}, {    287,11}, \
+    {    607,12}, {    319,11}, {    671,12}, {    351,11}, \
+    {    703,13}, {    191,12}, {    383,11}, {    767,12}, \
+    {    415,11}, {    831,12}, {    447,11}, {    895,12}, \
+    {    479,14}, {    127,13}, {    255,12}, {    511,11}, \
+    {   1023,12}, {    543,11}, {   1087,12}, {    607,13}, \
+    {    319,12}, {    703,13}, {    383,12}, {    831,13}, \
+    {    447,12}, {    959,14}, {    255,13}, {    511,12}, \
+    {   1087,13}, {    575,12}, {   1215,13}, {    639,12}, \
+    {   1279,13}, {    703,14}, {    383,13}, {    767,12}, \
+    {   1535,13}, {    831,12}, {   1663,13}, {    959,14}, \
+    {    511,13}, {   1087,12}, {   2175,13}, {   1215,14}, \
+    {    639,13}, {   1343,12}, {   2687,13}, {   1407,12}, \
+    {   2815,13}, {   1471,14}, {    767,13}, {   1599,12}, \
+    {   3199,13}, {   1663,14}, {    895,13}, {   1791,15}, \
+    {    511,14}, {   1023,13}, {   2175,14}, {   1151,13}, \
+    {   2431,12}, {   4863,14}, {   1279,13}, {   2687,14}, \
+    {   1407,13}, {   2815,15}, {    767,14}, {   1535,13}, \
+    {   3199,14}, {   1663,13}, {   3455,12}, {   6911,14}, \
+    {   1791,16}, {    511,15}, {   1023,14}, {   2431,13}, \
+    {   4863,15}, {   1279,14}, {   2943,13}, {   5887,15}, \
+    {   1535,14}, {   3455,13}, {   6911,15}, {   1791,14}, \
+    {   3839,16}, {   1023,15}, {   2047,14}, {   4223,15}, \
+    {   2303,14}, {   4863,15}, {   2815,14}, {   5887,16}, \
+    {   1535,15}, {   3327,14}, {   6911,15}, {   3839,17}, \
+    {   1023,16}, {   2047,15}, {   4863,16}, {   2559,15}, \
+    {   5887,14}, {  11775,16}, {   3071,15}, {   6911,16}, \
+    {   3583,15}, {   7679,14}, {  15359,17}, {   2047,16}, \
+    {   4607,15}, {   9983,14}, {  19967,16}, {   5631,15}, \
+    {  11775,17}, {   3071,16}, {  65536,17}, { 131072,18}, \
+    { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \
+    {4194304,23}, {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 210
+#define SQR_FFT_THRESHOLD                 3264
+
+#define MULLO_BASECASE_THRESHOLD             0  /* always */
+#define MULLO_DC_THRESHOLD                  62
+#define MULLO_MUL_N_THRESHOLD             8907
+#define SQRLO_BASECASE_THRESHOLD             9
+#define SQRLO_DC_THRESHOLD                  66
+#define SQRLO_SQR_THRESHOLD               6440
+
+#define DC_DIV_QR_THRESHOLD                 52
+#define DC_DIVAPPR_Q_THRESHOLD             172
+#define DC_BDIV_QR_THRESHOLD                46
+#define DC_BDIV_Q_THRESHOLD                 92
+
+#define INV_MULMOD_BNM1_THRESHOLD           46
+#define INV_NEWTON_THRESHOLD               170
+#define INV_APPR_THRESHOLD                 167
+
+#define BINV_NEWTON_THRESHOLD              228
+#define REDC_1_TO_REDC_2_THRESHOLD          36
+#define REDC_2_TO_REDC_N_THRESHOLD          55
+
+#define MU_DIV_QR_THRESHOLD               1387
+#define MU_DIVAPPR_Q_THRESHOLD            1387
+#define MUPI_DIV_QR_THRESHOLD               77
+#define MU_BDIV_QR_THRESHOLD              1187
+#define MU_BDIV_Q_THRESHOLD               1442
+
+#define POWM_SEC_TABLE  1,16,191,452,1297
+
+#define GET_STR_DC_THRESHOLD                14
+#define GET_STR_PRECOMPUTE_THRESHOLD        21
+#define SET_STR_DC_THRESHOLD              1160
+#define SET_STR_PRECOMPUTE_THRESHOLD      2043
+
+#define FAC_DSC_THRESHOLD                  426
+#define FAC_ODD_THRESHOLD                   24
+
+#define MATRIX22_STRASSEN_THRESHOLD         14
+#define HGCD2_DIV1_METHOD                    5  /* 0.74% faster than 3 */
+#define HGCD_THRESHOLD                      96
+#define HGCD_APPR_THRESHOLD                 60
+#define HGCD_REDUCE_THRESHOLD             2681
+#define GCD_DC_THRESHOLD                   465
+#define GCDEXT_DC_THRESHOLD                345
+#define JACOBI_BASE_METHOD                   1  /* 32.22% faster than 4 */
+
+/* Tuneup completed successfully, took 276198 seconds */

diff --git a/third_party/gmp/mpn/x86_64/coreisbr/lshift.asm b/third_party/gmp/mpn/x86_64/coreisbr/lshift.asm
new file mode 100644
index 0000000..a1cbc31
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/coreisbr/lshift.asm

@@ -0,0 +1,37 @@
+dnl  X86-64 mpn_lshift optimised for Intel Sandy Bridge.
+
+dnl  Copyright 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_lshift)
+include_mpn(`x86_64/fastsse/lshift-movdqu2.asm')

diff --git a/third_party/gmp/mpn/x86_64/coreisbr/lshiftc.asm b/third_party/gmp/mpn/x86_64/coreisbr/lshiftc.asm
new file mode 100644
index 0000000..ac90edb
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/coreisbr/lshiftc.asm

@@ -0,0 +1,37 @@
+dnl  X86-64 mpn_lshiftc optimised for Intel Sandy Bridge.
+
+dnl  Copyright 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_lshiftc)
+include_mpn(`x86_64/fastsse/lshiftc-movdqu2.asm')

diff --git a/third_party/gmp/mpn/x86_64/coreisbr/mul_1.asm b/third_party/gmp/mpn/x86_64/coreisbr/mul_1.asm
new file mode 100644
index 0000000..a43a117
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/coreisbr/mul_1.asm

@@ -0,0 +1,199 @@
+dnl  X86-64 mpn_mul_1 optimised for Intel Sandy Bridge.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2003-2005, 2007, 2008, 2011-2013, 2017 Free Software Foundation,
+dnl  Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C AMD K8,K9
+C AMD K10
+C AMD bull
+C AMD pile
+C AMD steam
+C AMD excavator
+C AMD bobcat
+C AMD jaguar
+C Intel P4
+C Intel core2
+C Intel NHM
+C Intel SBR      2.49
+C Intel IBR      2.32
+C Intel HWL      2.44
+C Intel BWL      2.43
+C Intel SKL      2.47
+C Intel atom
+C Intel SLM
+C VIA nano
+
+C The loop of this code is the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjorn Granlund.
+
+define(`rp',      `%rdi')   C rcx
+define(`up_param',`%rsi')   C rdx
+define(`n_param', `%rdx')   C r8
+define(`v0',      `%rcx')   C r9
+define(`cin',     `%r8')    C stack
+
+define(`up',      `%rsi')   C same as rp_param
+define(`n',	  `%r9')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+IFDOS(`	define(`rp',      `%rcx')')
+IFDOS(`	define(`up_param',`%rdx')')
+IFDOS(`	define(`n_param', `%r8')')
+IFDOS(`	define(`v0',      `%r9')')
+IFDOS(`	define(`cin',     `48(%rsp)')')
+
+IFDOS(`	define(`up',      `%rsi')')
+IFDOS(`	define(`n',       `%r8')')
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_mul_1)
+IFDOS(`	push	%rsi		')
+	mov	(up_param), %rax
+IFSTD(`	mov	n_param, n	')
+	lea	(up_param,n_param,8), up
+	lea	-8(rp,n_param,8), rp
+	neg	n
+	mul	v0
+
+	test	$1, R8(n)
+	jz	L(x0)
+L(x1):	mov	%rax, %r11
+	mov	%rdx, %r10
+	test	$2, R8(n)
+	jnz	L(01)
+
+L(11):	mov	8(up,n,8), %rax
+	dec	n
+	jmp	L(L3)
+
+L(01):	inc	n
+	jnz	L(L1)
+	mov	%rax, (rp)
+	mov	%rdx, %rax
+IFDOS(`	pop	%rsi		')
+	ret
+
+L(x0):	mov	%rax, %r10
+	mov	%rdx, %r11
+	mov	8(up,n,8), %rax
+	test	$2, R8(n)
+	jz	L(L0)
+
+L(10):	add	$-2, n
+	jmp	L(L2)
+
+	ALIGN(8)
+L(top):	mov	%rdx, %r10
+	add	%rax, %r11
+L(L1):	mov	0(up,n,8), %rax
+	adc	$0, %r10
+	mul	v0
+	add	%rax, %r10
+	mov	%r11, 0(rp,n,8)
+	mov	8(up,n,8), %rax
+	mov	%rdx, %r11
+L(L0c):	adc	$0, %r11
+L(L0):	mul	v0
+	mov	%r10, 8(rp,n,8)
+	add	%rax, %r11
+	mov	%rdx, %r10
+L(L3c):	mov	16(up,n,8), %rax
+	adc	$0, %r10
+L(L3):	mul	v0
+	mov	%r11, 16(rp,n,8)
+	mov	%rdx, %r11
+	add	%rax, %r10
+L(L2c):	mov	24(up,n,8), %rax
+	adc	$0, %r11
+L(L2):	mul	v0
+	mov	%r10, 24(rp,n,8)
+	add	$4, n
+	jnc	L(top)
+
+L(end):	add	%rax, %r11
+	mov	%rdx, %rax
+	adc	$0, %rax
+	mov	%r11, (rp)
+
+IFDOS(`	pop	%rsi		')
+	ret
+EPILOGUE()
+
+	ALIGN(16)
+PROLOGUE(mpn_mul_1c)
+IFDOS(`	push	%rsi		')
+	mov	(up_param), %rax
+IFSTD(`	mov	n_param, n	')
+	lea	(up_param,n_param,8), up
+	lea	-8(rp,n_param,8), rp
+	neg	n
+	mul	v0
+
+	test	$1, R8(n)
+	jz	L(x0c)
+L(x1c):	mov	%rax, %r11
+	mov	%rdx, %r10
+	test	$2, R8(n)
+	jnz	L(01c)
+
+L(11c):	add	cin, %r11
+	dec	n
+	jmp	L(L3c)
+
+L(01c):	add	cin, %r11
+	inc	n
+	jnz	L(L1)
+	mov	%r11, (rp)
+	mov	%rdx, %rax
+	adc	$0, %rax
+IFDOS(`	pop	%rsi		')
+	ret
+
+L(x0c):	mov	%rax, %r10
+	mov	%rdx, %r11
+	test	$2, R8(n)
+	jz	L(00c)
+
+L(10c):	add	$-2, n
+	add	cin, %r10
+	jmp	L(L2c)
+
+L(00c):	add	cin, %r10
+	mov	8(up,n,8), %rax
+	jmp	L(L0c)
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/coreisbr/mul_2.asm b/third_party/gmp/mpn/x86_64/coreisbr/mul_2.asm
new file mode 100644
index 0000000..781534d
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/coreisbr/mul_2.asm

@@ -0,0 +1,167 @@
+dnl  AMD64 mpn_mul_2 optimised for Intel Sandy Bridge.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb	best
+C AMD K8,K9      8.03
+C AMD K10        8.03
+C AMD bull       9.19
+C AMD pile       9.16
+C AMD steam
+C AMD excavator
+C AMD bobcat    10.6
+C AMD jaguar    11.0
+C Intel P4      26.0
+C Intel core2    8.73
+C Intel NHM      8.55
+C Intel SBR      5.15
+C Intel IBR      4.57
+C Intel HWL      4.08
+C Intel BWL      4.10
+C Intel SKL      4.14
+C Intel atom    39.5
+C Intel SLM     26.3
+C VIA nano
+
+C This code is the result of running a code generation and optimisation tool
+C suite written by David Harvey and Torbjorn Granlund.
+
+C When playing with pointers, set this to $2 to fall back to conservative
+C indexing in wind-down code.
+define(`I',`$1')
+
+define(`rp',      `%rdi')   C rcx
+define(`up',      `%rsi')   C rdx
+define(`n_param', `%rdx')   C r8
+define(`vp',      `%rcx')   C r9
+
+define(`n',	  `%rcx')
+define(`v0',      `%rbx')
+define(`v1',      `%rbp')
+
+define(`w0',	`%r8')
+define(`w1',	`%r9')
+define(`w2',	`%r10')
+define(`w3',	`%r11')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(32)
+PROLOGUE(mpn_mul_2)
+	FUNC_ENTRY(4)
+	push	%rbx
+	push	%rbp
+
+	mov	(vp), v0
+	mov	8(vp), v1
+
+	mov	(up), %rax
+	lea	(up,n_param,8), up
+	lea	(rp,n_param,8), rp
+
+	test	$1, R8(n_param)
+	jnz	L(b1)
+
+L(b0):	mov	$0, R32(n)
+	sub	n_param, n
+	xor	w0, w0
+	mul	v0
+	mov	%rax, w2
+	mov	%rdx, w1
+	mov	(up,n,8), %rax
+	jmp	L(lo0)
+
+L(b1):	mov	$1, R32(n)
+	sub	n_param, n
+	xor	w2, w2
+	mul	v0
+	mov	%rax, w0
+	mov	%rdx, w3
+	mov	-8(up,n,8), %rax
+	mul	v1
+	jmp	L(lo1)
+
+	ALIGN(32)
+L(top):	mul	v0
+	add	%rax, w0		C 1
+	mov	%rdx, w3		C 2
+	adc	$0, w3			C 2
+	mov	-8(up,n,8), %rax
+	mul	v1
+	add	w1, w0			C 1
+	adc	$0, w3			C 2
+L(lo1):	add	%rax, w2		C 2
+	mov	w0, -8(rp,n,8)		C 1
+	mov	%rdx, w0		C 3
+	adc	$0, w0			C 3
+	mov	(up,n,8), %rax
+	mul	v0
+	add	%rax, w2		C 2
+	mov	%rdx, w1		C 3
+	adc	$0, w1			C 3
+	add	w3, w2			C 2
+	mov	(up,n,8), %rax
+	adc	$0, w1			C 1
+L(lo0):	mul	v1
+	mov	w2, (rp,n,8)		C 2
+	add	%rax, w0		C 3
+	mov	%rdx, w2		C 4
+	mov	8(up,n,8), %rax
+	adc	$0, w2			C 4
+	add	$2, n
+	jnc	L(top)
+
+L(end):	mul	v0
+	add	%rax, w0
+	mov	%rdx, w3
+	adc	$0, w3
+	mov	I(-8(up),-8(up,n,8)), %rax
+	mul	v1
+	add	w1, w0
+	adc	$0, w3
+	add	%rax, w2
+	mov	w0, I(-8(rp),-8(rp,n,8))
+	adc	$0, %rdx
+	add	w3, w2
+	mov	w2, I((rp),(rp,n,8))
+	adc	$0, %rdx
+	mov	%rdx, %rax
+
+	pop	%rbp
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/coreisbr/mul_basecase.asm b/third_party/gmp/mpn/x86_64/coreisbr/mul_basecase.asm
new file mode 100644
index 0000000..35fd1cc
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/coreisbr/mul_basecase.asm

@@ -0,0 +1,407 @@
+dnl  AMD64 mpn_mul_basecase optimised for Intel Sandy bridge and Ivy bridge.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb	mul_1		mul_2		mul_3		addmul_2
+C AMD K8,K9
+C AMD K10
+C AMD bull
+C AMD pile
+C AMD steam
+C AMD bobcat
+C AMD jaguar
+C Intel P4
+C Intel core
+C Intel NHM
+C Intel SBR	 2.5		 2.5		 -		 2.95
+C Intel IBR	 2.4		 2.3		 -		 2.68
+C Intel HWL	 2.35		 2.0		 -		 2.5
+C Intel BWL
+C Intel atom
+C VIA nano
+
+C The inner loops of this code are the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjorn Granlund.
+
+C TODO
+C  * Fix the addmul_2 fluctuation affecting SBR.
+C  * Improve feed-in code, avoiding zeroing of many registers and dummy adds in
+C    the loops at the expense of code size.
+C  * Adjoin a mul_3, avoiding slow mul_1 for odd vn.
+C  * Consider replacing the 2-way mul_2 code with 4-way code, for a very slight
+C    speedup.
+C  * Further micro-optimise.
+
+C When playing with pointers, set this to $2 to fall back to conservative
+C indexing in wind-down code.
+define(`I',`$1')
+
+
+define(`rp',      `%rdi')
+define(`up',      `%rsi')
+define(`un_param',`%rdx')
+define(`vp',      `%rcx')
+define(`vn',      `%r8')
+
+define(`un',      `%rbx')
+
+define(`w0',	`%r10')
+define(`w1',	`%r11')
+define(`w2',	`%r12')
+define(`w3',	`%r13')
+define(`n',	`%rbp')
+define(`v0',	`%r9')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_mul_basecase)
+	FUNC_ENTRY(4)
+IFDOS(`	mov	56(%rsp), %r8d	')
+	push	%rbx
+	push	%rbp
+	mov	un_param, un		C free up rdx
+	neg	un
+
+	mov	(up), %rax		C shared for mul_1 and mul_2
+	lea	(up,un_param,8), up	C point at operand end
+	lea	(rp,un_param,8), rp	C point at rp[un-1]
+
+	mov	(vp), v0		C shared for mul_1 and mul_2
+	mul	v0			C shared for mul_1 and mul_2
+
+	test	$1, R8(vn)
+	jz	L(do_mul_2)
+
+L(do_mul_1):
+	test	$1, R8(un)
+	jnz	L(m1x1)
+
+L(m1x0):mov	%rax, w0		C un = 2, 4, 6, 8, ...
+	mov	%rdx, w1
+	mov	8(up,un,8), %rax
+	test	$2, R8(un)
+	jnz	L(m110)
+
+L(m100):lea	2(un), n		C un = 4, 8, 12, ...
+	jmp	L(m1l0)
+
+L(m110):lea	(un), n			C un = 2, 6, 10, ...
+	jmp	L(m1l2)
+
+L(m1x1):mov	%rax, w1		C un = 1, 3, 5, 7, ...
+	mov	%rdx, w0
+	test	$2, R8(un)
+	jz	L(m111)
+
+L(m101):lea	3(un), n		C un = 1, 5, 9, ...
+	test	n, n
+	js	L(m1l1)
+	mov	%rax, -8(rp)
+	mov	%rdx, (rp)
+	pop	%rbp
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+
+L(m111):lea	1(un), n		C un = 3, 7, 11, ...
+	mov	8(up,un,8), %rax
+	jmp	L(m1l3)
+
+	ALIGN(16)		C FIXME
+L(m1tp):mov	%rdx, w0
+	add	%rax, w1
+L(m1l1):mov	-16(up,n,8), %rax
+	adc	$0, w0
+	mul	v0
+	add	%rax, w0
+	mov	w1, -24(rp,n,8)
+	mov	-8(up,n,8), %rax
+	mov	%rdx, w1
+	adc	$0, w1
+L(m1l0):mul	v0
+	mov	w0, -16(rp,n,8)
+	add	%rax, w1
+	mov	%rdx, w0
+	mov	(up,n,8), %rax
+	adc	$0, w0
+L(m1l3):mul	v0
+	mov	w1, -8(rp,n,8)
+	mov	%rdx, w1
+	add	%rax, w0
+	mov	8(up,n,8), %rax
+	adc	$0, w1
+L(m1l2):mul	v0
+	mov	w0, (rp,n,8)
+	add	$4, n
+	jnc	L(m1tp)
+
+L(m1ed):add	%rax, w1
+	adc	$0, %rdx
+	mov	w1, I(-8(rp),-24(rp,n,8))
+	mov	%rdx, I((rp),-16(rp,n,8))
+
+	dec	R32(vn)
+	jz	L(ret2)
+
+	lea	8(vp), vp
+	lea	8(rp), rp
+	push	%r12
+	push	%r13
+	push	%r14
+	jmp	L(do_addmul)
+
+L(do_mul_2):
+define(`v1',	`%r14')
+	push	%r12
+	push	%r13
+	push	%r14
+
+	mov	8(vp), v1
+
+	test	$1, R8(un)
+	jnz	L(m2b1)
+
+L(m2b0):lea	(un), n
+	xor	w0, w0
+	mov	%rax, w2
+	mov	%rdx, w1
+	jmp	L(m2l0)
+
+L(m2b1):lea	1(un), n
+	xor	w1, w1
+	xor	w2, w2
+	mov	%rax, w0
+	mov	%rdx, w3
+	jmp	L(m2l1)
+
+	ALIGN(32)
+L(m2tp):mul	v0
+	add	%rax, w0
+	mov	%rdx, w3
+	adc	$0, w3
+L(m2l1):mov	-8(up,n,8), %rax
+	mul	v1
+	add	w1, w0
+	adc	$0, w3
+	add	%rax, w2
+	mov	w0, -8(rp,n,8)
+	mov	%rdx, w0
+	adc	$0, w0
+	mov	(up,n,8), %rax
+	mul	v0
+	add	%rax, w2
+	mov	%rdx, w1
+	adc	$0, w1
+	add	w3, w2
+L(m2l0):mov	(up,n,8), %rax
+	adc	$0, w1
+	mul	v1
+	mov	w2, (rp,n,8)
+	add	%rax, w0
+	mov	%rdx, w2
+	mov	8(up,n,8), %rax
+	adc	$0, w2
+	add	$2, n
+	jnc	L(m2tp)
+
+L(m2ed):mul	v0
+	add	%rax, w0
+	mov	%rdx, w3
+	adc	$0, w3
+	mov	I(-8(up),-8(up,n,8)), %rax
+	mul	v1
+	add	w1, w0
+	adc	$0, w3
+	add	%rax, w2
+	mov	w0, I(-8(rp),-8(rp,n,8))
+	adc	$0, %rdx
+	add	w3, w2
+	mov	w2, I((rp),(rp,n,8))
+	adc	$0, %rdx
+	mov	%rdx, I(8(rp),8(rp,n,8))
+
+	add	$-2, R32(vn)
+	jz	L(ret5)
+	lea	16(vp), vp
+	lea	16(rp), rp
+
+
+L(do_addmul):
+	push	%r15
+	push	vn			C save vn in new stack slot
+define(`vn',	`(%rsp)')
+define(`X0',	`%r14')
+define(`X1',	`%r15')
+define(`v1',	`%r8')
+
+L(outer):
+	mov	(vp), v0
+	mov	8(vp), v1
+	mov	(up,un,8), %rax
+	mul	v0
+	test	$1, R8(un)
+	jnz	L(a1x1)
+
+L(a1x0):mov	(rp,un,8), X0
+	xor	w0, w0
+	mov	%rdx, w1
+	test	$2, R8(un)
+	jnz	L(a110)
+
+L(a100):lea	2(un), n		C un = 4, 8, 12, ...
+	add	%rax, X0
+	adc	$0, w1
+	mov	(up,un,8), %rax
+	mul	v1
+	mov	8(rp,un,8), X1
+	jmp	L(lo0)
+
+L(a110):lea	(un), n			C un = 2, 6, 10, ...
+	xor	w3, w3
+	jmp	L(lo2)
+
+L(a1x1):mov	(rp,un,8), X1
+	xor	w2, w2
+	xor	w1, w1
+	test	$2, R8(un)
+	jz	L(a111)
+
+L(a101):lea	3(un), n		C un = 1, 5, 9, ...
+	mov	%rdx, w3
+	add	%rax, X1
+	mov	(up,un,8), %rax
+	mov	8(rp,un,8), X0
+	adc	$0, w3
+	jmp	L(top)
+
+L(a111):lea	1(un), n		C un = 3, 7, 11, ...
+	jmp	L(lo3)
+
+	ALIGN(32)
+L(top):	mul	v1
+	mov	%rdx, w0
+	add	%rax, X0
+	adc	$0, w0
+	add	w1, X1
+	adc	$0, w3
+	add	w2, X0
+	adc	$0, w0
+	mov	-16(up,n,8), %rax
+	mul	v0
+	add	%rax, X0
+	mov	%rdx, w1
+	adc	$0, w1
+	mov	-16(up,n,8), %rax
+	mul	v1
+	mov	X1, -24(rp,n,8)
+	mov	-8(rp,n,8), X1
+	add	w3, X0
+	adc	$0, w1
+L(lo0):	mov	%rdx, w2
+	mov	X0, -16(rp,n,8)
+	add	%rax, X1
+	adc	$0, w2
+	mov	-8(up,n,8), %rax
+	add	w0, X1
+	adc	$0, w2
+	mul	v0
+L(lo3):	add	%rax, X1
+	mov	%rdx, w3
+	adc	$0, w3
+	mov	-8(up,n,8), %rax
+	mul	v1
+	add	w1, X1
+	mov	(rp,n,8), X0
+	adc	$0, w3
+	mov	%rdx, w0
+	add	%rax, X0
+	adc	$0, w0
+	mov	(up,n,8), %rax
+	mul	v0
+	add	w2, X0
+	mov	X1, -8(rp,n,8)
+	mov	%rdx, w1
+	adc	$0, w0
+L(lo2):	add	%rax, X0
+	adc	$0, w1
+	mov	(up,n,8), %rax
+	add	w3, X0
+	adc	$0, w1
+	mul	v1
+	mov	8(rp,n,8), X1
+	add	%rax, X1
+	mov	%rdx, w2
+	adc	$0, w2
+	mov	8(up,n,8), %rax
+	mov	X0, (rp,n,8)
+	mul	v0
+	add	w0, X1
+	mov	%rdx, w3
+	adc	$0, w2
+	add	%rax, X1
+	mov	8(up,n,8), %rax
+	mov	16(rp,n,8), X0		C useless but harmless in final iter
+	adc	$0, w3
+	add	$4, n
+	jnc	L(top)
+
+L(end):	mul	v1
+	add	w1, X1
+	adc	$0, w3
+	add	w2, %rax
+	adc	$0, %rdx
+	mov	X1, I(-8(rp),-24(rp,n,8))
+	add	w3, %rax
+	adc	$0, %rdx
+	mov	%rax, I((rp),-16(rp,n,8))
+	mov	%rdx, I(8(rp),-8(rp,n,8))
+
+	addl	$-2, vn
+	lea	16(vp), vp
+	lea	16(rp), rp
+	jnz	L(outer)
+
+	pop	%rax		C deallocate vn slot
+	pop	%r15
+L(ret5):pop	%r14
+	pop	%r13
+	pop	%r12
+L(ret2):pop	%rbp
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/coreisbr/mullo_basecase.asm b/third_party/gmp/mpn/x86_64/coreisbr/mullo_basecase.asm
new file mode 100644
index 0000000..a41a8ac
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/coreisbr/mullo_basecase.asm

@@ -0,0 +1,384 @@
+dnl  AMD64 mpn_mullo_basecase optimised for Intel Sandy bridge and Ivy bridge.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2008, 2009, 2011-2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb	mul_2		addmul_2
+C AMD K8,K9
+C AMD K10
+C AMD bull
+C AMD pile
+C AMD steam
+C AMD bobcat
+C AMD jaguar
+C Intel P4
+C Intel core
+C Intel NHM
+C Intel SBR	 2.5		 2.95
+C Intel IBR	 2.3		 2.68
+C Intel HWL	 2.0		 2.5
+C Intel BWL
+C Intel atom
+C VIA nano
+
+C The inner loops of this code are the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjörn Granlund.
+
+C TODO
+C   * Implement proper cor2, replacing current cor0.
+C   * Offset n by 2 in order to avoid the outer loop cmp.  (And sqr_basecase?)
+C   * Micro-optimise.
+
+C When playing with pointers, set this to $2 to fall back to conservative
+C indexing in wind-down code.
+define(`I',`$1')
+
+define(`rp',       `%rdi')
+define(`up',       `%rsi')
+define(`vp_param', `%rdx')
+define(`n',        `%rcx')
+
+define(`vp',       `%r8')
+define(`X0',       `%r14')
+define(`X1',       `%r15')
+
+define(`w0',       `%r10')
+define(`w1',       `%r11')
+define(`w2',       `%r12')
+define(`w3',       `%r13')
+define(`i',        `%rbp')
+define(`v0',       `%r9')
+define(`v1',       `%rbx')
+
+C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(32)
+PROLOGUE(mpn_mullo_basecase)
+	FUNC_ENTRY(4)
+
+	mov	(up), %rax
+	mov	vp_param, vp
+
+	cmp	$4, n
+	jb	L(small)
+
+	mov	(vp_param), v0
+	push	%rbx
+	lea	(rp,n,8), rp		C point rp at R[un]
+	push	%rbp
+	lea	(up,n,8), up		C point up right after U's end
+	push	%r12
+	neg	n
+	push	%r13
+	mul	v0
+	mov	8(vp), v1
+
+	test	$1, R8(n)
+	jnz	L(m2b1)
+
+L(m2b0):lea	(n), i
+	xor	w0, w0
+	mov	%rax, w2
+	mov	%rdx, w1
+	jmp	L(m2l0)
+
+L(m2b1):lea	1(n), i
+	xor	w1, w1
+	xor	w2, w2
+	mov	%rax, w0
+	mov	%rdx, w3
+	jmp	L(m2l1)
+
+	ALIGN(32)
+L(m2tp):mul	v0
+	add	%rax, w0
+	mov	%rdx, w3
+	adc	$0, w3
+L(m2l1):mov	-8(up,i,8), %rax
+	mul	v1
+	add	w1, w0
+	adc	$0, w3
+	add	%rax, w2
+	mov	w0, -8(rp,i,8)
+	mov	%rdx, w0
+	adc	$0, w0
+	mov	(up,i,8), %rax
+	mul	v0
+	add	%rax, w2
+	mov	%rdx, w1
+	adc	$0, w1
+	add	w3, w2
+L(m2l0):mov	(up,i,8), %rax
+	adc	$0, w1
+	mul	v1
+	mov	w2, (rp,i,8)
+	add	%rax, w0
+	mov	%rdx, w2		C FIXME: dead in last iteration
+	mov	8(up,i,8), %rax
+	adc	$0, w2			C FIXME: dead in last iteration
+	add	$2, i
+	jnc	L(m2tp)
+
+L(m2ed):imul	v0, %rax
+	add	w0, %rax
+	add	w1, %rax
+	mov	%rax, I(-8(rp),-8(rp,i,8))
+
+	add	$2, n
+	lea	16(vp), vp
+	lea	-16(up), up
+	cmp	$-2, n
+	jge	L(cor1)
+
+	push	%r14
+	push	%r15
+
+L(outer):
+	mov	(vp), v0
+	mov	8(vp), v1
+	mov	(up,n,8), %rax
+	mul	v0
+	test	$1, R8(n)
+	jnz	L(a1x1)
+
+L(a1x0):mov	(rp,n,8), X1
+	xor	w2, w2
+	xor	w1, w1
+	test	$2, R8(n)
+	jnz	L(a110)
+
+L(a100):lea	1(n), i
+	jmp	L(lo0)
+
+L(a110):lea	3(n), i
+	mov	%rdx, w3
+	add	%rax, X1
+	mov	(up,n,8), %rax
+	mov	8(rp,n,8), X0
+	adc	$0, w3
+	jmp	L(lo2)
+
+L(a1x1):mov	(rp,n,8), X0
+	xor	w0, w0
+	mov	%rdx, w1
+	test	$2, R8(n)
+	jz	L(a111)
+
+L(a101):lea	2(n), i
+	add	%rax, X0
+	adc	$0, w1
+	mov	(up,n,8), %rax
+	mul	v1
+	mov	8(rp,n,8), X1
+	jmp	L(lo1)
+
+L(a111):lea	(n), i
+	xor	w3, w3
+	jmp	L(lo3)
+
+	ALIGN(32)
+L(top):
+L(lo2):	mul	v1
+	mov	%rdx, w0
+	add	%rax, X0
+	adc	$0, w0
+	add	w1, X1
+	adc	$0, w3
+	add	w2, X0
+	adc	$0, w0
+	mov	-16(up,i,8), %rax
+	mul	v0
+	add	%rax, X0
+	mov	%rdx, w1
+	adc	$0, w1
+	mov	-16(up,i,8), %rax
+	mul	v1
+	mov	X1, -24(rp,i,8)
+	mov	-8(rp,i,8), X1
+	add	w3, X0
+	adc	$0, w1
+L(lo1):	mov	%rdx, w2
+	mov	X0, -16(rp,i,8)
+	add	%rax, X1
+	adc	$0, w2
+	mov	-8(up,i,8), %rax
+	add	w0, X1
+	adc	$0, w2
+	mul	v0
+L(lo0):	add	%rax, X1
+	mov	%rdx, w3
+	adc	$0, w3
+	mov	-8(up,i,8), %rax
+	mul	v1
+	add	w1, X1
+	mov	(rp,i,8), X0
+	adc	$0, w3
+	mov	%rdx, w0
+	add	%rax, X0
+	adc	$0, w0
+	mov	(up,i,8), %rax
+	mul	v0
+	add	w2, X0
+	mov	X1, -8(rp,i,8)
+	mov	%rdx, w1
+	adc	$0, w0
+L(lo3):	add	%rax, X0
+	adc	$0, w1
+	mov	(up,i,8), %rax
+	add	w3, X0
+	adc	$0, w1
+	mul	v1
+	mov	8(rp,i,8), X1
+	add	%rax, X1
+	mov	%rdx, w2
+	adc	$0, w2
+	mov	8(up,i,8), %rax
+	mov	X0, (rp,i,8)
+	mul	v0
+	add	w0, X1
+	mov	%rdx, w3
+	adc	$0, w2
+	add	%rax, X1
+	mov	8(up,i,8), %rax
+	mov	16(rp,i,8), X0
+	adc	$0, w3
+	add	$4, i
+	jnc	L(top)
+
+L(end):	imul	v1, %rax
+	add	%rax, X0
+	add	w1, X1
+	adc	$0, w3
+	add	w2, X0
+	mov	I(-8(up),-16(up,i,8)), %rax
+	imul	v0, %rax
+	add	X0, %rax
+	mov	X1, I(-16(rp),-24(rp,i,8))
+	add	w3, %rax
+	mov	%rax, I(-8(rp),-16(rp,i,8))
+
+	add	$2, n
+	lea	16(vp), vp
+	lea	-16(up), up
+	cmp	$-2, n
+	jl	L(outer)
+
+	pop	%r15
+	pop	%r14
+
+	jnz	L(cor0)
+
+L(cor1):mov	(vp), v0
+	mov	8(vp), v1
+	mov	-16(up), %rax
+	mul	v0			C u0 x v2
+	add	-16(rp), %rax		C FIXME: rp[0] still available in reg?
+	adc	-8(rp), %rdx		C FIXME: rp[1] still available in reg?
+	mov	-8(up), %r10
+	imul	v0, %r10
+	mov	-16(up), %r11
+	imul	v1, %r11
+	mov	%rax, -16(rp)
+	add	%r10, %r11
+	add	%rdx, %r11
+	mov	%r11, -8(rp)
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+
+L(cor0):mov	(vp), %r11
+	imul	-8(up), %r11
+	add	%rax, %r11
+	mov	%r11, -8(rp)
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+
+	ALIGN(16)
+L(small):
+	cmp	$2, n
+	jae	L(gt1)
+L(n1):	imul	(vp_param), %rax
+	mov	%rax, (rp)
+	FUNC_EXIT()
+	ret
+L(gt1):	ja	L(gt2)
+L(n2):	mov	(vp_param), %r9
+	mul	%r9
+	mov	%rax, (rp)
+	mov	8(up), %rax
+	imul	%r9, %rax
+	add	%rax, %rdx
+	mov	8(vp), %r9
+	mov	(up), %rcx
+	imul	%r9, %rcx
+	add	%rcx, %rdx
+	mov	%rdx, 8(rp)
+	FUNC_EXIT()
+	ret
+L(gt2):
+L(n3):	mov	(vp_param), %r9
+	mul	%r9		C u0 x v0
+	mov	%rax, (rp)
+	mov	%rdx, %r10
+	mov	8(up), %rax
+	mul	%r9		C u1 x v0
+	imul	16(up), %r9	C u2 x v0
+	add	%rax, %r10
+	adc	%rdx, %r9
+	mov	8(vp), %r11
+	mov	(up), %rax
+	mul	%r11		C u0 x v1
+	add	%rax, %r10
+	adc	%rdx, %r9
+	imul	8(up), %r11	C u1 x v1
+	add	%r11, %r9
+	mov	%r10, 8(rp)
+	mov	16(vp), %r10
+	mov	(up), %rax
+	imul	%rax, %r10	C u0 x v2
+	add	%r10, %r9
+	mov	%r9, 16(rp)
+	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/coreisbr/redc_1.asm b/third_party/gmp/mpn/x86_64/coreisbr/redc_1.asm
new file mode 100644
index 0000000..f0dbe07
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/coreisbr/redc_1.asm

@@ -0,0 +1,546 @@
+dnl  X86-64 mpn_redc_1 optimised for Intel Sandy Bridge and Ivy Bridge.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C AMD K8,K9	 ?
+C AMD K10	 ?
+C AMD bull	 ?
+C AMD pile	 ?
+C AMD steam	 ?
+C AMD bobcat	 ?
+C AMD jaguar	 ?
+C Intel P4	 ?
+C Intel core	 ?
+C Intel NHM	 ?
+C Intel SBR	 3.24
+C Intel IBR	 3.04
+C Intel HWL	 ?
+C Intel BWL	 ?
+C Intel atom	 ?
+C VIA nano	 ?
+
+C The inner loops of this code are the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjörn Granlund.
+
+C TODO
+C  * Micro-optimise, none performed thus far.
+C  * Consider inlining mpn_add_n.
+C  * Single basecases out before the pushes.
+
+C When playing with pointers, set this to $2 to fall back to conservative
+C indexing in wind-down code.
+define(`I',`$1')
+
+define(`rp',          `%rdi')   C rcx
+define(`up',          `%rsi')   C rdx
+define(`mp_param',    `%rdx')   C r8
+define(`n',           `%rcx')   C r9
+define(`u0inv',       `%r8')    C stack
+
+define(`i',           `%r14')
+define(`j',           `%r15')
+define(`mp',          `%r12')
+define(`q0',          `%r13')
+
+C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+define(`ALIGNx', `ALIGN(16)')
+
+ASM_START()
+	TEXT
+	ALIGN(32)
+PROLOGUE(mpn_redc_1)
+	FUNC_ENTRY(4)
+IFDOS(`	mov	56(%rsp), %r8	')
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+
+	mov	(up), q0
+	mov	n, j			C outer loop induction var
+	lea	8(mp_param,n,8), mp
+	lea	8(up,n,8), up
+	neg	n
+	imul	u0inv, q0		C first iteration q0
+
+	test	$1, R8(n)
+	jz	L(bx0)
+
+L(bx1):	test	$2, R8(n)
+	jz	L(b3)
+
+L(b1):	cmp	$-1, R32(n)
+	jz	L(n1)
+
+L(otp1):lea	1(n), i
+	mov	-8(mp,n,8), %rax
+	mul	q0
+	mov	-8(up,n,8), %r10
+	mov	%rdx, %r11
+	add	%rax, %r10
+	mov	(mp,n,8), %rax
+	adc	$0, %r11
+	mul	q0
+	mov	%rdx, %r9
+	mov	(up,n,8), %rbx
+	add	%rax, %rbx
+	adc	$0, %r9
+	mov	(mp,i,8), %rax
+	mul	q0
+	mov	(up,i,8), %r10
+	add	%r11, %rbx
+	mov	%rbx, -8(up,i,8)	C next low remainder limb
+	adc	$0, %r9
+	imul	u0inv, %rbx		C next q limb
+	jmp	L(e1)
+
+	ALIGNx
+L(tp1):	mul	q0
+	mov	-16(up,i,8), %r10
+	add	%r11, %rbp
+	mov	%rdx, %r11
+	adc	$0, %r9
+	mov	%rbp, -24(up,i,8)
+	add	%rax, %r10
+	mov	-8(mp,i,8), %rax
+	adc	$0, %r11
+	mul	q0
+	add	%r9, %r10
+	mov	%rdx, %r9
+	mov	-8(up,i,8), %rbp
+	adc	$0, %r11
+	mov	%r10, -16(up,i,8)
+	add	%rax, %rbp
+	adc	$0, %r9
+	mov	(mp,i,8), %rax
+	mul	q0
+	mov	(up,i,8), %r10
+	add	%r11, %rbp
+	mov	%rbp, -8(up,i,8)
+	adc	$0, %r9
+L(e1):	mov	%rdx, %r11
+	add	%rax, %r10
+	mov	8(mp,i,8), %rax
+	adc	$0, %r11
+	mul	q0
+	mov	8(up,i,8), %rbp
+	add	%r9, %r10
+	mov	%rdx, %r9
+	mov	%r10, (up,i,8)
+	adc	$0, %r11
+	add	%rax, %rbp
+	adc	$0, %r9
+	mov	16(mp,i,8), %rax
+	add	$4, i
+	jnc	L(tp1)
+
+L(ed1):	mul	q0
+	mov	I(-16(up),-16(up,i,8)), %r10
+	add	%r11, %rbp
+	adc	$0, %r9
+	mov	%rbp, I(-24(up),-24(up,i,8))
+	add	%rax, %r10
+	adc	$0, %rdx
+	add	%r9, %r10
+	adc	$0, %rdx
+	mov	%r10, I(-16(up),-16(up,i,8))
+	mov	%rdx, -8(up,n,8)	C up[0]
+	mov	%rbx, q0		C previously computed q limb -> q0
+	lea	8(up), up		C up++
+	dec	j
+	jnz	L(otp1)
+	jmp	L(cj)
+
+L(b3):	cmp	$-3, R32(n)
+	jz	L(n3)
+
+L(otp3):lea	3(n), i
+	mov	-8(mp,n,8), %rax
+	mul	q0
+	mov	-8(up,n,8), %r10
+	mov	%rdx, %r11
+	add	%rax, %r10
+	mov	(mp,n,8), %rax
+	adc	$0, %r11
+	mul	q0
+	mov	(up,n,8), %rbx
+	mov	%rdx, %r9
+	add	%rax, %rbx
+	adc	$0, %r9
+	mov	8(mp,n,8), %rax
+	mul	q0
+	mov	8(up,n,8), %r10
+	add	%r11, %rbx
+	mov	%rdx, %r11
+	adc	$0, %r9
+	mov	%rbx, (up,n,8)
+	imul	u0inv, %rbx		C next q limb
+	jmp	L(e3)
+
+	ALIGNx
+L(tp3):	mul	q0
+	mov	-16(up,i,8), %r10
+	add	%r11, %rbp
+	mov	%rdx, %r11
+	adc	$0, %r9
+	mov	%rbp, -24(up,i,8)
+L(e3):	add	%rax, %r10
+	mov	-8(mp,i,8), %rax
+	adc	$0, %r11
+	mul	q0
+	add	%r9, %r10
+	mov	%rdx, %r9
+	mov	-8(up,i,8), %rbp
+	adc	$0, %r11
+	mov	%r10, -16(up,i,8)
+	add	%rax, %rbp
+	adc	$0, %r9
+	mov	(mp,i,8), %rax
+	mul	q0
+	mov	(up,i,8), %r10
+	add	%r11, %rbp
+	mov	%rbp, -8(up,i,8)
+	adc	$0, %r9
+	mov	%rdx, %r11
+	add	%rax, %r10
+	mov	8(mp,i,8), %rax
+	adc	$0, %r11
+	mul	q0
+	mov	8(up,i,8), %rbp
+	add	%r9, %r10
+	mov	%rdx, %r9
+	mov	%r10, (up,i,8)
+	adc	$0, %r11
+	add	%rax, %rbp
+	adc	$0, %r9
+	mov	16(mp,i,8), %rax
+	add	$4, i
+	jnc	L(tp3)
+
+L(ed3):	mul	q0
+	mov	I(-16(up),-16(up,i,8)), %r10
+	add	%r11, %rbp
+	adc	$0, %r9
+	mov	%rbp, I(-24(up),-24(up,i,8))
+	add	%rax, %r10
+	adc	$0, %rdx
+	add	%r9, %r10
+	adc	$0, %rdx
+	mov	%r10, I(-16(up),-16(up,i,8))
+	mov	%rdx, -8(up,n,8)	C up[0]
+	mov	%rbx, q0		C previously computed q limb -> q0
+	lea	8(up), up		C up++
+	dec	j
+	jnz	L(otp3)
+C	jmp	L(cj)
+
+L(cj):
+IFSTD(`	lea	-8(up,n,8), up		C param 2: up
+	lea	(up,n,8), %rdx		C param 3: up - n
+	neg	R32(n)		')	C param 4: n
+
+IFDOS(`	lea	-8(up,n,8), %rdx	C param 2: up
+	lea	(%rdx,n,8), %r8		C param 3: up - n
+	neg	R32(n)
+	mov	n, %r9			C param 4: n
+	mov	rp, %rcx	')	C param 1: rp
+
+IFSTD(`	sub	$8, %rsp	')
+IFDOS(`	sub	$40, %rsp	')
+	ASSERT(nz, `test $15, %rsp')
+	CALL(	mpn_add_n)
+IFSTD(`	add	$8, %rsp	')
+IFDOS(`	add	$40, %rsp	')
+
+L(ret):	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+
+L(bx0):	test	$2, R8(n)
+	jnz	L(b2)
+
+L(b0):
+L(otp0):lea	(n), i
+	mov	-8(mp,n,8), %rax
+	mul	q0
+	mov	%rdx, %r9
+	mov	-8(up,n,8), %rbp
+	add	%rax, %rbp
+	adc	$0, %r9
+	mov	(mp,n,8), %rax
+	mul	q0
+	mov	(up,n,8), %rbx
+	mov	%rdx, %r11
+	add	%rax, %rbx
+	mov	8(mp,n,8), %rax
+	adc	$0, %r11
+	mul	q0
+	mov	8(up,n,8), %rbp
+	add	%r9, %rbx
+	mov	%rdx, %r9
+	mov	%rbx, (up,n,8)
+	adc	$0, %r11
+	imul	u0inv, %rbx		C next q limb
+	jmp	L(e0)
+
+	ALIGNx
+L(tp0):	mul	q0
+	mov	-16(up,i,8), %r10
+	add	%r11, %rbp
+	mov	%rdx, %r11
+	adc	$0, %r9
+	mov	%rbp, -24(up,i,8)
+	add	%rax, %r10
+	mov	-8(mp,i,8), %rax
+	adc	$0, %r11
+	mul	q0
+	add	%r9, %r10
+	mov	%rdx, %r9
+	mov	-8(up,i,8), %rbp
+	adc	$0, %r11
+	mov	%r10, -16(up,i,8)
+	add	%rax, %rbp
+	adc	$0, %r9
+	mov	(mp,i,8), %rax
+	mul	q0
+	mov	(up,i,8), %r10
+	add	%r11, %rbp
+	mov	%rbp, -8(up,i,8)
+	adc	$0, %r9
+	mov	%rdx, %r11
+	add	%rax, %r10
+	mov	8(mp,i,8), %rax
+	adc	$0, %r11
+	mul	q0
+	mov	8(up,i,8), %rbp
+	add	%r9, %r10
+	mov	%rdx, %r9
+	mov	%r10, (up,i,8)
+	adc	$0, %r11
+L(e0):	add	%rax, %rbp
+	adc	$0, %r9
+	mov	16(mp,i,8), %rax
+	add	$4, i
+	jnc	L(tp0)
+
+L(ed0):	mul	q0
+	mov	I(-16(up),-16(up,i,8)), %r10
+	add	%r11, %rbp
+	adc	$0, %r9
+	mov	%rbp, I(-24(up),-24(up,i,8))
+	add	%rax, %r10
+	adc	$0, %rdx
+	add	%r9, %r10
+	adc	$0, %rdx
+	mov	%r10, I(-16(up),-16(up,i,8))
+	mov	%rdx, -8(up,n,8)	C up[0]
+	mov	%rbx, q0		C previously computed q limb -> q0
+	lea	8(up), up		C up++
+	dec	j
+	jnz	L(otp0)
+	jmp	L(cj)
+
+L(b2):	cmp	$-2, R32(n)
+	jz	L(n2)
+
+L(otp2):lea	2(n), i
+	mov	-8(mp,n,8), %rax
+	mul	q0
+	mov	-8(up,n,8), %rbp
+	mov	%rdx, %r9
+	add	%rax, %rbp
+	adc	$0, %r9
+	mov	(mp,n,8), %rax
+	mul	q0
+	mov	(up,n,8), %rbx
+	mov	%rdx, %r11
+	add	%rax, %rbx
+	mov	8(mp,n,8), %rax
+	adc	$0, %r11
+	mul	q0
+	add	%r9, %rbx
+	mov	%rdx, %r9
+	mov	8(up,n,8), %rbp
+	adc	$0, %r11
+	mov	%rbx, (up,n,8)
+	imul	u0inv, %rbx		C next q limb
+	jmp	L(e2)
+
+	ALIGNx
+L(tp2):	mul	q0
+	mov	-16(up,i,8), %r10
+	add	%r11, %rbp
+	mov	%rdx, %r11
+	adc	$0, %r9
+	mov	%rbp, -24(up,i,8)
+	add	%rax, %r10
+	mov	-8(mp,i,8), %rax
+	adc	$0, %r11
+	mul	q0
+	add	%r9, %r10
+	mov	%rdx, %r9
+	mov	-8(up,i,8), %rbp
+	adc	$0, %r11
+	mov	%r10, -16(up,i,8)
+L(e2):	add	%rax, %rbp
+	adc	$0, %r9
+	mov	(mp,i,8), %rax
+	mul	q0
+	mov	(up,i,8), %r10
+	add	%r11, %rbp
+	mov	%rbp, -8(up,i,8)
+	adc	$0, %r9
+	mov	%rdx, %r11
+	add	%rax, %r10
+	mov	8(mp,i,8), %rax
+	adc	$0, %r11
+	mul	q0
+	mov	8(up,i,8), %rbp
+	add	%r9, %r10
+	mov	%rdx, %r9
+	mov	%r10, (up,i,8)
+	adc	$0, %r11
+	add	%rax, %rbp
+	adc	$0, %r9
+	mov	16(mp,i,8), %rax
+	add	$4, i
+	jnc	L(tp2)
+
+L(ed2):	mul	q0
+	mov	I(-16(up),-16(up,i,8)), %r10
+	add	%r11, %rbp
+	adc	$0, %r9
+	mov	%rbp, I(-24(up),-24(up,i,8))
+	add	%rax, %r10
+	adc	$0, %rdx
+	add	%r9, %r10
+	adc	$0, %rdx
+	mov	%r10, I(-16(up),-16(up,i,8))
+	mov	%rdx, -8(up,n,8)	C up[0]
+	mov	%rbx, q0		C previously computed q limb -> q0
+	lea	8(up), up		C up++
+	dec	j
+	jnz	L(otp2)
+	jmp	L(cj)
+
+L(n1):	mov	(mp_param), %rax
+	mul	q0
+	add	-16(up), %rax
+	adc	-8(up), %rdx
+	mov	%rdx, (rp)
+	mov	$0, R32(%rax)
+	adc	R32(%rax), R32(%rax)
+	jmp	L(ret)
+
+L(n2):	mov	(mp_param), %rax
+	mov	-24(up), %rbp
+	mul	q0
+	add	%rax, %rbp
+	mov	%rdx, %r9
+	adc	$0, %r9
+	mov	-16(mp), %rax
+	mov	-16(up), %r10
+	mul	q0
+	add	%rax, %r10
+	mov	%rdx, %r11
+	adc	$0, %r11
+	add	%r9, %r10
+	adc	$0, %r11
+	mov	%r10, q0
+	imul	u0inv, q0		C next q0
+	mov	-24(mp), %rax
+	mul	q0
+	add	%rax, %r10
+	mov	%rdx, %r9
+	adc	$0, %r9
+	mov	-16(mp), %rax
+	mov	-8(up), %r14
+	mul	q0
+	add	%rax, %r14
+	adc	$0, %rdx
+	add	%r9, %r14
+	adc	$0, %rdx
+	xor	R32(%rax), R32(%rax)
+	add	%r11, %r14
+	adc	(up), %rdx
+	mov	%r14, (rp)
+	mov	%rdx, 8(rp)
+	adc	R32(%rax), R32(%rax)
+	jmp	L(ret)
+
+	ALIGNx
+L(n3):	mov	-32(mp), %rax
+	mov	-32(up), %r10
+	mul	q0
+	add	%rax, %r10
+	mov	-24(mp), %rax
+	mov	%rdx, %r11
+	adc	$0, %r11
+	mov	-24(up), %rbp
+	mul	q0
+	add	%rax, %rbp
+	mov	%rdx, %r9
+	adc	$0, %r9
+	mov	-16(mp), %rax
+	add	%r11, %rbp
+	mov	-16(up), %r10
+	adc	$0, %r9
+	mul	q0
+	mov	%rbp, q0
+	imul	u0inv, q0		C next q0
+	add	%rax, %r10
+	mov	%rdx, %r11
+	adc	$0, %r11
+	mov	%rbp, -24(up)
+	add	%r9, %r10
+	adc	$0, %r11
+	mov	%r10, -16(up)
+	mov	%r11, -32(up)		C up[0]
+	lea	8(up), up		C up++
+	dec	j
+	jnz	L(n3)
+	jmp	L(cj)
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/x86_64/coreisbr/rsh1aors_n.asm b/third_party/gmp/mpn/x86_64/coreisbr/rsh1aors_n.asm
new file mode 100644
index 0000000..fd2eaea
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/coreisbr/rsh1aors_n.asm

@@ -0,0 +1,193 @@
+dnl  X86-64 mpn_rsh1add_n, mpn_rsh1sub_n optimised for Intel Sandy Bridge.
+
+dnl  Copyright 2003, 2005, 2009-2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C	     cycles/limb
+C AMD K8,K9	 ?
+C AMD K10	 4.25
+C Intel P4	 21.5
+C Intel core2	 3.2
+C Intel NHM	 3.87
+C Intel SBR	 2.05
+C Intel atom	 ?
+C VIA nano	 44.9
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`vp', `%rdx')
+define(`n',  `%rcx')
+
+ifdef(`OPERATION_rsh1add_n', `
+	define(ADDSUB,	      add)
+	define(ADCSBB,	      adc)
+	define(func_n,	      mpn_rsh1add_n)
+	define(func_nc,	      mpn_rsh1add_nc)')
+ifdef(`OPERATION_rsh1sub_n', `
+	define(ADDSUB,	      sub)
+	define(ADCSBB,	      sbb)
+	define(func_n,	      mpn_rsh1sub_n)
+	define(func_nc,	      mpn_rsh1sub_nc)')
+
+MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1add_nc mpn_rsh1sub_n mpn_rsh1sub_nc)
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+
+	ALIGN(16)
+PROLOGUE(func_nc)
+	FUNC_ENTRY(4)
+IFDOS(`	mov	56(%rsp), %r8	')
+	push	%rbx
+	push	%rbp
+
+	neg	%r8			C set C flag from parameter
+	mov	(up), %rbp
+	ADCSBB	(vp), %rbp
+
+	jmp	L(ent)
+EPILOGUE()
+
+	ALIGN(16)
+PROLOGUE(func_n)
+	FUNC_ENTRY(4)
+	push	%rbx
+	push	%rbp
+
+	mov	(up), %rbp
+	ADDSUB	(vp), %rbp
+L(ent):
+	sbb	R32(%rbx), R32(%rbx)	C save cy
+	mov	R32(%rbp), R32(%rax)
+	and	$1, R32(%rax)		C return value
+
+	mov	R32(n), R32(%r11)
+	and	$3, R32(%r11)
+
+	cmp	$1, R32(%r11)
+	je	L(do)			C jump if n = 1 5 9 ...
+
+L(n1):	cmp	$2, R32(%r11)
+	jne	L(n2)			C jump unless n = 2 6 10 ...
+	add	R32(%rbx), R32(%rbx)	C restore cy
+	mov	8(up), %r10
+	ADCSBB	8(vp), %r10
+	lea	8(up), up
+	lea	8(vp), vp
+	lea	8(rp), rp
+	sbb	R32(%rbx), R32(%rbx)	C save cy
+
+	shrd	$1, %r10, %rbp
+	mov	%rbp, -8(rp)
+	jmp	L(cj1)
+
+L(n2):	cmp	$3, R32(%r11)
+	jne	L(n3)			C jump unless n = 3 7 11 ...
+	add	R32(%rbx), R32(%rbx)	C restore cy
+	mov	8(up), %r9
+	mov	16(up), %r10
+	ADCSBB	8(vp), %r9
+	ADCSBB	16(vp), %r10
+	lea	16(up), up
+	lea	16(vp), vp
+	lea	16(rp), rp
+	sbb	R32(%rbx), R32(%rbx)	C save cy
+
+	shrd	$1, %r9, %rbp
+	mov	%rbp, -16(rp)
+	jmp	L(cj2)
+
+L(n3):	dec	n			C come here for n = 4 8 12 ...
+	add	R32(%rbx), R32(%rbx)	C restore cy
+	mov	8(up), %r8
+	mov	16(up), %r9
+	ADCSBB	8(vp), %r8
+	ADCSBB	16(vp), %r9
+	mov	24(up), %r10
+	ADCSBB	24(vp), %r10
+	lea	24(up), up
+	lea	24(vp), vp
+	lea	24(rp), rp
+	sbb	R32(%rbx), R32(%rbx)	C save cy
+
+	shrd	$1, %r8, %rbp
+	mov	%rbp, -24(rp)
+	shrd	$1, %r9, %r8
+	mov	%r8, -16(rp)
+L(cj2):	shrd	$1, %r10, %r9
+	mov	%r9, -8(rp)
+L(cj1):	mov	%r10, %rbp
+
+L(do):
+	shr	$2, n			C				4
+	je	L(end)			C				2
+	ALIGN(16)
+L(top):	add	R32(%rbx), R32(%rbx)		C restore cy
+
+	mov	8(up), %r8
+	mov	16(up), %r9
+	ADCSBB	8(vp), %r8
+	ADCSBB	16(vp), %r9
+	mov	24(up), %r10
+	mov	32(up), %r11
+	ADCSBB	24(vp), %r10
+	ADCSBB	32(vp), %r11
+
+	lea	32(up), up
+	lea	32(vp), vp
+
+	sbb	R32(%rbx), R32(%rbx)	C save cy
+
+	shrd	$1, %r8, %rbp
+	mov	%rbp, (rp)
+	shrd	$1, %r9, %r8
+	mov	%r8, 8(rp)
+	shrd	$1, %r10, %r9
+	mov	%r9, 16(rp)
+	shrd	$1, %r11, %r10
+	mov	%r10, 24(rp)
+
+	dec	n
+	mov	%r11, %rbp
+	lea	32(rp), rp
+	jne	L(top)
+
+L(end):	shrd	$1, %rbx, %rbp
+	mov	%rbp, (rp)
+	pop	%rbp
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/coreisbr/rshift.asm b/third_party/gmp/mpn/x86_64/coreisbr/rshift.asm
new file mode 100644
index 0000000..4c1c0d4
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/coreisbr/rshift.asm

@@ -0,0 +1,37 @@
+dnl  X86-64 mpn_rshift optimised for Intel Sandy Bridge.
+
+dnl  Copyright 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_rshift)
+include_mpn(`x86_64/fastsse/rshift-movdqu2.asm')

diff --git a/third_party/gmp/mpn/x86_64/coreisbr/sec_tabselect.asm b/third_party/gmp/mpn/x86_64/coreisbr/sec_tabselect.asm
new file mode 100644
index 0000000..e436034
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/coreisbr/sec_tabselect.asm

@@ -0,0 +1,37 @@
+dnl  X86-64 mpn_sec_tabselect.
+
+dnl  Copyright 2012, 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_sec_tabselect)
+include_mpn(`x86_64/fastsse/sec_tabselect.asm')

diff --git a/third_party/gmp/mpn/x86_64/coreisbr/sqr_basecase.asm b/third_party/gmp/mpn/x86_64/coreisbr/sqr_basecase.asm
new file mode 100644
index 0000000..46a3612
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/coreisbr/sqr_basecase.asm

@@ -0,0 +1,484 @@
+dnl  AMD64 mpn_sqr_basecase optimised for Intel Sandy bridge and Ivy bridge.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2008, 2009, 2011-2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb	mul_2		addmul_2	sqr_diag_addlsh1
+C AMD K8,K9	 ?		 ?			 ?
+C AMD K10	 ?		 ?			 ?
+C AMD bull	 ?		 ?			 ?
+C AMD pile	 ?		 ?			 ?
+C AMD steam	 ?		 ?			 ?
+C AMD bobcat	 ?		 ?			 ?
+C AMD jaguar	 ?		 ?			 ?
+C Intel P4	 ?		 ?			 ?
+C Intel core	 ?		 ?			 ?
+C Intel NHM	 ?		 ?			 ?
+C Intel SBR	 2.57		 2.93			 3.0
+C Intel IBR	 2.35		 2.66			 3.0
+C Intel HWL	 2.02		 2.5			 2.5
+C Intel BWL	 ?		 ?			 ?
+C Intel atom	 ?		 ?			 ?
+C VIA nano	 ?		 ?			 ?
+
+C The inner loops of this code are the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjörn Granlund, except
+C that the sqr_diag_addlsh1 loop was manually written.
+
+C TODO
+C  * Replace current unoptimised sqr_diag_addlsh1 loop, 2.5 c/l should be easy.
+C  * Streamline pointer updates.
+C  * Perhaps suppress a few more xor insns in feed-in code.
+C  * Make sure we write no dead registers in feed-in code.
+C  * We might use 32-bit size ops, since n >= 2^32 is non-terminating.  Watch
+C    out for negative sizes being zero-extended, though.
+C  * The straight-line code for n <= 3 comes from the K8 code, and might be
+C    quite sub-optimal here.  Write specific code, and add code for n = 4.
+C  * The mul_2 loop has a 10 insn common sequence in the loop start and the
+C    wind-down code.  Try re-rolling it.
+C  * This file has been the subject to just basic micro-optimisation.
+
+C When playing with pointers, set this to $2 to fall back to conservative
+C indexing in wind-down code.
+define(`I',`$1')
+
+define(`rp',	  `%rdi')
+define(`up',	  `%rsi')
+define(`un_param',`%rdx')
+
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(32)
+PROLOGUE(mpn_sqr_basecase)
+	FUNC_ENTRY(3)
+
+	cmp	$2, un_param
+	jae	L(gt1)
+
+	mov	(up), %rax
+	mul	%rax
+	mov	%rax, (rp)
+	mov	%rdx, 8(rp)
+	FUNC_EXIT()
+	ret
+
+L(gt1):	jne	L(gt2)
+
+	mov	(up), %rax
+	mov	%rax, %r8
+	mul	%rax
+	mov	8(up), %r11
+	mov	%rax, (rp)
+	mov	%r11, %rax
+	mov	%rdx, %r9
+	mul	%rax
+	mov	%rax, %r10
+	mov	%r11, %rax
+	mov	%rdx, %r11
+	mul	%r8
+	xor	%r8, %r8
+	add	%rax, %r9
+	adc	%rdx, %r10
+	adc	%r8, %r11
+	add	%rax, %r9
+	mov	%r9, 8(rp)
+	adc	%rdx, %r10
+	mov	%r10, 16(rp)
+	adc	%r8, %r11
+	mov	%r11, 24(rp)
+	FUNC_EXIT()
+	ret
+
+L(gt2):	cmp	$4, un_param
+	jae	L(gt3)
+define(`v0', `%r8')
+define(`v1', `%r9')
+define(`w0', `%r10')
+define(`w2', `%r11')
+
+	mov	(up), %rax
+	mov	%rax, %r10
+	mul	%rax
+	mov	8(up), %r11
+	mov	%rax, (rp)
+	mov	%r11, %rax
+	mov	%rdx, 8(rp)
+	mul	%rax
+	mov	16(up), %rcx
+	mov	%rax, 16(rp)
+	mov	%rcx, %rax
+	mov	%rdx, 24(rp)
+	mul	%rax
+	mov	%rax, 32(rp)
+	mov	%rdx, 40(rp)
+
+	mov	%r11, %rax
+	mul	%r10
+	mov	%rax, %r8
+	mov	%rcx, %rax
+	mov	%rdx, %r9
+	mul	%r10
+	xor	%r10, %r10
+	add	%rax, %r9
+	mov	%r11, %rax
+	mov	%r10, %r11
+	adc	%rdx, %r10
+
+	mul	%rcx
+	add	%rax, %r10
+	adc	%r11, %rdx
+	add	%r8, %r8
+	adc	%r9, %r9
+	adc	%r10, %r10
+	adc	%rdx, %rdx
+	adc	%r11, %r11
+	add	%r8, 8(rp)
+	adc	%r9, 16(rp)
+	adc	%r10, 24(rp)
+	adc	%rdx, 32(rp)
+	adc	%r11, 40(rp)
+	FUNC_EXIT()
+	ret
+
+L(gt3):
+
+define(`v0', `%r8')
+define(`v1', `%r9')
+define(`w0', `%r10')
+define(`w1', `%r11')
+define(`w2', `%rbx')
+define(`w3', `%rbp')
+define(`un', `%r12')
+define(`n',  `%rcx')
+
+define(`X0', `%r13')
+define(`X1', `%r14')
+
+L(do_mul_2):
+	mov	(up), v0
+	push	%rbx
+	lea	(rp,un_param,8), rp	C point rp at R[un]
+	mov	8(up), %rax
+	push	%rbp
+	lea	(up,un_param,8), up	C point up right after U's end
+	mov	%rax, v1
+	push	%r12
+	mov	$1, R32(un)		C free up rdx
+	push	%r13
+	sub	un_param, un
+	push	%r14
+	push	un
+	mul	v0
+	mov	%rax, (rp,un,8)
+	mov	8(up,un,8), %rax
+	test	$1, R8(un)
+	jnz	L(m2b1)
+
+L(m2b0):lea	2(un), n
+	xor	R32(w1), R32(w1)	C FIXME
+	xor	R32(w2), R32(w2)	C FIXME
+	mov	%rdx, w0
+	jmp	L(m2l0)
+
+L(m2b1):lea	1(un), n
+	xor	R32(w3), R32(w3)	C FIXME
+	xor	R32(w0), R32(w0)	C FIXME
+	mov	%rdx, w2
+	jmp	L(m2l1)
+
+	ALIGN(32)
+L(m2tp):
+L(m2l0):mul	v0
+	add	%rax, w0
+	mov	%rdx, w3
+	adc	$0, w3
+	mov	-8(up,n,8), %rax
+	mul	v1
+	add	w1, w0
+	adc	$0, w3
+	add	%rax, w2
+	mov	w0, -8(rp,n,8)
+	mov	%rdx, w0
+	adc	$0, w0
+	mov	(up,n,8), %rax
+L(m2l1):mul	v0
+	add	%rax, w2
+	mov	%rdx, w1
+	adc	$0, w1
+	add	w3, w2
+	mov	(up,n,8), %rax
+	adc	$0, w1
+	mul	v1
+	mov	w2, (rp,n,8)
+	add	%rax, w0
+	mov	%rdx, w2
+	mov	8(up,n,8), %rax
+	adc	$0, w2
+	add	$2, n
+	jnc	L(m2tp)
+
+L(m2ed):mul	v0
+	add	%rax, w0
+	mov	%rdx, w3
+	adc	$0, w3
+	mov	I(-8(up),-8(up,n,8)), %rax
+	mul	v1
+	add	w1, w0
+	adc	$0, w3
+	add	%rax, w2
+	mov	w0, I(-8(rp),-8(rp,n,8))
+	adc	$0, %rdx
+	add	w3, w2
+	mov	w2, I((rp),(rp,n,8))
+	adc	$0, %rdx
+	mov	%rdx, I(8(rp),8(rp,n,8))
+
+	add	$2, un			C decrease |un|
+
+L(do_addmul_2):
+L(outer):
+	lea	16(rp), rp
+	cmp	$-2, R32(un)		C jump if un C {-1,0}  FIXME jump if un C {-2,1}
+	jge	L(corner)		C FIXME: move to before the lea above
+
+	mov	-8(up,un,8), v0
+	mov	(up,un,8), %rax
+	mov	%rax, v1
+	mul	v0
+	test	$1, R8(un)
+	jnz	L(a1x1)
+
+L(a1x0):mov	(rp,un,8), X0
+	xor	w0, w0
+	mov	8(rp,un,8), X1
+	add	%rax, X0
+	mov	%rdx, w1
+	adc	$0, w1
+	xor	w2, w2
+	mov	X0, (rp,un,8)
+	mov	8(up,un,8), %rax
+	test	$2, R8(un)
+	jnz	L(a110)
+
+L(a100):lea	2(un), n		C un = 4, 8, 12, ...
+	jmp	L(lo0)
+
+L(a110):lea	(un), n			C un = 2, 6, 10, ...
+	jmp	L(lo2)
+
+L(a1x1):mov	(rp,un,8), X1
+	xor	w2, w2
+	mov	8(rp,un,8), X0
+	add	%rax, X1
+	mov	%rdx, w3
+	adc	$0, w3
+	xor	w0, w0
+	mov	8(up,un,8), %rax
+	test	$2, R8(un)
+	jz	L(a111)
+
+L(a101):lea	3(un), n		C un = 1, 5, 9, ...
+	jmp	L(lo1)
+
+L(a111):lea	1(un), n		C un = 3, 7, 11, ...
+	jmp	L(lo3)
+
+	ALIGN(32)
+L(top):	mul	v1
+	mov	%rdx, w0
+	add	%rax, X0
+	adc	$0, w0
+	add	w1, X1
+	adc	$0, w3
+	add	w2, X0
+	adc	$0, w0
+	mov	-16(up,n,8), %rax
+L(lo1):	mul	v0
+	add	%rax, X0
+	mov	%rdx, w1
+	adc	$0, w1
+	mov	-16(up,n,8), %rax
+	mul	v1
+	mov	X1, -24(rp,n,8)
+	mov	-8(rp,n,8), X1
+	add	w3, X0
+	adc	$0, w1
+	mov	%rdx, w2
+	mov	X0, -16(rp,n,8)
+	add	%rax, X1
+	adc	$0, w2
+	mov	-8(up,n,8), %rax
+	add	w0, X1
+	adc	$0, w2
+L(lo0):	mul	v0
+	add	%rax, X1
+	mov	%rdx, w3
+	adc	$0, w3
+	mov	-8(up,n,8), %rax
+	mul	v1
+	add	w1, X1
+	mov	(rp,n,8), X0
+	adc	$0, w3
+	mov	%rdx, w0
+	add	%rax, X0
+	adc	$0, w0
+	mov	(up,n,8), %rax
+L(lo3):	mul	v0
+	add	w2, X0
+	mov	X1, -8(rp,n,8)
+	mov	%rdx, w1
+	adc	$0, w0
+	add	%rax, X0
+	adc	$0, w1
+	mov	(up,n,8), %rax
+	add	w3, X0
+	adc	$0, w1
+	mul	v1
+	mov	8(rp,n,8), X1
+	add	%rax, X1
+	mov	%rdx, w2
+	adc	$0, w2
+	mov	8(up,n,8), %rax
+	mov	X0, (rp,n,8)
+L(lo2):	mul	v0
+	add	w0, X1
+	mov	%rdx, w3
+	adc	$0, w2
+	add	%rax, X1
+	mov	8(up,n,8), %rax
+	mov	16(rp,n,8), X0
+	adc	$0, w3
+	add	$4, n
+	jnc	L(top)
+
+L(end):	mul	v1
+	add	w1, X1
+	adc	$0, w3
+	add	w2, %rax
+	adc	$0, %rdx
+	mov	X1, I(-8(rp),-24(rp,n,8))
+	add	w3, %rax
+	adc	$0, %rdx
+	mov	%rax, I((rp),-16(rp,n,8))
+	mov	%rdx, I(8(rp),-8(rp,n,8))
+
+	add	$2, un			C decrease |un|
+	jmp	L(outer)		C loop until a small corner remains
+
+L(corner):
+	pop	n
+	jg	L(small_corner)
+
+	lea	8(rp), rp
+	mov	-24(up), v0
+	mov	-16(up), %rax
+	mov	%rax, v1
+	mul	v0
+	mov	-24(rp), X0
+	mov	-16(rp), X1
+	add	%rax, X0
+	mov	%rdx, w1
+	adc	$0, w1
+	xor	w2, w2
+	mov	X0, -24(rp)
+	mov	-8(up), %rax
+	mul	v0
+	add	$0, X1
+	mov	%rdx, w3
+	adc	$0, w2
+	add	%rax, X1
+	mov	-8(up), %rax
+	adc	$0, w3
+	mul	v1
+	add	w1, X1
+	adc	$0, w3
+	add	w2, %rax
+	adc	$0, %rdx
+	mov	X1, -16(rp)
+	jmp	L(com)
+
+L(small_corner):
+	mov	-8(rp), w3
+	mov	-16(up), v0
+	mov	-8(up), %rax
+	mul	v0
+L(com):	add	w3, %rax
+	adc	$0, %rdx
+	mov	%rax, -8(rp)
+	mov	%rdx, (rp)
+
+L(sqr_diag_addlsh1):
+	mov	-8(up,n,8), %rax
+	shl	n
+	mul	%rax
+	mov	%rax, (rp,n,8)
+
+	xor	R32(%rbx), R32(%rbx)
+	mov	8(rp,n,8), %r8
+	mov	16(rp,n,8), %r9
+	jmp	L(dm)
+
+	ALIGN(32)
+L(dtop):add	%r8, %r10
+	adc	%r9, %rax
+	mov	8(rp,n,8), %r8
+	mov	16(rp,n,8), %r9
+	mov	%r10, -8(rp,n,8)
+	mov	%rax, (rp,n,8)
+L(dm):	adc	%r8, %r8
+	adc	%r9, %r9
+	mov	(up,n,4), %rax
+	lea	(%rdx,%rbx), %r10
+	setc	R8(%rbx)
+	mul	%rax
+	add	$2, n
+	js	L(dtop)
+
+L(dend):add	%r8, %r10
+	adc	%r9, %rax
+	mov	%r10, I(-8(rp),-8(rp,n,8))
+	mov	%rax, I((rp),(rp,n,8))
+	adc	%rbx, %rdx
+	mov	%rdx, I(8(rp),8(rp,n,8))
+
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/darwin.m4 b/third_party/gmp/mpn/x86_64/darwin.m4
new file mode 100644
index 0000000..7771476
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/darwin.m4

@@ -0,0 +1,82 @@
+divert(-1)
+dnl  Copyright 2008, 2011, 2012 Free Software Foundation, Inc.
+dnl
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+define(`DARWIN')
+
+define(`LEA',`dnl
+ifdef(`PIC',
+	`lea	$1(%rip), $2'
+,
+	`movabs	`$'$1, $2')
+')
+
+dnl  Usage: CALL(funcname)
+dnl
+dnl  Simply override the definition in x86_64-defs.m4.
+
+define(`CALL',`call	GSYM_PREFIX`'$1')
+define(`TCALL',`jmp	GSYM_PREFIX`'$1')
+
+
+dnl  Usage: JUMPTABSECT
+dnl
+dnl  CAUTION: Do not put anything sensible here, like RODATA.  That works with
+dnl  some Darwin tool chains, but silently breaks with other.  (Note that
+dnl  putting jump tables in the text segment is a really poor idea for many PC
+dnl  processors, since they cannot cache the same thing in both L1D and L2I.)
+
+define(`JUMPTABSECT', `.text')
+
+
+dnl  Usage: JMPENT(targlabel,tablabel)
+
+define(`JMPENT',`dnl
+ifdef(`PIC',
+	`.set	$1_tmp, $1-$2
+	.long	$1_tmp'
+,
+	`.quad	$1'
+)')
+
+dnl  Target ABI macros.  For Darwin we override IFELF (and leave default for
+dnl  IFDOS and IFSTD).
+
+define(`IFELF',   `')
+
+
+dnl  Usage: PROTECT(symbol)
+dnl
+dnl  Used for private GMP symbols that should never be overridden by users.
+dnl  This can save reloc entries and improve shlib sharing as well as
+dnl  application startup times
+
+define(`PROTECT',  `.private_extern $1')
+
+
+divert`'dnl

diff --git a/third_party/gmp/mpn/x86_64/div_qr_1n_pi1.asm b/third_party/gmp/mpn/x86_64/div_qr_1n_pi1.asm
new file mode 100644
index 0000000..b3d45e2
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/div_qr_1n_pi1.asm

@@ -0,0 +1,247 @@
+dnl  x86-64 mpn_div_qr_1n_pi1
+dnl  -- Divide an mpn number by a normalized single-limb number,
+dnl     using a single-limb inverse.
+
+dnl  Contributed to the GNU project by Niels Möller
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C		c/l
+C AMD K8,K9	13
+C AMD K10	13
+C AMD bull	16.5
+C AMD pile	15
+C AMD steam	 ?
+C AMD bobcat	16
+C AMD jaguar	 ?
+C Intel P4	47	poor
+C Intel core	19.25
+C Intel NHM	18
+C Intel SBR	15	poor
+C Intel IBR	13
+C Intel HWL	11.7
+C Intel BWL	 ?
+C Intel atom	52	very poor
+C VIA nano	19
+
+
+C INPUT Parameters
+define(`QP', `%rdi')
+define(`UP', `%rsi')
+define(`UN_INPUT', `%rdx')
+define(`U1', `%rcx')	C Also in %rax
+define(`D', `%r8')
+define(`DINV', `%r9')
+
+C Invariants
+define(`B2', `%rbp')
+define(`B2md', `%rbx')
+
+C Variables
+define(`UN', `%r8')	C Overlaps D input
+define(`T', `%r10')
+define(`U0', `%r11')
+define(`U2', `%r12')
+define(`Q0', `%r13')
+define(`Q1', `%r14')
+define(`Q2', `%r15')
+
+ABI_SUPPORT(STD64)
+
+	ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_div_qr_1n_pi1)
+	FUNC_ENTRY(4)
+IFDOS(`	mov	56(%rsp), %r8	')
+IFDOS(`	mov	64(%rsp), %r9	')
+	dec	UN_INPUT
+	jnz	L(first)
+
+	C Just a single 2/1 division.
+	C T, U0 are allocated in scratch registers
+	lea	1(U1), T
+	mov	U1, %rax
+	mul	DINV
+	mov	(UP), U0
+	add	U0, %rax
+	adc	T, %rdx
+	mov	%rdx, T
+	imul	D, %rdx
+	sub	%rdx, U0
+	cmp	U0, %rax
+	lea	(U0, D), %rax
+	cmovnc	U0, %rax
+	sbb	$0, T
+	cmp	D, %rax
+	jc	L(single_div_done)
+	sub	D, %rax
+	add	$1, T
+L(single_div_done):
+	mov	T, (QP)
+	FUNC_EXIT()
+	ret
+L(first):
+	C FIXME: Could delay some of these until we enter the loop.
+	push	%r15
+	push	%r14
+	push	%r13
+	push	%r12
+	push	%rbx
+	push	%rbp
+
+	mov	D, B2
+	imul	DINV, B2
+	neg	B2
+	mov	B2, B2md
+	sub	D, B2md
+
+	C D not needed until final reduction
+	push	D
+	mov	UN_INPUT, UN	C Clobbers D
+
+	mov	DINV, %rax
+	mul	U1
+	mov	%rax, Q0
+	add	U1, %rdx
+	mov	%rdx, T
+
+	mov	B2, %rax
+	mul	U1
+	mov	-8(UP, UN, 8), U0
+	mov	(UP, UN, 8), U1
+	mov	T, (QP, UN, 8)
+	add	%rax, U0
+	adc	%rdx, U1
+	sbb	U2, U2
+	dec	UN
+	mov	U1, %rax
+	jz	L(final)
+
+	ALIGN(16)
+
+	C Loop is 28 instructions, 30 decoder slots, should run in 10 cycles.
+	C At entry, %rax holds an extra copy of U1
+L(loop):
+	C {Q2, Q1, Q0} <-- DINV * U1 + B (Q0 + U2 DINV) + B^2 U2
+	C Remains to add in B (U1 + c)
+	mov	DINV, Q1
+	mov	U2, Q2
+	and	U2, Q1
+	neg	Q2
+	mul	DINV
+	add	%rdx, Q1
+	adc	$0, Q2
+	add	Q0, Q1
+	mov	%rax, Q0
+	mov	B2, %rax
+	lea	(B2md, U0), T
+	adc	$0, Q2
+
+	C {U2, U1, U0} <-- (U0 + U2 B2 -c U) B + U1 B2 + u
+	mul	U1
+	and	B2, U2
+	add	U2, U0
+	cmovnc	U0, T
+
+	C {QP+UN, ...} <-- {QP+UN, ...} + {Q2, Q1} + U1 + c
+	adc	U1, Q1
+	mov	-8(UP, UN, 8), U0
+	adc	Q2, 8(QP, UN, 8)
+	jc	L(q_incr)
+L(q_incr_done):
+	add	%rax, U0
+	mov	T, %rax
+	adc	%rdx, %rax
+	mov	Q1, (QP, UN, 8)
+	sbb	U2, U2
+	dec	UN
+	mov	%rax, U1
+	jnz	L(loop)
+
+L(final):
+	pop	D
+
+	mov	U2, Q1
+	and	D, U2
+	sub	U2, %rax
+	neg	Q1
+
+	mov	%rax, U1
+	sub	D, %rax
+	cmovc	U1, %rax
+	sbb	$-1, Q1
+
+	lea	1(%rax), T
+	mul	DINV
+	add	U0, %rax
+	adc	T, %rdx
+	mov	%rdx, T
+	imul	D, %rdx
+	sub	%rdx, U0
+	cmp	U0, %rax
+	lea	(U0, D), %rax
+	cmovnc	U0, %rax
+	sbb	$0, T
+	cmp	D, %rax
+	jc	L(div_done)
+	sub	D, %rax
+	add	$1, T
+L(div_done):
+	add	T, Q0
+	mov	Q0, (QP)
+	adc	Q1, 8(QP)
+	jnc	L(done)
+L(final_q_incr):
+	addq	$1, 16(QP)
+	lea	8(QP), QP
+	jc	L(final_q_incr)
+
+L(done):
+	pop	%rbp
+	pop	%rbx
+	pop	%r12
+	pop	%r13
+	pop	%r14
+	pop	%r15
+	FUNC_EXIT()
+	ret
+
+L(q_incr):
+	C U1 is not live, so use it for indexing
+	lea	16(QP, UN, 8), U1
+L(q_incr_loop):
+	addq	$1, (U1)
+	jnc	L(q_incr_done)
+	lea	8(U1), U1
+	jmp	L(q_incr_loop)
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/div_qr_2n_pi1.asm b/third_party/gmp/mpn/x86_64/div_qr_2n_pi1.asm
new file mode 100644
index 0000000..5e59a0a
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/div_qr_2n_pi1.asm

@@ -0,0 +1,158 @@
+dnl  x86-64 mpn_div_qr_2n_pi1
+dnl  -- Divide an mpn number by a normalized 2-limb number,
+dnl     using a single-limb inverse.
+
+dnl  Copyright 2007, 2008, 2010-2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C		c/l
+C INPUT PARAMETERS
+define(`qp',		`%rdi')
+define(`rp',		`%rsi')
+define(`up_param',	`%rdx')
+define(`un',		`%rcx')
+define(`d1',		`%r8')
+define(`d0',		`%r9')
+define(`di_param',	`8(%rsp)')
+
+define(`di',		`%r10')
+define(`up',		`%r11')
+define(`u2',		`%rbx')
+define(`u1',		`%r12')
+define(`t1',		`%r13')
+define(`t0',		`%r14')
+define(`md1',		`%r15')
+
+C TODO
+C * Store qh in the same stack slot as di_param, instead of pushing
+C   it. (we could put it in register %rbp, but then we would need to
+C   save and restore that instead, which doesn't seem like a win).
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_div_qr_2n_pi1)
+	FUNC_ENTRY(4)
+IFDOS(`	mov	56(%rsp), %r8	')
+IFDOS(`	mov	64(%rsp), %r9	')
+IFDOS(`define(`di_param', `72(%rsp)')')
+	mov	di_param, di
+	mov	up_param, up
+	push	%r15
+	push	%r14
+	push	%r13
+	push	%r12
+	push	%rbx
+
+	mov	-16(up, un, 8), u1
+	mov	-8(up, un, 8), u2
+
+	mov	u1, t0
+	mov	u2, t1
+	sub	d0, t0
+	sbb	d1, t1
+	cmovnc  t0, u1
+	cmovnc	t1, u2
+	C push qh which is !carry
+	sbb	%rax, %rax
+	inc	%rax
+	push	%rax
+	lea	-2(un), un
+	mov	d1, md1
+	neg	md1
+
+	jmp	L(next)
+
+	ALIGN(16)
+L(loop):
+	C udiv_qr_3by2 (q,u2,u1,u2,u1,n0, d1,d0,di)
+	C Based on the optimized divrem_2.asm code.
+
+	mov	di, %rax
+	mul	u2
+	mov	u1, t0
+	add	%rax, t0	C q0 in t0
+	adc	u2, %rdx
+	mov	%rdx, t1	C q in t1
+	imul	md1, %rdx
+	mov	d0, %rax
+	lea	(%rdx, u1), u2
+	mul	t1
+	mov	(up, un, 8), u1
+	sub	d0, u1
+	sbb	d1, u2
+	sub	%rax, u1
+	sbb	%rdx, u2
+	xor	R32(%rax), R32(%rax)
+	xor	R32(%rdx), R32(%rdx)
+	cmp	t0, u2
+	cmovnc	d0, %rax
+	cmovnc	d1, %rdx
+	adc	$0, t1
+	nop
+	add	%rax, u1
+	adc	%rdx, u2
+	cmp	d1, u2
+	jae	L(fix)
+L(bck):
+	mov	t1, (qp, un, 8)
+L(next):
+	sub	$1, un
+	jnc	L(loop)
+L(end):
+	mov	u2, 8(rp)
+	mov	u1, (rp)
+
+	C qh on stack
+	pop	%rax
+
+	pop	%rbx
+	pop	%r12
+	pop	%r13
+	pop	%r14
+	pop	%r15
+	FUNC_EXIT()
+	ret
+
+L(fix):	C Unlikely update. u2 >= d1
+	seta	%dl
+	cmp	d0, u1
+	setae	%al
+	orb	%dl, %al		C "orb" form to placate Sun tools
+	je	L(bck)
+	inc	t1
+	sub	d0, u1
+	sbb	d1, u2
+	jmp	L(bck)
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/div_qr_2u_pi1.asm b/third_party/gmp/mpn/x86_64/div_qr_2u_pi1.asm
new file mode 100644
index 0000000..85af96f
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/div_qr_2u_pi1.asm

@@ -0,0 +1,200 @@
+dnl  x86-64 mpn_div_qr_2u_pi1
+dnl  -- Divide an mpn number by an unnormalized 2-limb number,
+dnl     using a single-limb inverse and shifting the dividend on the fly.
+
+dnl  Copyright 2007, 2008, 2010, 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C		c/l
+C INPUT PARAMETERS
+define(`qp',		`%rdi')
+define(`rp',		`%rsi')
+define(`up_param',	`%rdx')
+define(`un_param',	`%rcx') dnl %rcx needed for shift count
+define(`d1',		`%r8')
+define(`d0',		`%r9')
+define(`shift_param',	`FRAME+8(%rsp)')
+define(`di_param',	`FRAME+16(%rsp)')
+
+define(`di',		`%r10')
+define(`up',		`%r11')
+define(`un',		`%rbp')
+define(`u2',		`%rbx')
+define(`u1',		`%r12')
+define(`u0',		`%rsi') dnl Same as rp, which is saved and restored.
+define(`t1',		`%r13')
+define(`t0',		`%r14')
+define(`md1',		`%r15')
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+deflit(`FRAME', 0)
+PROLOGUE(mpn_div_qr_2u_pi1)
+	mov	di_param, di
+	mov	up_param, up
+	push	%r15
+	push	%r14
+	push	%r13
+	push	%r12
+	push	%rbx
+	push	%rbp
+	push	rp
+deflit(`FRAME', 56)
+	lea	-2(un_param), un
+	mov	d1, md1
+	neg	md1
+
+	C int parameter, 32 bits only
+	movl	shift_param, R32(%rcx)
+
+	C FIXME: Different code for SHLD_SLOW
+
+	xor	R32(u2), R32(u2)
+	mov	8(up, un, 8), u1
+	shld	%cl, u1, u2
+	C Remains to read (up, un, 8) and shift u1, u0
+	C udiv_qr_3by2 (qh,u2,u1,u2,u1,n0, d1,d0,di)
+	mov	di, %rax
+	mul	u2
+	mov	(up, un, 8), u0
+	shld	%cl, u0, u1
+	mov	u1, t0
+	add	%rax, t0	C q0 in t0
+	adc	u2, %rdx
+	mov	%rdx, t1	C q in t1
+	imul	md1, %rdx
+	mov	d0, %rax
+	lea	(%rdx, u1), u2
+	mul	t1
+	mov	u0, u1
+	shl	%cl, u1
+	sub	d0, u1
+	sbb	d1, u2
+	sub	%rax, u1
+	sbb	%rdx, u2
+	xor	R32(%rax), R32(%rax)
+	xor	R32(%rdx), R32(%rdx)
+	cmp	t0, u2
+	cmovnc	d0, %rax
+	cmovnc	d1, %rdx
+	adc	$0, t1
+	nop
+	add	%rax, u1
+	adc	%rdx, u2
+	cmp	d1, u2
+	jae	L(fix_qh)
+L(bck_qh):
+	push	t1	C push qh on stack
+
+	jmp	L(next)
+
+	ALIGN(16)
+L(loop):
+	C udiv_qr_3by2 (q,u2,u1,u2,u1,n0, d1,d0,di)
+	C Based on the optimized divrem_2.asm code.
+
+	mov	di, %rax
+	mul	u2
+	mov	(up, un, 8), u0
+	xor	R32(t1), R32(t1)
+	shld	%cl, u0, t1
+	or	t1, u1
+	mov	u1, t0
+	add	%rax, t0	C q0 in t0
+	adc	u2, %rdx
+	mov	%rdx, t1	C q in t1
+	imul	md1, %rdx
+	mov	d0, %rax
+	lea	(%rdx, u1), u2
+	mul	t1
+	mov	u0, u1
+	shl	%cl, u1
+	sub	d0, u1
+	sbb	d1, u2
+	sub	%rax, u1
+	sbb	%rdx, u2
+	xor	R32(%rax), R32(%rax)
+	xor	R32(%rdx), R32(%rdx)
+	cmp	t0, u2
+	cmovnc	d0, %rax
+	cmovnc	d1, %rdx
+	adc	$0, t1
+	nop
+	add	%rax, u1
+	adc	%rdx, u2
+	cmp	d1, u2
+	jae	L(fix)
+L(bck):
+	mov	t1, (qp, un, 8)
+L(next):
+	sub	$1, un
+	jnc	L(loop)
+L(end):
+	C qh on stack
+	pop	%rax
+	pop	rp
+	shrd	%cl, u2, u1
+	shr	%cl, u2
+	mov	u2, 8(rp)
+	mov	u1, (rp)
+
+	pop	%rbp
+	pop	%rbx
+	pop	%r12
+	pop	%r13
+	pop	%r14
+	pop	%r15
+	ret
+
+L(fix):	C Unlikely update. u2 >= d1
+	seta	%dl
+	cmp	d0, u1
+	setae	%al
+	orb	%dl, %al		C "orb" form to placate Sun tools
+	je	L(bck)
+	inc	t1
+	sub	d0, u1
+	sbb	d1, u2
+	jmp	L(bck)
+
+C Duplicated, just jumping back to a different address.
+L(fix_qh):	C Unlikely update. u2 >= d1
+	seta	%dl
+	cmp	d0, u1
+	setae	%al
+	orb	%dl, %al		C "orb" form to placate Sun tools
+	je	L(bck_qh)
+	inc	t1
+	sub	d0, u1
+	sbb	d1, u2
+	jmp	L(bck_qh)
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/dive_1.asm b/third_party/gmp/mpn/x86_64/dive_1.asm
new file mode 100644
index 0000000..988bdab
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/dive_1.asm

@@ -0,0 +1,158 @@
+dnl  AMD64 mpn_divexact_1 -- mpn by limb exact division.
+
+dnl  Copyright 2001, 2002, 2004-2006, 2011, 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C	     cycles/limb
+C AMD K8,K9	10
+C AMD K10	10
+C Intel P4	33
+C Intel core2	13.25
+C Intel corei	14
+C Intel atom	42
+C VIA nano	43
+
+C A quick adoption of the 32-bit K7 code.
+
+
+C INPUT PARAMETERS
+C rp		rdi
+C up		rsi
+C n		rdx
+C divisor	rcx
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_divexact_1)
+	FUNC_ENTRY(4)
+	push	%rbx
+
+	mov	%rcx, %rax
+	xor	R32(%rcx), R32(%rcx)	C shift count
+	mov	%rdx, %r8
+
+	bt	$0, R32(%rax)
+	jnc	L(evn)			C skip bsfq unless divisor is even
+
+L(odd):	mov	%rax, %rbx
+	shr	R32(%rax)
+	and	$127, R32(%rax)		C d/2, 7 bits
+
+	LEA(	binvert_limb_table, %rdx)
+
+	movzbl	(%rdx,%rax), R32(%rax)	C inv 8 bits
+
+	mov	%rbx, %r11		C d without twos
+
+	lea	(%rax,%rax), R32(%rdx)	C 2*inv
+	imul	R32(%rax), R32(%rax)	C inv*inv
+	imul	R32(%rbx), R32(%rax)	C inv*inv*d
+	sub	R32(%rax), R32(%rdx)	C inv = 2*inv - inv*inv*d, 16 bits
+
+	lea	(%rdx,%rdx), R32(%rax)	C 2*inv
+	imul	R32(%rdx), R32(%rdx)	C inv*inv
+	imul	R32(%rbx), R32(%rdx)	C inv*inv*d
+	sub	R32(%rdx), R32(%rax)	C inv = 2*inv - inv*inv*d, 32 bits
+
+	lea	(%rax,%rax), %r10	C 2*inv
+	imul	%rax, %rax		C inv*inv
+	imul	%rbx, %rax		C inv*inv*d
+	sub	%rax, %r10		C inv = 2*inv - inv*inv*d, 64 bits
+
+	lea	(%rsi,%r8,8), %rsi	C up end
+	lea	-8(%rdi,%r8,8), %rdi	C rp end
+	neg	%r8			C -n
+
+	mov	(%rsi,%r8,8), %rax	C up[0]
+
+	inc	%r8
+	jz	L(one)
+
+	mov	(%rsi,%r8,8), %rdx	C up[1]
+
+	shrd	R8(%rcx), %rdx, %rax
+
+	xor	R32(%rbx), R32(%rbx)
+	jmp	L(ent)
+
+L(evn):	bsf	%rax, %rcx
+	shr	R8(%rcx), %rax
+	jmp	L(odd)
+
+	ALIGN(8)
+L(top):
+	C rax	q
+	C rbx	carry bit, 0 or 1
+	C rcx	shift
+	C rdx
+	C rsi	up end
+	C rdi	rp end
+	C r8	counter, limbs, negative
+	C r10	d^(-1) mod 2^64
+	C r11	d, shifted down
+
+	mul	%r11			C carry limb in rdx	0 10
+	mov	-8(%rsi,%r8,8), %rax	C
+	mov	(%rsi,%r8,8), %r9	C
+	shrd	R8(%rcx), %r9, %rax	C
+	nop				C
+	sub	%rbx, %rax		C apply carry bit
+	setc	%bl			C
+	sub	%rdx, %rax		C apply carry limb	5
+	adc	$0, %rbx		C			6
+L(ent):	imul	%r10, %rax		C			6
+	mov	%rax, (%rdi,%r8,8)	C
+	inc	%r8			C
+	jnz	L(top)
+
+	mul	%r11			C carry limb in rdx
+	mov	-8(%rsi), %rax		C up high limb
+	shr	R8(%rcx), %rax
+	sub	%rbx, %rax		C apply carry bit
+	sub	%rdx, %rax		C apply carry limb
+	imul	%r10, %rax
+	mov	%rax, (%rdi)
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+
+L(one):	shr	R8(%rcx), %rax
+	imul	%r10, %rax
+	mov	%rax, (%rdi)
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/divrem_1.asm b/third_party/gmp/mpn/x86_64/divrem_1.asm
new file mode 100644
index 0000000..d4d61ad
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/divrem_1.asm

@@ -0,0 +1,314 @@
+dnl  x86-64 mpn_divrem_1 -- mpn by limb division.
+
+dnl  Copyright 2004, 2005, 2007-2012, 2014 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C		norm	unorm	frac
+C AMD K8,K9	13	13	12
+C AMD K10	13	13	12
+C Intel P4	43	44	43
+C Intel core2	24.5	24.5	19.5
+C Intel corei	20.5	19.5	18
+C Intel atom	43	46	36
+C VIA nano	25.5	25.5	24
+
+C mp_limb_t
+C mpn_divrem_1 (mp_ptr qp, mp_size_t fn,
+C               mp_srcptr np, mp_size_t nn, mp_limb_t d)
+
+C mp_limb_t
+C mpn_preinv_divrem_1 (mp_ptr qp, mp_size_t fn,
+C                      mp_srcptr np, mp_size_t nn, mp_limb_t d,
+C                      mp_limb_t dinv, int cnt)
+
+C INPUT PARAMETERS
+define(`qp',		`%rdi')
+define(`fn_param',	`%rsi')
+define(`up_param',	`%rdx')
+define(`un_param',	`%rcx')
+define(`d',		`%r8')
+define(`dinv',		`%r9')		C only for mpn_preinv_divrem_1
+C       shift passed on stack		C only for mpn_preinv_divrem_1
+
+define(`cnt',		`%rcx')
+define(`up',		`%rsi')
+define(`fn',		`%r12')
+define(`un',		`%rbx')
+
+
+C rax rbx rcx rdx rsi rdi rbp r8  r9  r10 r11 r12 r13 r14 r15
+C         cnt         qp      d  dinv
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+IFSTD(`define(`CNTOFF',		`40($1)')')
+IFDOS(`define(`CNTOFF',		`104($1)')')
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_preinv_divrem_1)
+	FUNC_ENTRY(4)
+IFDOS(`	mov	56(%rsp), %r8	')
+IFDOS(`	mov	64(%rsp), %r9	')
+	xor	R32(%rax), R32(%rax)
+	push	%r13
+	push	%r12
+	push	%rbp
+	push	%rbx
+
+	mov	fn_param, fn
+	mov	un_param, un
+	add	fn_param, un_param
+	mov	up_param, up
+
+	lea	-8(qp,un_param,8), qp
+
+	test	d, d
+	js	L(nent)
+
+	mov	CNTOFF(%rsp), R8(cnt)
+	shl	R8(cnt), d
+	jmp	L(uent)
+EPILOGUE()
+
+	ALIGN(16)
+PROLOGUE(mpn_divrem_1)
+	FUNC_ENTRY(4)
+IFDOS(`	mov	56(%rsp), %r8	')
+	xor	R32(%rax), R32(%rax)
+	push	%r13
+	push	%r12
+	push	%rbp
+	push	%rbx
+
+	mov	fn_param, fn
+	mov	un_param, un
+	add	fn_param, un_param
+	mov	up_param, up
+	je	L(ret)
+
+	lea	-8(qp,un_param,8), qp
+	xor	R32(%rbp), R32(%rbp)
+
+	test	d, d
+	jns	L(unnormalized)
+
+L(normalized):
+	test	un, un
+	je	L(8)			C un == 0
+	mov	-8(up,un,8), %rbp
+	dec	un
+	mov	%rbp, %rax
+	sub	d, %rbp
+	cmovc	%rax, %rbp
+	sbb	R32(%rax), R32(%rax)
+	inc	R32(%rax)
+	mov	%rax, (qp)
+	lea	-8(qp), qp
+L(8):
+IFSTD(`	push	%rdi		')
+IFSTD(`	push	%rsi		')
+	push	%r8
+IFSTD(`	mov	d, %rdi		')
+IFDOS(`	sub	$32, %rsp	')
+IFDOS(`	mov	d, %rcx		')
+	ASSERT(nz, `test $15, %rsp')
+	CALL(	mpn_invert_limb)
+IFDOS(`	add	$32, %rsp	')
+	pop	%r8
+IFSTD(`	pop	%rsi		')
+IFSTD(`	pop	%rdi		')
+
+	mov	%rax, dinv
+	mov	%rbp, %rax
+	jmp	L(nent)
+
+	ALIGN(16)
+L(ntop):mov	(up,un,8), %r10		C	    K8-K10  P6-CNR P6-NHM  P4
+	mul	dinv			C	      0,13   0,20   0,18   0,45
+	add	%r10, %rax		C	      4      8      3     12
+	adc	%rbp, %rdx		C	      5      9     10     13
+	mov	%rax, %rbp		C	      5      9      4     13
+	mov	%rdx, %r13		C	      6     11     12     23
+	imul	d, %rdx			C	      6     11     11     23
+	sub	%rdx, %r10		C	     10     16     14     33
+	mov	d, %rax			C
+	add	%r10, %rax		C	     11     17     15     34
+	cmp	%rbp, %r10		C	     11     17     15     34
+	cmovc	%r10, %rax		C	     12     18     16     35
+	adc	$-1, %r13		C
+	cmp	d, %rax			C
+	jae	L(nfx)			C
+L(nok):	mov	%r13, (qp)		C
+	sub	$8, qp			C
+L(nent):lea	1(%rax), %rbp		C
+	dec	un			C
+	jns	L(ntop)			C
+
+	xor	R32(%rcx), R32(%rcx)
+	jmp	L(frac)
+
+L(nfx):	sub	d, %rax
+	inc	%r13
+	jmp	L(nok)
+
+L(unnormalized):
+	test	un, un
+	je	L(44)
+	mov	-8(up,un,8), %rax
+	cmp	d, %rax
+	jae	L(44)
+	mov	%rbp, (qp)
+	mov	%rax, %rbp
+	lea	-8(qp), qp
+	je	L(ret)
+	dec	un
+L(44):
+	bsr	d, %rcx
+	not	R32(%rcx)
+	shl	R8(%rcx), d
+	shl	R8(%rcx), %rbp
+
+	push	%rcx
+IFSTD(`	push	%rdi		')
+IFSTD(`	push	%rsi		')
+	push	%r8
+IFSTD(`	sub	$8, %rsp	')
+IFSTD(`	mov	d, %rdi		')
+IFDOS(`	sub	$40, %rsp	')
+IFDOS(`	mov	d, %rcx		')
+	ASSERT(nz, `test $15, %rsp')
+	CALL(	mpn_invert_limb)
+IFSTD(`	add	$8, %rsp	')
+IFDOS(`	add	$40, %rsp	')
+	pop	%r8
+IFSTD(`	pop	%rsi		')
+IFSTD(`	pop	%rdi		')
+	pop	%rcx
+
+	mov	%rax, dinv
+	mov	%rbp, %rax
+	test	un, un
+	je	L(frac)
+
+L(uent):dec	un
+	mov	(up,un,8), %rbp
+	neg	R32(%rcx)
+	shr	R8(%rcx), %rbp
+	neg	R32(%rcx)
+	or	%rbp, %rax
+	jmp	L(ent)
+
+	ALIGN(16)
+L(utop):mov	(up,un,8), %r10
+	shl	R8(%rcx), %rbp
+	neg	R32(%rcx)
+	shr	R8(%rcx), %r10
+	neg	R32(%rcx)
+	or	%r10, %rbp
+	mul	dinv
+	add	%rbp, %rax
+	adc	%r11, %rdx
+	mov	%rax, %r11
+	mov	%rdx, %r13
+	imul	d, %rdx
+	sub	%rdx, %rbp
+	mov	d, %rax
+	add	%rbp, %rax
+	cmp	%r11, %rbp
+	cmovc	%rbp, %rax
+	adc	$-1, %r13
+	cmp	d, %rax
+	jae	L(ufx)
+L(uok):	mov	%r13, (qp)
+	sub	$8, qp
+L(ent):	mov	(up,un,8), %rbp
+	dec	un
+	lea	1(%rax), %r11
+	jns	L(utop)
+
+L(uend):shl	R8(%rcx), %rbp
+	mul	dinv
+	add	%rbp, %rax
+	adc	%r11, %rdx
+	mov	%rax, %r11
+	mov	%rdx, %r13
+	imul	d, %rdx
+	sub	%rdx, %rbp
+	mov	d, %rax
+	add	%rbp, %rax
+	cmp	%r11, %rbp
+	cmovc	%rbp, %rax
+	adc	$-1, %r13
+	cmp	d, %rax
+	jae	L(efx)
+L(eok):	mov	%r13, (qp)
+	sub	$8, qp
+	jmp	L(frac)
+
+L(ufx):	sub	d, %rax
+	inc	%r13
+	jmp	L(uok)
+L(efx):	sub	d, %rax
+	inc	%r13
+	jmp	L(eok)
+
+L(frac):mov	d, %rbp
+	neg	%rbp
+	jmp	L(fent)
+
+	ALIGN(16)			C	    K8-K10  P6-CNR P6-NHM  P4
+L(ftop):mul	dinv			C	      0,12   0,17   0,17
+	add	%r11, %rdx		C	      5      8     10
+	mov	%rax, %r11		C	      4      8      3
+	mov	%rdx, %r13		C	      6      9     11
+	imul	%rbp, %rdx		C	      6      9     11
+	mov	d, %rax			C
+	add	%rdx, %rax		C	     10     14     14
+	cmp	%r11, %rdx		C	     10     14     14
+	cmovc	%rdx, %rax		C	     11     15     15
+	adc	$-1, %r13		C
+	mov	%r13, (qp)		C
+	sub	$8, qp			C
+L(fent):lea	1(%rax), %r11		C
+	dec	fn			C
+	jns	L(ftop)			C
+
+	shr	R8(%rcx), %rax
+L(ret):	pop	%rbx
+	pop	%rbp
+	pop	%r12
+	pop	%r13
+	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/divrem_2.asm b/third_party/gmp/mpn/x86_64/divrem_2.asm
new file mode 100644
index 0000000..20811cc
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/divrem_2.asm

@@ -0,0 +1,192 @@
+dnl  x86-64 mpn_divrem_2 -- Divide an mpn number by a normalized 2-limb number.
+
+dnl  Copyright 2007, 2008, 2010, 2014 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C	     cycles/limb	best
+C AMD K8,K9	18
+C AMD K10	18
+C AMD bull
+C AMD pile
+C AMD bobcat
+C AMD jaguar
+C Intel P4	68
+C Intel core	34
+C Intel NHM	30.25
+C Intel SBR	21.3
+C Intel IBR	21.4
+C Intel HWL	20.6
+C Intel BWL
+C Intel atom	73
+C VIA nano	33
+
+
+C INPUT PARAMETERS
+define(`qp',		`%rdi')
+define(`fn',		`%rsi')
+define(`up_param',	`%rdx')
+define(`un_param',	`%rcx')
+define(`dp',		`%r8')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_divrem_2)
+	FUNC_ENTRY(4)
+IFDOS(`	mov	56(%rsp), %r8	')
+	push	%r15
+	push	%r14
+	push	%r13
+	push	%r12
+	lea	-24(%rdx,%rcx,8), %r12	C r12 = &up[un-1]
+	mov	%rsi, %r13
+	push	%rbp
+	mov	%rdi, %rbp
+	push	%rbx
+	mov	8(%r8), %r11		C d1
+	mov	16(%r12), %rbx
+	mov	(%r8), %r8		C d0
+	mov	8(%r12), %r10
+
+	xor	R32(%r15), R32(%r15)
+	cmp	%rbx, %r11
+	ja	L(2)
+	setb	%dl
+	cmp	%r10, %r8
+	setbe	%al
+	orb	%al, %dl		C "orb" form to placate Sun tools
+	je	L(2)
+	inc	R32(%r15)
+	sub	%r8, %r10
+	sbb	%r11, %rbx
+L(2):
+	lea	-3(%rcx,%r13), %r14	C un + fn - 3
+	test	%r14, %r14
+	js	L(end)
+
+	push	%r8
+	push	%r10
+	push	%r11
+IFSTD(`	mov	%r11, %rdi	')
+IFDOS(`	mov	%r11, %rcx	')
+IFDOS(`	sub	$32, %rsp	')
+	ASSERT(nz, `test $15, %rsp')
+	CALL(	mpn_invert_limb)
+IFDOS(`	add	$32, %rsp	')
+	pop	%r11
+	pop	%r10
+	pop	%r8
+
+	mov	%r11, %rdx
+	mov	%rax, %rdi
+	imul	%rax, %rdx
+	mov	%rdx, %r9
+	mul	%r8
+	xor	R32(%rcx), R32(%rcx)
+	add	%r8, %r9
+	adc	$-1, %rcx
+	add	%rdx, %r9
+	adc	$0, %rcx
+	js	2f
+1:	dec	%rdi
+	sub	%r11, %r9
+	sbb	$0, %rcx
+	jns	1b
+2:
+
+	lea	(%rbp,%r14,8), %rbp
+	mov	%r11, %rsi
+	neg	%rsi			C -d1
+
+C rax rbx rcx rdx rsi rdi  rbp r8 r9 r10 r11 r12 r13 r14 r15
+C     n2  un      -d1 dinv qp  d0 q0     d1  up  fn      msl
+
+	ALIGN(16)
+L(top):	mov	%rdi, %rax		C di		ncp
+	mul	%rbx			C		0, 17
+	mov	%r10, %rcx		C
+	add	%rax, %rcx		C		4
+	adc	%rbx, %rdx		C		5
+	mov	%rdx, %r9		C q		6
+	imul	%rsi, %rdx		C		6
+	mov	%r8, %rax		C		ncp
+	lea	(%rdx, %r10), %rbx	C n1 -= ...	10
+	xor	R32(%r10), R32(%r10)	C
+	mul	%r9			C		7
+	cmp	%r14, %r13		C
+	jg	L(19)			C
+	mov	(%r12), %r10		C
+	sub	$8, %r12		C
+L(19):	sub	%r8, %r10		C		ncp
+	sbb	%r11, %rbx		C		11
+	sub	%rax, %r10		C		11
+	sbb	%rdx, %rbx		C		12
+	xor	R32(%rax), R32(%rax)	C
+	xor	R32(%rdx), R32(%rdx)	C
+	cmp	%rcx, %rbx		C		13
+	cmovnc	%r8, %rax		C		14
+	cmovnc	%r11, %rdx		C		14
+	adc	$0, %r9			C adjust q	14
+	nop
+	add	%rax, %r10		C		15
+	adc	%rdx, %rbx		C		16
+	cmp	%r11, %rbx		C
+	jae	L(fix)			C
+L(bck):	mov	%r9, (%rbp)		C
+	sub	$8, %rbp		C
+	dec	%r14
+	jns	L(top)
+
+L(end):	mov	%r10, 8(%r12)
+	mov	%rbx, 16(%r12)
+	pop	%rbx
+	pop	%rbp
+	pop	%r12
+	pop	%r13
+	pop	%r14
+	mov	%r15, %rax
+	pop	%r15
+	FUNC_EXIT()
+	ret
+
+L(fix):	seta	%dl
+	cmp	%r8, %r10
+	setae	%al
+	orb	%dl, %al		C "orb" form to placate Sun tools
+	je	L(bck)
+	inc	%r9
+	sub	%r8, %r10
+	sbb	%r11, %rbx
+	jmp	L(bck)
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/dos64.m4 b/third_party/gmp/mpn/x86_64/dos64.m4
new file mode 100644
index 0000000..0da1b36
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/dos64.m4

@@ -0,0 +1,101 @@
+divert(-1)
+dnl  Copyright 2011-2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+define(`HOST_DOS64')
+
+
+dnl  On DOS64 we always generate position-independent-code
+dnl
+
+define(`PIC')
+
+
+define(`LEA',`
+	lea	$1(%rip), $2
+')
+
+
+dnl  Usage: CALL(funcname)
+dnl
+dnl  Simply override the definition in x86_64-defs.m4.
+
+define(`CALL',`call	GSYM_PREFIX`'$1')
+define(`TCALL',`jmp	GSYM_PREFIX`'$1')
+
+
+dnl  Usage: JUMPTABSECT
+
+define(`JUMPTABSECT', `RODATA')
+
+
+dnl  Usage: JMPENT(targlabel,tablabel)
+
+define(`JMPENT', `.long	$1-$2')
+
+
+dnl  Usage: FUNC_ENTRY(nregparmas)
+dnl  Usage: FUNC_EXIT()
+
+dnl  FUNC_ENTRY and FUNC_EXIT provide an easy path for adoption of standard
+dnl  ABI assembly to the DOS64 ABI.
+
+define(`FUNC_ENTRY',
+	`push	%rdi
+	push	%rsi
+	mov	%rcx, %rdi
+ifelse(eval($1>=2),1,`dnl
+	mov	%rdx, %rsi
+ifelse(eval($1>=3),1,`dnl
+	mov	%r8, %rdx
+ifelse(eval($1>=4),1,`dnl
+	mov	%r9, %rcx
+')')')')
+
+define(`FUNC_EXIT',
+	`pop	%rsi
+	pop	%rdi')
+
+
+dnl  Target ABI macros.  For DOS64 we override the defaults.
+
+define(`IFDOS',   `$1')
+define(`IFSTD',   `')
+define(`IFELF',   `')
+
+
+dnl  Usage: PROTECT(symbol)
+dnl
+dnl  Used for private GMP symbols that should never be overridden by users.
+dnl  This can save reloc entries and improve shlib sharing as well as
+dnl  application startup times
+
+define(`PROTECT',  `')
+
+
+divert`'dnl

diff --git a/third_party/gmp/mpn/x86_64/fastavx/copyd.asm b/third_party/gmp/mpn/x86_64/fastavx/copyd.asm
new file mode 100644
index 0000000..56d472f
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/fastavx/copyd.asm

@@ -0,0 +1,172 @@
+dnl  AMD64 mpn_copyd optimised for CPUs with fast AVX.
+
+dnl  Copyright 2003, 2005, 2007, 2011-2013, 2015 Free Software Foundation, Inc.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb     cycles/limb     cycles/limb      good
+C              aligned	      unaligned	      best seen	     for cpu?
+C AMD K8,K9	n/a
+C AMD K10	n/a
+C AMD bull	n/a
+C AMD pile	 4.87		 4.87				N
+C AMD steam	 ?		 ?
+C AMD bobcat	n/a
+C AMD jaguar	n/a
+C Intel P4	n/a
+C Intel core	n/a
+C Intel NHM	n/a
+C Intel SBR	 0.50		 0.91				N
+C Intel IBR	 0.50		 0.65				N
+C Intel HWL	 0.25		 0.30				Y
+C Intel BWL	 0.28		 0.37				Y
+C Intel atom	n/a
+C VIA nano	n/a
+
+C We try to do as many 32-byte operations as possible.  The top-most and
+C bottom-most writes might need 8-byte operations.  For the bulk copying, we
+C write using aligned 32-byte operations, but we read with both aligned and
+C unaligned 32-byte operations.
+
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`n',  `%rdx')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+dnl define(`vmovdqu', vlddqu)
+
+ASM_START()
+	TEXT
+	ALIGN(32)
+PROLOGUE(mpn_copyd)
+	FUNC_ENTRY(3)
+
+	lea	-32(rp,n,8), rp
+	lea	-32(up,n,8), up
+
+	cmp	$7, n			C basecase needed for correctness
+	jbe	L(bc)
+
+	test	$8, R8(rp)		C is rp 16-byte aligned?
+	jz	L(a2)			C jump if rp aligned
+	mov	24(up), %rax
+	lea	-8(up), up
+	mov	%rax, 24(rp)
+	lea	-8(rp), rp
+	dec	n
+L(a2):	test	$16, R8(rp)		C is rp 32-byte aligned?
+	jz	L(a3)			C jump if rp aligned
+	vmovdqu	16(up), %xmm0
+	lea	-16(up), up
+	vmovdqa	%xmm0, 16(rp)
+	lea	-16(rp), rp
+	sub	$2, n
+L(a3):	sub	$16, n
+	jc	L(sma)
+
+	ALIGN(16)
+L(top):	vmovdqu	(up), %ymm0
+	vmovdqu	-32(up), %ymm1
+	vmovdqu	-64(up), %ymm2
+	vmovdqu	-96(up), %ymm3
+	lea	-128(up), up
+	vmovdqa	%ymm0, (rp)
+	vmovdqa	%ymm1, -32(rp)
+	vmovdqa	%ymm2, -64(rp)
+	vmovdqa	%ymm3, -96(rp)
+	lea	-128(rp), rp
+L(ali):	sub	$16, n
+	jnc	L(top)
+
+L(sma):	test	$8, R8(n)
+	jz	1f
+	vmovdqu	(up), %ymm0
+	vmovdqu	-32(up), %ymm1
+	lea	-64(up), up
+	vmovdqa	%ymm0, (rp)
+	vmovdqa	%ymm1, -32(rp)
+	lea	-64(rp), rp
+1:
+	test	$4, R8(n)
+	jz	1f
+	vmovdqu	(up), %ymm0
+	lea	-32(up), up
+	vmovdqa	%ymm0, (rp)
+	lea	-32(rp), rp
+1:
+	test	$2, R8(n)
+	jz	1f
+	vmovdqu	16(up), %xmm0
+	lea	-16(up), up
+	vmovdqa	%xmm0, 16(rp)
+	lea	-16(rp), rp
+1:
+	test	$1, R8(n)
+	jz	1f
+	mov	24(up), %r8
+	mov	%r8, 24(rp)
+1:
+	FUNC_EXIT()
+	ret
+
+	ALIGN(16)
+L(bc):	test	$4, R8(n)
+	jz	1f
+	mov	24(up), %rax
+	mov	16(up), %rcx
+	mov	8(up), %r8
+	mov	(up), %r9
+	lea	-32(up), up
+	mov	%rax, 24(rp)
+	mov	%rcx, 16(rp)
+	mov	%r8, 8(rp)
+	mov	%r9, (rp)
+	lea	-32(rp), rp
+1:
+	test	$2, R8(n)
+	jz	1f
+	mov	24(up), %rax
+	mov	16(up), %rcx
+	lea	-16(up), up
+	mov	%rax, 24(rp)
+	mov	%rcx, 16(rp)
+	lea	-16(rp), rp
+1:
+	test	$1, R8(n)
+	jz	1f
+	mov	24(up), %rax
+	mov	%rax, 24(rp)
+1:
+	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/fastavx/copyi.asm b/third_party/gmp/mpn/x86_64/fastavx/copyi.asm
new file mode 100644
index 0000000..7607747
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/fastavx/copyi.asm

@@ -0,0 +1,169 @@
+dnl  AMD64 mpn_copyi optimised for CPUs with fast AVX.
+
+dnl  Copyright 2003, 2005, 2007, 2011-2013, 2015 Free Software Foundation, Inc.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb     cycles/limb     cycles/limb      good
+C              aligned	      unaligned	      best seen	     for cpu?
+C AMD K8,K9	n/a
+C AMD K10	n/a
+C AMD bull	n/a
+C AMD pile	 4.87		 4.87				N
+C AMD steam	 ?		 ?
+C AMD bobcat	n/a
+C AMD jaguar	n/a
+C Intel P4	n/a
+C Intel core	n/a
+C Intel NHM	n/a
+C Intel SBR	 0.50		 0.91				N
+C Intel IBR	 0.50		 0.65				N
+C Intel HWL	 0.25		 0.30				Y
+C Intel BWL	 0.28		 0.37				Y
+C Intel atom	n/a
+C VIA nano	n/a
+
+C We try to do as many 32-byte operations as possible.  The top-most and
+C bottom-most writes might need 8-byte operations.  For the bulk copying, we
+C write using aligned 32-byte operations, but we read with both aligned and
+C unaligned 32-byte operations.
+
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`n',  `%rdx')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+dnl define(`vmovdqu', vlddqu)
+
+ASM_START()
+	TEXT
+	ALIGN(32)
+PROLOGUE(mpn_copyi)
+	FUNC_ENTRY(3)
+
+	cmp	$7, n
+	jbe	L(bc)
+
+	test	$8, R8(rp)		C is rp 16-byte aligned?
+	jz	L(a2)			C jump if rp aligned
+	mov	(up), %rax
+	lea	8(up), up
+	mov	%rax, (rp)
+	lea	8(rp), rp
+	dec	n
+L(a2):	test	$16, R8(rp)		C is rp 32-byte aligned?
+	jz	L(a3)			C jump if rp aligned
+	vmovdqu	(up), %xmm0
+	lea	16(up), up
+	vmovdqa	%xmm0, (rp)
+	lea	16(rp), rp
+	sub	$2, n
+L(a3):	sub	$16, n
+	jc	L(sma)
+
+	ALIGN(16)
+L(top):	vmovdqu	(up), %ymm0
+	vmovdqu	32(up), %ymm1
+	vmovdqu	64(up), %ymm2
+	vmovdqu	96(up), %ymm3
+	lea	128(up), up
+	vmovdqa	%ymm0, (rp)
+	vmovdqa	%ymm1, 32(rp)
+	vmovdqa	%ymm2, 64(rp)
+	vmovdqa	%ymm3, 96(rp)
+	lea	128(rp), rp
+L(ali):	sub	$16, n
+	jnc	L(top)
+
+L(sma):	test	$8, R8(n)
+	jz	1f
+	vmovdqu	(up), %ymm0
+	vmovdqu	32(up), %ymm1
+	lea	64(up), up
+	vmovdqa	%ymm0, (rp)
+	vmovdqa	%ymm1, 32(rp)
+	lea	64(rp), rp
+1:
+	test	$4, R8(n)
+	jz	1f
+	vmovdqu	(up), %ymm0
+	lea	32(up), up
+	vmovdqa	%ymm0, (rp)
+	lea	32(rp), rp
+1:
+	test	$2, R8(n)
+	jz	1f
+	vmovdqu	(up), %xmm0
+	lea	16(up), up
+	vmovdqa	%xmm0, (rp)
+	lea	16(rp), rp
+1:
+L(end):	test	$1, R8(n)
+	jz	1f
+	mov	(up), %r8
+	mov	%r8, (rp)
+1:
+	FUNC_EXIT()
+	ret
+
+	ALIGN(16)
+L(bc):	test	$4, R8(n)
+	jz	1f
+	mov	(up), %rax
+	mov	8(up), %rcx
+	mov	16(up), %r8
+	mov	24(up), %r9
+	lea	32(up), up
+	mov	%rax, (rp)
+	mov	%rcx, 8(rp)
+	mov	%r8, 16(rp)
+	mov	%r9, 24(rp)
+	lea	32(rp), rp
+1:
+	test	$2, R8(n)
+	jz	1f
+	mov	(up), %rax
+	mov	8(up), %rcx
+	lea	16(up), up
+	mov	%rax, (rp)
+	mov	%rcx, 8(rp)
+	lea	16(rp), rp
+1:
+	test	$1, R8(n)
+	jz	1f
+	mov	(up), %rax
+	mov	%rax, (rp)
+1:
+	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/fastsse/README b/third_party/gmp/mpn/x86_64/fastsse/README
new file mode 100644
index 0000000..5538b2d
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/fastsse/README

@@ -0,0 +1,22 @@
+This directory contains code for x86-64 processors with fast
+implementations of SSE operations, hence the name "fastsse".
+
+Current processors that might benefit from this code are:
+
+  AMD K10
+  AMD Bulldozer/Piledriver/Steamroller/Excavator
+  Intel Nocona
+  Intel Nehalem/Westmere
+  Intel Sandybridge/Ivybridge
+  Intel Haswell/Broadwell
+  VIA Nano
+
+Current processors that do not benefit from this code are:
+
+  AMD K8
+  AMD Bobcat
+  Intel Atom
+
+Intel Conroe/Penryn is a border case; its handling of non-aligned
+128-bit memory operands is poor.  VIA Nano also have poor handling of
+non-aligned operands.

diff --git a/third_party/gmp/mpn/x86_64/fastsse/com-palignr.asm b/third_party/gmp/mpn/x86_64/fastsse/com-palignr.asm
new file mode 100644
index 0000000..69027bc
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/fastsse/com-palignr.asm

@@ -0,0 +1,311 @@
+dnl  AMD64 mpn_com optimised for CPUs with fast SSE copying and SSSE3.
+
+dnl  Copyright 2012, 2013, 2015 Free Software Foundation, Inc.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb     cycles/limb     cycles/limb      good
+C              aligned	      unaligned	      best seen	     for cpu?
+C AMD K8,K9	 2.0		 illop		1.0/1.0		N
+C AMD K10	 0.85		 illop				Y/N
+C AMD bd1	 1.39		 ? 1.45				Y/N
+C AMD bd2     0.8-1.4	       0.7-1.4				Y
+C AMD bd3
+C AMD bd4
+C AMD bobcat	 1.97		 ? 8.17		1.5/1.5		N
+C AMD jaguar	 1.02		 1.02		0.91/0.91	N
+C Intel P4	 2.26		 illop				Y/N
+C Intel core	 0.58		 0.87		opt/0.74	Y
+C Intel NHM	 0.64		 1.14		opt/bad		Y
+C Intel SBR	 0.51		 0.65		opt/opt		Y
+C Intel IBR	 0.50		 0.64		opt/0.57	Y
+C Intel HWL	 0.51		 0.58		opt/opt		Y
+C Intel BWL	 0.52		 0.64		opt/opt		Y
+C Intel SKL	 0.51		 0.63		opt/opt		Y
+C Intel atom	 1.16		 1.70		opt/opt		Y
+C Intel SLM	 1.02		 1.52				N
+C VIA nano	 1.09		 1.10		opt/opt		Y
+
+C We use only 16-byte operations, except for unaligned top-most and bottom-most
+C limbs.  We use the SSSE3 palignr instruction when rp - up = 8 (mod 16).  That
+C instruction is better adapted to mpn_copyd's needs, we need to contort the
+C code to use it here.
+C
+C For operands of < COM_SSE_THRESHOLD limbs, we use a plain 64-bit loop, taken
+C from the x86_64 default code.
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`n',  `%rdx')
+
+C There are three instructions for loading an aligned 128-bit quantity.  We use
+C movaps, since it has the shortest coding.
+define(`movdqa', ``movaps'')
+
+ifdef(`COM_SSE_THRESHOLD',`',`define(`COM_SSE_THRESHOLD', 7)')
+
+ASM_START()
+	TEXT
+	ALIGN(64)
+PROLOGUE(mpn_com)
+	FUNC_ENTRY(3)
+
+	cmp	$COM_SSE_THRESHOLD, n
+	jbe	L(bc)
+
+	pcmpeqb	%xmm5, %xmm5		C set to 111...111
+
+	test	$8, R8(rp)		C is rp 16-byte aligned?
+	jz	L(rp_aligned)		C jump if rp aligned
+
+	mov	(up), %r8
+	lea	8(up), up
+	not	%r8
+	mov	%r8, (rp)
+	lea	8(rp), rp
+	dec	n
+
+L(rp_aligned):
+	test	$8, R8(up)
+	jnz	L(uent)
+
+ifelse(eval(COM_SSE_THRESHOLD >= 8),1,
+`	sub	$8, n',
+`	jmp	L(am)')
+
+	ALIGN(16)
+L(atop):movdqa	0(up), %xmm0
+	movdqa	16(up), %xmm1
+	movdqa	32(up), %xmm2
+	movdqa	48(up), %xmm3
+	lea	64(up), up
+	pxor	%xmm5, %xmm0
+	pxor	%xmm5, %xmm1
+	pxor	%xmm5, %xmm2
+	pxor	%xmm5, %xmm3
+	movdqa	%xmm0, (rp)
+	movdqa	%xmm1, 16(rp)
+	movdqa	%xmm2, 32(rp)
+	movdqa	%xmm3, 48(rp)
+	lea	64(rp), rp
+L(am):	sub	$8, n
+	jnc	L(atop)
+
+	test	$4, R8(n)
+	jz	1f
+	movdqa	(up), %xmm0
+	movdqa	16(up), %xmm1
+	lea	32(up), up
+	pxor	%xmm5, %xmm0
+	pxor	%xmm5, %xmm1
+	movdqa	%xmm0, (rp)
+	movdqa	%xmm1, 16(rp)
+	lea	32(rp), rp
+
+1:	test	$2, R8(n)
+	jz	1f
+	movdqa	(up), %xmm0
+	lea	16(up), up
+	pxor	%xmm5, %xmm0
+	movdqa	%xmm0, (rp)
+	lea	16(rp), rp
+
+1:	test	$1, R8(n)
+	jz	1f
+	mov	(up), %r8
+	not	%r8
+	mov	%r8, (rp)
+
+1:	FUNC_EXIT()
+	ret
+
+L(uent):
+C Code handling up - rp = 8 (mod 16)
+
+C FIXME: The code below only handles overlap if it is close to complete, or
+C quite separate: up-rp < 5 or up-up > 15 limbs
+	lea	-40(up), %rax		C 40 = 5 * GMP_LIMB_BYTES
+	sub	rp, %rax
+	cmp	$80, %rax		C 80 = (15-5) * GMP_LIMB_BYTES
+	jbe	L(bc)			C deflect to plain loop
+
+	sub	$16, n
+	jc	L(uend)
+
+	movdqa	120(up), %xmm3
+
+	sub	$16, n
+	jmp	L(um)
+
+	ALIGN(16)
+L(utop):movdqa	120(up), %xmm3
+	pxor	%xmm5, %xmm0
+	movdqa	%xmm0, -128(rp)
+	sub	$16, n
+L(um):	movdqa	104(up), %xmm2
+	palignr($8, %xmm2, %xmm3)
+	movdqa	88(up), %xmm1
+	pxor	%xmm5, %xmm3
+	movdqa	%xmm3, 112(rp)
+	palignr($8, %xmm1, %xmm2)
+	movdqa	72(up), %xmm0
+	pxor	%xmm5, %xmm2
+	movdqa	%xmm2, 96(rp)
+	palignr($8, %xmm0, %xmm1)
+	movdqa	56(up), %xmm3
+	pxor	%xmm5, %xmm1
+	movdqa	%xmm1, 80(rp)
+	palignr($8, %xmm3, %xmm0)
+	movdqa	40(up), %xmm2
+	pxor	%xmm5, %xmm0
+	movdqa	%xmm0, 64(rp)
+	palignr($8, %xmm2, %xmm3)
+	movdqa	24(up), %xmm1
+	pxor	%xmm5, %xmm3
+	movdqa	%xmm3, 48(rp)
+	palignr($8, %xmm1, %xmm2)
+	movdqa	8(up), %xmm0
+	pxor	%xmm5, %xmm2
+	movdqa	%xmm2, 32(rp)
+	palignr($8, %xmm0, %xmm1)
+	movdqa	-8(up), %xmm3
+	pxor	%xmm5, %xmm1
+	movdqa	%xmm1, 16(rp)
+	palignr($8, %xmm3, %xmm0)
+	lea	128(up), up
+	lea	128(rp), rp
+	jnc	L(utop)
+
+	pxor	%xmm5, %xmm0
+	movdqa	%xmm0, -128(rp)
+
+L(uend):test	$8, R8(n)
+	jz	1f
+	movdqa	56(up), %xmm3
+	movdqa	40(up), %xmm2
+	palignr($8, %xmm2, %xmm3)
+	movdqa	24(up), %xmm1
+	pxor	%xmm5, %xmm3
+	movdqa	%xmm3, 48(rp)
+	palignr($8, %xmm1, %xmm2)
+	movdqa	8(up), %xmm0
+	pxor	%xmm5, %xmm2
+	movdqa	%xmm2, 32(rp)
+	palignr($8, %xmm0, %xmm1)
+	movdqa	-8(up), %xmm3
+	pxor	%xmm5, %xmm1
+	movdqa	%xmm1, 16(rp)
+	palignr($8, %xmm3, %xmm0)
+	lea	64(up), up
+	pxor	%xmm5, %xmm0
+	movdqa	%xmm0, (rp)
+	lea	64(rp), rp
+
+1:	test	$4, R8(n)
+	jz	1f
+	movdqa	24(up), %xmm1
+	movdqa	8(up), %xmm0
+	palignr($8, %xmm0, %xmm1)
+	movdqa	-8(up), %xmm3
+	pxor	%xmm5, %xmm1
+	movdqa	%xmm1, 16(rp)
+	palignr($8, %xmm3, %xmm0)
+	lea	32(up), up
+	pxor	%xmm5, %xmm0
+	movdqa	%xmm0, (rp)
+	lea	32(rp), rp
+
+1:	test	$2, R8(n)
+	jz	1f
+	movdqa	8(up), %xmm0
+	movdqa	-8(up), %xmm3
+	palignr($8, %xmm3, %xmm0)
+	lea	16(up), up
+	pxor	%xmm5, %xmm0
+	movdqa	%xmm0, (rp)
+	lea	16(rp), rp
+
+1:	test	$1, R8(n)
+	jz	1f
+	mov	(up), %r8
+	not	%r8
+	mov	%r8, (rp)
+
+1:	FUNC_EXIT()
+	ret
+
+C Basecase code.  Needed for good small operands speed, not for
+C correctness as the above code is currently written.
+
+L(bc):	lea	-8(rp), rp
+	sub	$4, R32(n)
+	jc	L(end)
+
+ifelse(eval(1 || COM_SSE_THRESHOLD >= 8),1,
+`	ALIGN(16)')
+L(top):	mov	(up), %r8
+	mov	8(up), %r9
+	lea	32(rp), rp
+	mov	16(up), %r10
+	mov	24(up), %r11
+	lea	32(up), up
+	not	%r8
+	not	%r9
+	not	%r10
+	not	%r11
+	mov	%r8, -24(rp)
+	mov	%r9, -16(rp)
+ifelse(eval(1 || COM_SSE_THRESHOLD >= 8),1,
+`	sub	$4, R32(n)')
+	mov	%r10, -8(rp)
+	mov	%r11, (rp)
+ifelse(eval(1 || COM_SSE_THRESHOLD >= 8),1,
+`	jnc	L(top)')
+
+L(end):	test	$1, R8(n)
+	jz	1f
+	mov	(up), %r8
+	not	%r8
+	mov	%r8, 8(rp)
+	lea	8(rp), rp
+	lea	8(up), up
+1:	test	$2, R8(n)
+	jz	1f
+	mov	(up), %r8
+	mov	8(up), %r9
+	not	%r8
+	not	%r9
+	mov	%r8, 8(rp)
+	mov	%r9, 16(rp)
+1:	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/fastsse/com.asm b/third_party/gmp/mpn/x86_64/fastsse/com.asm
new file mode 100644
index 0000000..c867222
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/fastsse/com.asm

@@ -0,0 +1,175 @@
+dnl  AMD64 mpn_com optimised for CPUs with fast SSE.
+
+dnl  Copyright 2003, 2005, 2007, 2011, 2012, 2015 Free Software Foundation,
+dnl  Inc.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb     cycles/limb     cycles/limb      good
+C              aligned	      unaligned	      best seen	     for cpu?
+C AMD K8,K9	 2.0		 2.0				N
+C AMD K10	 0.85		 1.3				Y/N
+C AMD bull	 1.40		 1.40				Y
+C AMD pile     0.9-1.4	       0.9-1.4				Y
+C AMD steam
+C AMD excavator
+C AMD bobcat	 3.1		 3.1				N
+C AMD jaguar	 0.91		 0.91		opt/opt		Y
+C Intel P4	 2.28		 illop				Y
+C Intel core2	 1.02		 1.02				N
+C Intel NHM	 0.53		 0.68				Y
+C Intel SBR	 0.51		 0.75		opt/0.65	Y/N
+C Intel IBR	 0.50		 0.57		opt/opt		Y
+C Intel HWL	 0.51		 0.64		opt/0.58	Y
+C Intel BWL	 0.61		 0.65		0.57/opt	Y
+C Intel atom	 3.68		 3.68				N
+C Intel SLM	 1.09		 1.35				N
+C VIA nano	 1.17		 5.09				Y/N
+
+C We try to do as many 16-byte operations as possible.  The top-most and
+C bottom-most writes might need 8-byte operations.  We can always write using
+C aligned 16-byte operations, we read with both aligned and unaligned 16-byte
+C operations.
+
+C Instead of having separate loops for reading aligned and unaligned, we read
+C using MOVDQU.  This seems to work great except for core2; there performance
+C doubles when reading using MOVDQA (for aligned source).  It is unclear how to
+C best handle the unaligned case there.
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`n',  `%rdx')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_com)
+	FUNC_ENTRY(3)
+
+IFDOS(`	add	$-56, %rsp	')
+IFDOS(`	movdqa	%xmm6, (%rsp)	')
+IFDOS(`	movdqa	%xmm7, 16(%rsp)	')
+
+	pcmpeqb	%xmm7, %xmm7		C set to 111...111
+
+	test	$8, R8(rp)		C is rp 16-byte aligned?
+	jz	L(ali)			C jump if rp aligned
+	mov	(up), %rax
+	lea	8(up), up
+	not	%rax
+	mov	%rax, (rp)
+	lea	8(rp), rp
+	dec	n
+
+	sub	$14, n
+	jc	L(sma)
+
+	ALIGN(16)
+L(top):	movdqu	(up), %xmm0
+	movdqu	16(up), %xmm1
+	movdqu	32(up), %xmm2
+	movdqu	48(up), %xmm3
+	movdqu	64(up), %xmm4
+	movdqu	80(up), %xmm5
+	movdqu	96(up), %xmm6
+	lea	112(up), up
+	pxor	%xmm7, %xmm0
+	pxor	%xmm7, %xmm1
+	pxor	%xmm7, %xmm2
+	pxor	%xmm7, %xmm3
+	pxor	%xmm7, %xmm4
+	pxor	%xmm7, %xmm5
+	pxor	%xmm7, %xmm6
+	movdqa	%xmm0, (rp)
+	movdqa	%xmm1, 16(rp)
+	movdqa	%xmm2, 32(rp)
+	movdqa	%xmm3, 48(rp)
+	movdqa	%xmm4, 64(rp)
+	movdqa	%xmm5, 80(rp)
+	movdqa	%xmm6, 96(rp)
+	lea	112(rp), rp
+L(ali):	sub	$14, n
+	jnc	L(top)
+
+L(sma):	add	$14, n
+	test	$8, R8(n)
+	jz	1f
+	movdqu	(up), %xmm0
+	movdqu	16(up), %xmm1
+	movdqu	32(up), %xmm2
+	movdqu	48(up), %xmm3
+	lea	64(up), up
+	pxor	%xmm7, %xmm0
+	pxor	%xmm7, %xmm1
+	pxor	%xmm7, %xmm2
+	pxor	%xmm7, %xmm3
+	movdqa	%xmm0, (rp)
+	movdqa	%xmm1, 16(rp)
+	movdqa	%xmm2, 32(rp)
+	movdqa	%xmm3, 48(rp)
+	lea	64(rp), rp
+1:
+	test	$4, R8(n)
+	jz	1f
+	movdqu	(up), %xmm0
+	movdqu	16(up), %xmm1
+	lea	32(up), up
+	pxor	%xmm7, %xmm0
+	pxor	%xmm7, %xmm1
+	movdqa	%xmm0, (rp)
+	movdqa	%xmm1, 16(rp)
+	lea	32(rp), rp
+1:
+	test	$2, R8(n)
+	jz	1f
+	movdqu	(up), %xmm0
+	lea	16(up), up
+	pxor	%xmm7, %xmm0
+	movdqa	%xmm0, (rp)
+	lea	16(rp), rp
+1:
+	test	$1, R8(n)
+	jz	1f
+	mov	(up), %rax
+	not	%rax
+	mov	%rax, (rp)
+1:
+L(don):
+IFDOS(`	movdqa	(%rsp), %xmm6	')
+IFDOS(`	movdqa	16(%rsp), %xmm7	')
+IFDOS(`	add	$56, %rsp	')
+	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/fastsse/copyd-palignr.asm b/third_party/gmp/mpn/x86_64/fastsse/copyd-palignr.asm
new file mode 100644
index 0000000..fac6f8a
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/fastsse/copyd-palignr.asm

@@ -0,0 +1,254 @@
+dnl  AMD64 mpn_copyd optimised for CPUs with fast SSE copying and SSSE3.
+
+dnl  Copyright 2012, 2015 Free Software Foundation, Inc.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb     cycles/limb     cycles/limb      good
+C              aligned	      unaligned	      best seen	     for cpu?
+C AMD K8,K9	 2.0		 illop		1.0/1.0		N
+C AMD K10	 0.85		 illop				Y/N
+C AMD bull	 0.70		 0.70				Y
+C AMD pile	 0.68		 0.68				Y
+C AMD steam
+C AMD excavator
+C AMD bobcat	 1.97		 8.24		1.5/1.5		N
+C AMD jaguar	 0.77		 0.89		0.65/opt	N/Y
+C Intel P4	 2.26		 illop				Y/N
+C Intel core	 0.52		 0.80		opt/opt		Y
+C Intel NHM	 0.52		 0.64		opt/opt		Y
+C Intel SBR	 0.51		 0.51		opt/opt		Y
+C Intel IBR	 0.50		 0.50		opt/opt		Y
+C Intel HWL	 0.50		 0.51		opt/opt		Y
+C Intel BWL	 0.55		 0.55		opt/opt		Y
+C Intel atom	 1.16		 1.66		opt/opt		Y
+C Intel SLM	 1.02		 1.04		opt/opt		Y
+C VIA nano	 1.08		 1.06		opt/opt		Y
+
+C We use only 16-byte operations, except for unaligned top-most and bottom-most
+C limbs.  We use the SSSE3 palignr instruction when rp - up = 8 (mod 16).
+C
+C For operands of < COPYD_SSE_THRESHOLD limbs, we use a plain 64-bit loop,
+C taken from the x86_64 default code.
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`n',  `%rdx')
+
+C There are three instructions for loading an aligned 128-bit quantity.  We use
+C movaps, since it has the shortest coding.
+define(`movdqa', ``movaps'')
+
+ifdef(`COPYD_SSE_THRESHOLD',`',`define(`COPYD_SSE_THRESHOLD', 7)')
+
+ASM_START()
+	TEXT
+	ALIGN(64)
+PROLOGUE(mpn_copyd)
+	FUNC_ENTRY(3)
+
+	lea	-8(up,n,8), up
+	lea	-8(rp,n,8), rp
+
+	cmp	$COPYD_SSE_THRESHOLD, n
+	jbe	L(bc)
+
+	test	$8, R8(rp)		C is rp 16-byte aligned?
+	jnz	L(rp_aligned)		C jump if rp aligned
+
+	mov	(up), %rax		C copy one limb
+	mov	%rax, (rp)
+	lea	-8(up), up
+	lea	-8(rp), rp
+	dec	n
+
+L(rp_aligned):
+	test	$8, R8(up)
+	jz	L(uent)
+
+ifelse(eval(COPYD_SSE_THRESHOLD >= 8),1,
+`	sub	$8, n',
+`	jmp	L(am)')
+
+	ALIGN(16)
+L(atop):movdqa	-8(up), %xmm0
+	movdqa	-24(up), %xmm1
+	movdqa	-40(up), %xmm2
+	movdqa	-56(up), %xmm3
+	lea	-64(up), up
+	movdqa	%xmm0, -8(rp)
+	movdqa	%xmm1, -24(rp)
+	movdqa	%xmm2, -40(rp)
+	movdqa	%xmm3, -56(rp)
+	lea	-64(rp), rp
+L(am):	sub	$8, n
+	jnc	L(atop)
+
+	test	$4, R8(n)
+	jz	1f
+	movdqa	-8(up), %xmm0
+	movdqa	-24(up), %xmm1
+	lea	-32(up), up
+	movdqa	%xmm0, -8(rp)
+	movdqa	%xmm1, -24(rp)
+	lea	-32(rp), rp
+
+1:	test	$2, R8(n)
+	jz	1f
+	movdqa	-8(up), %xmm0
+	lea	-16(up), up
+	movdqa	%xmm0, -8(rp)
+	lea	-16(rp), rp
+
+1:	test	$1, R8(n)
+	jz	1f
+	mov	(up), %r8
+	mov	%r8, (rp)
+
+1:	FUNC_EXIT()
+	ret
+
+L(uent):sub	$16, n
+	movdqa	(up), %xmm0
+	jc	L(uend)
+
+	ALIGN(16)
+L(utop):sub	$16, n
+	movdqa	-16(up), %xmm1
+	palignr($8, %xmm1, %xmm0)
+	movdqa	%xmm0, -8(rp)
+	movdqa	-32(up), %xmm2
+	palignr($8, %xmm2, %xmm1)
+	movdqa	%xmm1, -24(rp)
+	movdqa	-48(up), %xmm3
+	palignr($8, %xmm3, %xmm2)
+	movdqa	%xmm2, -40(rp)
+	movdqa	-64(up), %xmm0
+	palignr($8, %xmm0, %xmm3)
+	movdqa	%xmm3, -56(rp)
+	movdqa	-80(up), %xmm1
+	palignr($8, %xmm1, %xmm0)
+	movdqa	%xmm0, -72(rp)
+	movdqa	-96(up), %xmm2
+	palignr($8, %xmm2, %xmm1)
+	movdqa	%xmm1, -88(rp)
+	movdqa	-112(up), %xmm3
+	palignr($8, %xmm3, %xmm2)
+	movdqa	%xmm2, -104(rp)
+	movdqa	-128(up), %xmm0
+	palignr($8, %xmm0, %xmm3)
+	movdqa	%xmm3, -120(rp)
+	lea	-128(up), up
+	lea	-128(rp), rp
+	jnc	L(utop)
+
+L(uend):test	$8, R8(n)
+	jz	1f
+	movdqa	-16(up), %xmm1
+	palignr($8, %xmm1, %xmm0)
+	movdqa	%xmm0, -8(rp)
+	movdqa	-32(up), %xmm0
+	palignr($8, %xmm0, %xmm1)
+	movdqa	%xmm1, -24(rp)
+	movdqa	-48(up), %xmm1
+	palignr($8, %xmm1, %xmm0)
+	movdqa	%xmm0, -40(rp)
+	movdqa	-64(up), %xmm0
+	palignr($8, %xmm0, %xmm1)
+	movdqa	%xmm1, -56(rp)
+	lea	-64(up), up
+	lea	-64(rp), rp
+
+1:	test	$4, R8(n)
+	jz	1f
+	movdqa	-16(up), %xmm1
+	palignr($8, %xmm1, %xmm0)
+	movdqa	%xmm0, -8(rp)
+	movdqa	-32(up), %xmm0
+	palignr($8, %xmm0, %xmm1)
+	movdqa	%xmm1, -24(rp)
+	lea	-32(up), up
+	lea	-32(rp), rp
+
+1:	test	$2, R8(n)
+	jz	1f
+	movdqa	-16(up), %xmm1
+	palignr($8, %xmm1, %xmm0)
+	movdqa	%xmm0, -8(rp)
+	lea	-16(up), up
+	lea	-16(rp), rp
+
+1:	test	$1, R8(n)
+	jz	1f
+	mov	(up), %r8
+	mov	%r8, (rp)
+
+1:	FUNC_EXIT()
+	ret
+
+C Basecase code.  Needed for good small operands speed, not for
+C correctness as the above code is currently written.
+
+L(bc):	sub	$4, R32(n)
+	jc	L(end)
+
+	ALIGN(16)
+L(top):	mov	(up), %r8
+	mov	-8(up), %r9
+	lea	-32(rp), rp
+	mov	-16(up), %r10
+	mov	-24(up), %r11
+	lea	-32(up), up
+	mov	%r8, 32(rp)
+	mov	%r9, 24(rp)
+ifelse(eval(COPYD_SSE_THRESHOLD >= 8),1,
+`	sub	$4, R32(n)')
+	mov	%r10, 16(rp)
+	mov	%r11, 8(rp)
+ifelse(eval(COPYD_SSE_THRESHOLD >= 8),1,
+`	jnc	L(top)')
+
+L(end):	test	$1, R8(n)
+	jz	1f
+	mov	(up), %r8
+	mov	%r8, (rp)
+	lea	-8(rp), rp
+	lea	-8(up), up
+1:	test	$2, R8(n)
+	jz	1f
+	mov	(up), %r8
+	mov	-8(up), %r9
+	mov	%r8, (rp)
+	mov	%r9, -8(rp)
+1:	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/fastsse/copyd.asm b/third_party/gmp/mpn/x86_64/fastsse/copyd.asm
new file mode 100644
index 0000000..b3c4706
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/fastsse/copyd.asm

@@ -0,0 +1,166 @@
+dnl  AMD64 mpn_copyd optimised for CPUs with fast SSE.
+
+dnl  Copyright 2003, 2005, 2007, 2011, 2012, 2015 Free Software Foundation,
+dnl  Inc.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb     cycles/limb     cycles/limb      good
+C              aligned	      unaligned	      best seen	     for cpu?
+C AMD K8,K9
+C AMD K10	 0.85		 1.64				Y/N
+C AMD bull	 1.4		 1.4				Y
+C AMD pile	 0.68		 0.98				Y/N
+C AMD steam
+C AMD excavator
+C AMD bobcat
+C AMD jaguar	 0.65		 1.02		opt/0.93	Y/N
+C Intel P4	 2.3		 2.3				Y
+C Intel core	 1.0		 1.0		0.52/0.80	N
+C Intel NHM	 0.5		 0.67				Y
+C Intel SBR	 0.51		 0.75		opt/0.54	Y/N
+C Intel IBR	 0.50		 0.57		opt/0.50	Y
+C Intel HWL	 0.50		 0.57		opt/0.51	Y
+C Intel BWL	 0.55		 0.62		opt/0.55	Y
+C Intel atom
+C Intel SLM	 1.02		 1.27		opt/1.04	Y/N
+C VIA nano	 1.16		 5.16				Y/N
+
+C We try to do as many 16-byte operations as possible.  The top-most and
+C bottom-most writes might need 8-byte operations.  We can always write using
+C aligned 16-byte operations, we read with both aligned and unaligned 16-byte
+C operations.
+
+C Instead of having separate loops for reading aligned and unaligned, we read
+C using MOVDQU.  This seems to work great except for core2; there performance
+C doubles when reading using MOVDQA (for aligned source).  It is unclear how to
+C best handle the unaligned case there.
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`n',  `%rdx')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+dnl define(`movdqu', lddqu)
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_copyd)
+	FUNC_ENTRY(3)
+
+	test	n, n
+	jz	L(don)
+
+	lea	-16(rp,n,8), rp
+	lea	-16(up,n,8), up
+
+	test	$8, R8(rp)		C is rp 16-byte aligned?
+	jz	L(ali)			C jump if rp aligned
+	mov	8(up), %rax
+	lea	-8(up), up
+	mov	%rax, 8(rp)
+	lea	-8(rp), rp
+	dec	n
+
+L(ali):	sub	$16, n
+	jc	L(sma)
+
+IFDOS(`	add	$-56, %rsp	')
+IFDOS(`	movdqa	%xmm6, (%rsp)	')
+IFDOS(`	movdqa	%xmm7, 16(%rsp)	')
+
+	ALIGN(16)
+L(top):	movdqu	(up), %xmm0
+	movdqu	-16(up), %xmm1
+	movdqu	-32(up), %xmm2
+	movdqu	-48(up), %xmm3
+	movdqu	-64(up), %xmm4
+	movdqu	-80(up), %xmm5
+	movdqu	-96(up), %xmm6
+	movdqu	-112(up), %xmm7
+	lea	-128(up), up
+	movdqa	%xmm0, (rp)
+	movdqa	%xmm1, -16(rp)
+	movdqa	%xmm2, -32(rp)
+	movdqa	%xmm3, -48(rp)
+	movdqa	%xmm4, -64(rp)
+	movdqa	%xmm5, -80(rp)
+	movdqa	%xmm6, -96(rp)
+	movdqa	%xmm7, -112(rp)
+	lea	-128(rp), rp
+	sub	$16, n
+	jnc	L(top)
+
+IFDOS(`	movdqa	(%rsp), %xmm6	')
+IFDOS(`	movdqa	16(%rsp), %xmm7	')
+IFDOS(`	add	$56, %rsp	')
+
+L(sma):	test	$8, R8(n)
+	jz	1f
+	movdqu	(up), %xmm0
+	movdqu	-16(up), %xmm1
+	movdqu	-32(up), %xmm2
+	movdqu	-48(up), %xmm3
+	lea	-64(up), up
+	movdqa	%xmm0, (rp)
+	movdqa	%xmm1, -16(rp)
+	movdqa	%xmm2, -32(rp)
+	movdqa	%xmm3, -48(rp)
+	lea	-64(rp), rp
+1:
+	test	$4, R8(n)
+	jz	1f
+	movdqu	(up), %xmm0
+	movdqu	-16(up), %xmm1
+	lea	-32(up), up
+	movdqa	%xmm0, (rp)
+	movdqa	%xmm1, -16(rp)
+	lea	-32(rp), rp
+1:
+	test	$2, R8(n)
+	jz	1f
+	movdqu	(up), %xmm0
+	lea	-16(up), up
+	movdqa	%xmm0, (rp)
+	lea	-16(rp), rp
+1:
+	test	$1, R8(n)
+	jz	1f
+	mov	8(up), %r8
+	mov	%r8, 8(rp)
+1:
+L(don):	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/fastsse/copyi-palignr.asm b/third_party/gmp/mpn/x86_64/fastsse/copyi-palignr.asm
new file mode 100644
index 0000000..9876a47
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/fastsse/copyi-palignr.asm

@@ -0,0 +1,300 @@
+dnl  AMD64 mpn_copyi optimised for CPUs with fast SSE copying and SSSE3.
+
+dnl  Copyright 2012, 2013, 2015 Free Software Foundation, Inc.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb     cycles/limb     cycles/limb      good
+C              aligned	      unaligned	      best seen	     for cpu?
+C AMD K8,K9	 2.0		 illop		1.0/1.0		N
+C AMD K10	 0.85		 illop				Y/N
+C AMD bd1	 0.70		 0.66				Y
+C AMD bd2	 0.68		 0.66				Y
+C AMD bd3	 ?		 ?
+C AMD bd4	 ?		 ?
+C AMD bt1	 1.97		 8.16		1.5/1.5		N
+C AMD bt2	 0.77		 0.93		0.65/opt	N/Y
+C AMD zn1	 ?		 ?
+C AMD zn2	 ?		 ?
+C Intel P4	 2.26		 illop				Y/N
+C Intel CNR	 0.52		 0.64		opt/opt		Y
+C Intel NHM	 0.52		 0.71		0.50/0.67	N
+C Intel SBR	 0.51		 0.54		opt/0.51	Y
+C Intel IBR	 0.50		 0.54		opt/opt		Y
+C Intel HWL	 0.50		 0.51		opt/opt		Y
+C Intel BWL	 0.55		 0.55		opt/opt		Y
+C Intel atom	 1.16		 1.61		opt/opt		Y
+C Intel SLM	 1.02		 1.07		opt/opt		Y
+C VIA nano	 1.09		 1.08		opt/opt		Y
+
+C We use only 16-byte operations, except for unaligned top-most and bottom-most
+C limbs.  We use the SSSE3 palignr instruction when rp - up = 8 (mod 16).  That
+C instruction is better adapted to mpn_copyd's needs, we need to contort the
+C code to use it here.
+C
+C For operands of < COPYI_SSE_THRESHOLD limbs, we use a plain 64-bit loop,
+C taken from the x86_64 default code.
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`n',  `%rdx')
+
+C There are three instructions for loading an aligned 128-bit quantity.  We use
+C movaps, since it has the shortest coding.
+dnl define(`movdqa', ``movaps'')
+
+ifdef(`COPYI_SSE_THRESHOLD',`',`define(`COPYI_SSE_THRESHOLD', 7)')
+
+ASM_START()
+	TEXT
+	ALIGN(64)
+PROLOGUE(mpn_copyi)
+	FUNC_ENTRY(3)
+
+	cmp	$COPYI_SSE_THRESHOLD, n
+	jbe	L(bc)
+
+	test	$8, R8(rp)		C is rp 16-byte aligned?
+	jz	L(rp_aligned)		C jump if rp aligned
+
+	movsq				C copy one limb
+	dec	n
+
+L(rp_aligned):
+	test	$8, R8(up)
+	jnz	L(uent)
+
+ifelse(eval(COPYI_SSE_THRESHOLD >= 8),1,
+`	sub	$8, n',
+`	jmp	L(am)')
+
+	ALIGN(16)
+L(atop):movdqa	0(up), %xmm0
+	movdqa	16(up), %xmm1
+	movdqa	32(up), %xmm2
+	movdqa	48(up), %xmm3
+	lea	64(up), up
+	movdqa	%xmm0, (rp)
+	movdqa	%xmm1, 16(rp)
+	movdqa	%xmm2, 32(rp)
+	movdqa	%xmm3, 48(rp)
+	lea	64(rp), rp
+L(am):	sub	$8, n
+	jnc	L(atop)
+
+	test	$4, R8(n)
+	jz	1f
+	movdqa	(up), %xmm0
+	movdqa	16(up), %xmm1
+	lea	32(up), up
+	movdqa	%xmm0, (rp)
+	movdqa	%xmm1, 16(rp)
+	lea	32(rp), rp
+
+1:	test	$2, R8(n)
+	jz	1f
+	movdqa	(up), %xmm0
+	lea	16(up), up
+	movdqa	%xmm0, (rp)
+	lea	16(rp), rp
+
+1:	test	$1, R8(n)
+	jz	1f
+	mov	(up), %r8
+	mov	%r8, (rp)
+
+1:	FUNC_EXIT()
+	ret
+
+L(uent):
+C Code handling up - rp = 8 (mod 16)
+
+	cmp	$16, n
+	jc	L(ued0)
+
+IFDOS(`	add	$-56, %rsp	')
+IFDOS(`	movdqa	%xmm6, (%rsp)	')
+IFDOS(`	movdqa	%xmm7, 16(%rsp)	')
+IFDOS(`	movdqa	%xmm8, 32(%rsp)	')
+
+	movaps	120(up), %xmm7
+	movaps	104(up), %xmm6
+	movaps	88(up), %xmm5
+	movaps	72(up), %xmm4
+	movaps	56(up), %xmm3
+	movaps	40(up), %xmm2
+	lea	128(up), up
+	sub	$32, n
+	jc	L(ued1)
+
+	ALIGN(16)
+L(utop):movaps	-104(up), %xmm1
+	sub	$16, n
+	movaps	-120(up), %xmm0
+	palignr($8, %xmm6, %xmm7)
+	movaps	-136(up), %xmm8
+	movdqa	%xmm7, 112(rp)
+	palignr($8, %xmm5, %xmm6)
+	movaps	120(up), %xmm7
+	movdqa	%xmm6, 96(rp)
+	palignr($8, %xmm4, %xmm5)
+	movaps	104(up), %xmm6
+	movdqa	%xmm5, 80(rp)
+	palignr($8, %xmm3, %xmm4)
+	movaps	88(up), %xmm5
+	movdqa	%xmm4, 64(rp)
+	palignr($8, %xmm2, %xmm3)
+	movaps	72(up), %xmm4
+	movdqa	%xmm3, 48(rp)
+	palignr($8, %xmm1, %xmm2)
+	movaps	56(up), %xmm3
+	movdqa	%xmm2, 32(rp)
+	palignr($8, %xmm0, %xmm1)
+	movaps	40(up), %xmm2
+	movdqa	%xmm1, 16(rp)
+	palignr($8, %xmm8, %xmm0)
+	lea	128(up), up
+	movdqa	%xmm0, (rp)
+	lea	128(rp), rp
+	jnc	L(utop)
+
+L(ued1):movaps	-104(up), %xmm1
+	movaps	-120(up), %xmm0
+	movaps	-136(up), %xmm8
+	palignr($8, %xmm6, %xmm7)
+	movdqa	%xmm7, 112(rp)
+	palignr($8, %xmm5, %xmm6)
+	movdqa	%xmm6, 96(rp)
+	palignr($8, %xmm4, %xmm5)
+	movdqa	%xmm5, 80(rp)
+	palignr($8, %xmm3, %xmm4)
+	movdqa	%xmm4, 64(rp)
+	palignr($8, %xmm2, %xmm3)
+	movdqa	%xmm3, 48(rp)
+	palignr($8, %xmm1, %xmm2)
+	movdqa	%xmm2, 32(rp)
+	palignr($8, %xmm0, %xmm1)
+	movdqa	%xmm1, 16(rp)
+	palignr($8, %xmm8, %xmm0)
+	movdqa	%xmm0, (rp)
+	lea	128(rp), rp
+
+IFDOS(`	movdqa	(%rsp), %xmm6	')
+IFDOS(`	movdqa	16(%rsp), %xmm7	')
+IFDOS(`	movdqa	32(%rsp), %xmm8	')
+IFDOS(`	add	$56, %rsp	')
+
+L(ued0):test	$8, R8(n)
+	jz	1f
+	movaps	56(up), %xmm3
+	movaps	40(up), %xmm2
+	movaps	24(up), %xmm1
+	movaps	8(up), %xmm0
+	movaps	-8(up), %xmm4
+	palignr($8, %xmm2, %xmm3)
+	movdqa	%xmm3, 48(rp)
+	palignr($8, %xmm1, %xmm2)
+	movdqa	%xmm2, 32(rp)
+	palignr($8, %xmm0, %xmm1)
+	movdqa	%xmm1, 16(rp)
+	palignr($8, %xmm4, %xmm0)
+	lea	64(up), up
+	movdqa	%xmm0, (rp)
+	lea	64(rp), rp
+
+1:	test	$4, R8(n)
+	jz	1f
+	movaps	24(up), %xmm1
+	movaps	8(up), %xmm0
+	palignr($8, %xmm0, %xmm1)
+	movaps	-8(up), %xmm3
+	movdqa	%xmm1, 16(rp)
+	palignr($8, %xmm3, %xmm0)
+	lea	32(up), up
+	movdqa	%xmm0, (rp)
+	lea	32(rp), rp
+
+1:	test	$2, R8(n)
+	jz	1f
+	movdqa	8(up), %xmm0
+	movdqa	-8(up), %xmm3
+	palignr($8, %xmm3, %xmm0)
+	lea	16(up), up
+	movdqa	%xmm0, (rp)
+	lea	16(rp), rp
+
+1:	test	$1, R8(n)
+	jz	1f
+	mov	(up), %r8
+	mov	%r8, (rp)
+
+1:	FUNC_EXIT()
+	ret
+
+C Basecase code.  Needed for good small operands speed, not for
+C correctness as the above code is currently written.
+
+L(bc):	lea	-8(rp), rp
+	sub	$4, R32(n)
+	jc	L(end)
+
+	ALIGN(16)
+L(top):	mov	(up), %r8
+	mov	8(up), %r9
+	lea	32(rp), rp
+	mov	16(up), %r10
+	mov	24(up), %r11
+	lea	32(up), up
+	mov	%r8, -24(rp)
+	mov	%r9, -16(rp)
+ifelse(eval(COPYI_SSE_THRESHOLD >= 8),1,
+`	sub	$4, R32(n)')
+	mov	%r10, -8(rp)
+	mov	%r11, (rp)
+ifelse(eval(COPYI_SSE_THRESHOLD >= 8),1,
+`	jnc	L(top)')
+
+L(end):	test	$1, R8(n)
+	jz	1f
+	mov	(up), %r8
+	mov	%r8, 8(rp)
+	lea	8(rp), rp
+	lea	8(up), up
+1:	test	$2, R8(n)
+	jz	1f
+	mov	(up), %r8
+	mov	8(up), %r9
+	mov	%r8, 8(rp)
+	mov	%r9, 16(rp)
+1:	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/fastsse/copyi.asm b/third_party/gmp/mpn/x86_64/fastsse/copyi.asm
new file mode 100644
index 0000000..97f7865
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/fastsse/copyi.asm

@@ -0,0 +1,185 @@
+dnl  AMD64 mpn_copyi optimised for CPUs with fast SSE.
+
+dnl  Copyright 2003, 2005, 2007, 2011, 2012, 2015 Free Software Foundation,
+dnl  Inc.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb     cycles/limb     cycles/limb      good
+C              aligned	      unaligned	      best seen	     for cpu?
+C AMD K8,K9
+C AMD K10	 0.85		 1.64				Y/N
+C AMD bull	 1.4		 1.4				N
+C AMD pile	 0.77		 0.93				N
+C AMD steam	 ?		 ?
+C AMD excavator	 ?		 ?
+C AMD bobcat
+C AMD jaguar	 0.65		 1.02		opt/0.93	Y/N
+C Intel P4	 2.3		 2.3				Y
+C Intel core	 1.0		 1.0		0.52/0.64	N
+C Intel NHM	 0.5		 0.67				Y
+C Intel SBR	 0.51		 0.75		opt/0.54	Y/N
+C Intel IBR	 0.50		 0.57		opt/0.54	Y
+C Intel HWL	 0.50		 0.57		opt/0.51	Y
+C Intel BWL	 0.55		 0.62		opt/0.55	Y
+C Intel atom
+C Intel SLM	 1.02		 1.27		opt/1.07	Y/N
+C VIA nano	 1.16		 5.16				Y/N
+
+C We try to do as many 16-byte operations as possible.  The top-most and
+C bottom-most writes might need 8-byte operations.  We can always write using
+C aligned 16-byte operations, we read with both aligned and unaligned 16-byte
+C operations.
+
+C Instead of having separate loops for reading aligned and unaligned, we read
+C using MOVDQU.  This seems to work great except for core2; there performance
+C doubles when reading using MOVDQA (for aligned source).  It is unclear how to
+C best handle the unaligned case there.
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`n',  `%rdx')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+dnl define(`movdqu', lddqu)
+
+ASM_START()
+	TEXT
+	ALIGN(64)
+PROLOGUE(mpn_copyi)
+	FUNC_ENTRY(3)
+
+	cmp	$3, n			C NB: bc code below assumes this limit
+	jc	L(bc)
+
+	test	$8, R8(rp)		C is rp 16-byte aligned?
+	jz	L(ali)			C jump if rp aligned
+	movsq				C copy single limb
+	dec	n
+
+L(ali):	sub	$16, n
+	jc	L(sma)
+
+IFDOS(`	add	$-56, %rsp	')
+IFDOS(`	movdqa	%xmm6, (%rsp)	')
+IFDOS(`	movdqa	%xmm7, 16(%rsp)	')
+
+	ALIGN(16)
+L(top):	movdqu	(up), %xmm0
+	movdqu	16(up), %xmm1
+	movdqu	32(up), %xmm2
+	movdqu	48(up), %xmm3
+	movdqu	64(up), %xmm4
+	movdqu	80(up), %xmm5
+	movdqu	96(up), %xmm6
+	movdqu	112(up), %xmm7
+	lea	128(up), up
+	movdqa	%xmm0, (rp)
+	movdqa	%xmm1, 16(rp)
+	movdqa	%xmm2, 32(rp)
+	movdqa	%xmm3, 48(rp)
+	movdqa	%xmm4, 64(rp)
+	movdqa	%xmm5, 80(rp)
+	movdqa	%xmm6, 96(rp)
+	movdqa	%xmm7, 112(rp)
+	lea	128(rp), rp
+	sub	$16, n
+	jnc	L(top)
+
+IFDOS(`	movdqa	(%rsp), %xmm6	')
+IFDOS(`	movdqa	16(%rsp), %xmm7	')
+IFDOS(`	add	$56, %rsp	')
+
+L(sma):	test	$8, R8(n)
+	jz	1f
+	movdqu	(up), %xmm0
+	movdqu	16(up), %xmm1
+	movdqu	32(up), %xmm2
+	movdqu	48(up), %xmm3
+	lea	64(up), up
+	movdqa	%xmm0, (rp)
+	movdqa	%xmm1, 16(rp)
+	movdqa	%xmm2, 32(rp)
+	movdqa	%xmm3, 48(rp)
+	lea	64(rp), rp
+1:
+	test	$4, R8(n)
+	jz	1f
+	movdqu	(up), %xmm0
+	movdqu	16(up), %xmm1
+	lea	32(up), up
+	movdqa	%xmm0, (rp)
+	movdqa	%xmm1, 16(rp)
+	lea	32(rp), rp
+1:
+	test	$2, R8(n)
+	jz	1f
+	movdqu	(up), %xmm0
+	lea	16(up), up
+	movdqa	%xmm0, (rp)
+	lea	16(rp), rp
+	ALIGN(16)
+1:
+L(end):	test	$1, R8(n)
+	jz	1f
+	mov	(up), %r8
+	mov	%r8, (rp)
+1:
+	FUNC_EXIT()
+	ret
+
+C Basecase code.  Needed for good small operands speed, not for correctness as
+C the above code is currently written.  The commented-out lines need to be
+C reinstated if this code is to be used for n > 3, and then the post loop
+C offsets need fixing.
+
+L(bc):	sub	$2, n
+	jc	L(end)
+	ALIGN(16)
+1:	mov	(up), %rax
+	mov	8(up), %rcx
+dnl	lea	16(up), up
+	mov	%rax, (rp)
+	mov	%rcx, 8(rp)
+dnl	lea	16(rp), rp
+dnl	sub	$2, n
+dnl	jnc	1b
+
+	test	$1, R8(n)
+	jz	L(ret)
+	mov	16(up), %rax
+	mov	%rax, 16(rp)
+L(ret):	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/fastsse/lshift-movdqu2.asm b/third_party/gmp/mpn/x86_64/fastsse/lshift-movdqu2.asm
new file mode 100644
index 0000000..a05e850
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/fastsse/lshift-movdqu2.asm

@@ -0,0 +1,182 @@
+dnl  AMD64 mpn_lshift optimised for CPUs with fast SSE including fast movdqu.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  Copyright 2010-2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C	     cycles/limb     cycles/limb     cycles/limb    good
+C              aligned	      unaligned	      best seen	   for cpu?
+C AMD K8,K9	 3		 3		 2.35	  no, use shl/shr
+C AMD K10	 1.5-1.8	 1.5-1.8	 1.33	  yes
+C AMD bd1	 1.7-1.9	 1.7-1.9	 1.33	  yes
+C AMD bobcat	 3.17		 3.17			  yes, bad for n < 20
+C Intel P4	 4.67		 4.67		 2.7	  no, slow movdqu
+C Intel core2	 2.15		 2.15		 1.25	  no, use shld/shrd
+C Intel NHM	 1.66		 1.66		 1.25	  no, use shld/shrd
+C Intel SBR	 1.3		 1.3		 1.25	  yes, bad for n = 4-6
+C Intel atom	11.7		11.7		 4.5	  no
+C VIA nano	 5.7		 5.95		 2.0	  no, slow movdqu
+
+C We try to do as many aligned 16-byte operations as possible.  The top-most
+C and bottom-most writes might need 8-byte operations.
+C
+C This variant rely on fast load movdqu, and uses it even for aligned operands,
+C in order to avoid the need for two separate loops.
+C
+C TODO
+C  * Could 2-limb wind-down code be simplified?
+C  * Improve basecase code, using shld/shrd for SBR, discrete integer shifts
+C    for other affected CPUs.
+
+C INPUT PARAMETERS
+define(`rp',  `%rdi')
+define(`ap',  `%rsi')
+define(`n',   `%rdx')
+define(`cnt', `%rcx')
+
+ASM_START()
+	TEXT
+	ALIGN(64)
+PROLOGUE(mpn_lshift)
+	FUNC_ENTRY(4)
+	movd	R32(%rcx), %xmm4
+	mov	$64, R32(%rax)
+	sub	R32(%rcx), R32(%rax)
+	movd	R32(%rax), %xmm5
+
+	neg	R32(%rcx)
+	mov	-8(ap,n,8), %rax
+	shr	R8(%rcx), %rax
+
+	cmp	$3, n
+	jle	L(bc)
+
+	lea	(rp,n,8), R32(%rcx)
+	test	$8, R8(%rcx)
+	jz	L(rp_aligned)
+
+C Do one initial limb in order to make rp aligned
+	movq	-8(ap,n,8), %xmm0
+	movq	-16(ap,n,8), %xmm1
+	psllq	%xmm4, %xmm0
+	psrlq	%xmm5, %xmm1
+	por	%xmm1, %xmm0
+	movq	%xmm0, -8(rp,n,8)
+	dec	n
+
+L(rp_aligned):
+	lea	1(n), %r8d
+
+	and	$6, R32(%r8)
+	jz	L(ba0)
+	cmp	$4, R32(%r8)
+	jz	L(ba4)
+	jc	L(ba2)
+L(ba6):	add	$-4, n
+	jmp	L(i56)
+L(ba0):	add	$-6, n
+	jmp	L(i70)
+L(ba4):	add	$-2, n
+	jmp	L(i34)
+L(ba2):	add	$-8, n
+	jle	L(end)
+
+	ALIGN(16)
+L(top):	movdqu	40(ap,n,8), %xmm1
+	movdqu	48(ap,n,8), %xmm0
+	psllq	%xmm4, %xmm0
+	psrlq	%xmm5, %xmm1
+	por	%xmm1, %xmm0
+	movdqa	%xmm0, 48(rp,n,8)
+L(i70):
+	movdqu	24(ap,n,8), %xmm1
+	movdqu	32(ap,n,8), %xmm0
+	psllq	%xmm4, %xmm0
+	psrlq	%xmm5, %xmm1
+	por	%xmm1, %xmm0
+	movdqa	%xmm0, 32(rp,n,8)
+L(i56):
+	movdqu	8(ap,n,8), %xmm1
+	movdqu	16(ap,n,8), %xmm0
+	psllq	%xmm4, %xmm0
+	psrlq	%xmm5, %xmm1
+	por	%xmm1, %xmm0
+	movdqa	%xmm0, 16(rp,n,8)
+L(i34):
+	movdqu	-8(ap,n,8), %xmm1
+	movdqu	(ap,n,8), %xmm0
+	psllq	%xmm4, %xmm0
+	psrlq	%xmm5, %xmm1
+	por	%xmm1, %xmm0
+	movdqa	%xmm0, (rp,n,8)
+	sub	$8, n
+	jg	L(top)
+
+L(end):	test	$1, R8(n)
+	jnz	L(end8)
+
+	movdqu	(ap), %xmm1
+	pxor	%xmm0, %xmm0
+	punpcklqdq  %xmm1, %xmm0
+	psllq	%xmm4, %xmm1
+	psrlq	%xmm5, %xmm0
+	por	%xmm1, %xmm0
+	movdqa	%xmm0, (rp)
+	FUNC_EXIT()
+	ret
+
+C Basecase
+	ALIGN(16)
+L(bc):	dec	R32(n)
+	jz	L(end8)
+
+	movq	(ap,n,8), %xmm1
+	movq	-8(ap,n,8), %xmm0
+	psllq	%xmm4, %xmm1
+	psrlq	%xmm5, %xmm0
+	por	%xmm1, %xmm0
+	movq	%xmm0, (rp,n,8)
+	sub	$2, R32(n)
+	jl	L(end8)
+	movq	8(ap), %xmm1
+	movq	(ap), %xmm0
+	psllq	%xmm4, %xmm1
+	psrlq	%xmm5, %xmm0
+	por	%xmm1, %xmm0
+	movq	%xmm0, 8(rp)
+
+L(end8):movq	(ap), %xmm0
+	psllq	%xmm4, %xmm0
+	movq	%xmm0, (rp)
+	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/fastsse/lshift.asm b/third_party/gmp/mpn/x86_64/fastsse/lshift.asm
new file mode 100644
index 0000000..6a17b93
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/fastsse/lshift.asm

@@ -0,0 +1,173 @@
+dnl  AMD64 mpn_lshift optimised for CPUs with fast SSE.
+
+dnl  Contributed to the GNU project by David Harvey and Torbjorn Granlund.
+
+dnl  Copyright 2010-2012, 2018 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C	     cycles/limb	     cycles/limb	      good
+C          16-byte aligned         16-byte unaligned	    for cpu?
+C AMD K8,K9	 ?			 ?
+C AMD K10	 1.68  (1.45)		 1.75  (1.49)		Y
+C AMD bd1	 1.82  (1.75)		 1.82  (1.75)		Y
+C AMD bobcat	 4			 4
+C Intel P4	 3     (2.7)		 3     (2.7)		Y
+C Intel core2	 2.05  (1.67)		 2.55  (1.75)
+C Intel NHM	 2.05  (1.75)		 2.09  (2)
+C Intel SBR	 1.5   (1.3125)		 1.5   (1.4375)		Y
+C Intel atom	 ?			 ?
+C VIA nano	 2.25  (2)		 2.5   (2)		Y
+
+C We try to do as many 16-byte operations as possible.  The top-most and
+C bottom-most writes might need 8-byte operations.
+
+C There are two inner-loops, one for when rp = ap (mod 16) and one when this is
+C not true.  The aligned case reads 16+8 bytes, the unaligned case reads
+C 16+8+X bytes, where X is 8 or 16 depending on how punpcklqdq is implemented.
+
+C This is not yet great code:
+C   (1) The unaligned case makes many reads.
+C   (2) We should do some unrolling, at least 2-way.
+C With 2-way unrolling but no scheduling we reach 1.5 c/l on K10 and 2 c/l on
+C Nano.
+
+C INPUT PARAMETERS
+define(`rp',  `%rdi')
+define(`ap',  `%rsi')
+define(`n',   `%rdx')
+define(`cnt', `%rcx')
+
+ASM_START()
+	TEXT
+	ALIGN(64)
+PROLOGUE(mpn_lshift)
+	FUNC_ENTRY(4)
+	movd	R32(%rcx), %xmm4
+	mov	$64, R32(%rax)
+	sub	R32(%rcx), R32(%rax)
+	movd	R32(%rax), %xmm5
+
+	neg	R32(%rcx)
+	mov	-8(ap,n,8), %rax
+	shr	R8(%rcx), %rax
+
+	cmp	$2, n
+	jle	L(le2)
+
+	lea	(rp,n,8), R32(%rcx)
+	test	$8, R8(%rcx)
+	je	L(rp_aligned)
+
+C Do one initial limb in order to make rp aligned
+	movq	-8(ap,n,8), %xmm0
+	movq	-16(ap,n,8), %xmm1
+	psllq	%xmm4, %xmm0
+	psrlq	%xmm5, %xmm1
+	por	%xmm1, %xmm0
+	movq	%xmm0, -8(rp,n,8)
+	dec	n
+
+L(rp_aligned):
+	lea	(ap,n,8), R32(%rcx)
+	test	$8, R8(%rcx)
+	je	L(aent)
+	jmp	L(uent)
+C *****************************************************************************
+
+C Handle the case when ap != rp (mod 16).
+
+	ALIGN(16)
+L(utop):movdqa	-8(ap,n,8), %xmm0
+	movq	(ap,n,8), %xmm1
+	punpcklqdq  8(ap,n,8), %xmm1
+	psllq	%xmm4, %xmm1
+	psrlq	%xmm5, %xmm0
+	por	%xmm1, %xmm0
+	movdqa	%xmm0, (rp,n,8)
+L(uent):sub	$2, n
+	ja	L(utop)
+
+	jne	L(end8)
+
+	movq	(ap), %xmm1
+	pxor	%xmm0, %xmm0
+	punpcklqdq  %xmm1, %xmm0
+	punpcklqdq  8(ap), %xmm1
+	psllq	%xmm4, %xmm1
+	psrlq	%xmm5, %xmm0
+	por	%xmm1, %xmm0
+	movdqa	%xmm0, (rp)
+	FUNC_EXIT()
+	ret
+C *****************************************************************************
+
+C Handle the case when ap = rp (mod 16).
+
+	ALIGN(16)
+L(atop):movdqa	(ap,n,8), %xmm0		C xmm0 = B*ap[n-1] + ap[n-2]
+	movq	-8(ap,n,8), %xmm1	C xmm1 = ap[n-3]
+	punpcklqdq  %xmm0, %xmm1	C xmm1 = B*ap[n-2] + ap[n-3]
+	psllq	%xmm4, %xmm0
+	psrlq	%xmm5, %xmm1
+	por	%xmm1, %xmm0
+	movdqa	%xmm0, (rp,n,8)
+L(aent):
+	sub	$2, n
+	ja	L(atop)
+	jne	L(end8)
+
+	movdqa	(ap), %xmm1
+	pxor	%xmm0, %xmm0
+	punpcklqdq  %xmm1, %xmm0
+	psllq	%xmm4, %xmm1
+	psrlq	%xmm5, %xmm0
+	por	%xmm1, %xmm0
+	movdqa	%xmm0, (rp)
+	FUNC_EXIT()
+	ret
+C *****************************************************************************
+
+	ALIGN(16)
+L(le2):	jne	L(end8)
+
+	movq	8(ap), %xmm0
+	movq	(ap), %xmm1
+	psllq	%xmm4, %xmm0
+	psrlq	%xmm5, %xmm1
+	por	%xmm1, %xmm0
+	movq	%xmm0, 8(rp)
+
+L(end8):movq	(ap), %xmm0
+	psllq	%xmm4, %xmm0
+	movq	%xmm0, (rp)
+	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/fastsse/lshiftc-movdqu2.asm b/third_party/gmp/mpn/x86_64/fastsse/lshiftc-movdqu2.asm
new file mode 100644
index 0000000..8250910
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/fastsse/lshiftc-movdqu2.asm

@@ -0,0 +1,193 @@
+dnl  AMD64 mpn_lshiftc optimised for CPUs with fast SSE including fast movdqu.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  Copyright 2010-2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C	     cycles/limb     cycles/limb     cycles/limb    good
+C              aligned	      unaligned	      best seen	   for cpu?
+C AMD K8,K9	 3		 3		 ?	  no, use shl/shr
+C AMD K10	 1.8-2.0	 1.8-2.0	 ?	  yes
+C AMD bd1	 1.9		 1.9		 ?	  yes
+C AMD bobcat	 3.67		 3.67			  yes, bad for n < 20
+C Intel P4	 4.75		 4.75		 ?	  no, slow movdqu
+C Intel core2	 2.27		 2.27		 ?	  no, use shld/shrd
+C Intel NHM	 2.15		 2.15		 ?	  no, use shld/shrd
+C Intel SBR	 1.45		 1.45		 ?	  yes, bad for n = 4-6
+C Intel atom	12.9		12.9		 ?	  no
+C VIA nano	 6.18		 6.44		 ?	  no, slow movdqu
+
+C We try to do as many aligned 16-byte operations as possible.  The top-most
+C and bottom-most writes might need 8-byte operations.
+C
+C This variant rely on fast load movdqu, and uses it even for aligned operands,
+C in order to avoid the need for two separate loops.
+C
+C TODO
+C  * Could 2-limb wind-down code be simplified?
+C  * Improve basecase code, using shld/shrd for SBR, discrete integer shifts
+C    for other affected CPUs.
+
+C INPUT PARAMETERS
+define(`rp',  `%rdi')
+define(`ap',  `%rsi')
+define(`n',   `%rdx')
+define(`cnt', `%rcx')
+
+ASM_START()
+	TEXT
+	ALIGN(64)
+PROLOGUE(mpn_lshiftc)
+	FUNC_ENTRY(4)
+	movd	R32(%rcx), %xmm4
+	mov	$64, R32(%rax)
+	sub	R32(%rcx), R32(%rax)
+	movd	R32(%rax), %xmm5
+
+	neg	R32(%rcx)
+	mov	-8(ap,n,8), %rax
+	shr	R8(%rcx), %rax
+
+	pcmpeqb	%xmm3, %xmm3		C set to 111...111
+
+	cmp	$3, n
+	jle	L(bc)
+
+	lea	(rp,n,8), R32(%rcx)
+	test	$8, R8(%rcx)
+	jz	L(rp_aligned)
+
+C Do one initial limb in order to make rp aligned
+	movq	-8(ap,n,8), %xmm0
+	movq	-16(ap,n,8), %xmm1
+	psllq	%xmm4, %xmm0
+	psrlq	%xmm5, %xmm1
+	por	%xmm1, %xmm0
+	pxor	%xmm3, %xmm0
+	movq	%xmm0, -8(rp,n,8)
+	dec	n
+
+L(rp_aligned):
+	lea	1(n), %r8d
+
+	and	$6, R32(%r8)
+	jz	L(ba0)
+	cmp	$4, R32(%r8)
+	jz	L(ba4)
+	jc	L(ba2)
+L(ba6):	add	$-4, n
+	jmp	L(i56)
+L(ba0):	add	$-6, n
+	jmp	L(i70)
+L(ba4):	add	$-2, n
+	jmp	L(i34)
+L(ba2):	add	$-8, n
+	jle	L(end)
+
+	ALIGN(16)
+L(top):	movdqu	40(ap,n,8), %xmm1
+	movdqu	48(ap,n,8), %xmm0
+	psllq	%xmm4, %xmm0
+	psrlq	%xmm5, %xmm1
+	por	%xmm1, %xmm0
+	pxor	%xmm3, %xmm0
+	movdqa	%xmm0, 48(rp,n,8)
+L(i70):
+	movdqu	24(ap,n,8), %xmm1
+	movdqu	32(ap,n,8), %xmm0
+	psllq	%xmm4, %xmm0
+	psrlq	%xmm5, %xmm1
+	por	%xmm1, %xmm0
+	pxor	%xmm3, %xmm0
+	movdqa	%xmm0, 32(rp,n,8)
+L(i56):
+	movdqu	8(ap,n,8), %xmm1
+	movdqu	16(ap,n,8), %xmm0
+	psllq	%xmm4, %xmm0
+	psrlq	%xmm5, %xmm1
+	por	%xmm1, %xmm0
+	pxor	%xmm3, %xmm0
+	movdqa	%xmm0, 16(rp,n,8)
+L(i34):
+	movdqu	-8(ap,n,8), %xmm1
+	movdqu	(ap,n,8), %xmm0
+	psllq	%xmm4, %xmm0
+	psrlq	%xmm5, %xmm1
+	por	%xmm1, %xmm0
+	pxor	%xmm3, %xmm0
+	movdqa	%xmm0, (rp,n,8)
+	sub	$8, n
+	jg	L(top)
+
+L(end):	test	$1, R8(n)
+	jnz	L(end8)
+
+	movdqu	(ap), %xmm1
+	pxor	%xmm0, %xmm0
+	punpcklqdq  %xmm1, %xmm0
+	psllq	%xmm4, %xmm1
+	psrlq	%xmm5, %xmm0
+	por	%xmm1, %xmm0
+	pxor	%xmm3, %xmm0
+	movdqa	%xmm0, (rp)
+	FUNC_EXIT()
+	ret
+
+C Basecase
+	ALIGN(16)
+L(bc):	dec	R32(n)
+	jz	L(end8)
+
+	movq	(ap,n,8), %xmm1
+	movq	-8(ap,n,8), %xmm0
+	psllq	%xmm4, %xmm1
+	psrlq	%xmm5, %xmm0
+	por	%xmm1, %xmm0
+	pxor	%xmm3, %xmm0
+	movq	%xmm0, (rp,n,8)
+	sub	$2, R32(n)
+	jl	L(end8)
+	movq	8(ap), %xmm1
+	movq	(ap), %xmm0
+	psllq	%xmm4, %xmm1
+	psrlq	%xmm5, %xmm0
+	por	%xmm1, %xmm0
+	pxor	%xmm3, %xmm0
+	movq	%xmm0, 8(rp)
+
+L(end8):movq	(ap), %xmm0
+	psllq	%xmm4, %xmm0
+	pxor	%xmm3, %xmm0
+	movq	%xmm0, (rp)
+	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/fastsse/lshiftc.asm b/third_party/gmp/mpn/x86_64/fastsse/lshiftc.asm
new file mode 100644
index 0000000..a616075
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/fastsse/lshiftc.asm

@@ -0,0 +1,183 @@
+dnl  AMD64 mpn_lshiftc optimised for CPUs with fast SSE.
+
+dnl  Contributed to the GNU project by David Harvey and Torbjorn Granlund.
+
+dnl  Copyright 2010-2012, 2018 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C	     cycles/limb	     cycles/limb	      good
+C          16-byte aligned         16-byte unaligned	    for cpu?
+C AMD K8,K9	 ?			 ?
+C AMD K10	 1.85  (1.635)		 1.9   (1.67)		Y
+C AMD bd1	 1.82  (1.75)		 1.82  (1.75)		Y
+C AMD bobcat	 4.5			 4.5
+C Intel P4	 3.6   (3.125)		 3.6   (3.125)		Y
+C Intel core2	 2.05  (1.67)		 2.55  (1.75)
+C Intel NHM	 2.05  (1.875)		 2.6   (2.25)
+C Intel SBR	 1.55  (1.44)		 2     (1.57)		Y
+C Intel atom	 ?			 ?
+C VIA nano	 2.5   (2.5)		 2.5   (2.5)		Y
+
+C We try to do as many 16-byte operations as possible.  The top-most and
+C bottom-most writes might need 8-byte operations.  We always write using
+C 16-byte operations, we read with both 8-byte and 16-byte operations.
+
+C There are two inner-loops, one for when rp = ap (mod 16) and one when this is
+C not true.  The aligned case reads 16+8 bytes, the unaligned case reads
+C 16+8+X bytes, where X is 8 or 16 depending on how punpcklqdq is implemented.
+
+C This is not yet great code:
+C   (1) The unaligned case makes too many reads.
+C   (2) We should do some unrolling, at least 2-way.
+C With 2-way unrolling but no scheduling we reach 1.5 c/l on K10 and 2 c/l on
+C Nano.
+
+C INPUT PARAMETERS
+define(`rp',  `%rdi')
+define(`ap',  `%rsi')
+define(`n',   `%rdx')
+define(`cnt', `%rcx')
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_lshiftc)
+	FUNC_ENTRY(4)
+	movd	R32(%rcx), %xmm4
+	mov	$64, R32(%rax)
+	sub	R32(%rcx), R32(%rax)
+	movd	R32(%rax), %xmm5
+
+	neg	R32(%rcx)
+	mov	-8(ap,n,8), %rax
+	shr	R8(%rcx), %rax
+
+	pcmpeqb	%xmm2, %xmm2		C set to 111...111
+
+	cmp	$2, n
+	jle	L(le2)
+
+	lea	(rp,n,8), R32(%rcx)
+	test	$8, R8(%rcx)
+	je	L(rp_aligned)
+
+C Do one initial limb in order to make rp aligned
+	movq	-8(ap,n,8), %xmm0
+	movq	-16(ap,n,8), %xmm1
+	psllq	%xmm4, %xmm0
+	psrlq	%xmm5, %xmm1
+	por	%xmm1, %xmm0
+	pxor	%xmm2, %xmm0
+	movq	%xmm0, -8(rp,n,8)
+	dec	n
+
+L(rp_aligned):
+	lea	(ap,n,8), R32(%rcx)
+	test	$8, R8(%rcx)
+	je	L(aent)
+	jmp	L(uent)
+C *****************************************************************************
+
+C Handle the case when ap != rp (mod 16).
+
+	ALIGN(16)
+L(utop):movq	(ap,n,8), %xmm1
+	punpcklqdq  8(ap,n,8), %xmm1
+	movdqa	-8(ap,n,8), %xmm0
+	psllq	%xmm4, %xmm1
+	psrlq	%xmm5, %xmm0
+	por	%xmm1, %xmm0
+	pxor	%xmm2, %xmm0
+	movdqa	%xmm0, (rp,n,8)
+L(uent):sub	$2, n
+	ja	L(utop)
+
+	jne	L(end8)
+
+	movq	(ap), %xmm1
+	pxor	%xmm0, %xmm0
+	punpcklqdq  %xmm1, %xmm0
+	punpcklqdq  8(ap), %xmm1
+	psllq	%xmm4, %xmm1
+	psrlq	%xmm5, %xmm0
+	por	%xmm1, %xmm0
+	pxor	%xmm2, %xmm0
+	movdqa	%xmm0, (rp)
+	FUNC_EXIT()
+	ret
+C *****************************************************************************
+
+C Handle the case when ap = rp (mod 16).
+
+	ALIGN(16)
+L(atop):movdqa	(ap,n,8), %xmm0		C xmm0 = B*ap[n-1] + ap[n-2]
+	movq	-8(ap,n,8), %xmm1	C xmm1 = ap[n-3]
+	punpcklqdq  %xmm0, %xmm1	C xmm1 = B*ap[n-2] + ap[n-3]
+	psllq	%xmm4, %xmm0
+	psrlq	%xmm5, %xmm1
+	por	%xmm1, %xmm0
+	pxor	%xmm2, %xmm0
+	movdqa	%xmm0, (rp,n,8)
+L(aent):sub	$2, n
+	ja	L(atop)
+
+	jne	L(end8)
+
+	movdqa	(ap), %xmm0
+	pxor	%xmm1, %xmm1
+	punpcklqdq  %xmm0, %xmm1
+	psllq	%xmm4, %xmm0
+	psrlq	%xmm5, %xmm1
+	por	%xmm1, %xmm0
+	pxor	%xmm2, %xmm0
+	movdqa	%xmm0, (rp)
+	FUNC_EXIT()
+	ret
+C *****************************************************************************
+
+	ALIGN(16)
+L(le2):	jne	L(end8)
+
+	movq	8(ap), %xmm0
+	movq	(ap), %xmm1
+	psllq	%xmm4, %xmm0
+	psrlq	%xmm5, %xmm1
+	por	%xmm1, %xmm0
+	pxor	%xmm2, %xmm0
+	movq	%xmm0, 8(rp)
+
+L(end8):movq	(ap), %xmm0
+	psllq	%xmm4, %xmm0
+	pxor	%xmm2, %xmm0
+	movq	%xmm0, (rp)
+	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/fastsse/rshift-movdqu2.asm b/third_party/gmp/mpn/x86_64/fastsse/rshift-movdqu2.asm
new file mode 100644
index 0000000..1e270b1
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/fastsse/rshift-movdqu2.asm

@@ -0,0 +1,201 @@
+dnl  AMD64 mpn_rshift optimised for CPUs with fast SSE including fast movdqu.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  Copyright 2010-2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C	     cycles/limb     cycles/limb     cycles/limb    good
+C              aligned	      unaligned	      best seen	   for cpu?
+C AMD K8,K9	 3		 3		 2.35	  no, use shl/shr
+C AMD K10	 1.5-1.8	 1.5-1.8	 1.33	  yes
+C AMD bd1	 1.7-1.9	 1.7-1.9	 1.33	  yes
+C AMD bobcat	 3.17		 3.17			  yes, bad for n < 20
+C Intel P4	 4.67		 4.67		 2.7	  no, slow movdqu
+C Intel core2	 2.15		 2.15		 1.25	  no, use shld/shrd
+C Intel NHM	 1.66		 1.66		 1.25	  no, use shld/shrd
+C Intel SBR	 1.3		 1.3		 1.25	  yes, bad for n = 4-6
+C Intel atom	11.7		11.7		 4.5	  no
+C VIA nano	 5.7		 5.95		 2.0	  no, slow movdqu
+
+C We try to do as many aligned 16-byte operations as possible.  The top-most
+C and bottom-most writes might need 8-byte operations.
+C
+C This variant rely on fast load movdqu, and uses it even for aligned operands,
+C in order to avoid the need for two separate loops.
+C
+C TODO
+C  * Could 2-limb wind-down code be simplified?
+C  * Improve basecase code, using shld/shrd for SBR, discrete integer shifts
+C    for other affected CPUs.
+
+C INPUT PARAMETERS
+define(`rp',  `%rdi')
+define(`ap',  `%rsi')
+define(`n',   `%rdx')
+define(`cnt', `%rcx')
+
+ASM_START()
+	TEXT
+	ALIGN(64)
+PROLOGUE(mpn_rshift)
+	FUNC_ENTRY(4)
+	movd	R32(%rcx), %xmm4
+	mov	$64, R32(%rax)
+	sub	R32(%rcx), R32(%rax)
+	movd	R32(%rax), %xmm5
+
+	neg	R32(%rcx)
+	mov	(ap), %rax
+	shl	R8(%rcx), %rax
+
+	cmp	$3, n
+	jle	L(bc)
+
+	test	$8, R8(rp)
+	jz	L(rp_aligned)
+
+C Do one initial limb in order to make rp aligned
+	movq	(ap), %xmm0
+	movq	8(ap), %xmm1
+	psrlq	%xmm4, %xmm0
+	psllq	%xmm5, %xmm1
+	por	%xmm1, %xmm0
+	movq	%xmm0, (rp)
+	lea	8(ap), ap
+	lea	8(rp), rp
+	dec	n
+
+L(rp_aligned):
+	lea	1(n), %r8d
+	lea	(ap,n,8), ap
+	lea	(rp,n,8), rp
+	neg	n
+
+	and	$6, R32(%r8)
+	jz	L(bu0)
+	cmp	$4, R32(%r8)
+	jz	L(bu4)
+	jc	L(bu2)
+L(bu6):	add	$4, n
+	jmp	L(i56)
+L(bu0):	add	$6, n
+	jmp	L(i70)
+L(bu4):	add	$2, n
+	jmp	L(i34)
+L(bu2):	add	$8, n
+	jge	L(end)
+
+	ALIGN(16)
+L(top):	movdqu	-64(ap,n,8), %xmm1
+	movdqu	-56(ap,n,8), %xmm0
+	psllq	%xmm5, %xmm0
+	psrlq	%xmm4, %xmm1
+	por	%xmm1, %xmm0
+	movdqa	%xmm0, -64(rp,n,8)
+L(i70):
+	movdqu	-48(ap,n,8), %xmm1
+	movdqu	-40(ap,n,8), %xmm0
+	psllq	%xmm5, %xmm0
+	psrlq	%xmm4, %xmm1
+	por	%xmm1, %xmm0
+	movdqa	%xmm0, -48(rp,n,8)
+L(i56):
+	movdqu	-32(ap,n,8), %xmm1
+	movdqu	-24(ap,n,8), %xmm0
+	psllq	%xmm5, %xmm0
+	psrlq	%xmm4, %xmm1
+	por	%xmm1, %xmm0
+	movdqa	%xmm0, -32(rp,n,8)
+L(i34):
+	movdqu	-16(ap,n,8), %xmm1
+	movdqu	-8(ap,n,8), %xmm0
+	psllq	%xmm5, %xmm0
+	psrlq	%xmm4, %xmm1
+	por	%xmm1, %xmm0
+	movdqa	%xmm0, -16(rp,n,8)
+	add	$8, n
+	jl	L(top)
+
+L(end):	test	$1, R8(n)
+	jnz	L(e1)
+
+	movdqu	-16(ap), %xmm1
+	movq	-8(ap), %xmm0
+	psrlq	%xmm4, %xmm1
+	psllq	%xmm5, %xmm0
+	por	%xmm1, %xmm0
+	movdqa	%xmm0, -16(rp)
+	FUNC_EXIT()
+	ret
+
+L(e1):	movq	-8(ap), %xmm0
+	psrlq	%xmm4, %xmm0
+	movq	%xmm0, -8(rp)
+	FUNC_EXIT()
+	ret
+
+C Basecase
+	ALIGN(16)
+L(bc):	dec	R32(n)
+	jnz	1f
+	movq	(ap), %xmm0
+	psrlq	%xmm4, %xmm0
+	movq	%xmm0, (rp)
+	FUNC_EXIT()
+	ret
+
+1:	movq	(ap), %xmm1
+	movq	8(ap), %xmm0
+	psrlq	%xmm4, %xmm1
+	psllq	%xmm5, %xmm0
+	por	%xmm1, %xmm0
+	movq	%xmm0, (rp)
+	dec	R32(n)
+	jnz	1f
+	movq	8(ap), %xmm0
+	psrlq	%xmm4, %xmm0
+	movq	%xmm0, 8(rp)
+	FUNC_EXIT()
+	ret
+
+1:	movq	8(ap), %xmm1
+	movq	16(ap), %xmm0
+	psrlq	%xmm4, %xmm1
+	psllq	%xmm5, %xmm0
+	por	%xmm1, %xmm0
+	movq	%xmm0,	8(rp)
+	movq	16(ap), %xmm0
+	psrlq	%xmm4, %xmm0
+	movq	%xmm0, 16(rp)
+	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/fastsse/sec_tabselect.asm b/third_party/gmp/mpn/x86_64/fastsse/sec_tabselect.asm
new file mode 100644
index 0000000..e7b7feb
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/fastsse/sec_tabselect.asm

@@ -0,0 +1,204 @@
+dnl  AMD64 SSE mpn_sec_tabselect.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2011-2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C	     cycles/limb     cycles/limb     cycles/limb
+C	      ali,evn n	     unal,evn n	      other cases
+C AMD K8,K9	 1.65		1.65		 1.8
+C AMD K10	 0.78		0.78		 0.85
+C AMD bd1	 0.80		0.91		 1.25
+C AMD bobcat	 2.15		2.15		 2.37
+C Intel P4	 2.5		2.5		 2.95
+C Intel core2	 1.17		1.25		 1.25
+C Intel NHM	 0.87		0.90		 0.90
+C Intel SBR	 0.63		0.79		 0.77
+C Intel atom	 4.3		 4.3		 4.3	slower than plain code
+C VIA nano	 1.4		 5.1		 3.14	too alignment dependent
+
+C NOTES
+C  * We only honour the least significant 32 bits of the `which' and `nents'
+C    arguments to allow efficient code using just SSE2.  We would need to
+C    either use the SSE4_1 pcmpeqq, or find some other SSE2 sequence.
+C  * We use movd for copying between xmm and plain registers, since old gas
+C    rejects movq.  But gas assembles movd as movq when given a 64-bit greg.
+
+define(`rp',     `%rdi')
+define(`tp',     `%rsi')
+define(`n',      `%rdx')
+define(`nents',  `%rcx')
+define(`which',  `%r8')
+
+define(`i',      `%r10')
+define(`j',      `%r9')
+
+C rax  rbx  rcx  rdx  rdi  rsi  rbp   r8   r9  r10  r11  r12  r13  r14  r15
+C          nents  n   rp   tab       which j    i   temp  *    *    *    *
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_sec_tabselect)
+	FUNC_ENTRY(4)
+IFDOS(`	mov	56(%rsp), %r8d	')
+
+IFDOS(`	add	$-88, %rsp	')
+IFDOS(`	movdqu	%xmm6, (%rsp)	')
+IFDOS(`	movdqu	%xmm7, 16(%rsp)	')
+IFDOS(`	movdqu	%xmm8, 32(%rsp)	')
+IFDOS(`	movdqu	%xmm9, 48(%rsp)	')
+
+	movd	which, %xmm8
+	pshufd	$0, %xmm8, %xmm8	C 4 `which' copies
+	mov	$1, R32(%rax)
+	movd	%rax, %xmm9
+	pshufd	$0, %xmm9, %xmm9	C 4 copies of 1
+
+	mov	n, j
+	add	$-8, j
+	js	L(outer_end)
+
+L(outer_top):
+	mov	nents, i
+	mov	tp, %r11
+	pxor	%xmm1, %xmm1
+	pxor	%xmm4, %xmm4
+	pxor	%xmm5, %xmm5
+	pxor	%xmm6, %xmm6
+	pxor	%xmm7, %xmm7
+	ALIGN(16)
+L(top):	movdqa	%xmm8, %xmm0
+	pcmpeqd	%xmm1, %xmm0
+	paddd	%xmm9, %xmm1
+	movdqu	0(tp), %xmm2
+	movdqu	16(tp), %xmm3
+	pand	%xmm0, %xmm2
+	pand	%xmm0, %xmm3
+	por	%xmm2, %xmm4
+	por	%xmm3, %xmm5
+	movdqu	32(tp), %xmm2
+	movdqu	48(tp), %xmm3
+	pand	%xmm0, %xmm2
+	pand	%xmm0, %xmm3
+	por	%xmm2, %xmm6
+	por	%xmm3, %xmm7
+	lea	(tp,n,8), tp
+	add	$-1, i
+	jne	L(top)
+
+	movdqu	%xmm4, 0(rp)
+	movdqu	%xmm5, 16(rp)
+	movdqu	%xmm6, 32(rp)
+	movdqu	%xmm7, 48(rp)
+
+	lea	64(%r11), tp
+	lea	64(rp), rp
+	add	$-8, j
+	jns	L(outer_top)
+L(outer_end):
+
+	test	$4, R8(n)
+	je	L(b0xx)
+L(b1xx):mov	nents, i
+	mov	tp, %r11
+	pxor	%xmm1, %xmm1
+	pxor	%xmm4, %xmm4
+	pxor	%xmm5, %xmm5
+	ALIGN(16)
+L(tp4):	movdqa	%xmm8, %xmm0
+	pcmpeqd	%xmm1, %xmm0
+	paddd	%xmm9, %xmm1
+	movdqu	0(tp), %xmm2
+	movdqu	16(tp), %xmm3
+	pand	%xmm0, %xmm2
+	pand	%xmm0, %xmm3
+	por	%xmm2, %xmm4
+	por	%xmm3, %xmm5
+	lea	(tp,n,8), tp
+	add	$-1, i
+	jne	L(tp4)
+	movdqu	%xmm4, 0(rp)
+	movdqu	%xmm5, 16(rp)
+	lea	32(%r11), tp
+	lea	32(rp), rp
+
+L(b0xx):test	$2, R8(n)
+	je	L(b00x)
+L(b01x):mov	nents, i
+	mov	tp, %r11
+	pxor	%xmm1, %xmm1
+	pxor	%xmm4, %xmm4
+	ALIGN(16)
+L(tp2):	movdqa	%xmm8, %xmm0
+	pcmpeqd	%xmm1, %xmm0
+	paddd	%xmm9, %xmm1
+	movdqu	0(tp), %xmm2
+	pand	%xmm0, %xmm2
+	por	%xmm2, %xmm4
+	lea	(tp,n,8), tp
+	add	$-1, i
+	jne	L(tp2)
+	movdqu	%xmm4, 0(rp)
+	lea	16(%r11), tp
+	lea	16(rp), rp
+
+L(b00x):test	$1, R8(n)
+	je	L(b000)
+L(b001):mov	nents, i
+	mov	tp, %r11
+	pxor	%xmm1, %xmm1
+	pxor	%xmm4, %xmm4
+	ALIGN(16)
+L(tp1):	movdqa	%xmm8, %xmm0
+	pcmpeqd	%xmm1, %xmm0
+	paddd	%xmm9, %xmm1
+	movq	0(tp), %xmm2
+	pand	%xmm0, %xmm2
+	por	%xmm2, %xmm4
+	lea	(tp,n,8), tp
+	add	$-1, i
+	jne	L(tp1)
+	movq	%xmm4, 0(rp)
+
+L(b000):
+IFDOS(`	movdqu	(%rsp), %xmm6	')
+IFDOS(`	movdqu	16(%rsp), %xmm7	')
+IFDOS(`	movdqu	32(%rsp), %xmm8	')
+IFDOS(`	movdqu	48(%rsp), %xmm9	')
+IFDOS(`	add	$88, %rsp	')
+	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/fat/addmul_2.c b/third_party/gmp/mpn/x86_64/fat/addmul_2.c
new file mode 100644
index 0000000..e0d7358
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/fat/addmul_2.c

@@ -0,0 +1,38 @@
+/* Fat binary fallback mpn_addmul_2.
+
+Copyright 2016 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+
+mp_limb_t
+mpn_addmul_2 (mp_ptr rp, mp_srcptr up, mp_size_t n, const mp_limb_t vp[2])
+{
+  rp[n] = mpn_addmul_1 (rp,     up, n, vp[0]);
+  return  mpn_addmul_1 (rp + 1, up, n, vp[1]);
+}

diff --git a/third_party/gmp/mpn/x86_64/fat/fat.c b/third_party/gmp/mpn/x86_64/fat/fat.c
new file mode 100644
index 0000000..b7446a3
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/fat/fat.c

@@ -0,0 +1,472 @@
+/* x86_64 fat binary initializers.
+
+   Contributed to the GNU project by Kevin Ryde (original x86_32 code) and
+   Torbjorn Granlund (port to x86_64)
+
+   THE FUNCTIONS AND VARIABLES IN THIS FILE ARE FOR INTERNAL USE ONLY.
+   THEY'RE ALMOST CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR
+   COMPLETELY IN FUTURE GNU MP RELEASES.
+
+Copyright 2003, 2004, 2009, 2011-2015, 2017 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include <stdio.h>    /* for printf */
+#include <stdlib.h>   /* for getenv */
+#include <string.h>
+
+#include "gmp-impl.h"
+
+/* Change this to "#define TRACE(x) x" for some traces. */
+#define TRACE(x)
+
+
+/* fat_entry.asm */
+long __gmpn_cpuid (char [12], int);
+
+
+#if WANT_FAKE_CPUID
+/* The "name"s in the table are values for the GMP_CPU_TYPE environment
+   variable.  Anything can be used, but for now it's the canonical cpu types
+   as per config.guess/config.sub.  */
+
+#define __gmpn_cpuid            fake_cpuid
+
+#define MAKE_FMS(family, model)						\
+  ((((family) & 0xf) << 8) + (((family) & 0xff0) << 20)			\
+   + (((model) & 0xf) << 4) + (((model)  &  0xf0) << 12))
+
+static struct {
+  const char  *name;
+  const char  *vendor;
+  unsigned    fms;
+} fake_cpuid_table[] = {
+  { "core2",      "GenuineIntel", MAKE_FMS (6, 0xf) },
+  { "nehalem",    "GenuineIntel", MAKE_FMS (6, 0x1a) },
+  { "nhm",        "GenuineIntel", MAKE_FMS (6, 0x1a) },
+  { "atom",       "GenuineIntel", MAKE_FMS (6, 0x1c) },
+  { "westmere",   "GenuineIntel", MAKE_FMS (6, 0x25) },
+  { "wsm",        "GenuineIntel", MAKE_FMS (6, 0x25) },
+  { "sandybridge","GenuineIntel", MAKE_FMS (6, 0x2a) },
+  { "sbr",        "GenuineIntel", MAKE_FMS (6, 0x2a) },
+  { "silvermont", "GenuineIntel", MAKE_FMS (6, 0x37) },
+  { "slm",        "GenuineIntel", MAKE_FMS (6, 0x37) },
+  { "haswell",    "GenuineIntel", MAKE_FMS (6, 0x3c) },
+  { "hwl",        "GenuineIntel", MAKE_FMS (6, 0x3c) },
+  { "broadwell",  "GenuineIntel", MAKE_FMS (6, 0x3d) },
+  { "bwl",        "GenuineIntel", MAKE_FMS (6, 0x3d) },
+  { "skylake",    "GenuineIntel", MAKE_FMS (6, 0x5e) },
+  { "sky",        "GenuineIntel", MAKE_FMS (6, 0x5e) },
+  { "pentium4",   "GenuineIntel", MAKE_FMS (15, 3) },
+
+  { "k8",         "AuthenticAMD", MAKE_FMS (15, 0) },
+  { "k10",        "AuthenticAMD", MAKE_FMS (16, 0) },
+  { "bobcat",     "AuthenticAMD", MAKE_FMS (20, 1) },
+  { "bulldozer",  "AuthenticAMD", MAKE_FMS (21, 1) },
+  { "piledriver", "AuthenticAMD", MAKE_FMS (21, 2) },
+  { "steamroller","AuthenticAMD", MAKE_FMS (21, 0x30) },
+  { "excavator",  "AuthenticAMD", MAKE_FMS (21, 0x60) },
+  { "jaguar",     "AuthenticAMD", MAKE_FMS (22, 1) },
+  { "zen",        "AuthenticAMD", MAKE_FMS (23, 1) },
+
+  { "nano",       "CentaurHauls", MAKE_FMS (6, 15) },
+};
+
+static int
+fake_cpuid_lookup (void)
+{
+  char  *s;
+  int   i;
+
+  s = getenv ("GMP_CPU_TYPE");
+  if (s == NULL)
+    {
+      printf ("Need GMP_CPU_TYPE environment variable for fake cpuid\n");
+      abort ();
+    }
+
+  for (i = 0; i < numberof (fake_cpuid_table); i++)
+    if (strcmp (s, fake_cpuid_table[i].name) == 0)
+      return i;
+
+  printf ("GMP_CPU_TYPE=%s unknown\n", s);
+  abort ();
+}
+
+static long
+fake_cpuid (char dst[12], unsigned int id)
+{
+  int  i = fake_cpuid_lookup();
+
+  switch (id) {
+  case 0:
+    memcpy (dst, fake_cpuid_table[i].vendor, 12);
+    return 0;
+  case 1:
+    return fake_cpuid_table[i].fms;
+  case 7:
+    dst[0] = 0xff;				/* BMI1, AVX2, etc */
+    dst[1] = 0xff;				/* BMI2, etc */
+    return 0;
+  case 0x80000001:
+    dst[4 + 29 / 8] = (1 << (29 % 8));		/* "long" mode */
+    return 0;
+  default:
+    printf ("fake_cpuid(): oops, unknown id %d\n", id);
+    abort ();
+  }
+}
+#endif
+
+
+typedef DECL_preinv_divrem_1 ((*preinv_divrem_1_t));
+typedef DECL_preinv_mod_1    ((*preinv_mod_1_t));
+
+struct cpuvec_t __gmpn_cpuvec = {
+  __MPN(add_n_init),
+  __MPN(addlsh1_n_init),
+  __MPN(addlsh2_n_init),
+  __MPN(addmul_1_init),
+  __MPN(addmul_2_init),
+  __MPN(bdiv_dbm1c_init),
+  __MPN(cnd_add_n_init),
+  __MPN(cnd_sub_n_init),
+  __MPN(com_init),
+  __MPN(copyd_init),
+  __MPN(copyi_init),
+  __MPN(divexact_1_init),
+  __MPN(divrem_1_init),
+  __MPN(gcd_11_init),
+  __MPN(lshift_init),
+  __MPN(lshiftc_init),
+  __MPN(mod_1_init),
+  __MPN(mod_1_1p_init),
+  __MPN(mod_1_1p_cps_init),
+  __MPN(mod_1s_2p_init),
+  __MPN(mod_1s_2p_cps_init),
+  __MPN(mod_1s_4p_init),
+  __MPN(mod_1s_4p_cps_init),
+  __MPN(mod_34lsub1_init),
+  __MPN(modexact_1c_odd_init),
+  __MPN(mul_1_init),
+  __MPN(mul_basecase_init),
+  __MPN(mullo_basecase_init),
+  __MPN(preinv_divrem_1_init),
+  __MPN(preinv_mod_1_init),
+  __MPN(redc_1_init),
+  __MPN(redc_2_init),
+  __MPN(rshift_init),
+  __MPN(sqr_basecase_init),
+  __MPN(sub_n_init),
+  __MPN(sublsh1_n_init),
+  __MPN(submul_1_init),
+  0
+};
+
+int __gmpn_cpuvec_initialized = 0;
+
+/* The following setups start with generic x86, then overwrite with
+   specifics for a chip, and higher versions of that chip.
+
+   The arrangement of the setups here will normally be the same as the $path
+   selections in configure.in for the respective chips.
+
+   This code is reentrant and thread safe.  We always calculate the same
+   decided_cpuvec, so if two copies of the code are running it doesn't
+   matter which completes first, both write the same to __gmpn_cpuvec.
+
+   We need to go via decided_cpuvec because if one thread has completed
+   __gmpn_cpuvec then it may be making use of the threshold values in that
+   vector.  If another thread is still running __gmpn_cpuvec_init then we
+   don't want it to write different values to those fields since some of the
+   asm routines only operate correctly up to their own defined threshold,
+   not an arbitrary value.  */
+
+static int
+gmp_workaround_skylake_cpuid_bug ()
+{
+  char feature_string[49];
+  char processor_name_string[49];
+  static const char *bad_cpus[] = {" G44", " G45", " G39" /* , "6600" */ };
+  int i;
+
+  /* Example strings:                                   */
+  /* "Intel(R) Pentium(R) CPU G4400 @ 3.30GHz"          */
+  /* "Intel(R) Core(TM) i5-6600K CPU @ 3.50GHz"         */
+  /*                  ^               ^               ^ */
+  /*     0x80000002       0x80000003      0x80000004    */
+  /* We match out just the 0x80000003 part here. */
+
+  /* In their infinitive wisdom, Intel decided to use one register order for
+     the vendor string, and another for the processor name string.  We shuffle
+     things about here, rather than write a new variant of our assembly cpuid.
+  */
+
+  unsigned int eax, ebx, ecx, edx;
+  eax = __gmpn_cpuid (feature_string, 0x80000003);
+  ebx = ((unsigned int *)feature_string)[0];
+  edx = ((unsigned int *)feature_string)[1];
+  ecx = ((unsigned int *)feature_string)[2];
+
+  ((unsigned int *) (processor_name_string))[0] = eax;
+  ((unsigned int *) (processor_name_string))[1] = ebx;
+  ((unsigned int *) (processor_name_string))[2] = ecx;
+  ((unsigned int *) (processor_name_string))[3] = edx;
+
+  processor_name_string[16] = 0;
+
+  for (i = 0; i < sizeof (bad_cpus) / sizeof (char *); i++)
+    {
+      if (strstr (processor_name_string, bad_cpus[i]) != 0)
+	return 1;
+    }
+  return 0;
+}
+
+enum {BMI2_BIT = 8};
+
+void
+__gmpn_cpuvec_init (void)
+{
+  struct cpuvec_t  decided_cpuvec;
+  char vendor_string[13];
+  char dummy_string[12];
+  long fms;
+  int family, model;
+
+  TRACE (printf ("__gmpn_cpuvec_init:\n"));
+
+  memset (&decided_cpuvec, '\0', sizeof (decided_cpuvec));
+
+  CPUVEC_SETUP_x86_64;
+  CPUVEC_SETUP_fat;
+
+  __gmpn_cpuid (vendor_string, 0);
+  vendor_string[12] = 0;
+
+  fms = __gmpn_cpuid (dummy_string, 1);
+  family = ((fms >> 8) & 0xf) + ((fms >> 20) & 0xff);
+  model = ((fms >> 4) & 0xf) + ((fms >> 12) & 0xf0);
+
+  /* Check extended feature flags */
+  __gmpn_cpuid (dummy_string, 0x80000001);
+  if ((dummy_string[4 + 29 / 8] & (1 << (29 % 8))) == 0)
+    abort (); /* longmode-capable-bit turned off! */
+
+  /*********************************************************/
+  /*** WARNING: keep this list in sync with config.guess ***/
+  /*********************************************************/
+  if (strcmp (vendor_string, "GenuineIntel") == 0)
+    {
+      switch (family)
+	{
+	case 6:
+	  switch (model)
+	    {
+	    case 0x0f:		/* Conroe Merom Kentsfield Allendale */
+	    case 0x10:
+	    case 0x11:
+	    case 0x12:
+	    case 0x13:
+	    case 0x14:
+	    case 0x15:
+	    case 0x16:
+	    case 0x17:		/* PNR Wolfdale Yorkfield */
+	    case 0x18:
+	    case 0x19:
+	    case 0x1d:		/* PNR Dunnington */
+	      CPUVEC_SETUP_core2;
+	      break;
+
+	    case 0x1c:		/* Atom Silverthorne */
+	    case 0x26:		/* Atom Lincroft */
+	    case 0x27:		/* Atom Saltwell? */
+	    case 0x36:		/* Atom Cedarview/Saltwell */
+	      CPUVEC_SETUP_atom;
+	      break;
+
+	    case 0x1a:		/* NHM Gainestown */
+	    case 0x1b:
+	    case 0x1e:		/* NHM Lynnfield/Jasper */
+	    case 0x1f:
+	    case 0x20:
+	    case 0x21:
+	    case 0x22:
+	    case 0x23:
+	    case 0x24:
+	    case 0x25:		/* WSM Clarkdale/Arrandale */
+	    case 0x28:
+	    case 0x29:
+	    case 0x2b:
+	    case 0x2c:		/* WSM Gulftown */
+	    case 0x2e:		/* NHM Beckton */
+	    case 0x2f:		/* WSM Eagleton */
+	      CPUVEC_SETUP_core2;
+	      CPUVEC_SETUP_coreinhm;
+	      break;
+
+	    case 0x37:		/* Silvermont */
+	    case 0x4a:		/* Silvermont */
+	    case 0x4c:		/* Airmont */
+	    case 0x4d:		/* Silvermont/Avoton */
+	    case 0x5a:		/* Silvermont */
+	      CPUVEC_SETUP_atom;
+	      CPUVEC_SETUP_silvermont;
+	      break;
+
+	    case 0x5c:		/* Goldmont */
+	    case 0x5f:		/* Goldmont */
+	    case 0x7a:		/* Goldmont Plus */
+	      CPUVEC_SETUP_atom;
+	      CPUVEC_SETUP_silvermont;
+	      CPUVEC_SETUP_goldmont;
+	      break;
+
+	    case 0x2a:		/* SB */
+	    case 0x2d:		/* SBC-EP */
+	    case 0x3a:		/* IBR */
+	    case 0x3e:		/* IBR Ivytown */
+	      CPUVEC_SETUP_core2;
+	      CPUVEC_SETUP_coreinhm;
+	      CPUVEC_SETUP_coreisbr;
+	      break;
+	    case 0x3c:		/* Haswell client */
+	    case 0x3f:		/* Haswell server */
+	    case 0x45:		/* Haswell ULT */
+	    case 0x46:		/* Crystal Well */
+	      CPUVEC_SETUP_core2;
+	      CPUVEC_SETUP_coreinhm;
+	      CPUVEC_SETUP_coreisbr;
+	      /* Some Haswells lack BMI2.  Let them appear as Sandybridges for
+		 now.  */
+	      __gmpn_cpuid (dummy_string, 7);
+	      if ((dummy_string[0 + BMI2_BIT / 8] & (1 << (BMI2_BIT % 8))) == 0)
+		break;
+	      CPUVEC_SETUP_coreihwl;
+	      break;
+	    case 0x3d:		/* Broadwell */
+	    case 0x47:		/* Broadwell */
+	    case 0x4f:		/* Broadwell server */
+	    case 0x56:		/* Broadwell microserver */
+	      CPUVEC_SETUP_core2;
+	      CPUVEC_SETUP_coreinhm;
+	      CPUVEC_SETUP_coreisbr;
+	      if ((dummy_string[0 + BMI2_BIT / 8] & (1 << (BMI2_BIT % 8))) == 0)
+		break;
+	      CPUVEC_SETUP_coreihwl;
+	      CPUVEC_SETUP_coreibwl;
+	      break;
+	    case 0x4e:		/* Skylake client */
+	    case 0x55:		/* Skylake server */
+	    case 0x5e:		/* Skylake */
+	    case 0x8e:		/* Kabylake */
+	    case 0x9e:		/* Kabylake */
+	      CPUVEC_SETUP_core2;
+	      CPUVEC_SETUP_coreinhm;
+	      CPUVEC_SETUP_coreisbr;
+	      if ((dummy_string[0 + BMI2_BIT / 8] & (1 << (BMI2_BIT % 8))) == 0)
+		break;
+	      if (gmp_workaround_skylake_cpuid_bug ())
+		break;
+	      CPUVEC_SETUP_coreihwl;
+	      CPUVEC_SETUP_coreibwl;
+	      CPUVEC_SETUP_skylake;
+	      break;
+	    }
+	  break;
+
+	case 15:
+	  CPUVEC_SETUP_pentium4;
+	  break;
+	}
+    }
+  else if (strcmp (vendor_string, "AuthenticAMD") == 0)
+    {
+      switch (family)
+	{
+	case 0x0f:		/* k8 */
+	case 0x11:		/* "fam 11h", mix of k8 and k10 */
+	case 0x13:
+	  CPUVEC_SETUP_k8;
+	  break;
+
+	case 0x10:		/* k10 */
+	case 0x12:		/* k10 (llano) */
+	  CPUVEC_SETUP_k8;
+	  CPUVEC_SETUP_k10;
+	  break;
+
+	case 0x14:		/* bobcat */
+	  CPUVEC_SETUP_k8;
+	  CPUVEC_SETUP_k10;
+	  CPUVEC_SETUP_bt1;
+	  break;
+
+	case 0x16:		/* jaguar */
+	  CPUVEC_SETUP_k8;
+	  CPUVEC_SETUP_k10;
+	  CPUVEC_SETUP_bt1;
+	  CPUVEC_SETUP_bt2;
+	  break;
+
+	case 0x15:	    /* bulldozer, piledriver, steamroller, excavator */
+	  CPUVEC_SETUP_k8;
+	  CPUVEC_SETUP_k10;
+	  CPUVEC_SETUP_bd1;
+	  break;
+
+	case 0x17:	    /* zen */
+	  CPUVEC_SETUP_zen;
+	  break;
+	}
+    }
+  else if (strcmp (vendor_string, "CentaurHauls") == 0)
+    {
+      switch (family)
+	{
+	case 6:
+	  if (model >= 15)
+	    CPUVEC_SETUP_nano;
+	  break;
+	}
+    }
+
+  /* There's no x86 generic mpn_preinv_divrem_1 or mpn_preinv_mod_1.
+     Instead default to the plain versions from whichever CPU we detected.
+     The function arguments are compatible, no need for any glue code.  */
+  if (decided_cpuvec.preinv_divrem_1 == NULL)
+    decided_cpuvec.preinv_divrem_1 =(preinv_divrem_1_t)decided_cpuvec.divrem_1;
+  if (decided_cpuvec.preinv_mod_1 == NULL)
+    decided_cpuvec.preinv_mod_1    =(preinv_mod_1_t)   decided_cpuvec.mod_1;
+
+  ASSERT_CPUVEC (decided_cpuvec);
+  CPUVEC_INSTALL (decided_cpuvec);
+
+  /* Set this once the threshold fields are ready.
+     Use volatile to prevent it getting moved.  */
+  *((volatile int *) &__gmpn_cpuvec_initialized) = 1;
+}

diff --git a/third_party/gmp/mpn/x86_64/fat/fat_entry.asm b/third_party/gmp/mpn/x86_64/fat/fat_entry.asm
new file mode 100644
index 0000000..5f244ac
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/fat/fat_entry.asm

@@ -0,0 +1,209 @@
+dnl  x86 fat binary entrypoints.
+
+dnl  Contributed to the GNU project by Kevin Ryde (original x86_32 code) and
+dnl  Torbjorn Granlund (port to x86_64)
+
+dnl  Copyright 2003, 2009, 2011-2014, 2016 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+dnl  Forcibly disable profiling.
+dnl
+dnl  The entrypoints and inits are small enough not to worry about, the real
+dnl  routines arrived at will have any profiling.  Also, the way the code
+dnl  here ends with a jump means we won't work properly with the
+dnl  "instrument" profiling scheme anyway.
+
+define(`WANT_PROFILING',no)
+
+
+dnl  We define PRETEND_PIC as a helper symbol, the use it for suppressing
+dnl  normal, fast call code, since that triggers problems on Darwin, OpenBSD
+dnl  and some versions of GNU/Linux.  This will go away when symbol hiding is
+dnl  finished.
+
+ifdef(`DARWIN',
+`define(`PRETEND_PIC')')
+ifdef(`OPENBSD',
+`define(`PRETEND_PIC')')
+ifdef(`LINUX',
+`define(`PRETEND_PIC')')
+ifdef(`PIC',
+`define(`PRETEND_PIC')')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+	TEXT
+
+dnl  Usage: FAT_ENTRY(name, offset)
+dnl
+dnl  Emit a fat binary entrypoint function of the given name.  This is the
+dnl  normal entry for applications, eg. __gmpn_add_n.
+dnl
+dnl  The code simply jumps through the function pointer in __gmpn_cpuvec at
+dnl  the given "offset" (in bytes).
+dnl
+dnl  For non-PIC, the jumps are 5 bytes each, aligning them to 8 should be
+dnl  fine for all x86s.
+dnl
+dnl  For ELF/DARWIN PIC, the jumps are 20 bytes each, and are best aligned to
+dnl  16 to ensure at least the first two instructions don't cross a cache line
+dnl  boundary.
+dnl
+dnl  For DOS64, the jumps are 6 bytes.  The same form works also for GNU/Linux
+dnl  (at least with certain assembler/linkers) but FreeBSD 8.2 crashes.  Not
+dnl  tested on Darwin, Slowaris, NetBSD, etc.
+dnl
+dnl  Note the extra `' ahead of PROLOGUE obscures it from the HAVE_NATIVE
+dnl  grepping in configure, stopping that code trying to eval something with
+dnl  $1 in it.
+
+define(FAT_ENTRY,
+m4_assert_numargs(2)
+`ifdef(`HOST_DOS64',
+`	ALIGN(8)
+`'PROLOGUE($1)
+	jmp	*$2+GSYM_PREFIX`'__gmpn_cpuvec(%rip)
+EPILOGUE()
+',
+`	ALIGN(ifdef(`PIC',16,8))
+`'PROLOGUE($1)
+ifdef(`PRETEND_PIC',
+`	LEA(	GSYM_PREFIX`'__gmpn_cpuvec, %rax)
+	jmp	*$2(%rax)
+',`dnl non-PIC
+	jmp	*GSYM_PREFIX`'__gmpn_cpuvec+$2
+')
+EPILOGUE()
+')')
+
+
+dnl  FAT_ENTRY for each CPUVEC_FUNCS_LIST
+dnl
+
+define(`CPUVEC_offset',0)
+foreach(i,
+`FAT_ENTRY(MPN(i),CPUVEC_offset)
+define(`CPUVEC_offset',eval(CPUVEC_offset + 8))',
+CPUVEC_FUNCS_LIST)
+
+
+dnl  Usage: FAT_INIT(name, offset)
+dnl
+dnl  Emit a fat binary initializer function of the given name.  These
+dnl  functions are the initial values for the pointers in __gmpn_cpuvec.
+dnl
+dnl  The code simply calls __gmpn_cpuvec_init, and then jumps back through
+dnl  the __gmpn_cpuvec pointer, at the given "offset" (in bytes).
+dnl  __gmpn_cpuvec_init will have stored the address of the selected
+dnl  implementation there.
+dnl
+dnl  Only one of these routines will be executed, and only once, since after
+dnl  that all the __gmpn_cpuvec pointers go to real routines.  So there's no
+dnl  need for anything special here, just something small and simple.  To
+dnl  keep code size down, "fat_init" is a shared bit of code, arrived at
+dnl  with the offset in %al.  %al is used since the movb instruction is 2
+dnl  bytes where %eax would be 4.
+dnl
+dnl  Note having `PROLOGUE in FAT_INIT obscures that PROLOGUE from the
+dnl  HAVE_NATIVE grepping in configure, preventing that code trying to eval
+dnl  something with $1 in it.
+dnl
+dnl  We need to preserve parameter registers over the __gmpn_cpuvec_init call
+
+define(FAT_INIT,
+m4_assert_numargs(2)
+`PROLOGUE($1)
+	mov	$`'$2, %al
+	jmp	L(fat_init)
+EPILOGUE()
+')
+
+dnl  FAT_INIT for each CPUVEC_FUNCS_LIST
+dnl
+
+define(`CPUVEC_offset',0)
+foreach(i,
+`FAT_INIT(MPN(i`'_init),CPUVEC_offset)
+define(`CPUVEC_offset',eval(CPUVEC_offset + 1))',
+CPUVEC_FUNCS_LIST)
+
+L(fat_init):
+	C al	__gmpn_cpuvec byte offset
+
+	movzbl	%al, %eax
+IFSTD(`	push	%rdi	')
+IFSTD(`	push	%rsi	')
+	push	%rdx
+	push	%rcx
+	push	%r8
+	push	%r9
+	push	%rax
+IFDOS(`	sub	$32, %rsp	')
+	CALL(	__gmpn_cpuvec_init)
+IFDOS(`	add	$32, %rsp	')
+	pop	%rax
+	pop	%r9
+	pop	%r8
+	pop	%rcx
+	pop	%rdx
+IFSTD(`	pop	%rsi	')
+IFSTD(`	pop	%rdi	')
+ifdef(`PRETEND_PIC',`
+	LEA(	GSYM_PREFIX`'__gmpn_cpuvec, %r10)
+	jmp	*(%r10,%rax,8)
+',`dnl non-PIC
+	jmp	*GSYM_PREFIX`'__gmpn_cpuvec(,%rax,8)
+')
+
+
+C long __gmpn_cpuid (char dst[12], int id);
+C
+C This is called only 3 times, so just something simple and compact is fine.
+C
+C The rcx/ecx zeroing here is needed for the BMI2 check.
+
+define(`rp',  `%rdi')
+define(`idx', `%rsi')
+
+PROLOGUE(__gmpn_cpuid)
+	FUNC_ENTRY(2)
+	mov	%rbx, %r8
+	mov	R32(idx), R32(%rax)
+	xor	%ecx, %ecx
+	cpuid
+	mov	%ebx, (rp)
+	mov	%edx, 4(rp)
+	mov	%ecx, 8(rp)
+	mov	%r8, %rbx
+	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/fat/gmp-mparam.h b/third_party/gmp/mpn/x86_64/fat/gmp-mparam.h
new file mode 100644
index 0000000..005c893
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/fat/gmp-mparam.h

@@ -0,0 +1,72 @@
+/* Fat binary x86_64 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 2000-2003, 2009, 2011 Free Software Foundation,
+Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+
+/* mpn_divexact_1 is faster than mpn_divrem_1 at all sizes.  The only time
+   this might not be true currently is for actual 80386 and 80486 chips,
+   where mpn/x86/dive_1.asm might be slower than mpn/x86/divrem_1.asm, but
+   that's not worth worrying about.  */
+#define DIVEXACT_1_THRESHOLD  0
+
+/* Only some of the x86s have an mpn_preinv_divrem_1, but we set
+   USE_PREINV_DIVREM_1 so that all callers use it, and then let the
+   __gmpn_cpuvec pointer go to plain mpn_divrem_1 if there's not an actual
+   preinv.  */
+#define USE_PREINV_DIVREM_1   1
+
+#define BMOD_1_TO_MOD_1_THRESHOLD           20
+
+/* mpn_sqr_basecase is faster than mpn_mul_basecase at all sizes, no need
+   for mpn_sqr to call the latter.  */
+#define SQR_BASECASE_THRESHOLD 0
+
+/* Sensible fallbacks for these, when not taken from a cpu-specific
+   gmp-mparam.h.  */
+#define MUL_TOOM22_THRESHOLD      20
+#define MUL_TOOM33_THRESHOLD     130
+#define SQR_TOOM2_THRESHOLD       30
+#define SQR_TOOM3_THRESHOLD      200
+
+/* These are values more or less in the middle of what the typical x86 chips
+   come out as.  For a fat binary it's necessary to have values for these,
+   since the defaults for MUL_FFT_TABLE and SQR_FFT_TABLE otherwise come out
+   as non-constant array initializers.  FIXME: Perhaps these should be done
+   in the cpuvec structure like other thresholds.  */
+#define MUL_FFT_TABLE  { 464, 928, 1920, 3584, 10240, 40960, 0 }
+#define MUL_FFT_MODF_THRESHOLD          400
+#define MUL_FFT_THRESHOLD              2000
+
+#define SQR_FFT_TABLE  { 528, 1184, 1920, 4608, 14336, 40960, 0 }
+#define SQR_FFT_MODF_THRESHOLD          500
+#define SQR_FFT_THRESHOLD              3000

diff --git a/third_party/gmp/mpn/x86_64/fat/mod_1.c b/third_party/gmp/mpn/x86_64/fat/mod_1.c
new file mode 100644
index 0000000..4f149cc
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/fat/mod_1.c

@@ -0,0 +1,32 @@
+/* Fat binary fallback mpn_mod_1.
+
+Copyright 2003, 2009 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+#include "mpn/generic/mod_1.c"

diff --git a/third_party/gmp/mpn/x86_64/fat/mul_basecase.c b/third_party/gmp/mpn/x86_64/fat/mul_basecase.c
new file mode 100644
index 0000000..d9eb471
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/fat/mul_basecase.c

@@ -0,0 +1,32 @@
+/* Fat binary fallback mpn_mul_basecase.
+
+Copyright 2012 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+#include "mpn/generic/mul_basecase.c"

diff --git a/third_party/gmp/mpn/x86_64/fat/mullo_basecase.c b/third_party/gmp/mpn/x86_64/fat/mullo_basecase.c
new file mode 100644
index 0000000..7f86be6
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/fat/mullo_basecase.c

@@ -0,0 +1,32 @@
+/* Fat binary fallback mpn_mullo_basecase.
+
+Copyright 2012 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+#include "mpn/generic/mullo_basecase.c"

diff --git a/third_party/gmp/mpn/x86_64/fat/redc_1.c b/third_party/gmp/mpn/x86_64/fat/redc_1.c
new file mode 100644
index 0000000..0025403
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/fat/redc_1.c

@@ -0,0 +1,32 @@
+/* Fat binary fallback mpn_redc_1.
+
+Copyright 2012 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+#include "mpn/generic/redc_1.c"

diff --git a/third_party/gmp/mpn/x86_64/fat/redc_2.c b/third_party/gmp/mpn/x86_64/fat/redc_2.c
new file mode 100644
index 0000000..1932d58
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/fat/redc_2.c

@@ -0,0 +1,32 @@
+/* Fat binary fallback mpn_redc_2.
+
+Copyright 2012 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+#include "mpn/generic/redc_2.c"

diff --git a/third_party/gmp/mpn/x86_64/fat/sqr_basecase.c b/third_party/gmp/mpn/x86_64/fat/sqr_basecase.c
new file mode 100644
index 0000000..d1c5dcd
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/fat/sqr_basecase.c

@@ -0,0 +1,32 @@
+/* Fat binary fallback mpn_sqr_basecase.
+
+Copyright 2012 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+#include "mpn/generic/sqr_basecase.c"

diff --git a/third_party/gmp/mpn/x86_64/gcd_11.asm b/third_party/gmp/mpn/x86_64/gcd_11.asm
new file mode 100644
index 0000000..f9b3bcc
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/gcd_11.asm

@@ -0,0 +1,114 @@
+dnl  AMD64 mpn_gcd_11 -- 1 x 1 gcd.
+
+dnl  Based on the K7 gcd_1.asm, by Kevin Ryde.  Rehacked for AMD64 by Torbjorn
+dnl  Granlund.
+
+dnl  Copyright 2000-2002, 2005, 2009, 2011, 2012, 2017 Free Software
+dnl  Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C	     cycles/bit
+C AMD K8,K9	 5.5
+C AMD K10	 ?
+C AMD bd1	 ?
+C AMD bd2	 ?
+C AMD bd3	 ?
+C AMD bd4	 ?
+C AMD bt1	 7.1
+C AMD bt2	 ?
+C AMD zn1	 ?
+C AMD zn2	 ?
+C Intel P4	 ?
+C Intel CNR	 ?
+C Intel PNR	 ?
+C Intel NHM	 ?
+C Intel WSM	 ?
+C Intel SBR	 ?
+C Intel IBR	 ?
+C Intel HWL	 ?
+C Intel BWL	 ?
+C Intel SKL	 ?
+C Intel atom	 9.1
+C Intel SLM	 6.9
+C Intel GLM	 6.0
+C Intel GLM+	 5.8
+C VIA nano	 ?
+
+
+C ctz_table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0.
+
+deflit(MAXSHIFT, 7)
+deflit(MASK, eval((m4_lshift(1,MAXSHIFT))-1))
+
+DEF_OBJECT(ctz_table,64)
+	.byte	MAXSHIFT
+forloop(i,1,MASK,
+`	.byte	m4_count_trailing_zeros(i)
+')
+END_OBJECT(ctz_table)
+
+define(`u0',    `%rdi')
+define(`v0',    `%rsi')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(64)
+PROLOGUE(mpn_gcd_11)
+	FUNC_ENTRY(2)
+	LEA(	ctz_table, %r8)
+	jmp	L(ent)
+
+	ALIGN(16)
+L(top):	cmovc	%rdx, u0		C u = |u - v|
+	cmovc	%rax, v0		C v = min(u,v)
+L(mid):	and	$MASK, R32(%rdx)
+	movzbl	(%r8,%rdx), R32(%rcx)
+	jz	L(shift_alot)
+	shr	R8(%rcx), u0
+L(ent):	mov	u0, %rax
+	mov	v0, %rdx
+	sub	u0, %rdx
+	sub	v0, u0
+	jnz	L(top)
+
+L(end):	C rax = result
+	C rdx = 0 for the benefit of internal gcd_22 call
+	FUNC_EXIT()
+	ret
+
+L(shift_alot):
+	shr	$MAXSHIFT, u0
+	mov	u0, %rdx
+	jmp	L(mid)
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/gcd_22.asm b/third_party/gmp/mpn/x86_64/gcd_22.asm
new file mode 100644
index 0000000..78f985f
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/gcd_22.asm

@@ -0,0 +1,163 @@
+dnl  AMD64 mpn_gcd_22.  Assumes useless bsf, useless shrd, no tzcnt, no shlx.
+dnl  We actually use tzcnt here, when table cannot count bits, as tzcnt always
+dnl  works for our use, and helps a lot for certain CPUs.
+
+dnl  Copyright 2019 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C	     cycles/bit
+C AMD K8,K9	 8.9
+C AMD K10	 8.8
+C AMD bd1	 9.7
+C AMD bd2	 7.8
+C AMD bd3	 ?
+C AMD bd4	 7.4
+C AMD bt1	 9.2
+C AMD bt2	 9.1
+C AMD zn1	 7.5
+C AMD zn2	 7.5
+C Intel P4	 ?
+C Intel CNR	10.5
+C Intel PNR	10.5
+C Intel NHM	 9.7
+C Intel WSM	 9.7
+C Intel SBR	10.7
+C Intel IBR	 ?
+C Intel HWL	 9.5
+C Intel BWL	 8.7
+C Intel SKL	 8.6
+C Intel atom	18.9
+C Intel SLM	14.0
+C Intel GLM	 9.8
+C Intel GLM+	 8.8
+C VIA nano	 ?
+
+
+C ctz_table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0.
+
+deflit(MAXSHIFT, 8)
+deflit(MASK, eval((m4_lshift(1,MAXSHIFT))-1))
+
+DEF_OBJECT(ctz_table,64)
+	.byte	MAXSHIFT
+forloop(i,1,MASK,
+`	.byte	m4_count_trailing_zeros(i)
+')
+END_OBJECT(ctz_table)
+
+define(`u1',    `%rdi')
+define(`u0',    `%rsi')
+define(`v1',    `%rdx')
+define(`v0_param', `%rcx')
+
+define(`v0',    `%rax')
+define(`cnt',   `%rcx')
+
+define(`s0',    `%r8')
+define(`s1',    `%r9')
+define(`t0',    `%rcx')
+define(`t1',    `%r11')
+
+dnl ABI_SUPPORT(DOS64)	C returns mp_double_limb_t in memory
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(64)
+PROLOGUE(mpn_gcd_22)
+	FUNC_ENTRY(4)
+	mov	v0_param, v0
+
+	LEA(	ctz_table, %r10)
+
+	ALIGN(16)
+L(top):	mov	v0, t0
+	sub	u0, t0
+	jz	L(lowz)		C	jump when low limb result = 0
+	mov	v1, t1
+	sbb	u1, t1
+
+	mov	u0, s0
+	mov	u1, s1
+
+	sub	v0, u0
+	sbb	v1, u1
+
+L(bck):	cmovc	t0, u0		C u = |u - v|
+	cmovc	t1, u1		C u = |u - v|
+	cmovc	s0, v0		C v = min(u,v)
+	cmovc	s1, v1		C v = min(u,v)
+
+	and	$MASK, R32(t0)
+	movzbl	(%r10,t0), R32(cnt)
+	jz	L(count_better)
+C Rightshift (u1,,u0) into (u1,,u0)
+L(shr):	shr	R8(cnt), u0
+	mov	u1, t1
+	shr	R8(cnt), u1
+	neg	cnt
+	shl	R8(cnt), t1
+	or	t1, u0
+
+	test	v1, v1
+	jnz	L(top)
+	test	u1, u1
+	jnz	L(top)
+
+L(gcd_11):
+	mov	v0, %rdi
+C	mov	u0, %rsi
+	TCALL(	mpn_gcd_11)
+
+L(count_better):
+	rep;bsf	u0, cnt		C tzcnt!
+	jmp	L(shr)
+
+L(lowz):C We come here when v0 - u0 = 0
+	C 1. If v1 - u1 = 0, then gcd is u = v.
+	C 2. Else compute gcd_21({v1,v0}, |u1-v1|)
+	mov	v1, t0
+	sub	u1, t0
+	je	L(end)
+
+	xor	t1, t1
+	mov	u0, s0
+	mov	u1, s1
+	mov	u1, u0
+	xor	u1, u1
+	sub	v1, u0
+	jmp	L(bck)
+
+L(end):	C mov	v0, %rax
+	C mov	v1, %rdx
+	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/gmp-mparam.h b/third_party/gmp/mpn/x86_64/gmp-mparam.h
new file mode 100644
index 0000000..db94fb7
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/gmp-mparam.h

@@ -0,0 +1,217 @@
+/* AMD K8-K10 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 2000-2010, 2012 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+
+#define MOD_1_NORM_THRESHOLD                 0  /* always */
+#define MOD_1_UNNORM_THRESHOLD               0  /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          4
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          3
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD        14
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD        28
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD      7
+#define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           15
+
+#define MUL_TOOM22_THRESHOLD                27
+#define MUL_TOOM33_THRESHOLD                81
+#define MUL_TOOM44_THRESHOLD               234
+#define MUL_TOOM6H_THRESHOLD               418
+#define MUL_TOOM8H_THRESHOLD               466
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      97
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     160
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD     145
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD     175
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 36
+#define SQR_TOOM3_THRESHOLD                117
+#define SQR_TOOM4_THRESHOLD                327
+#define SQR_TOOM6_THRESHOLD                446
+#define SQR_TOOM8_THRESHOLD                547
+
+#define MULMID_TOOM42_THRESHOLD             36
+
+#define MULMOD_BNM1_THRESHOLD               17
+#define SQRMOD_BNM1_THRESHOLD               17
+
+#define POWM_SEC_TABLE  2,67,322,991
+
+#define MUL_FFT_MODF_THRESHOLD             570  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    570, 5}, {     21, 6}, {     11, 5}, {     23, 6}, \
+    {     25, 7}, {     13, 6}, {     27, 7}, {     15, 6}, \
+    {     31, 7}, {     25, 8}, {     13, 7}, {     29, 8}, \
+    {     15, 7}, {     31, 8}, {     17, 7}, {     35, 8}, \
+    {     19, 7}, {     39, 8}, {     21, 7}, {     43, 8}, \
+    {     23, 7}, {     47, 8}, {     25, 7}, {     51, 8}, \
+    {     29, 9}, {     15, 8}, {     37, 9}, {     19, 8}, \
+    {     43, 9}, {     23, 8}, {     51, 9}, {     27, 8}, \
+    {     55,10}, {     15, 9}, {     43,10}, {     23, 9}, \
+    {     55,10}, {     31, 9}, {     63, 5}, {   1023, 4}, \
+    {   2431, 5}, {   1279, 6}, {    671, 7}, {    367, 8}, \
+    {    189, 9}, {     95, 8}, {    195, 9}, {    111,11}, \
+    {     31, 9}, {    131,10}, {     71, 9}, {    155,10}, \
+    {     79, 9}, {    159,10}, {     87,11}, {     47,10}, \
+    {    111,11}, {     63,10}, {    135,11}, {     79,10}, \
+    {    167,11}, {     95,10}, {    191,11}, {    111,12}, \
+    {     63,11}, {    143,10}, {    287,11}, {    159,10}, \
+    {    319,11}, {    175,12}, {     95,11}, {    207,13}, \
+    {     63,12}, {    127,11}, {    255,10}, {    543,11}, \
+    {    287,12}, {    159,11}, {    319,10}, {    639,11}, \
+    {    335,10}, {    671,11}, {    351,10}, {    703,12}, \
+    {    191,11}, {    383,10}, {    767,11}, {    415,12}, \
+    {    223,13}, {    127,12}, {    255,11}, {    543,12}, \
+    {    287,11}, {    575,10}, {   1151,11}, {    607,12}, \
+    {    319,11}, {    639,10}, {   1279,11}, {    671,12}, \
+    {    351,11}, {    703,13}, {    191,12}, {    383,11}, \
+    {    767,12}, {    415,11}, {    831,12}, {    447,14}, \
+    {    127,13}, {    255,12}, {    543,11}, {   1087,12}, \
+    {    607,11}, {   1215,13}, {    319,12}, {    671,11}, \
+    {   1343,12}, {    735,13}, {    383,12}, {    767,11}, \
+    {   1535,12}, {    799,11}, {   1599,12}, {    831,13}, \
+    {    447,12}, {    895,11}, {   1791,12}, {    959,14}, \
+    {    255,13}, {    511,12}, {   1087,13}, {    575,12}, \
+    {   1215,13}, {    639,12}, {   1343,13}, {    703,12}, \
+    {   1407,14}, {    383,13}, {    767,12}, {   1599,13}, \
+    {    831,12}, {   1663,13}, {    895,12}, {   1791,13}, \
+    {    959,15}, {    255,14}, {    511,13}, {   1087,12}, \
+    {   2175,13}, {   1215,14}, {    639,13}, {   1471,14}, \
+    {    767,13}, {   1663,14}, {    895,13}, {   1855,15}, \
+    {    511,14}, {   1023,13}, {   2175,14}, {   1151,13}, \
+    {   2431,14}, {   1279,13}, {   2687,14}, {   1407,15}, \
+    {    767,14}, {   1535,13}, {   3071,14}, {   1791,16}, \
+    {    511,15}, {   1023,14}, {   2431,15}, {   1279,14}, \
+    {   2815,15}, {   1535,14}, {   3199,15}, {   1791,14}, \
+    {   3583,16}, {  65536,17}, { 131072,18}, { 262144,19}, \
+    { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \
+    {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 185
+#define MUL_FFT_THRESHOLD                 7552
+
+#define SQR_FFT_MODF_THRESHOLD             460  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    460, 5}, {     21, 6}, {     11, 5}, {     23, 6}, \
+    {     12, 5}, {     25, 6}, {     27, 7}, {     14, 6}, \
+    {     29, 7}, {     15, 6}, {     31, 7}, {     29, 8}, \
+    {     15, 7}, {     32, 8}, {     17, 7}, {     35, 8}, \
+    {     19, 7}, {     39, 8}, {     21, 7}, {     43, 8}, \
+    {     25, 7}, {     51, 8}, {     29, 9}, {     15, 8}, \
+    {     35, 9}, {     19, 8}, {     43, 9}, {     23, 8}, \
+    {     51, 9}, {     27, 8}, {     55,10}, {     15, 9}, \
+    {     31, 8}, {     63, 9}, {     43,10}, {     23, 9}, \
+    {     55,11}, {     15,10}, {     31, 9}, {     71,10}, \
+    {     39, 9}, {     83,10}, {     47, 6}, {    767, 4}, \
+    {   3263, 5}, {   1727, 4}, {   3455, 5}, {   1791, 6}, \
+    {    927, 7}, {    479, 6}, {    959, 7}, {    511, 8}, \
+    {    271, 9}, {    147,10}, {     87,11}, {     47,10}, \
+    {     95,12}, {     31,11}, {     63,10}, {    135,11}, \
+    {     79,10}, {    167,11}, {     95,10}, {    191,11}, \
+    {    111,12}, {     63,11}, {    127,10}, {    255,11}, \
+    {    143,10}, {    287, 9}, {    575,10}, {    303,11}, \
+    {    159,12}, {     95,11}, {    191,10}, {    383, 9}, \
+    {    767,10}, {    399,11}, {    207,13}, {     63,12}, \
+    {    127,11}, {    255,10}, {    511,11}, {    271,10}, \
+    {    543,11}, {    287,10}, {    575,12}, {    159,11}, \
+    {    319,10}, {    639,11}, {    335,10}, {    671,11}, \
+    {    351,10}, {    703,12}, {    191,11}, {    383,10}, \
+    {    767,11}, {    415,10}, {    831,11}, {    447,13}, \
+    {    127,12}, {    255,11}, {    511,10}, {   1023,11}, \
+    {    543,12}, {    287,11}, {    575,10}, {   1151,11}, \
+    {    607,10}, {   1215,12}, {    319,11}, {    639,10}, \
+    {   1279,11}, {    671,12}, {    351,11}, {    703,13}, \
+    {    191,12}, {    383,11}, {    767,12}, {    415,11}, \
+    {    831,12}, {    447,14}, {    127,13}, {    255,12}, \
+    {    511,11}, {   1023,12}, {    543,11}, {   1087,12}, \
+    {    575,11}, {   1151,12}, {    607,13}, {    319,12}, \
+    {    639,11}, {   1279,12}, {    671,11}, {   1343,12}, \
+    {    703,11}, {   1407,12}, {    735,13}, {    383,12}, \
+    {    767,11}, {   1535,12}, {    799,11}, {   1599,12}, \
+    {    831,13}, {    447,12}, {    959,14}, {    255,13}, \
+    {    511,12}, {   1087,13}, {    575,12}, {   1215,13}, \
+    {    639,12}, {   1343,13}, {    703,12}, {   1407,14}, \
+    {    383,13}, {    767,12}, {   1599,13}, {    831,12}, \
+    {   1663,13}, {    895,12}, {   1791,13}, {    959,15}, \
+    {    255,14}, {    511,13}, {   1087,12}, {   2175,13}, \
+    {   1215,14}, {    639,13}, {   1471,14}, {    767,13}, \
+    {   1663,14}, {    895,13}, {   1855,15}, {    511,14}, \
+    {   1023,13}, {   2175,14}, {   1151,13}, {   2303,14}, \
+    {   1279,13}, {   2559,14}, {   1407,15}, {    767,14}, \
+    {   1535,13}, {   3071,14}, {   1791,16}, {    511,15}, \
+    {   1023,14}, {   2303,15}, {   1279,14}, {   2687,15}, \
+    {   1535,14}, {   3199,15}, {   1791,16}, {  65536,17}, \
+    { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
+    {2097152,22}, {4194304,23}, {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 203
+#define SQR_FFT_THRESHOLD                 5248
+
+#define MULLO_BASECASE_THRESHOLD             0  /* always */
+#define MULLO_DC_THRESHOLD                  35
+#define MULLO_MUL_N_THRESHOLD            15604
+
+#define DC_DIV_QR_THRESHOLD                 56
+#define DC_DIVAPPR_Q_THRESHOLD             220
+#define DC_BDIV_QR_THRESHOLD                52
+#define DC_BDIV_Q_THRESHOLD                152
+
+#define INV_MULMOD_BNM1_THRESHOLD           54
+#define INV_NEWTON_THRESHOLD               226
+#define INV_APPR_THRESHOLD                 214
+
+#define BINV_NEWTON_THRESHOLD              327
+#define REDC_1_TO_REDC_2_THRESHOLD           4
+#define REDC_2_TO_REDC_N_THRESHOLD          79
+
+#define MU_DIV_QR_THRESHOLD               1895
+#define MU_DIVAPPR_Q_THRESHOLD            1895
+#define MUPI_DIV_QR_THRESHOLD              106
+#define MU_BDIV_QR_THRESHOLD              1589
+#define MU_BDIV_Q_THRESHOLD               1718
+
+#define MATRIX22_STRASSEN_THRESHOLD         16
+#define HGCD_THRESHOLD                     125
+#define HGCD_APPR_THRESHOLD                173
+#define HGCD_REDUCE_THRESHOLD             3524
+#define GCD_DC_THRESHOLD                   555
+#define GCDEXT_DC_THRESHOLD                478
+#define JACOBI_BASE_METHOD                   4
+
+#define GET_STR_DC_THRESHOLD                12
+#define GET_STR_PRECOMPUTE_THRESHOLD        28
+#define SET_STR_DC_THRESHOLD               248
+#define SET_STR_PRECOMPUTE_THRESHOLD      1648
+
+#define FAC_DSC_THRESHOLD                 1075
+#define FAC_ODD_THRESHOLD                    0  /* always */

diff --git a/third_party/gmp/mpn/x86_64/goldmont/aorrlsh_n.asm b/third_party/gmp/mpn/x86_64/goldmont/aorrlsh_n.asm
new file mode 100644
index 0000000..06c5d5d
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/goldmont/aorrlsh_n.asm

@@ -0,0 +1,37 @@
+dnl  X86-64 mpn_addlsh_n, mpn_rsblsh_n, optimised for Intel Goldmont.
+
+dnl  Copyright 2018 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_addlsh_n mpn_rsblsh_n)
+include_mpn(`x86_64/k8/aorrlsh_n.asm')

diff --git a/third_party/gmp/mpn/x86_64/goldmont/aors_n.asm b/third_party/gmp/mpn/x86_64/goldmont/aors_n.asm
new file mode 100644
index 0000000..1818f9f
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/goldmont/aors_n.asm

@@ -0,0 +1,37 @@
+dnl  X86-64 mpn_add_n, mpn_sub_n, optimised for Intel Goldmont.
+
+dnl  Copyright 2018 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+include_mpn(`x86_64/coreihwl/aors_n.asm')

diff --git a/third_party/gmp/mpn/x86_64/goldmont/aorsmul_1.asm b/third_party/gmp/mpn/x86_64/goldmont/aorsmul_1.asm
new file mode 100644
index 0000000..9c5f631
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/goldmont/aorsmul_1.asm

@@ -0,0 +1,37 @@
+dnl  X86-64 mpn_addmul_1 and mpn_submul_1 optimised for Intel Goldmont.
+
+dnl  Copyright 2018 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
+include_mpn(`x86_64/bd1/aorsmul_1.asm')

diff --git a/third_party/gmp/mpn/x86_64/goldmont/gmp-mparam.h b/third_party/gmp/mpn/x86_64/goldmont/gmp-mparam.h
new file mode 100644
index 0000000..531521d
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/goldmont/gmp-mparam.h

@@ -0,0 +1,264 @@
+/* Intel Goldmont gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+/* 2200 MHz Intel Atom C3758 Goldmont/Denverton */
+/* FFT tuning limit = 468,030,122 */
+/* Generated by tuneup.c, 2019-10-12, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD                 3
+#define MOD_1_UNNORM_THRESHOLD               5
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          5
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          4
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD        13
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD        38
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD      9
+#define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_1_NORM_THRESHOLD              3
+#define DIV_QR_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */
+#define DIV_QR_2_PI2_THRESHOLD              17
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           19
+
+#define DIV_1_VS_MUL_1_PERCENT             301
+
+#define MUL_TOOM22_THRESHOLD                23
+#define MUL_TOOM33_THRESHOLD                65
+#define MUL_TOOM44_THRESHOLD               178
+#define MUL_TOOM6H_THRESHOLD               258
+#define MUL_TOOM8H_THRESHOLD               357
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD     121
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     131
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD     121
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD     129
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD     178
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 30
+#define SQR_TOOM3_THRESHOLD                113
+#define SQR_TOOM4_THRESHOLD                290
+#define SQR_TOOM6_THRESHOLD                351
+#define SQR_TOOM8_THRESHOLD                  0  /* always */
+
+#define MULMID_TOOM42_THRESHOLD             36
+
+#define MULMOD_BNM1_THRESHOLD               14
+#define SQRMOD_BNM1_THRESHOLD               16
+
+#define MUL_FFT_MODF_THRESHOLD             440  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    440, 5}, {     19, 6}, {     10, 5}, {     21, 6}, \
+    {     11, 5}, {     23, 6}, {     21, 7}, {     11, 6}, \
+    {     24, 7}, {     13, 6}, {     27, 7}, {     15, 6}, \
+    {     31, 7}, {     21, 8}, {     11, 7}, {     24, 8}, \
+    {     13, 7}, {     28, 8}, {     15, 7}, {     31, 8}, \
+    {     17, 7}, {     35, 8}, {     19, 7}, {     39, 8}, \
+    {     21, 9}, {     11, 8}, {     27, 9}, {     15, 8}, \
+    {     35, 9}, {     19, 8}, {     41, 9}, {     23, 8}, \
+    {     47, 9}, {     27,10}, {     15, 9}, {     39,10}, \
+    {     23, 9}, {     51,11}, {     15,10}, {     31, 9}, \
+    {     71,10}, {     39, 9}, {     83,10}, {     47, 9}, \
+    {     95,10}, {     55,11}, {     31,10}, {     79,11}, \
+    {     47,10}, {    103,12}, {     31,11}, {     63,10}, \
+    {    135,11}, {     79,10}, {    159,11}, {     95,10}, \
+    {    191,11}, {    111,12}, {     63,11}, {    127,10}, \
+    {    255,11}, {    143,10}, {    287, 9}, {    575,11}, \
+    {    159,10}, {    319,12}, {     95,11}, {    191,10}, \
+    {    383, 9}, {    767,11}, {    207,10}, {    415,13}, \
+    {     63,12}, {    127,11}, {    255,10}, {    511,11}, \
+    {    271,10}, {    543,11}, {    287,10}, {    575,11}, \
+    {    303,12}, {    159,11}, {    319,10}, {    639,11}, \
+    {    367,12}, {    191,11}, {    383,10}, {    767,11}, \
+    {    415,12}, {    223,11}, {    479,13}, {    127,12}, \
+    {    255,11}, {    543,12}, {    287,11}, {    607,12}, \
+    {    319,11}, {    639,12}, {    351,11}, {    703,13}, \
+    {    191,12}, {    383,11}, {    767,12}, {    415,11}, \
+    {    831,12}, {    479,14}, {    127,13}, {    255,12}, \
+    {    543,11}, {   1087,12}, {    607,13}, {    319,12}, \
+    {    671,11}, {   1343,12}, {    703,11}, {   1407,12}, \
+    {    735,13}, {    383,12}, {    767,11}, {   1535,12}, \
+    {    831,13}, {    447,12}, {    959,14}, {    255,13}, \
+    {    511,12}, {   1023,11}, {   2047,12}, {   1087,13}, \
+    {    575,12}, {   1215,11}, {   2431,10}, {   4863,13}, \
+    {    639,12}, {   1279,11}, {   2559,12}, {   1343,13}, \
+    {    703,12}, {   1407,14}, {    383,13}, {    767,12}, \
+    {   1535,13}, {    831,12}, {   1727,13}, {    959,15}, \
+    {    255,14}, {    511,13}, {   1023,12}, {   2047,13}, \
+    {   1087,12}, {   2175,13}, {   1151,12}, {   2303,13}, \
+    {   1215,12}, {   2431,11}, {   4863,14}, {    639,13}, \
+    {   1279,12}, {   2559,13}, {   1343,12}, {   2687,13}, \
+    {   1407,12}, {   2815,13}, {   1471,12}, {   2943,11}, \
+    {   5887,14}, {    767,13}, {   1535,12}, {   3071,13}, \
+    {   1727,14}, {    895,13}, {   1791,12}, {   3583,13}, \
+    {   1919,15}, {    511,14}, {   1023,13}, {   2175,14}, \
+    {   1151,13}, {   2303,12}, {   4607,13}, {   2431,12}, \
+    {   4863,14}, {   1279,13}, {   2687,14}, {   1407,13}, \
+    {   2943,12}, {   5887,15}, {    767,14}, {   1535,13}, \
+    {   3071,14}, {   1663,13}, {   3455,12}, {   6911,14}, \
+    {   1791,13}, {   3583,14}, {   1919,13}, {   3839,16}, \
+    {    511,15}, {   1023,14}, {   2431,13}, {   4863,15}, \
+    {   1279,14}, {   2943,13}, {   5887,12}, {  11775,15}, \
+    {   1535,14}, {   3455,13}, {   6911,15}, {   1791,14}, \
+    {   3839,13}, {   7679,12}, {  15359,14}, {   3967,16}, \
+    {   1023,15}, {   2047,14}, {   4351,15}, {   2303,14}, \
+    {   4863,15}, {   2815,14}, {   5887,13}, {  11775,16}, \
+    {   1535,15}, {   3071,14}, {   6143,15}, {   3327,14}, \
+    {   6911,15}, {   3839,14}, {   7679,13}, {  15359,17}, \
+    {   1023,16}, {   2047,15}, {   4351,14}, {   8703,15}, \
+    {   4863,16}, {   2559,15}, {   5887,14}, {  11775,16}, \
+    {   3071,15}, {   6911,16}, {   3583,15}, {   7679,14}, \
+    {  15359,15}, {   7935,17}, {   2047,16}, {   4095,15}, \
+    {   8703,16}, {   4607,15}, {   9983,14}, {  19967,16}, \
+    {   5119,15}, {  10239,16}, {   5631,15}, {  11775,17}, \
+    {   3071,16}, {  65536,17}, { 131072,18}, { 262144,19}, \
+    { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \
+    {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 261
+#define MUL_FFT_THRESHOLD                 4544
+
+#define SQR_FFT_MODF_THRESHOLD             380  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    380, 5}, {     19, 6}, {     10, 5}, {     21, 6}, \
+    {     11, 5}, {     23, 6}, {     12, 5}, {     25, 6}, \
+    {     25, 7}, {     13, 6}, {     27, 7}, {     15, 6}, \
+    {     31, 7}, {     25, 8}, {     13, 7}, {     27, 8}, \
+    {     15, 7}, {     31, 8}, {     21, 9}, {     11, 8}, \
+    {     27, 9}, {     15, 8}, {     33, 9}, {     19, 8}, \
+    {     41, 9}, {     23, 8}, {     47, 9}, {     27,10}, \
+    {     15, 9}, {     39,10}, {     23, 9}, {     51,11}, \
+    {     15,10}, {     31, 9}, {     67,10}, {     39, 9}, \
+    {     79,10}, {     55,11}, {     31,10}, {     79,11}, \
+    {     47,10}, {     95,12}, {     31,11}, {     63,10}, \
+    {    127, 9}, {    255,10}, {    135,11}, {     79,10}, \
+    {    159, 9}, {    319,11}, {     95,10}, {    191, 9}, \
+    {    383,12}, {     63,11}, {    127,10}, {    255, 9}, \
+    {    511,10}, {    271, 9}, {    543,10}, {    287, 9}, \
+    {    575,10}, {    303, 9}, {    607,10}, {    319, 9}, \
+    {    639,12}, {     95,11}, {    191,10}, {    383,11}, \
+    {    207,13}, {     63,12}, {    127,11}, {    255,10}, \
+    {    511,11}, {    271,10}, {    543,11}, {    287,10}, \
+    {    575,11}, {    303,10}, {    607,11}, {    319,10}, \
+    {    639,11}, {    351,10}, {    703,11}, {    367,12}, \
+    {    191,11}, {    415,12}, {    223,11}, {    479,13}, \
+    {    127,12}, {    255,11}, {    543,12}, {    287,11}, \
+    {    607,12}, {    319,11}, {    639,12}, {    351,11}, \
+    {    703,10}, {   1407,13}, {    191,12}, {    383,11}, \
+    {    767,12}, {    479,14}, {    127,13}, {    255,12}, \
+    {    607,13}, {    319,12}, {    703,11}, {   1407,12}, \
+    {    735,13}, {    383,12}, {    767,11}, {   1535,12}, \
+    {    799,13}, {    447,12}, {    895,11}, {   1791,14}, \
+    {    255,13}, {    511,12}, {   1023,13}, {    575,12}, \
+    {   1151,11}, {   2303,12}, {   1215,13}, {    639,12}, \
+    {   1279,13}, {    703,12}, {   1407,14}, {    383,13}, \
+    {    767,12}, {   1535,13}, {    831,12}, {   1663,13}, \
+    {    895,12}, {   1791,13}, {    959,15}, {    255,14}, \
+    {    511,13}, {   1023,12}, {   2047,13}, {   1087,12}, \
+    {   2175,13}, {   1151,12}, {   2303,13}, {   1215,12}, \
+    {   2431,14}, {    639,13}, {   1279,12}, {   2559,13}, \
+    {   1343,12}, {   2687,13}, {   1407,12}, {   2815,13}, \
+    {   1471,12}, {   2943,11}, {   5887,14}, {    767,13}, \
+    {   1535,12}, {   3071,13}, {   1599,12}, {   3199,13}, \
+    {   1663,14}, {    895,13}, {   1791,12}, {   3583,15}, \
+    {    511,14}, {   1023,13}, {   2175,14}, {   1151,13}, \
+    {   2303,12}, {   4607,13}, {   2431,12}, {   4863,14}, \
+    {   1279,13}, {   2687,14}, {   1407,13}, {   2943,12}, \
+    {   5887,15}, {    767,14}, {   1535,13}, {   3199,14}, \
+    {   1663,13}, {   3455,12}, {   6911,14}, {   1791,13}, \
+    {   3583,14}, {   1919,16}, {    511,15}, {   1023,14}, \
+    {   2303,13}, {   4607,14}, {   2431,13}, {   4863,15}, \
+    {   1279,14}, {   2943,13}, {   5887,12}, {  11775,15}, \
+    {   1535,14}, {   3455,13}, {   6911,15}, {   1791,14}, \
+    {   3583,13}, {   7167,14}, {   3839,13}, {   7679,12}, \
+    {  15359,16}, {   1023,15}, {   2047,14}, {   4223,15}, \
+    {   2303,14}, {   4863,15}, {   2559,14}, {   5119,15}, \
+    {   2815,14}, {   5887,13}, {  11775,16}, {   1535,15}, \
+    {   3071,14}, {   6143,15}, {   3327,14}, {   6911,15}, \
+    {   3583,14}, {   7167,15}, {   3839,14}, {   7679,13}, \
+    {  15359,17}, {   1023,16}, {   2047,15}, {   4095,14}, \
+    {   8191,15}, {   4863,16}, {   2559,15}, {   5887,14}, \
+    {  11775,16}, {   3071,15}, {   6911,16}, {   3583,15}, \
+    {   7679,14}, {  15359,15}, {   7935,14}, {  15871,17}, \
+    {   2047,16}, {   4095,15}, {   8447,16}, {   4607,15}, \
+    {   9983,14}, {  19967,16}, {   5119,15}, {  10239,16}, \
+    {   5631,15}, {  11775,17}, {   3071,16}, {  65536,17}, \
+    { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
+    {2097152,22}, {4194304,23}, {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 259
+#define SQR_FFT_THRESHOLD                 3520
+
+#define MULLO_BASECASE_THRESHOLD             0  /* always */
+#define MULLO_DC_THRESHOLD                  62
+#define MULLO_MUL_N_THRESHOLD             8907
+#define SQRLO_BASECASE_THRESHOLD            10
+#define SQRLO_DC_THRESHOLD                  13
+#define SQRLO_SQR_THRESHOLD               7035
+
+#define DC_DIV_QR_THRESHOLD                 51
+#define DC_DIVAPPR_Q_THRESHOLD             183
+#define DC_BDIV_QR_THRESHOLD                47
+#define DC_BDIV_Q_THRESHOLD                 88
+
+#define INV_MULMOD_BNM1_THRESHOLD           46
+#define INV_NEWTON_THRESHOLD               226
+#define INV_APPR_THRESHOLD                 204
+
+#define BINV_NEWTON_THRESHOLD              264
+#define REDC_1_TO_REDC_2_THRESHOLD          28
+#define REDC_2_TO_REDC_N_THRESHOLD          54
+
+#define MU_DIV_QR_THRESHOLD               1589
+#define MU_DIVAPPR_Q_THRESHOLD            1620
+#define MUPI_DIV_QR_THRESHOLD               83
+#define MU_BDIV_QR_THRESHOLD              1334
+#define MU_BDIV_Q_THRESHOLD               1470
+
+#define POWM_SEC_TABLE  1,16,194,642
+
+#define GET_STR_DC_THRESHOLD                10
+#define GET_STR_PRECOMPUTE_THRESHOLD        17
+#define SET_STR_DC_THRESHOLD               381
+#define SET_STR_PRECOMPUTE_THRESHOLD      1042
+
+#define FAC_DSC_THRESHOLD                  218
+#define FAC_ODD_THRESHOLD                   25
+
+#define MATRIX22_STRASSEN_THRESHOLD         21
+#define HGCD2_DIV1_METHOD                    1  /* 6.58% faster than 3 */
+#define HGCD_THRESHOLD                     136
+#define HGCD_APPR_THRESHOLD                168
+#define HGCD_REDUCE_THRESHOLD             3014
+#define GCD_DC_THRESHOLD                   416
+#define GCDEXT_DC_THRESHOLD                393
+#define JACOBI_BASE_METHOD                   4  /* 1.17% faster than 3 */
+
+/* Tuneup completed successfully, took 800192 seconds */

diff --git a/third_party/gmp/mpn/x86_64/goldmont/mul_1.asm b/third_party/gmp/mpn/x86_64/goldmont/mul_1.asm
new file mode 100644
index 0000000..ed1ec54
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/goldmont/mul_1.asm

@@ -0,0 +1,37 @@
+dnl  X86-64 mpn_mul_1 optimised for Intel Goldmont.
+
+dnl  Copyright 2018 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_mul_1 mpn_mul_1c)
+include_mpn(`x86_64/coreisbr/mul_1.asm')

diff --git a/third_party/gmp/mpn/x86_64/goldmont/redc_1.asm b/third_party/gmp/mpn/x86_64/goldmont/redc_1.asm
new file mode 100644
index 0000000..1192635
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/goldmont/redc_1.asm

@@ -0,0 +1,37 @@
+dnl  X86-64 mpn_redc_1 optimised for Intel Goldmont.
+
+dnl  Copyright 2018 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_redc_1)
+include_mpn(`x86_64/k8/redc_1.asm')

diff --git a/third_party/gmp/mpn/x86_64/invert_limb.asm b/third_party/gmp/mpn/x86_64/invert_limb.asm
new file mode 100644
index 0000000..b375ad3
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/invert_limb.asm

@@ -0,0 +1,112 @@
+dnl  AMD64 mpn_invert_limb -- Invert a normalized limb.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund and Niels Möller.
+
+dnl  Copyright 2004, 2007-2009, 2011, 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C	     cycles/limb (approx)	div
+C AMD K8,K9	 48			 71
+C AMD K10	 48			 77
+C Intel P4	135			161
+C Intel core2	 69			116
+C Intel corei	 55			 89
+C Intel atom	129			191
+C VIA nano	 79			157
+
+C rax rcx rdx rdi rsi r8
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+PROTECT(`mpn_invert_limb_table')
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_invert_limb)		C			Kn	C2	Ci
+	FUNC_ENTRY(1)
+	mov	%rdi, %rax		C			 0	 0	 0
+	shr	$55, %rax		C			 1	 1	 1
+ifdef(`DARWIN',`
+	lea	mpn_invert_limb_table(%rip), %r8
+	add	$-512, %r8
+',`
+	lea	-512+mpn_invert_limb_table(%rip), %r8
+')
+	movzwl	(%r8,%rax,2), R32(%rcx)	C	%rcx = v0
+
+	C v1 = (v0 << 11) - (v0*v0*d40 >> 40) - 1
+	mov	%rdi, %rsi		C			 0	 0	 0
+	mov	R32(%rcx), R32(%rax)	C			 4	 5	 5
+	imul	R32(%rcx), R32(%rcx)	C			 4	 5	 5
+	shr	$24, %rsi		C			 1	 1	 1
+	inc	%rsi			C	%rsi = d40
+	imul	%rsi, %rcx		C			 8	10	 8
+	shr	$40, %rcx		C			12	15	11
+	sal	$11, R32(%rax)		C			 5	 6	 6
+	dec	R32(%rax)
+	sub	R32(%rcx), R32(%rax)	C	%rax = v1
+
+	C v2 = (v1 << 13) + (v1 * (2^60 - v1*d40) >> 47)
+	mov	$0x1000000000000000, %rcx
+	imul	%rax, %rsi		C			14	17	13
+	sub	%rsi, %rcx
+	imul	%rax, %rcx
+	sal	$13, %rax
+	shr	$47, %rcx
+	add	%rax, %rcx		C	%rcx = v2
+
+	C v3 = (v2 << 31) + (v2 * (2^96 - v2 * d63 + ((v2 >> 1) & mask)) >> 65)
+	mov	%rdi, %rsi		C			 0	 0	 0
+	shr	%rsi			C d/2
+	sbb	%rax, %rax		C -d0 = -(d mod 2)
+	sub	%rax, %rsi		C d63 = ceil(d/2)
+	imul	%rcx, %rsi		C v2 * d63
+	and	%rcx, %rax		C v2 * d0
+	shr	%rax			C (v2>>1) * d0
+	sub	%rsi, %rax		C (v2>>1) * d0 - v2 * d63
+	mul	%rcx
+	sal	$31, %rcx
+	shr	%rdx
+	add	%rdx, %rcx		C	%rcx = v3
+
+	mov	%rdi, %rax
+	mul	%rcx
+	add	%rdi, %rax
+	mov	%rcx, %rax
+	adc	%rdi, %rdx
+	sub	%rdx, %rax
+
+	FUNC_EXIT()
+	ret
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/x86_64/invert_limb_table.asm b/third_party/gmp/mpn/x86_64/invert_limb_table.asm
new file mode 100644
index 0000000..739d59e
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/invert_limb_table.asm

@@ -0,0 +1,50 @@
+dnl  Table used for mpn_invert_limb
+
+dnl  Contributed to the GNU project by Torbjorn Granlund and Niels Möller.
+
+dnl  Copyright 2004, 2007-2009, 2011-2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+PROTECT(`mpn_invert_limb_table')
+
+ASM_START()
+C Table entry X contains floor (0x7fd00 / (0x100 + X))
+
+	RODATA
+	ALIGN(2)
+	GLOBL mpn_invert_limb_table
+mpn_invert_limb_table:
+forloop(i,256,512-1,dnl
+`	.value	eval(0x7fd00/i)
+')dnl
+ASM_END()

diff --git a/third_party/gmp/mpn/x86_64/k10/gcd_11.asm b/third_party/gmp/mpn/x86_64/k10/gcd_11.asm
new file mode 100644
index 0000000..4723093
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/k10/gcd_11.asm

@@ -0,0 +1,37 @@
+dnl  AMD64 mpn_gcd_11.
+
+dnl  Copyright 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_gcd_11)
+include_mpn(`x86_64/core2/gcd_11.asm')

diff --git a/third_party/gmp/mpn/x86_64/k10/gcd_22.asm b/third_party/gmp/mpn/x86_64/k10/gcd_22.asm
new file mode 100644
index 0000000..f58b4cc
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/k10/gcd_22.asm

@@ -0,0 +1,142 @@
+dnl  AMD64 mpn_gcd_22.  Assumes useful bsf, useless shrd, no tzcnt, no shlx.
+
+dnl  Copyright 2019 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C	     cycles/bit
+C AMD K8,K9	 ?
+C AMD K10	 7.4
+C AMD bd1	 9.9
+C AMD bd2	 ?
+C AMD bd3	 ?
+C AMD bd4	 ?
+C AMD bt1	 ?
+C AMD bt2	 ?
+C AMD zn1	 ?
+C AMD zn2	 ?
+C Intel P4	 ?
+C Intel CNR	 ?
+C Intel PNR	 ?
+C Intel NHM	 9.2
+C Intel WSM	 9.0
+C Intel SBR	 ?
+C Intel IBR	 ?
+C Intel HWL	 ?
+C Intel BWL	 ?
+C Intel SKL	 ?
+C Intel atom	 ?
+C Intel SLM	 ?
+C Intel GLM	 ?
+C Intel GLM+	 ?
+C VIA nano	 ?
+
+
+define(`u1',    `%rdi')
+define(`u0',    `%rsi')
+define(`v1',    `%rdx')
+define(`v0_param', `%rcx')
+
+define(`v0',    `%rax')
+define(`cnt',   `%rcx')
+
+define(`s0',    `%r8')
+define(`s1',    `%r9')
+define(`t0',    `%r10')
+define(`t1',    `%r11')
+
+dnl ABI_SUPPORT(DOS64)	C returns mp_double_limb_t in memory
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(64)
+PROLOGUE(mpn_gcd_22)
+	FUNC_ENTRY(4)
+	mov	v0_param, v0
+
+	ALIGN(16)
+L(top):	mov	v0, t0
+	sub	u0, t0
+	jz	L(lowz)		C	jump when low limb result = 0
+	mov	v1, t1
+	sbb	u1, t1
+
+	mov	u0, s0
+	mov	u1, s1
+
+	bsf	t0, cnt
+
+	sub	v0, u0
+	sbb	v1, u1
+
+L(bck):	cmovc	t0, u0		C u = |u - v|
+	cmovnc	u1, t1		C u = |u - v|
+	cmovc	s0, v0		C v = min(u,v)
+	cmovc	s1, v1		C v = min(u,v)
+
+	shr	R8(cnt), u0
+	mov	t1, u1
+	shr	R8(cnt), u1
+	neg	cnt
+	shl	R8(cnt), t1
+	or	t1, u0
+
+	test	u1, u1
+	jnz	L(top)
+	test	v1, v1
+	jnz	L(top)
+
+L(gcd_11):
+	mov	v0, %rdi
+C	mov	u0, %rsi
+	TCALL(	mpn_gcd_11)
+
+L(lowz):C We come here when v0 - u0 = 0
+	C 1. If v1 - u1 = 0, then gcd is u = v.
+	C 2. Else compute gcd_21({v1,v0}, |u1-v1|)
+	mov	v1, t0
+	sub	u1, t0
+	je	L(end)
+
+	xor	t1, t1
+	mov	u0, s0
+	mov	u1, s1
+	bsf	t0, cnt
+	mov	u1, u0
+	xor	u1, u1
+	sub	v1, u0
+	jmp	L(bck)
+
+L(end):	C mov	v0, %rax
+	C mov	v1, %rdx
+	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/k10/gmp-mparam.h b/third_party/gmp/mpn/x86_64/k10/gmp-mparam.h
new file mode 100644
index 0000000..349bace
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/k10/gmp-mparam.h

@@ -0,0 +1,248 @@
+/* AMD K10 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+#if 0
+#undef mpn_sublsh_n
+#define mpn_sublsh_n(rp,up,vp,n,c)					\
+  (((rp) == (up)) ? mpn_submul_1 (rp, vp, n, CNST_LIMB(1) << (c))	\
+   : MPN(mpn_sublsh_n)(rp,up,vp,n,c))
+#endif
+
+/* 3200-3600 MHz K10 Thuban */
+/* FFT tuning limit = 427,161,280 */
+/* Generated by tuneup.c, 2019-10-22, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD                 0  /* always */
+#define MOD_1_UNNORM_THRESHOLD               0  /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          5
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          3
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD        17
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD        28
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     11
+#define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_1_NORM_THRESHOLD              1
+#define DIV_QR_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           15
+
+#define DIV_1_VS_MUL_1_PERCENT             324
+
+#define MUL_TOOM22_THRESHOLD                27
+#define MUL_TOOM33_THRESHOLD                81
+#define MUL_TOOM44_THRESHOLD               232
+#define MUL_TOOM6H_THRESHOLD               363
+#define MUL_TOOM8H_THRESHOLD               478
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      97
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     155
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD     145
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD     160
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD     142
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 30
+#define SQR_TOOM3_THRESHOLD                117
+#define SQR_TOOM4_THRESHOLD                280
+#define SQR_TOOM6_THRESHOLD                446
+#define SQR_TOOM8_THRESHOLD                547
+
+#define MULMID_TOOM42_THRESHOLD             34
+
+#define MULMOD_BNM1_THRESHOLD               15
+#define SQRMOD_BNM1_THRESHOLD               17
+
+#define MUL_FFT_MODF_THRESHOLD             530  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    530, 5}, {     24, 6}, {     13, 5}, {     27, 6}, \
+    {     27, 7}, {     14, 6}, {     29, 7}, {     15, 6}, \
+    {     31, 7}, {     29, 8}, {     15, 7}, {     32, 8}, \
+    {     17, 7}, {     36, 8}, {     19, 7}, {     39, 8}, \
+    {     21, 7}, {     43, 8}, {     23, 7}, {     47, 8}, \
+    {     25, 7}, {     51, 8}, {     29, 9}, {     15, 8}, \
+    {     35, 9}, {     19, 8}, {     43, 9}, {     23, 8}, \
+    {     51, 9}, {     27, 8}, {     55,10}, {     15, 9}, \
+    {     31, 8}, {     63, 9}, {     35, 8}, {     71, 9}, \
+    {     39, 8}, {     81, 9}, {     43,10}, {     23, 9}, \
+    {     55,11}, {     15,10}, {     31, 9}, {     71,10}, \
+    {     39, 9}, {     87,10}, {     47, 9}, {     99,10}, \
+    {     55,11}, {     31,10}, {     87,11}, {     47,10}, \
+    {    111,12}, {     31,11}, {     63,10}, {    143,11}, \
+    {     79,10}, {    167,11}, {     95,10}, {    191,11}, \
+    {    111,12}, {     63,11}, {    143,10}, {    287,11}, \
+    {    159,12}, {     95,11}, {    207,13}, {     63,12}, \
+    {    127,11}, {    255,10}, {    511,11}, {    271,10}, \
+    {    543,11}, {    287,12}, {    159,11}, {    319,10}, \
+    {    639,11}, {    335,10}, {    671,11}, {    351,10}, \
+    {    703,12}, {    191,11}, {    383,10}, {    767,11}, \
+    {    415,12}, {    223,11}, {    447,13}, {    127,12}, \
+    {    255,11}, {    543,12}, {    287,11}, {    575,10}, \
+    {   1151,11}, {    607,12}, {    319,11}, {    671,12}, \
+    {    351,11}, {    703,13}, {    191,12}, {    383,11}, \
+    {    767,12}, {    415,11}, {    831,12}, {    447,14}, \
+    {    127,13}, {    255,12}, {    543,11}, {   1087,12}, \
+    {    575,11}, {   1151,12}, {    607,13}, {    319,12}, \
+    {    703,11}, {   1407,12}, {    735,13}, {    383,12}, \
+    {    831,13}, {    447,12}, {    959,14}, {    255,13}, \
+    {    511,12}, {   1087,13}, {    575,12}, {   1215,13}, \
+    {    639,12}, {   1343,13}, {    703,12}, {   1471,14}, \
+    {    383,13}, {    767,12}, {   1535,13}, {    831,12}, \
+    {   1663,13}, {    959,14}, {    511,13}, {   1087,12}, \
+    {   2175,13}, {   1215,14}, {    639,13}, {   1471,14}, \
+    {    767,13}, {   1663,14}, {    895,13}, {   1855,15}, \
+    {    511,14}, {   1023,13}, {   2175,14}, {   1151,13}, \
+    {   2431,14}, {   1279,13}, {   2559,14}, {   1407,15}, \
+    {    767,14}, {   1791,16}, {    511,15}, {   1023,14}, \
+    {   2431,15}, {   1279,14}, {   2943,15}, {   1535,14}, \
+    {   3199,15}, {   1791,14}, {   3583,16}, {   1023,15}, \
+    {   2047,14}, {   4223,15}, {   2303,14}, {   4863,15}, \
+    {   2559,14}, {   5247,15}, {   2815,16}, {   1535,15}, \
+    {   3071,14}, {   6271,15}, {   3327,14}, {   6911,15}, \
+    {   3583,17}, {   1023,16}, {   2047,15}, {   4351,14}, \
+    {   8959,15}, {   4863,16}, {   2559,15}, {   5887,14}, \
+    {  11775,16}, {   3071,15}, {   6911,16}, {   3583,15}, \
+    {   7167,17}, {   2047,16}, {   4095,15}, {   8959,16}, \
+    {   4607,15}, {   9983,16}, {   5631,15}, {  11775,17}, \
+    {   3071,16}, {   6143,15}, {  12543,16}, {  65536,17}, \
+    { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
+    {2097152,22}, {4194304,23}, {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 207
+#define MUL_FFT_THRESHOLD                 7552
+
+#define SQR_FFT_MODF_THRESHOLD             476  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    476, 5}, {     21, 6}, {     11, 5}, {     23, 6}, \
+    {     12, 5}, {     25, 6}, {     29, 7}, {     28, 8}, \
+    {     15, 7}, {     32, 8}, {     17, 7}, {     35, 8}, \
+    {     19, 7}, {     39, 8}, {     21, 7}, {     43, 8}, \
+    {     23, 7}, {     47, 8}, {     29, 9}, {     15, 8}, \
+    {     35, 9}, {     19, 8}, {     43, 9}, {     23, 8}, \
+    {     49, 9}, {     27, 8}, {     55,10}, {     15, 9}, \
+    {     31, 8}, {     63, 9}, {     43,10}, {     23, 9}, \
+    {     55,11}, {     15,10}, {     31, 9}, {     67,10}, \
+    {     39, 9}, {     83,10}, {     47, 9}, {     95,10}, \
+    {     55,11}, {     31,10}, {     79,11}, {     47,10}, \
+    {    103,12}, {     31,11}, {     63,10}, {    135,11}, \
+    {     79,10}, {    167,11}, {    111,12}, {     63,11}, \
+    {    127,10}, {    255,11}, {    143,10}, {    287, 9}, \
+    {    575,11}, {    159,10}, {    319,12}, {     95,11}, \
+    {    191,10}, {    383, 9}, {    767,10}, {    399,13}, \
+    {     63,12}, {    127,11}, {    255,10}, {    511,11}, \
+    {    271,10}, {    543,11}, {    287,10}, {    575,12}, \
+    {    159,11}, {    319,10}, {    639,11}, {    335,10}, \
+    {    671,11}, {    351,10}, {    703,11}, {    367,12}, \
+    {    191,11}, {    383,10}, {    767,11}, {    415,10}, \
+    {    831,12}, {    223,11}, {    447,13}, {    127,12}, \
+    {    255,11}, {    511,10}, {   1023,11}, {    543,12}, \
+    {    287,11}, {    575,10}, {   1151,11}, {    607,12}, \
+    {    319,11}, {    639,10}, {   1279,11}, {    671,12}, \
+    {    351,11}, {    703,10}, {   1407,13}, {    191,12}, \
+    {    383,11}, {    799,12}, {    415,11}, {    831,12}, \
+    {    447,14}, {    127,13}, {    255,12}, {    511,11}, \
+    {   1023,12}, {    543,11}, {   1087,12}, {    575,11}, \
+    {   1151,12}, {    607,13}, {    319,12}, {    639,11}, \
+    {   1279,12}, {    671,11}, {   1343,12}, {    703,11}, \
+    {   1407,12}, {    735,13}, {    383,12}, {    767,11}, \
+    {   1535,12}, {    831,11}, {   1663,13}, {    447,12}, \
+    {    959,14}, {    255,13}, {    511,12}, {   1087,13}, \
+    {    575,12}, {   1215,13}, {    639,12}, {   1343,13}, \
+    {    703,12}, {   1407,14}, {    383,13}, {    767,12}, \
+    {   1535,13}, {    831,12}, {   1727,13}, {    895,12}, \
+    {   1791,13}, {    959,15}, {    255,14}, {    511,13}, \
+    {   1087,12}, {   2175,13}, {   1215,14}, {    639,13}, \
+    {   1471,14}, {    767,13}, {   1727,14}, {    895,13}, \
+    {   1791,15}, {    511,14}, {   1023,13}, {   2175,14}, \
+    {   1151,13}, {   2303,14}, {   1279,13}, {   2559,14}, \
+    {   1407,15}, {    767,14}, {   1791,16}, {    511,15}, \
+    {   1023,14}, {   2303,15}, {   1279,14}, {   2815,15}, \
+    {   1535,14}, {   3199,15}, {   1791,16}, {   1023,15}, \
+    {   2047,14}, {   4223,15}, {   2303,14}, {   4863,15}, \
+    {   2559,14}, {   5247,15}, {   2815,16}, {   1535,15}, \
+    {   3071,14}, {   6271,15}, {   3327,14}, {   6911,17}, \
+    {   1023,16}, {   2047,15}, {   4351,14}, {   8959,15}, \
+    {   4863,16}, {   2559,15}, {   5887,14}, {  11775,16}, \
+    {   3071,15}, {   6911,16}, {   3583,15}, {   7679,17}, \
+    {   2047,16}, {   4095,15}, {   8959,16}, {   4607,15}, \
+    {   9983,16}, {   5119,15}, {  10495,16}, {   5631,15}, \
+    {  11775,17}, {   3071,16}, {   6143,15}, {  12287,16}, \
+    {  65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \
+    {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 224
+#define SQR_FFT_THRESHOLD                 5568
+
+#define MULLO_BASECASE_THRESHOLD             0  /* always */
+#define MULLO_DC_THRESHOLD                  61
+#define MULLO_MUL_N_THRESHOLD            14281
+#define SQRLO_BASECASE_THRESHOLD             9
+#define SQRLO_DC_THRESHOLD                   0  /* never mpn_sqrlo_basecase */
+#define SQRLO_SQR_THRESHOLD              10950
+
+#define DC_DIV_QR_THRESHOLD                 54
+#define DC_DIVAPPR_Q_THRESHOLD             238
+#define DC_BDIV_QR_THRESHOLD                54
+#define DC_BDIV_Q_THRESHOLD                 42
+
+#define INV_MULMOD_BNM1_THRESHOLD           54
+#define INV_NEWTON_THRESHOLD               252
+#define INV_APPR_THRESHOLD                 230
+
+#define BINV_NEWTON_THRESHOLD              327
+#define REDC_1_TO_REDC_2_THRESHOLD          25
+#define REDC_2_TO_REDC_N_THRESHOLD          67
+
+#define MU_DIV_QR_THRESHOLD               1620
+#define MU_DIVAPPR_Q_THRESHOLD            1620
+#define MUPI_DIV_QR_THRESHOLD              104
+#define MU_BDIV_QR_THRESHOLD              1528
+#define MU_BDIV_Q_THRESHOLD               1652
+
+#define POWM_SEC_TABLE  1,22,321,473,2144
+
+#define GET_STR_DC_THRESHOLD                15
+#define GET_STR_PRECOMPUTE_THRESHOLD        24
+#define SET_STR_DC_THRESHOLD               248
+#define SET_STR_PRECOMPUTE_THRESHOLD      1304
+
+#define FAC_DSC_THRESHOLD                  470
+#define FAC_ODD_THRESHOLD                   25
+
+#define MATRIX22_STRASSEN_THRESHOLD         17
+#define HGCD2_DIV1_METHOD                    5  /* 8.38% faster than 4 */
+#define HGCD_THRESHOLD                     115
+#define HGCD_APPR_THRESHOLD                146
+#define HGCD_REDUCE_THRESHOLD             3524
+#define GCD_DC_THRESHOLD                   535
+#define GCDEXT_DC_THRESHOLD                460
+#define JACOBI_BASE_METHOD                   1  /* 0.90% faster than 4 */
+
+/* Tuneup completed successfully, took 448763 seconds */

diff --git a/third_party/gmp/mpn/x86_64/k10/hamdist.asm b/third_party/gmp/mpn/x86_64/k10/hamdist.asm
new file mode 100644
index 0000000..f70494a
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/k10/hamdist.asm

@@ -0,0 +1,109 @@
+dnl  AMD64 mpn_hamdist -- hamming distance.
+
+dnl  Copyright 2008, 2010-2012, 2017 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C AMD K8,K9	 -
+C AMD K10	 2.0		=
+C AMD bd1	~4.4		=
+C AMD bd2	~4.4		=
+C AMD bd3
+C AMD bd4
+C AMD bobcat	 7.55		=
+C AMD jaguar	 2.52		-
+C Intel P4	 -
+C Intel core2	 -
+C Intel NHM	 2.03		+
+C Intel SBR	 2.01		+
+C Intel IBR	 1.96		+
+C Intel HWL	 1.64		=
+C Intel BWL	 1.56		-
+C Intel SKL	 1.52		=
+C Intel atom
+C Intel SLM	 3.0		-
+C VIA nano
+
+define(`ap',		`%rdi')
+define(`bp',		`%rsi')
+define(`n',		`%rdx')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(32)
+PROLOGUE(mpn_hamdist)
+	FUNC_ENTRY(3)
+	mov	(ap), %r8
+	xor	(bp), %r8
+
+	lea	(ap,n,8), ap			C point at A operand end
+	lea	(bp,n,8), bp			C point at B operand end
+	neg	n
+
+	test	$1, R8(n)
+	jz	L(2)
+
+L(1):	.byte	0xf3,0x49,0x0f,0xb8,0xc0	C popcnt %r8, %rax
+	xor	R32(%r10), R32(%r10)
+	inc	n
+	js	L(top)
+	FUNC_EXIT()
+	ret
+
+	ALIGN(16)
+L(2):	mov	8(ap,n,8), %r9
+	.byte	0xf3,0x49,0x0f,0xb8,0xc0	C popcnt %r8, %rax
+	xor	8(bp,n,8), %r9
+	.byte	0xf3,0x4d,0x0f,0xb8,0xd1	C popcnt %r9, %r10
+	add	$2, n
+	js	L(top)
+	lea	(%r10, %rax), %rax
+	FUNC_EXIT()
+	ret
+
+	ALIGN(16)
+L(top):	mov	(ap,n,8), %r8
+	lea	(%r10, %rax), %rax
+	mov	8(ap,n,8), %r9
+	xor	(bp,n,8), %r8
+	xor	8(bp,n,8), %r9
+	.byte	0xf3,0x49,0x0f,0xb8,0xc8	C popcnt %r8, %rcx
+	lea	(%rcx, %rax), %rax
+	.byte	0xf3,0x4d,0x0f,0xb8,0xd1	C popcnt %r9, %r10
+	add	$2, n
+	js	L(top)
+
+	lea	(%r10, %rax), %rax
+	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/k10/lshift.asm b/third_party/gmp/mpn/x86_64/k10/lshift.asm
new file mode 100644
index 0000000..cadf9b9
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/k10/lshift.asm

@@ -0,0 +1,37 @@
+dnl  X86-64 mpn_lshift optimised for AMD K10.
+
+dnl  Copyright 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_lshift)
+include_mpn(`x86_64/fastsse/lshift-movdqu2.asm')

diff --git a/third_party/gmp/mpn/x86_64/k10/lshiftc.asm b/third_party/gmp/mpn/x86_64/k10/lshiftc.asm
new file mode 100644
index 0000000..48a92e5
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/k10/lshiftc.asm

@@ -0,0 +1,37 @@
+dnl  X86-64 mpn_lshiftc optimised for AMD K10.
+
+dnl  Copyright 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_lshiftc)
+include_mpn(`x86_64/fastsse/lshiftc-movdqu2.asm')

diff --git a/third_party/gmp/mpn/x86_64/k10/popcount.asm b/third_party/gmp/mpn/x86_64/k10/popcount.asm
new file mode 100644
index 0000000..3814aea
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/k10/popcount.asm

@@ -0,0 +1,138 @@
+dnl  AMD64 mpn_popcount -- population count.
+
+dnl  Copyright 2008, 2010-2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C		    cycles/limb
+C AMD K8,K9		 n/a
+C AMD K10		 1.125
+C Intel P4		 n/a
+C Intel core2		 n/a
+C Intel corei		 1.25
+C Intel atom		 n/a
+C VIA nano		 n/a
+
+C * The zero-offset of popcount is misassembled to the offset-less form, which
+C   is one byte shorter and therefore will mess up the switching code.
+C * The outdated gas used in FreeBSD and NetBSD cannot handle the POPCNT insn,
+C   which is the main reason for our usage of '.byte'.
+
+C TODO
+C  * Improve switching code, the current code sucks.
+
+define(`up',		`%rdi')
+define(`n',		`%rsi')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(32)
+PROLOGUE(mpn_popcount)
+	FUNC_ENTRY(2)
+
+ifelse(1,1,`
+	lea	(up,n,8), up
+
+C	mov	R32(n), R32(%rcx)
+C	neg	R32(%rcx)
+	imul	$-1, R32(n), R32(%rcx)
+	and	$8-1, R32(%rcx)
+
+	neg	n
+
+	mov	R32(%rcx), R32(%rax)
+	neg	%rax
+	lea	(up,%rax,8),up
+
+	xor	R32(%rax), R32(%rax)
+
+	lea	(%rcx,%rcx,4), %rcx
+
+	lea	L(top)(%rip), %rdx
+	lea	(%rdx,%rcx,2), %rdx
+	jmp	*%rdx
+',`
+	lea	(up,n,8), up
+
+	mov	R32(n), R32(%rcx)
+	neg	R32(%rcx)
+	and	$8-1, R32(%rcx)
+
+	neg	n
+
+	mov	R32(%rcx), R32(%rax)
+	shl	$3, R32(%rax)
+	sub	%rax, up
+
+	xor	R32(%rax), R32(%rax)
+
+C	add	R32(%rcx), R32(%rcx)	C 2x
+C	lea	(%rcx,%rcx,4), %rcx	C 10x
+	imul	$10, R32(%rcx)
+
+	lea	L(top)(%rip), %rdx
+	add	%rcx, %rdx
+	jmp	*%rdx
+')
+
+	ALIGN(32)
+L(top):
+C 0 = n mod 8
+	.byte	0xf3,0x4c,0x0f,0xb8,0x44,0xf7,0x00	C popcnt 0(up,n,8), %r8
+	add	%r8, %rax
+C 7 = n mod 8
+	.byte	0xf3,0x4c,0x0f,0xb8,0x4c,0xf7,0x08	C popcnt 8(up,n,8), %r9
+	add	%r9, %rax
+C 6 = n mod 8
+	.byte	0xf3,0x4c,0x0f,0xb8,0x44,0xf7,0x10	C popcnt 16(up,n,8), %r8
+	add	%r8, %rax
+C 5 = n mod 8
+	.byte	0xf3,0x4c,0x0f,0xb8,0x4c,0xf7,0x18	C popcnt 24(up,n,8), %r9
+	add	%r9, %rax
+C 4 = n mod 8
+	.byte	0xf3,0x4c,0x0f,0xb8,0x44,0xf7,0x20	C popcnt 32(up,n,8), %r8
+	add	%r8, %rax
+C 3 = n mod 8
+	.byte	0xf3,0x4c,0x0f,0xb8,0x4c,0xf7,0x28	C popcnt 40(up,n,8), %r9
+	add	%r9, %rax
+C 2 = n mod 8
+	.byte	0xf3,0x4c,0x0f,0xb8,0x44,0xf7,0x30	C popcnt 48(up,n,8), %r8
+	add	%r8, %rax
+C 1 = n mod 8
+	.byte	0xf3,0x4c,0x0f,0xb8,0x4c,0xf7,0x38	C popcnt 56(up,n,8), %r9
+	add	%r9, %rax
+
+	add	$8, n
+	js	L(top)
+	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/k10/rshift.asm b/third_party/gmp/mpn/x86_64/k10/rshift.asm
new file mode 100644
index 0000000..249051a
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/k10/rshift.asm

@@ -0,0 +1,37 @@
+dnl  X86-64 mpn_rshift optimised for AMD K10.
+
+dnl  Copyright 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_rshift)
+include_mpn(`x86_64/fastsse/rshift-movdqu2.asm')

diff --git a/third_party/gmp/mpn/x86_64/k10/sec_tabselect.asm b/third_party/gmp/mpn/x86_64/k10/sec_tabselect.asm
new file mode 100644
index 0000000..e436034
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/k10/sec_tabselect.asm

@@ -0,0 +1,37 @@
+dnl  X86-64 mpn_sec_tabselect.
+
+dnl  Copyright 2012, 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_sec_tabselect)
+include_mpn(`x86_64/fastsse/sec_tabselect.asm')

diff --git a/third_party/gmp/mpn/x86_64/k8/addmul_2.asm b/third_party/gmp/mpn/x86_64/k8/addmul_2.asm
new file mode 100644
index 0000000..78bcba1
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/k8/addmul_2.asm

@@ -0,0 +1,195 @@
+dnl  AMD64 mpn_addmul_2 -- Multiply an n-limb vector with a 2-limb vector and
+dnl  add the result to a third limb vector.
+
+dnl  Copyright 2008, 2011, 2012, 2016 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb     cycles/limb cfg	cycles/limb am1+am1
+C AMD K8,K9	 2.375
+C AMD K10	 2.375
+C AMD bull	 5.2		<-		4.6-4.75		bad
+C AMD pile	 4.96		<-		4.6-4.75		bad
+C AMD steam	 ?
+C AMD excavator	 ?
+C AMD bobcat	 5.75				5.0			bad
+C AMD jaguar	 5.9				5.2-5.4			bad
+C Intel P4	15-16
+C Intel core2	 4.5				4.25-4.5		bad
+C Intel NHM	 4.33				4.55			bad
+C Intel SBR	 3.4		 2.93		3.24			bad
+C Intel IBR	 3.35		 2.6		2.95			bad
+C Intel HWL	 3.3		 2.15		2.3			bad
+C Intel BWL	 2.33		 2.33		1.65			bad
+C Intel SKL	 2.37		 2.21		1.64			bad
+C Intel atom	20		18.7
+C Intel SLM	 8		 8.5
+C VIA nano	 4.4
+
+C This code is the result of running a code generation and optimization tool
+C suite written by David Harvey and Torbjorn Granlund.
+
+C TODO
+C  * Tune feed-in and wind-down code.
+
+C INPUT PARAMETERS
+define(`rp',     `%rdi')
+define(`up',     `%rsi')
+define(`n_param',`%rdx')
+define(`vp',     `%rcx')
+
+define(`v0', `%r8')
+define(`v1', `%r9')
+define(`w0', `%rbx')
+define(`w1', `%rcx')
+define(`w2', `%rbp')
+define(`w3', `%r10')
+define(`n',  `%r11')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_addmul_2)
+	FUNC_ENTRY(4)
+	mov	n_param, n
+	push	%rbx
+	push	%rbp
+
+	mov	0(vp), v0
+	mov	8(vp), v1
+
+	mov	R32(n_param), R32(%rbx)
+	mov	(up), %rax
+	lea	-8(up,n_param,8), up
+	lea	-8(rp,n_param,8), rp
+	mul	v0
+	neg	n
+	and	$3, R32(%rbx)
+	jz	L(b0)
+	cmp	$2, R32(%rbx)
+	jc	L(b1)
+	jz	L(b2)
+
+L(b3):	mov	%rax, w1
+	mov	%rdx, w2
+	xor	R32(w3), R32(w3)
+	mov	8(up,n,8), %rax
+	dec	n
+	jmp	L(lo3)
+
+L(b2):	mov	%rax, w2
+	mov	8(up,n,8), %rax
+	mov	%rdx, w3
+	xor	R32(w0), R32(w0)
+	add	$-2, n
+	jmp	L(lo2)
+
+L(b1):	mov	%rax, w3
+	mov	8(up,n,8), %rax
+	mov	%rdx, w0
+	xor	R32(w1), R32(w1)
+	inc	n
+	jmp	L(lo1)
+
+L(b0):	mov	$0, R32(w3)
+	mov	%rax, w0
+	mov	8(up,n,8), %rax
+	mov	%rdx, w1
+	xor	R32(w2), R32(w2)
+	jmp	L(lo0)
+
+	ALIGN(32)
+L(top):	mov	$0, R32(w1)
+	mul	v0
+	add	%rax, w3
+	mov	(up,n,8), %rax
+	adc	%rdx, w0
+	adc	$0, R32(w1)
+L(lo1):	mul	v1
+	add	w3, (rp,n,8)
+	mov	$0, R32(w3)
+	adc	%rax, w0
+	mov	$0, R32(w2)
+	mov	8(up,n,8), %rax
+	adc	%rdx, w1
+	mul	v0
+	add	%rax, w0
+	mov	8(up,n,8), %rax
+	adc	%rdx, w1
+	adc	$0, R32(w2)
+L(lo0):	mul	v1
+	add	w0, 8(rp,n,8)
+	adc	%rax, w1
+	adc	%rdx, w2
+	mov	16(up,n,8), %rax
+	mul	v0
+	add	%rax, w1
+	adc	%rdx, w2
+	adc	$0, R32(w3)
+	mov	16(up,n,8), %rax
+L(lo3):	mul	v1
+	add	w1, 16(rp,n,8)
+	adc	%rax, w2
+	adc	%rdx, w3
+	xor	R32(w0), R32(w0)
+	mov	24(up,n,8), %rax
+	mul	v0
+	add	%rax, w2
+	mov	24(up,n,8), %rax
+	adc	%rdx, w3
+	adc	$0, R32(w0)
+L(lo2):	mul	v1
+	add	w2, 24(rp,n,8)
+	adc	%rax, w3
+	adc	%rdx, w0
+	mov	32(up,n,8), %rax
+	add	$4, n
+	js	L(top)
+
+L(end):	xor	R32(w1), R32(w1)
+	mul	v0
+	add	%rax, w3
+	mov	(up), %rax
+	adc	%rdx, w0
+	adc	R32(w1), R32(w1)
+	mul	v1
+	add	w3, (rp)
+	adc	%rax, w0
+	adc	%rdx, w1
+	mov	w0, 8(rp)
+	mov	w1, %rax
+
+	pop	%rbp
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/k8/aorrlsh_n.asm b/third_party/gmp/mpn/x86_64/k8/aorrlsh_n.asm
new file mode 100644
index 0000000..ff3a184
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/k8/aorrlsh_n.asm

@@ -0,0 +1,217 @@
+dnl  AMD64 mpn_addlsh_n and mpn_rsblsh_n.  R = V2^k +- U.
+
+dnl  Copyright 2006, 2010-2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C AMD K8,K9	 2.87	< 3.85 for lshift + add_n
+C AMD K10	 2.75	< 3.85 for lshift + add_n
+C Intel P4	22	> 7.33 for lshift + add_n
+C Intel core2	 4.1	> 3.27 for lshift + add_n
+C Intel NHM	 4.4	> 3.75 for lshift + add_n
+C Intel SBR	 3.17	< 3.46 for lshift + add_n
+C Intel atom	 ?	? 8.75 for lshift + add_n
+C VIA nano	 4.7	< 6.25 for lshift + add_n
+
+C TODO
+C  * Can we propagate carry into rdx instead of using a special carry register?
+C    That could save enough insns to get to 10 cycles/iteration.
+
+define(`rp',       `%rdi')
+define(`up',       `%rsi')
+define(`vp_param', `%rdx')
+define(`n_param',  `%rcx')
+define(`cnt',      `%r8')
+
+define(`vp',    `%r12')
+define(`n',     `%rbp')
+
+ifdef(`OPERATION_addlsh_n',`
+  define(ADDSUB,       `add')
+  define(ADCSBB,       `adc')
+  define(func, mpn_addlsh_n)
+')
+ifdef(`OPERATION_rsblsh_n',`
+  define(ADDSUB,       `sub')
+  define(ADCSBB,       `sbb')
+  define(func, mpn_rsblsh_n)
+')
+
+MULFUNC_PROLOGUE(mpn_addlsh_n mpn_rsblsh_n)
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(func)
+	FUNC_ENTRY(4)
+IFDOS(`	mov	56(%rsp), %r8d	')
+	push	%r12
+	push	%rbp
+	push	%rbx
+
+	mov	(vp_param), %rax	C load first V limb early
+
+	mov	$0, R32(n)
+	sub	n_param, n
+
+	lea	-16(up,n_param,8), up
+	lea	-16(rp,n_param,8), rp
+	lea	16(vp_param,n_param,8), vp
+
+	mov	n_param, %r9
+
+	mov	%r8, %rcx
+	mov	$1, R32(%r8)
+	shl	R8(%rcx), %r8
+
+	mul	%r8			C initial multiply
+
+	and	$3, R32(%r9)
+	jz	L(b0)
+	cmp	$2, R32(%r9)
+	jc	L(b1)
+	jz	L(b2)
+
+L(b3):	mov	%rax, %r11
+	ADDSUB	16(up,n,8), %r11
+	mov	-8(vp,n,8), %rax
+	sbb	R32(%rcx), R32(%rcx)
+	mov	%rdx, %rbx
+	mul	%r8
+	or	%rax, %rbx
+	mov	(vp,n,8), %rax
+	mov	%rdx, %r9
+	mul	%r8
+	or	%rax, %r9
+	add	$3, n
+	jnz	L(lo3)
+	jmp	L(cj3)
+
+L(b2):	mov	%rax, %rbx
+	mov	-8(vp,n,8), %rax
+	mov	%rdx, %r9
+	mul	%r8
+	or	%rax, %r9
+	add	$2, n
+	jz	L(cj2)
+	mov	%rdx, %r10
+	mov	-16(vp,n,8), %rax
+	mul	%r8
+	or	%rax, %r10
+	xor	R32(%rcx), R32(%rcx)	C clear carry register
+	jmp	L(lo2)
+
+L(b1):	mov	%rax, %r9
+	mov	%rdx, %r10
+	add	$1, n
+	jnz	L(gt1)
+	ADDSUB	8(up,n,8), %r9
+	jmp	L(cj1)
+L(gt1):	mov	-16(vp,n,8), %rax
+	mul	%r8
+	or	%rax, %r10
+	mov	%rdx, %r11
+	mov	-8(vp,n,8), %rax
+	mul	%r8
+	or	%rax, %r11
+	ADDSUB	8(up,n,8), %r9
+	ADCSBB	16(up,n,8), %r10
+	ADCSBB	24(up,n,8), %r11
+	mov	(vp,n,8), %rax
+	sbb	R32(%rcx), R32(%rcx)
+	jmp	L(lo1)
+
+L(b0):	mov	%rax, %r10
+	mov	%rdx, %r11
+	mov	-8(vp,n,8), %rax
+	mul	%r8
+	or	%rax, %r11
+	ADDSUB	16(up,n,8), %r10
+	ADCSBB	24(up,n,8), %r11
+	mov	(vp,n,8), %rax
+	sbb	R32(%rcx), R32(%rcx)
+	mov	%rdx, %rbx
+	mul	%r8
+	or	%rax, %rbx
+	mov	8(vp,n,8), %rax
+	add	$4, n
+	jz	L(end)
+
+	ALIGN(8)
+L(top):	mov	%rdx, %r9
+	mul	%r8
+	or	%rax, %r9
+	mov	%r10, -16(rp,n,8)
+L(lo3):	mov	%rdx, %r10
+	mov	-16(vp,n,8), %rax
+	mul	%r8
+	or	%rax, %r10
+	mov	%r11, -8(rp,n,8)
+L(lo2):	mov	%rdx, %r11
+	mov	-8(vp,n,8), %rax
+	mul	%r8
+	or	%rax, %r11
+	add	R32(%rcx), R32(%rcx)
+	ADCSBB	(up,n,8), %rbx
+	ADCSBB	8(up,n,8), %r9
+	ADCSBB	16(up,n,8), %r10
+	ADCSBB	24(up,n,8), %r11
+	mov	(vp,n,8), %rax
+	sbb	R32(%rcx), R32(%rcx)
+	mov	%rbx, (rp,n,8)
+L(lo1):	mov	%rdx, %rbx
+	mul	%r8
+	or	%rax, %rbx
+	mov	%r9, 8(rp,n,8)
+L(lo0):	mov	8(vp,n,8), %rax
+	add	$4, n
+	jnz	L(top)
+
+L(end):	mov	%rdx, %r9
+	mul	%r8
+	or	%rax, %r9
+	mov	%r10, -16(rp,n,8)
+L(cj3):	mov	%r11, -8(rp,n,8)
+L(cj2):	add	R32(%rcx), R32(%rcx)
+	ADCSBB	(up,n,8), %rbx
+	ADCSBB	8(up,n,8), %r9
+	mov	%rbx, (rp,n,8)
+L(cj1):	mov	%r9, 8(rp,n,8)
+	mov	%rdx, %rax
+	ADCSBB	$0, %rax
+	pop	%rbx
+	pop	%rbp
+	pop	%r12
+	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/k8/bdiv_q_1.asm b/third_party/gmp/mpn/x86_64/k8/bdiv_q_1.asm
new file mode 100644
index 0000000..1172b0d
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/k8/bdiv_q_1.asm

@@ -0,0 +1,179 @@
+dnl  AMD64 mpn_bdiv_q_1, mpn_pi1_bdiv_q_1 -- Hensel division by 1-limb divisor,
+dnl  returning quotient only.
+
+dnl  Copyright 2001, 2002, 2004-2006, 2009, 2011, 2012, 2017 Free Software
+dnl  Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	    cycles/limb
+C	     norm/unorm
+C AMD K8,K9	10	+
+C AMD K10	10	+
+C AMD bull	13.7	-
+C AMD pile	13.7	+
+C AMD steam
+C AMD excavator
+C AMD bobcat	15	-
+C AMD jaguar	16	-
+C Intel P4	33	=
+C Intel core2	13.25	=
+C Intel NHM	14	=
+C Intel SBR	8.5	-
+C Intel IBR	8.5	-
+C Intel HWL	8	=
+C Intel BWL	8	=
+C Intel SKL	8	=
+C Intel atom	42	--
+C Intel SLM	20.4	--
+C VIA nano
+
+C INPUT PARAMETERS
+define(`rp',		`%rdi')
+define(`up',		`%rsi')
+define(`n',		`%rdx')
+define(`d',		`%rcx')
+define(`di',		`%r8')		C	just mpn_pi1_bdiv_q_1
+define(`ncnt',		`%r9')		C	just mpn_pi1_bdiv_q_1
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_bdiv_q_1)
+	FUNC_ENTRY(4)
+	push	%rbx
+
+	mov	%rcx, %rax
+	xor	R32(%rcx), R32(%rcx)	C ncnt count
+	mov	%rdx, %r10
+
+	bt	$0, R32(%rax)
+	jnc	L(evn)			C skip bsf unless divisor is even
+
+L(odd):	mov	%rax, %rbx
+	shr	R32(%rax)
+	and	$127, R32(%rax)		C d/2, 7 bits
+
+	LEA(	binvert_limb_table, %rdx)
+
+	movzbl	(%rdx,%rax), R32(%rax)	C inv 8 bits
+
+	mov	%rbx, %r11		C d without twos
+
+	lea	(%rax,%rax), R32(%rdx)	C 2*inv
+	imul	R32(%rax), R32(%rax)	C inv*inv
+	imul	R32(%rbx), R32(%rax)	C inv*inv*d
+	sub	R32(%rax), R32(%rdx)	C inv = 2*inv - inv*inv*d, 16 bits
+
+	lea	(%rdx,%rdx), R32(%rax)	C 2*inv
+	imul	R32(%rdx), R32(%rdx)	C inv*inv
+	imul	R32(%rbx), R32(%rdx)	C inv*inv*d
+	sub	R32(%rdx), R32(%rax)	C inv = 2*inv - inv*inv*d, 32 bits
+
+	lea	(%rax,%rax), %r8	C 2*inv
+	imul	%rax, %rax		C inv*inv
+	imul	%rbx, %rax		C inv*inv*d
+	sub	%rax, %r8		C inv = 2*inv - inv*inv*d, 64 bits
+
+	jmp	L(pi1)
+
+L(evn):	bsf	%rax, %rcx
+	shr	R8(%rcx), %rax
+	jmp	L(odd)
+EPILOGUE()
+
+PROLOGUE(mpn_pi1_bdiv_q_1)
+	FUNC_ENTRY(4)
+IFDOS(`	mov	56(%rsp), %r8	')
+IFDOS(`	mov	64(%rsp), %r9	')
+	push	%rbx
+
+	mov	%rcx, %r11		C d
+	mov	%rdx, %r10		C n
+	mov	%r9, %rcx		C ncnt
+
+L(pi1):	mov	(up), %rax		C up[0]
+
+	dec	%r10
+	jz	L(one)
+
+	mov	8(up), %rdx		C up[1]
+	lea	(up,%r10,8), up		C up end
+	lea	(rp,%r10,8), rp		C rp end
+	neg	%r10			C -n
+
+	shrd	R8(%rcx), %rdx, %rax
+
+	xor	R32(%rbx), R32(%rbx)
+	jmp	L(ent)
+
+	ALIGN(8)
+L(top):
+	C rax	q
+	C rbx	carry bit, 0 or 1
+	C rcx	ncnt
+	C rdx
+	C r10	counter, limbs, negative
+	C r11	d
+
+	mul	%r11			C carry limb in rdx
+	mov	(up,%r10,8), %rax
+	mov	8(up,%r10,8), %r9
+	shrd	R8(%rcx), %r9, %rax
+	nop
+	sub	%rbx, %rax		C apply carry bit
+	setc	R8(%rbx)
+	sub	%rdx, %rax		C apply carry limb
+	adc	$0, R32(%rbx)
+L(ent):	imul	%r8, %rax
+	mov	%rax, (rp,%r10,8)
+	inc	%r10
+	jnz	L(top)
+
+	mul	%r11			C carry limb in rdx
+	mov	(up), %rax		C up high limb
+	shr	R8(%rcx), %rax
+	sub	%rbx, %rax		C apply carry bit
+	sub	%rdx, %rax		C apply carry limb
+	imul	%r8, %rax
+	mov	%rax, (rp)
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+
+L(one):	shr	R8(%rcx), %rax
+	imul	%r8, %rax
+	mov	%rax, (rp)
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/k8/div_qr_1n_pi1.asm b/third_party/gmp/mpn/x86_64/k8/div_qr_1n_pi1.asm
new file mode 100644
index 0000000..86de08c
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/k8/div_qr_1n_pi1.asm

@@ -0,0 +1,249 @@
+dnl  x86-64 mpn_div_qr_1n_pi1
+dnl  -- Divide an mpn number by a normalized single-limb number,
+dnl     using a single-limb inverse.
+
+dnl  Contributed to the GNU project by Niels Möller
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C		c/l
+C AMD K8,K9	11
+C AMD K10	11
+C AMD bull	16
+C AMD pile	14.25
+C AMD steam	 ?
+C AMD bobcat	16
+C AMD jaguar	 ?
+C Intel P4	47.5	poor
+C Intel core	28.5	very poor
+C Intel NHM	29	very poor
+C Intel SBR	16	poor
+C Intel IBR	13.5
+C Intel HWL	12
+C Intel BWL	 ?
+C Intel atom	53	very poor
+C VIA nano	19
+
+
+C INPUT Parameters
+define(`QP', `%rdi')
+define(`UP', `%rsi')
+define(`UN_INPUT', `%rdx')
+define(`U1', `%rcx')	C Also in %rax
+define(`D', `%r8')
+define(`DINV', `%r9')
+
+C Invariants
+define(`B2', `%rbp')
+define(`B2md', `%rbx')
+
+C Variables
+define(`UN', `%r8')	C Overlaps D input
+define(`T', `%r10')
+define(`U0', `%r11')
+define(`U2', `%r12')
+define(`Q0', `%r13')
+define(`Q1', `%r14')
+define(`Q2', `%r15')
+
+ABI_SUPPORT(STD64)
+
+	ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_div_qr_1n_pi1)
+	FUNC_ENTRY(4)
+IFDOS(`	mov	56(%rsp), %r8	')
+IFDOS(`	mov	64(%rsp), %r9	')
+	dec	UN_INPUT
+	jnz	L(first)
+
+	C Just a single 2/1 division.
+	C T, U0 are allocated in scratch registers
+	lea	1(U1), T
+	mov	U1, %rax
+	mul	DINV
+	mov	(UP), U0
+	add	U0, %rax
+	adc	T, %rdx
+	mov	%rdx, T
+	imul	D, %rdx
+	sub	%rdx, U0
+	cmp	U0, %rax
+	lea	(U0, D), %rax
+	cmovnc	U0, %rax
+	sbb	$0, T
+	cmp	D, %rax
+	jc	L(single_div_done)
+	sub	D, %rax
+	add	$1, T
+L(single_div_done):
+	mov	T, (QP)
+	FUNC_EXIT()
+	ret
+L(first):
+	C FIXME: Could delay some of these until we enter the loop.
+	push	%r15
+	push	%r14
+	push	%r13
+	push	%r12
+	push	%rbx
+	push	%rbp
+
+	mov	D, B2
+	imul	DINV, B2
+	neg	B2
+	mov	B2, B2md
+	sub	D, B2md
+
+	C D not needed until final reduction
+	push	D
+	mov	UN_INPUT, UN	C Clobbers D
+
+	mov	DINV, %rax
+	mul	U1
+	mov	%rax, Q0
+	add	U1, %rdx
+	mov	%rdx, T
+
+	mov	B2, %rax
+	mul	U1
+	mov	-8(UP, UN, 8), U0
+	mov	(UP, UN, 8), U1
+	mov	T, (QP, UN, 8)
+	add	%rax, U0
+	adc	%rdx, U1
+	sbb	U2, U2
+	dec	UN
+	mov	U1, %rax
+	jz	L(final)
+	mov	$0, R32(Q1)
+
+	ALIGN(16)
+
+	C Loop is 28 instructions, 30 K8/K10 decoder slots, should run
+	C in 10 cycles. At entry, %rax holds an extra copy of U1, Q1
+	C is zero, and carry holds an extra copy of U2.
+L(loop):
+	C {Q2, Q1, Q0} <-- DINV * U1 + B (Q0 + U2 DINV) + B^2 U2
+	C Remains to add in B (U1 + c)
+	cmovc	DINV, Q1
+	mov	U2, Q2
+	neg	Q2
+	mul	DINV
+	add	%rdx, Q1
+	adc	$0, Q2
+	add	Q0, Q1
+	mov	%rax, Q0
+	mov	B2, %rax
+	lea	(B2md, U0), T
+	adc	$0, Q2
+
+	C {U2, U1, U0} <-- (U0 + U2 B2 -c U) B + U1 B2 + u
+	mul	U1
+	and	B2, U2
+	add	U2, U0
+	cmovnc	U0, T
+
+	C {QP+UN, ...} <-- {QP+UN, ...} + {Q2, Q1} + U1 + c
+	adc	U1, Q1
+	mov	-8(UP, UN, 8), U0
+	adc	Q2, 8(QP, UN, 8)
+	jc	L(q_incr)
+L(q_incr_done):
+	add	%rax, U0
+	mov	T, %rax
+	adc	%rdx, %rax
+	mov	Q1, (QP, UN, 8)
+	mov	$0, R32(Q1)
+	sbb	U2, U2
+	dec	UN
+	mov	%rax, U1
+	jnz	L(loop)
+
+L(final):
+	pop	D
+
+	mov	U2, Q1
+	and	D, U2
+	sub	U2, %rax
+	neg	Q1
+
+	mov	%rax, U1
+	sub	D, %rax
+	cmovc	U1, %rax
+	sbb	$-1, Q1
+
+	lea	1(%rax), T
+	mul	DINV
+	add	U0, %rax
+	adc	T, %rdx
+	mov	%rdx, T
+	imul	D, %rdx
+	sub	%rdx, U0
+	cmp	U0, %rax
+	lea	(U0, D), %rax
+	cmovnc	U0, %rax
+	sbb	$0, T
+	cmp	D, %rax
+	jc	L(div_done)
+	sub	D, %rax
+	add	$1, T
+L(div_done):
+	add	T, Q0
+	mov	Q0, (QP)
+	adc	Q1, 8(QP)
+	jnc	L(done)
+L(final_q_incr):
+	addq	$1, 16(QP)
+	lea	8(QP), QP
+	jc	L(final_q_incr)
+
+L(done):
+	pop	%rbp
+	pop	%rbx
+	pop	%r12
+	pop	%r13
+	pop	%r14
+	pop	%r15
+	FUNC_EXIT()
+	ret
+
+L(q_incr):
+	C U1 is not live, so use it for indexing
+	lea	16(QP, UN, 8), U1
+L(q_incr_loop):
+	addq	$1, (U1)
+	jnc	L(q_incr_done)
+	lea	8(U1), U1
+	jmp	L(q_incr_loop)
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/k8/gmp-mparam.h b/third_party/gmp/mpn/x86_64/k8/gmp-mparam.h
new file mode 100644
index 0000000..d87cc3b
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/k8/gmp-mparam.h

@@ -0,0 +1,237 @@
+/* AMD K8 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+#if 0
+#undef mpn_sublsh_n
+#define mpn_sublsh_n(rp,up,vp,n,c)					\
+  (((rp) == (up)) ? mpn_submul_1 (rp, vp, n, CNST_LIMB(1) << (c))	\
+   : MPN(mpn_sublsh_n)(rp,up,vp,n,c))
+#endif
+
+/* 2500 MHz K8 Brisbane */
+/* FFT tuning limit = 115,768,433 */
+/* Generated by tuneup.c, 2019-10-18, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD                 0  /* always */
+#define MOD_1_UNNORM_THRESHOLD               0  /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          5
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          2
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD        14
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD        35
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD      9
+#define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_1_NORM_THRESHOLD              1
+#define DIV_QR_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           16
+
+#define DIV_1_VS_MUL_1_PERCENT             309
+
+#define MUL_TOOM22_THRESHOLD                28
+#define MUL_TOOM33_THRESHOLD                81
+#define MUL_TOOM44_THRESHOLD               232
+#define MUL_TOOM6H_THRESHOLD               324
+#define MUL_TOOM8H_THRESHOLD               478
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      97
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     153
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD     154
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD     160
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD     226
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 34
+#define SQR_TOOM3_THRESHOLD                114
+#define SQR_TOOM4_THRESHOLD                336
+#define SQR_TOOM6_THRESHOLD                430
+#define SQR_TOOM8_THRESHOLD                  0  /* always */
+
+#define MULMID_TOOM42_THRESHOLD             36
+
+#define MULMOD_BNM1_THRESHOLD               17
+#define SQRMOD_BNM1_THRESHOLD               19
+
+#define MUL_FFT_MODF_THRESHOLD             654  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    654, 5}, {     21, 6}, {     11, 5}, {     23, 6}, \
+    {     12, 5}, {     25, 6}, {     13, 5}, {     27, 6}, \
+    {     27, 7}, {     14, 6}, {     29, 7}, {     15, 6}, \
+    {     31, 7}, {     29, 8}, {     15, 7}, {     32, 8}, \
+    {     17, 7}, {     37, 8}, {     19, 7}, {     39, 8}, \
+    {     21, 7}, {     44, 8}, {     23, 7}, {     47, 8}, \
+    {     25, 7}, {     51, 8}, {     31, 7}, {     63, 8}, \
+    {     37, 9}, {     19, 8}, {     43, 9}, {     23, 8}, \
+    {     53, 9}, {     27, 8}, {     57, 9}, {     31, 8}, \
+    {     67, 9}, {     35, 8}, {     71, 9}, {     39, 8}, \
+    {     81, 9}, {     43,10}, {     23, 9}, {     55, 8}, \
+    {    111,10}, {     31, 9}, {     71,10}, {     39, 9}, \
+    {     87,10}, {     47, 9}, {     99,10}, {     55, 9}, \
+    {    111,11}, {     31,10}, {     63, 9}, {    131,10}, \
+    {     71, 9}, {    147,10}, {     87,11}, {     47,10}, \
+    {    111,11}, {     63,10}, {    143,11}, {     79,10}, \
+    {    167,11}, {     95,10}, {    199,11}, {    111,12}, \
+    {     63,11}, {    143,10}, {    287,11}, {    159,12}, \
+    {     95,11}, {    191,10}, {    383,11}, {    207,10}, \
+    {    415,13}, {     63,12}, {    127,11}, {    255,10}, \
+    {    511,11}, {    271,10}, {    543,11}, {    287,12}, \
+    {    159,11}, {    319,10}, {    639,11}, {    335,10}, \
+    {    671,11}, {    351,12}, {    191,11}, {    415,12}, \
+    {    223,11}, {    447,13}, {    127,12}, {    255,11}, \
+    {    543,12}, {    287,11}, {    575,10}, {   1151,11}, \
+    {    607,12}, {    319,11}, {    671,12}, {    351,11}, \
+    {    703,13}, {    191,12}, {    383,11}, {    767,12}, \
+    {    415,11}, {    831,12}, {    447,11}, {    895,12}, \
+    {    479,14}, {    127,13}, {    255,12}, {    543,11}, \
+    {   1087,12}, {    575,11}, {   1151,12}, {    607,13}, \
+    {    319,12}, {    735,13}, {    383,12}, {    831,13}, \
+    {    447,12}, {    959,14}, {    255,13}, {    511,12}, \
+    {   1087,13}, {    575,12}, {   1215,13}, {    639,12}, \
+    {   1279,13}, {    703,12}, {   1407,14}, {    383,13}, \
+    {    767,12}, {   1535,13}, {    831,12}, {   1663,13}, \
+    {    959,15}, {    255,14}, {    511,13}, {   1215,14}, \
+    {    639,13}, {   1471,14}, {    767,13}, {   1663,14}, \
+    {    895,13}, {   1855,15}, {    511,14}, {   1023,13}, \
+    {   2047,14}, {   1151,13}, {   2367,14}, {   1407,15}, \
+    {    767,14}, {   1791,16}, {    511,15}, {   1023,14}, \
+    {   2303,15}, {   1279,14}, {   2687,15}, {   1535,14}, \
+    {   3199,15}, {   1791,16}, {   1023,15}, {   2047,14}, \
+    {   4223,15}, {   2303,14}, {   4735,15}, {   2559,16}, \
+    {   1535,15}, {   3071,14}, {   6271,15}, {   3327,17}, \
+    { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
+    {2097152,22}, {4194304,23}, {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 183
+#define MUL_FFT_THRESHOLD                11520
+
+#define SQR_FFT_MODF_THRESHOLD             540  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    540, 5}, {     21, 6}, {     11, 5}, {     23, 6}, \
+    {     12, 5}, {     25, 6}, {     13, 5}, {     27, 6}, \
+    {     16, 5}, {     33, 6}, {     29, 7}, {     15, 6}, \
+    {     31, 7}, {     16, 6}, {     33, 7}, {     33, 8}, \
+    {     17, 7}, {     37, 8}, {     19, 7}, {     39, 8}, \
+    {     21, 7}, {     43, 8}, {     23, 7}, {     47, 8}, \
+    {     25, 7}, {     51, 8}, {     29, 9}, {     15, 8}, \
+    {     37, 9}, {     19, 8}, {     43, 9}, {     23, 8}, \
+    {     51, 9}, {     27, 8}, {     55, 9}, {     31, 8}, \
+    {     65, 9}, {     35, 8}, {     71, 9}, {     43,10}, \
+    {     23, 9}, {     55,10}, {     31, 9}, {     71,10}, \
+    {     39, 9}, {     83,10}, {     47, 9}, {     99,10}, \
+    {     55, 9}, {    111,11}, {     31,10}, {     63, 9}, \
+    {    127,10}, {     87,11}, {     47,10}, {    111,12}, \
+    {     31,11}, {     63,10}, {    143,11}, {     79,10}, \
+    {    167,11}, {     95,10}, {    191,11}, {    111,12}, \
+    {     63,11}, {    127, 9}, {    511,11}, {    143,10}, \
+    {    287, 9}, {    575,11}, {    159,12}, {     95,11}, \
+    {    191,10}, {    383, 9}, {    767,11}, {    207,10}, \
+    {    415,13}, {     63,12}, {    127,10}, {    511, 9}, \
+    {   1023,11}, {    271,10}, {    543, 9}, {   1087,11}, \
+    {    287,10}, {    575,12}, {    159,11}, {    319,10}, \
+    {    639,11}, {    335,10}, {    671,11}, {    351,10}, \
+    {    703,12}, {    191,11}, {    383,10}, {    767,11}, \
+    {    415,10}, {    831,12}, {    223,11}, {    447,13}, \
+    {    127,11}, {    511,10}, {   1023,11}, {    543,10}, \
+    {   1087,12}, {    287,11}, {    575,10}, {   1151,11}, \
+    {    607,12}, {    319,11}, {    639,10}, {   1279,11}, \
+    {    671,12}, {    351,11}, {    703,13}, {    191,12}, \
+    {    383,11}, {    767,12}, {    415,11}, {    831,12}, \
+    {    447,11}, {    895,14}, {    127,12}, {    511,11}, \
+    {   1023,12}, {    543,11}, {   1087,12}, {    575,11}, \
+    {   1151,12}, {    607,11}, {   1215,13}, {    319,12}, \
+    {    639,11}, {   1279,12}, {    671,11}, {   1343,12}, \
+    {    703,11}, {   1407,12}, {    735,13}, {    383,12}, \
+    {    767,11}, {   1535,12}, {    831,13}, {    447,12}, \
+    {    959,13}, {    511,12}, {   1087,13}, {    575,12}, \
+    {   1215,13}, {    639,12}, {   1343,13}, {    703,12}, \
+    {   1407,14}, {    383,13}, {    767,12}, {   1535,13}, \
+    {    831,12}, {   1663,13}, {    895,12}, {   1791,13}, \
+    {    959,14}, {    511,13}, {   1215,14}, {    639,13}, \
+    {   1471,14}, {    767,13}, {   1663,14}, {    895,13}, \
+    {   1791,15}, {    511,14}, {   1023,13}, {   2111,14}, \
+    {   1151,13}, {   2303,14}, {   1407,15}, {    767,14}, \
+    {   1791,16}, {    511,15}, {   1023,14}, {   2303,15}, \
+    {   1279,14}, {   2687,15}, {   1535,14}, {   3199,15}, \
+    {   1791,16}, {   1023,15}, {   2047,14}, {   4223,15}, \
+    {   2303,14}, {   4863,15}, {   2559,16}, {   1535,15}, \
+    {   3071,14}, {   6271,15}, {   3327,17}, { 131072,18}, \
+    { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \
+    {4194304,23}, {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 202
+#define SQR_FFT_THRESHOLD                 7296
+
+#define MULLO_BASECASE_THRESHOLD             0  /* always */
+#define MULLO_DC_THRESHOLD                  61
+#define MULLO_MUL_N_THRESHOLD            22239
+#define SQRLO_BASECASE_THRESHOLD             8
+#define SQRLO_DC_THRESHOLD                   0  /* never mpn_sqrlo_basecase */
+#define SQRLO_SQR_THRESHOLD              14281
+
+#define DC_DIV_QR_THRESHOLD                 47
+#define DC_DIVAPPR_Q_THRESHOLD             266
+#define DC_BDIV_QR_THRESHOLD                38
+#define DC_BDIV_Q_THRESHOLD                104
+
+#define INV_MULMOD_BNM1_THRESHOLD           54
+#define INV_NEWTON_THRESHOLD               252
+#define INV_APPR_THRESHOLD                 250
+
+#define BINV_NEWTON_THRESHOLD              258
+#define REDC_1_TO_REDC_2_THRESHOLD          35
+#define REDC_2_TO_REDC_N_THRESHOLD          79
+
+#define MU_DIV_QR_THRESHOLD               2089
+#define MU_DIVAPPR_Q_THRESHOLD            1895
+#define MUPI_DIV_QR_THRESHOLD               99
+#define MU_BDIV_QR_THRESHOLD              1787
+#define MU_BDIV_Q_THRESHOLD               1895
+
+#define POWM_SEC_TABLE  1,16,194,960,2825
+
+#define GET_STR_DC_THRESHOLD                16
+#define GET_STR_PRECOMPUTE_THRESHOLD        26
+#define SET_STR_DC_THRESHOLD               248
+#define SET_STR_PRECOMPUTE_THRESHOLD      1747
+
+#define FAC_DSC_THRESHOLD                 1240
+#define FAC_ODD_THRESHOLD                   27
+
+#define MATRIX22_STRASSEN_THRESHOLD         21
+#define HGCD2_DIV1_METHOD                    3  /* 4.10% faster than 5 */
+#define HGCD_THRESHOLD                     141
+#define HGCD_APPR_THRESHOLD                181
+#define HGCD_REDUCE_THRESHOLD             4633
+#define GCD_DC_THRESHOLD                   622
+#define GCDEXT_DC_THRESHOLD                496
+#define JACOBI_BASE_METHOD                   1  /* 0.97% faster than 3 */
+
+/* Tuneup completed successfully, took 131832 seconds */

diff --git a/third_party/gmp/mpn/x86_64/k8/mul_basecase.asm b/third_party/gmp/mpn/x86_64/k8/mul_basecase.asm
new file mode 100644
index 0000000..ca2efb9
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/k8/mul_basecase.asm

@@ -0,0 +1,469 @@
+dnl  AMD64 mpn_mul_basecase.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund and David Harvey.
+
+dnl  Copyright 2008, 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C AMD K8,K9	 2.375
+C AMD K10	 2.375
+C Intel P4	15-16
+C Intel core2	 4.45
+C Intel corei	 4.35
+C Intel atom	 ?
+C VIA nano	 4.5
+
+C The inner loops of this code are the result of running a code generation and
+C optimization tool suite written by David Harvey and Torbjorn Granlund.
+
+C TODO
+C  * Use fewer registers.  (how??? I can't see it -- david)
+C  * Avoid some "mov $0,r" and instead use "xor r,r".
+C  * Can the top of each L(addmul_outer_n) prologue be folded into the
+C    mul_1/mul_2 prologues, saving a LEA (%rip)? It would slow down the
+C    case where vn = 1 or 2; is it worth it?
+
+C INPUT PARAMETERS
+define(`rp',      `%rdi')
+define(`up',      `%rsi')
+define(`un_param',`%rdx')
+define(`vp',      `%rcx')
+define(`vn',      `%r8')
+
+define(`v0', `%r12')
+define(`v1', `%r9')
+
+define(`w0', `%rbx')
+define(`w1', `%r15')
+define(`w2', `%rbp')
+define(`w3', `%r10')
+
+define(`n',  `%r11')
+define(`outer_addr', `%r14')
+define(`un',  `%r13')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_mul_basecase)
+	FUNC_ENTRY(4)
+IFDOS(`	mov	56(%rsp), %r8d	')
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+
+	xor	R32(un), R32(un)
+	mov	(up), %rax
+	mov	(vp), v0
+
+	sub	un_param, un		C rdx used by mul
+	mov	un, n
+	mov	R32(un_param), R32(w0)
+
+	lea	(rp,un_param,8), rp
+	lea	(up,un_param,8), up
+
+	mul	v0
+
+	test	$1, R8(vn)
+	jz	L(mul_2)
+
+C ===========================================================
+C     mul_1 for vp[0] if vn is odd
+
+L(mul_1):
+	and	$3, R32(w0)
+	jz	L(mul_1_prologue_0)
+	cmp	$2, R32(w0)
+	jc	L(mul_1_prologue_1)
+	jz	L(mul_1_prologue_2)
+
+L(mul_1_prologue_3):
+	add	$-1, n
+	lea	L(addmul_outer_3)(%rip), outer_addr
+	mov	%rax, w3
+	mov	%rdx, w0
+	jmp	L(mul_1_entry_3)
+
+L(mul_1_prologue_0):
+	mov	%rax, w2
+	mov	%rdx, w3		C note: already w0 == 0
+	lea	L(addmul_outer_0)(%rip), outer_addr
+	jmp	L(mul_1_entry_0)
+
+L(mul_1_prologue_1):
+	cmp	$-1, un
+	jne	2f
+	mov	%rax, -8(rp)
+	mov	%rdx, (rp)
+	jmp	L(ret)
+2:	add	$1, n
+	lea	L(addmul_outer_1)(%rip), outer_addr
+	mov	%rax, w1
+	mov	%rdx, w2
+	xor	R32(w3), R32(w3)
+	mov	(up,n,8), %rax
+	jmp	L(mul_1_entry_1)
+
+L(mul_1_prologue_2):
+	add	$-2, n
+	lea	L(addmul_outer_2)(%rip), outer_addr
+	mov	%rax, w0
+	mov	%rdx, w1
+	mov	24(up,n,8), %rax
+	xor	R32(w2), R32(w2)
+	xor	R32(w3), R32(w3)
+	jmp	L(mul_1_entry_2)
+
+
+	C this loop is 10 c/loop = 2.5 c/l on K8, for all up/rp alignments
+
+	ALIGN(16)
+L(mul_1_top):
+	mov	w0, -16(rp,n,8)
+	add	%rax, w1
+	mov	(up,n,8), %rax
+	adc	%rdx, w2
+L(mul_1_entry_1):
+	xor	R32(w0), R32(w0)
+	mul	v0
+	mov	w1, -8(rp,n,8)
+	add	%rax, w2
+	adc	%rdx, w3
+L(mul_1_entry_0):
+	mov	8(up,n,8), %rax
+	mul	v0
+	mov	w2, (rp,n,8)
+	add	%rax, w3
+	adc	%rdx, w0
+L(mul_1_entry_3):
+	mov	16(up,n,8), %rax
+	mul	v0
+	mov	w3, 8(rp,n,8)
+	xor	R32(w2), R32(w2)	C zero
+	mov	w2, w3			C zero
+	add	%rax, w0
+	mov	24(up,n,8), %rax
+	mov	w2, w1			C zero
+	adc	%rdx, w1
+L(mul_1_entry_2):
+	mul	v0
+	add	$4, n
+	js	L(mul_1_top)
+
+	mov	w0, -16(rp)
+	add	%rax, w1
+	mov	w1, -8(rp)
+	adc	%rdx, w2
+	mov	w2, (rp)
+
+	add	$-1, vn			C vn -= 1
+	jz	L(ret)
+
+	mov	8(vp), v0
+	mov	16(vp), v1
+
+	lea	8(vp), vp		C vp += 1
+	lea	8(rp), rp		C rp += 1
+
+	jmp	*outer_addr
+
+C ===========================================================
+C     mul_2 for vp[0], vp[1] if vn is even
+
+	ALIGN(16)
+L(mul_2):
+	mov	8(vp), v1
+
+	and	$3, R32(w0)
+	jz	L(mul_2_prologue_0)
+	cmp	$2, R32(w0)
+	jz	L(mul_2_prologue_2)
+	jc	L(mul_2_prologue_1)
+
+L(mul_2_prologue_3):
+	lea	L(addmul_outer_3)(%rip), outer_addr
+	add	$2, n
+	mov	%rax, -16(rp,n,8)
+	mov	%rdx, w2
+	xor	R32(w3), R32(w3)
+	xor	R32(w0), R32(w0)
+	mov	-16(up,n,8), %rax
+	jmp	L(mul_2_entry_3)
+
+	ALIGN(16)
+L(mul_2_prologue_0):
+	add	$3, n
+	mov	%rax, w0
+	mov	%rdx, w1
+	xor	R32(w2), R32(w2)
+	mov	-24(up,n,8), %rax
+	lea	L(addmul_outer_0)(%rip), outer_addr
+	jmp	L(mul_2_entry_0)
+
+	ALIGN(16)
+L(mul_2_prologue_1):
+	mov	%rax, w3
+	mov	%rdx, w0
+	xor	R32(w1), R32(w1)
+	lea	L(addmul_outer_1)(%rip), outer_addr
+	jmp	L(mul_2_entry_1)
+
+	ALIGN(16)
+L(mul_2_prologue_2):
+	add	$1, n
+	lea	L(addmul_outer_2)(%rip), outer_addr
+	mov	$0, R32(w0)
+	mov	$0, R32(w1)
+	mov	%rax, w2
+	mov	-8(up,n,8), %rax
+	mov	%rdx, w3
+	jmp	L(mul_2_entry_2)
+
+	C this loop is 18 c/loop = 2.25 c/l on K8, for all up/rp alignments
+
+	ALIGN(16)
+L(mul_2_top):
+	mov	-32(up,n,8), %rax
+	mul	v1
+	add	%rax, w0
+	adc	%rdx, w1
+	mov	-24(up,n,8), %rax
+	xor	R32(w2), R32(w2)
+	mul	v0
+	add	%rax, w0
+	mov	-24(up,n,8), %rax
+	adc	%rdx, w1
+	adc	$0, R32(w2)
+L(mul_2_entry_0):
+	mul	v1
+	add	%rax, w1
+	mov	w0, -24(rp,n,8)
+	adc	%rdx, w2
+	mov	-16(up,n,8), %rax
+	mul	v0
+	mov	$0, R32(w3)
+	add	%rax, w1
+	adc	%rdx, w2
+	mov	-16(up,n,8), %rax
+	adc	$0, R32(w3)
+	mov	$0, R32(w0)
+	mov	w1, -16(rp,n,8)
+L(mul_2_entry_3):
+	mul	v1
+	add	%rax, w2
+	mov	-8(up,n,8), %rax
+	adc	%rdx, w3
+	mov	$0, R32(w1)
+	mul	v0
+	add	%rax, w2
+	mov	-8(up,n,8), %rax
+	adc	%rdx, w3
+	adc	R32(w1), R32(w0)	C adc $0, w0
+L(mul_2_entry_2):
+	mul	v1
+	add	%rax, w3
+	mov	w2, -8(rp,n,8)
+	adc	%rdx, w0
+	mov	(up,n,8), %rax
+	mul	v0
+	add	%rax, w3
+	adc	%rdx, w0
+	adc	$0, R32(w1)
+L(mul_2_entry_1):
+	add	$4, n
+	mov	w3, -32(rp,n,8)
+	js	L(mul_2_top)
+
+	mov	-32(up,n,8), %rax	C FIXME: n is constant
+	mul	v1
+	add	%rax, w0
+	mov	w0, (rp)
+	adc	%rdx, w1
+	mov	w1, 8(rp)
+
+	add	$-2, vn			C vn -= 2
+	jz	L(ret)
+
+	mov	16(vp), v0
+	mov	24(vp), v1
+
+	lea	16(vp), vp		C vp += 2
+	lea	16(rp), rp		C rp += 2
+
+	jmp	*outer_addr
+
+
+C ===========================================================
+C     addmul_2 for remaining vp's
+
+	C in the following prologues, we reuse un to store the
+	C adjusted value of n that is reloaded on each iteration
+
+L(addmul_outer_0):
+	add	$3, un
+	lea	0(%rip), outer_addr
+
+	mov	un, n
+	mov	-24(up,un,8), %rax
+	mul	v0
+	mov	%rax, w0
+	mov	-24(up,un,8), %rax
+	mov	%rdx, w1
+	xor	R32(w2), R32(w2)
+	jmp	L(addmul_entry_0)
+
+L(addmul_outer_1):
+	mov	un, n
+	mov	(up,un,8), %rax
+	mul	v0
+	mov	%rax, w3
+	mov	(up,un,8), %rax
+	mov	%rdx, w0
+	xor	R32(w1), R32(w1)
+	jmp	L(addmul_entry_1)
+
+L(addmul_outer_2):
+	add	$1, un
+	lea	0(%rip), outer_addr
+
+	mov	un, n
+	mov	-8(up,un,8), %rax
+	mul	v0
+	xor	R32(w0), R32(w0)
+	mov	%rax, w2
+	xor	R32(w1), R32(w1)
+	mov	%rdx, w3
+	mov	-8(up,un,8), %rax
+	jmp	L(addmul_entry_2)
+
+L(addmul_outer_3):
+	add	$2, un
+	lea	0(%rip), outer_addr
+
+	mov	un, n
+	mov	-16(up,un,8), %rax
+	xor	R32(w3), R32(w3)
+	mul	v0
+	mov	%rax, w1
+	mov	-16(up,un,8), %rax
+	mov	%rdx, w2
+	jmp	L(addmul_entry_3)
+
+	C this loop is 19 c/loop = 2.375 c/l on K8, for all up/rp alignments
+
+	ALIGN(16)
+L(addmul_top):
+	add	w3, -32(rp,n,8)
+	adc	%rax, w0
+	mov	-24(up,n,8), %rax
+	adc	%rdx, w1
+	xor	R32(w2), R32(w2)
+	mul	v0
+	add	%rax, w0
+	mov	-24(up,n,8), %rax
+	adc	%rdx, w1
+	adc	R32(w2), R32(w2)	C adc $0, w2
+L(addmul_entry_0):
+	mul	v1
+	xor	R32(w3), R32(w3)
+	add	w0, -24(rp,n,8)
+	adc	%rax, w1
+	mov	-16(up,n,8), %rax
+	adc	%rdx, w2
+	mul	v0
+	add	%rax, w1
+	mov	-16(up,n,8), %rax
+	adc	%rdx, w2
+	adc	$0, R32(w3)
+L(addmul_entry_3):
+	mul	v1
+	add	w1, -16(rp,n,8)
+	adc	%rax, w2
+	mov	-8(up,n,8), %rax
+	adc	%rdx, w3
+	mul	v0
+	xor	R32(w0), R32(w0)
+	add	%rax, w2
+	adc	%rdx, w3
+	mov	$0, R32(w1)
+	mov	-8(up,n,8), %rax
+	adc	R32(w1), R32(w0)	C adc $0, w0
+L(addmul_entry_2):
+	mul	v1
+	add	w2, -8(rp,n,8)
+	adc	%rax, w3
+	adc	%rdx, w0
+	mov	(up,n,8), %rax
+	mul	v0
+	add	%rax, w3
+	mov	(up,n,8), %rax
+	adc	%rdx, w0
+	adc	$0, R32(w1)
+L(addmul_entry_1):
+	mul	v1
+	add	$4, n
+	js	L(addmul_top)
+
+	add	w3, -8(rp)
+	adc	%rax, w0
+	mov	w0, (rp)
+	adc	%rdx, w1
+	mov	w1, 8(rp)
+
+	add	$-2, vn			C vn -= 2
+	jz	L(ret)
+
+	lea	16(rp), rp		C rp += 2
+	lea	16(vp), vp		C vp += 2
+
+	mov	(vp), v0
+	mov	8(vp), v1
+
+	jmp	*outer_addr
+
+	ALIGN(16)
+L(ret):	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/k8/mullo_basecase.asm b/third_party/gmp/mpn/x86_64/k8/mullo_basecase.asm
new file mode 100644
index 0000000..fa00f42
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/k8/mullo_basecase.asm

@@ -0,0 +1,436 @@
+dnl  AMD64 mpn_mullo_basecase.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  Copyright 2008, 2009, 2011, 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C The inner loops of this code are the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjorn Granlund.
+
+C NOTES
+C   * There is a major stupidity in that we call mpn_mul_1 initially, for a
+C     large trip count.  Instead, we should start with mul_2 for any operand
+C     size congruence class.
+C   * Stop iterating addmul_2 earlier, falling into straight-line triangle code
+C     for the last 2-3 iterations.
+C   * Perhaps implement n=4 special code.
+C   * The reload of the outer loop jump address hurts branch prediction.
+C   * The addmul_2 loop ends with an MUL whose high part is not used upon loop
+C     exit.
+
+C INPUT PARAMETERS
+define(`rp',	   `%rdi')
+define(`up',	   `%rsi')
+define(`vp_param', `%rdx')
+define(`n',	   `%rcx')
+
+define(`vp',	`%r11')
+define(`outer_addr', `%r8')
+define(`j',	`%r9')
+define(`v0',	`%r13')
+define(`v1',	`%r14')
+define(`w0',	`%rbx')
+define(`w1',	`%r15')
+define(`w2',	`%rbp')
+define(`w3',	`%r10')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_mullo_basecase)
+	FUNC_ENTRY(4)
+	cmp	$4, n
+	jge	L(gen)
+	mov	(up), %rax		C u0
+	mov	(vp_param), %r8		C v0
+
+	lea	L(tab)(%rip), %r9
+ifdef(`PIC',
+`	movslq	(%r9,%rcx,4), %r10
+	add	%r10, %r9
+	jmp	*%r9
+',`
+	jmp	*(%r9,n,8)
+')
+	JUMPTABSECT
+	ALIGN(8)
+L(tab):	JMPENT(	L(tab), L(tab))			C not allowed
+	JMPENT(	L(1), L(tab))			C 1
+	JMPENT(	L(2), L(tab))			C 2
+	JMPENT(	L(3), L(tab))			C 3
+dnl	JMPENT(	L(0m4), L(tab))			C 4
+dnl	JMPENT(	L(1m4), L(tab))			C 5
+dnl	JMPENT(	L(2m4), L(tab))			C 6
+dnl	JMPENT(	L(3m4), L(tab))			C 7
+dnl	JMPENT(	L(0m4), L(tab))			C 8
+dnl	JMPENT(	L(1m4), L(tab))			C 9
+dnl	JMPENT(	L(2m4), L(tab))			C 10
+dnl	JMPENT(	L(3m4), L(tab))			C 11
+	TEXT
+
+L(1):	imul	%r8, %rax
+	mov	%rax, (rp)
+	FUNC_EXIT()
+	ret
+
+L(2):	mov	8(vp_param), %r11
+	imul	%rax, %r11		C u0 x v1
+	mul	%r8			C u0 x v0
+	mov	%rax, (rp)
+	imul	8(up), %r8		C u1 x v0
+	lea	(%r11, %rdx), %rax
+	add	%r8, %rax
+	mov	%rax, 8(rp)
+	FUNC_EXIT()
+	ret
+
+L(3):	mov	8(vp_param), %r9	C v1
+	mov	16(vp_param), %r11
+	mul	%r8			C u0 x v0 -> <r1,r0>
+	mov	%rax, (rp)		C r0
+	mov	(up), %rax		C u0
+	mov	%rdx, %rcx		C r1
+	mul	%r9			C u0 x v1 -> <r2,r1>
+	imul	8(up), %r9		C u1 x v1 -> r2
+	mov	16(up), %r10
+	imul	%r8, %r10		C u2 x v0 -> r2
+	add	%rax, %rcx
+	adc	%rdx, %r9
+	add	%r10, %r9
+	mov	8(up), %rax		C u1
+	mul	%r8			C u1 x v0 -> <r2,r1>
+	add	%rax, %rcx
+	adc	%rdx, %r9
+	mov	%r11, %rax
+	imul	(up), %rax		C u0 x v2 -> r2
+	add	%rax, %r9
+	mov	%rcx, 8(rp)
+	mov	%r9, 16(rp)
+	FUNC_EXIT()
+	ret
+
+L(0m4):
+L(1m4):
+L(2m4):
+L(3m4):
+L(gen):	push	%rbx
+	push	%rbp
+	push	%r13
+	push	%r14
+	push	%r15
+
+	mov	(up), %rax
+	mov	(vp_param), v0
+	mov	vp_param, vp
+
+	lea	(rp,n,8), rp
+	lea	(up,n,8), up
+	neg	n
+
+	mul	v0
+
+	test	$1, R8(n)
+	jz	L(mul_2)
+
+L(mul_1):
+	lea	-8(rp), rp
+	lea	-8(up), up
+	test	$2, R8(n)
+	jnz	L(mul_1_prologue_3)
+
+L(mul_1_prologue_2):		C n = 7, 11, 15, ...
+	lea	-1(n), j
+	lea	L(addmul_outer_1)(%rip), outer_addr
+	mov	%rax, w0
+	mov	%rdx, w1
+	xor	R32(w2), R32(w2)
+	xor	R32(w3), R32(w3)
+	mov	16(up,n,8), %rax
+	jmp	L(mul_1_entry_2)
+
+L(mul_1_prologue_3):		C n = 5, 9, 13, ...
+	lea	1(n), j
+	lea	L(addmul_outer_3)(%rip), outer_addr
+	mov	%rax, w2
+	mov	%rdx, w3
+	xor	R32(w0), R32(w0)
+	jmp	L(mul_1_entry_0)
+
+	ALIGN(16)
+L(mul_1_top):
+	mov	w0, -16(rp,j,8)
+	add	%rax, w1
+	mov	(up,j,8), %rax
+	adc	%rdx, w2
+	xor	R32(w0), R32(w0)
+	mul	v0
+	mov	w1, -8(rp,j,8)
+	add	%rax, w2
+	adc	%rdx, w3
+L(mul_1_entry_0):
+	mov	8(up,j,8), %rax
+	mul	v0
+	mov	w2, (rp,j,8)
+	add	%rax, w3
+	adc	%rdx, w0
+	mov	16(up,j,8), %rax
+	mul	v0
+	mov	w3, 8(rp,j,8)
+	xor	R32(w2), R32(w2)	C zero
+	mov	w2, w3			C zero
+	add	%rax, w0
+	mov	24(up,j,8), %rax
+	mov	w2, w1			C zero
+	adc	%rdx, w1
+L(mul_1_entry_2):
+	mul	v0
+	add	$4, j
+	js	L(mul_1_top)
+
+	mov	w0, -16(rp)
+	add	%rax, w1
+	mov	w1, -8(rp)
+	adc	%rdx, w2
+
+	imul	(up), v0
+	add	v0, w2
+	mov	w2, (rp)
+
+	add	$1, n
+	jz	L(ret)
+
+	mov	8(vp), v0
+	mov	16(vp), v1
+
+	lea	16(up), up
+	lea	8(vp), vp
+	lea	24(rp), rp
+
+	jmp	*outer_addr
+
+
+L(mul_2):
+	mov	8(vp), v1
+	test	$2, R8(n)
+	jz	L(mul_2_prologue_3)
+
+	ALIGN(16)
+L(mul_2_prologue_1):
+	lea	0(n), j
+	mov	%rax, w3
+	mov	%rdx, w0
+	xor	R32(w1), R32(w1)
+	mov	(up,n,8), %rax
+	lea	L(addmul_outer_3)(%rip), outer_addr
+	jmp	L(mul_2_entry_1)
+
+	ALIGN(16)
+L(mul_2_prologue_3):
+	lea	2(n), j
+	mov	$0, R32(w3)
+	mov	%rax, w1
+	mov	(up,n,8), %rax
+	mov	%rdx, w2
+	lea	L(addmul_outer_1)(%rip), outer_addr
+	jmp	L(mul_2_entry_3)
+
+	ALIGN(16)
+L(mul_2_top):
+	mov	-32(up,j,8), %rax
+	mul	v1
+	add	%rax, w0
+	adc	%rdx, w1
+	mov	-24(up,j,8), %rax
+	xor	R32(w2), R32(w2)
+	mul	v0
+	add	%rax, w0
+	mov	-24(up,j,8), %rax
+	adc	%rdx, w1
+	adc	$0, R32(w2)
+	mul	v1
+	add	%rax, w1
+	mov	w0, -24(rp,j,8)
+	adc	%rdx, w2
+	mov	-16(up,j,8), %rax
+	mul	v0
+	mov	$0, R32(w3)
+	add	%rax, w1
+	adc	%rdx, w2
+	mov	-16(up,j,8), %rax
+	adc	$0, R32(w3)
+L(mul_2_entry_3):
+	mov	$0, R32(w0)
+	mov	w1, -16(rp,j,8)
+	mul	v1
+	add	%rax, w2
+	mov	-8(up,j,8), %rax
+	adc	%rdx, w3
+	mov	$0, R32(w1)
+	mul	v0
+	add	%rax, w2
+	mov	-8(up,j,8), %rax
+	adc	%rdx, w3
+	adc	R32(w1), R32(w0)
+	mul	v1
+	add	%rax, w3
+	mov	w2, -8(rp,j,8)
+	adc	%rdx, w0
+	mov	(up,j,8), %rax
+	mul	v0
+	add	%rax, w3
+	adc	%rdx, w0
+	adc	$0, R32(w1)
+L(mul_2_entry_1):
+	add	$4, j
+	mov	w3, -32(rp,j,8)
+	js	L(mul_2_top)
+
+	imul	-16(up), v1
+	add	v1, w0
+	imul	-8(up), v0
+	add	v0, w0
+	mov	w0, -8(rp)
+
+	add	$2, n
+	jz	L(ret)
+
+	mov	16(vp), v0
+	mov	24(vp), v1
+
+	lea	16(vp), vp
+	lea	16(rp), rp
+
+	jmp	*outer_addr
+
+
+L(addmul_outer_1):
+	lea	-2(n), j
+	mov	-16(up,n,8), %rax
+	mul	v0
+	mov	%rax, w3
+	mov	-16(up,n,8), %rax
+	mov	%rdx, w0
+	xor	R32(w1), R32(w1)
+	lea	L(addmul_outer_3)(%rip), outer_addr
+	jmp	L(addmul_entry_1)
+
+L(addmul_outer_3):
+	lea	0(n), j
+	mov	-16(up,n,8), %rax
+	xor	R32(w3), R32(w3)
+	mul	v0
+	mov	%rax, w1
+	mov	-16(up,n,8), %rax
+	mov	%rdx, w2
+	lea	L(addmul_outer_1)(%rip), outer_addr
+	jmp	L(addmul_entry_3)
+
+	ALIGN(16)
+L(addmul_top):
+	add	w3, -32(rp,j,8)
+	adc	%rax, w0
+	mov	-24(up,j,8), %rax
+	adc	%rdx, w1
+	xor	R32(w2), R32(w2)
+	mul	v0
+	add	%rax, w0
+	mov	-24(up,j,8), %rax
+	adc	%rdx, w1
+	adc	R32(w2), R32(w2)
+	mul	v1
+	xor	R32(w3), R32(w3)
+	add	w0, -24(rp,j,8)
+	adc	%rax, w1
+	mov	-16(up,j,8), %rax
+	adc	%rdx, w2
+	mul	v0
+	add	%rax, w1
+	mov	-16(up,j,8), %rax
+	adc	%rdx, w2
+	adc	$0, R32(w3)
+L(addmul_entry_3):
+	mul	v1
+	add	w1, -16(rp,j,8)
+	adc	%rax, w2
+	mov	-8(up,j,8), %rax
+	adc	%rdx, w3
+	mul	v0
+	xor	R32(w0), R32(w0)
+	add	%rax, w2
+	adc	%rdx, w3
+	mov	$0, R32(w1)
+	mov	-8(up,j,8), %rax
+	adc	R32(w1), R32(w0)
+	mul	v1
+	add	w2, -8(rp,j,8)
+	adc	%rax, w3
+	adc	%rdx, w0
+	mov	(up,j,8), %rax
+	mul	v0
+	add	%rax, w3
+	mov	(up,j,8), %rax
+	adc	%rdx, w0
+	adc	$0, R32(w1)
+L(addmul_entry_1):
+	mul	v1
+	add	$4, j
+	js	L(addmul_top)
+
+	add	w3, -32(rp)
+	adc	%rax, w0
+
+	imul	-24(up), v0
+	add	v0, w0
+	add	w0, -24(rp)
+
+	add	$2, n
+	jns	L(ret)
+
+	lea	16(vp), vp
+
+	mov	(vp), v0
+	mov	8(vp), v1
+
+	lea	-16(up), up
+
+	jmp	*outer_addr
+
+L(ret):	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%rbp
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/k8/mulmid_basecase.asm b/third_party/gmp/mpn/x86_64/k8/mulmid_basecase.asm
new file mode 100644
index 0000000..86f1414
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/k8/mulmid_basecase.asm

@@ -0,0 +1,559 @@
+dnl  AMD64 mpn_mulmid_basecase
+
+dnl  Contributed by David Harvey.
+
+dnl  Copyright 2011, 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+C	     cycles/limb
+C K8,K9:	 2.375  (2.5 when un - vn is "small")
+C K10:		 ?
+C P4:		 ?
+C P6-15:	 ?
+
+C INPUT PARAMETERS
+define(`rp',      `%rdi')
+define(`up',      `%rsi')
+define(`un_param',`%rdx')
+define(`vp_param',`%rcx')
+define(`vn',      `%r8')
+
+define(`v0', `%r12')
+define(`v1', `%r9')
+
+define(`w0', `%rbx')
+define(`w1', `%rcx')
+define(`w2', `%rbp')
+define(`w3', `%r10')
+
+define(`n',  `%r11')
+define(`outer_addr', `%r14')
+define(`un',  `%r13')
+define(`vp',  `%r15')
+
+define(`vp_inner', `%r10')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_mulmid_basecase)
+	FUNC_ENTRY(4)
+IFDOS(`	mov	56(%rsp), %r8d	')
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+
+	mov	vp_param, vp
+
+	C use un for row length (= un_param - vn + 1)
+	lea	1(un_param), un
+	sub	vn, un
+
+	lea	(rp,un,8), rp
+
+	cmp	$4, un		C TODO: needs tuning
+	jc	L(diagonal)
+
+	lea	(up,un_param,8), up
+
+	test	$1, vn
+	jz	L(mul_2)
+
+C ===========================================================
+C     mul_1 for vp[0] if vn is odd
+
+L(mul_1):
+	mov	R32(un), R32(w0)
+
+	neg	un
+	mov	(up,un,8), %rax
+	mov	(vp), v0
+	mul	v0
+
+	and	$-4, un		C round down to multiple of 4
+	mov	un, n
+
+	and	$3, R32(w0)
+	jz	L(mul_1_prologue_0)
+	cmp	$2, R32(w0)
+	jc	L(mul_1_prologue_1)
+	jz	L(mul_1_prologue_2)
+
+L(mul_1_prologue_3):
+	mov	%rax, w3
+	mov	%rdx, w0
+	lea	L(addmul_prologue_3)(%rip), outer_addr
+	jmp	L(mul_1_entry_3)
+
+	ALIGN(16)
+L(mul_1_prologue_0):
+	mov	%rax, w2
+	mov	%rdx, w3		C note already w0 == 0
+	lea	L(addmul_prologue_0)(%rip), outer_addr
+	jmp	L(mul_1_entry_0)
+
+	ALIGN(16)
+L(mul_1_prologue_1):
+	add	$4, n
+	mov	%rax, w1
+	mov	%rdx, w2
+	mov	$0, R32(w3)
+	mov	(up,n,8), %rax
+	lea	L(addmul_prologue_1)(%rip), outer_addr
+	jmp	L(mul_1_entry_1)
+
+	ALIGN(16)
+L(mul_1_prologue_2):
+	mov	%rax, w0
+	mov	%rdx, w1
+	mov	24(up,n,8), %rax
+	mov	$0, R32(w2)
+	mov	$0, R32(w3)
+	lea	L(addmul_prologue_2)(%rip), outer_addr
+	jmp	L(mul_1_entry_2)
+
+
+	C this loop is 10 c/loop = 2.5 c/l on K8
+
+	ALIGN(16)
+L(mul_1_top):
+	mov	w0, -16(rp,n,8)
+	add	%rax, w1
+	mov	(up,n,8), %rax
+	adc	%rdx, w2
+L(mul_1_entry_1):
+	mov	$0, R32(w0)
+	mul	v0
+	mov	w1, -8(rp,n,8)
+	add	%rax, w2
+	adc	%rdx, w3
+L(mul_1_entry_0):
+	mov	8(up,n,8), %rax
+	mul	v0
+	mov	w2, (rp,n,8)
+	add	%rax, w3
+	adc	%rdx, w0
+L(mul_1_entry_3):
+	mov	16(up,n,8), %rax
+	mul	v0
+	mov	w3, 8(rp,n,8)
+	mov	$0, R32(w2)		C zero
+	mov	w2, w3			C zero
+	add	%rax, w0
+	mov	24(up,n,8), %rax
+	mov	w2, w1			C zero
+	adc	%rdx, w1
+L(mul_1_entry_2):
+	mul	v0
+	add	$4, n
+	js	L(mul_1_top)
+
+	mov	w0, -16(rp)
+	add	%rax, w1
+	mov	w1, -8(rp)
+	mov	w2, 8(rp)		C zero last limb of output
+	adc	%rdx, w2
+	mov	w2, (rp)
+
+	dec	vn
+	jz	L(ret)
+
+	lea	-8(up), up
+	lea	8(vp), vp
+
+	mov	un, n
+	mov	(vp), v0
+	mov	8(vp), v1
+
+	jmp	*outer_addr
+
+C ===========================================================
+C     mul_2 for vp[0], vp[1] if vn is even
+
+	ALIGN(16)
+L(mul_2):
+	mov	R32(un), R32(w0)
+
+	neg	un
+	mov	-8(up,un,8), %rax
+	mov	(vp), v0
+	mov	8(vp), v1
+	mul	v1
+
+	and	$-4, un		C round down to multiple of 4
+	mov	un, n
+
+	and	$3, R32(w0)
+	jz	L(mul_2_prologue_0)
+	cmp	$2, R32(w0)
+	jc	L(mul_2_prologue_1)
+	jz	L(mul_2_prologue_2)
+
+L(mul_2_prologue_3):
+	mov	%rax, w1
+	mov	%rdx, w2
+	lea	L(addmul_prologue_3)(%rip), outer_addr
+	jmp	L(mul_2_entry_3)
+
+	ALIGN(16)
+L(mul_2_prologue_0):
+	mov	%rax, w0
+	mov	%rdx, w1
+	lea	L(addmul_prologue_0)(%rip), outer_addr
+	jmp	L(mul_2_entry_0)
+
+	ALIGN(16)
+L(mul_2_prologue_1):
+	mov	%rax, w3
+	mov	%rdx, w0
+	mov	$0, R32(w1)
+	lea	L(addmul_prologue_1)(%rip), outer_addr
+	jmp	L(mul_2_entry_1)
+
+	ALIGN(16)
+L(mul_2_prologue_2):
+	mov	%rax, w2
+	mov	%rdx, w3
+	mov	$0, R32(w0)
+	mov	16(up,n,8), %rax
+	lea	L(addmul_prologue_2)(%rip), outer_addr
+	jmp	L(mul_2_entry_2)
+
+
+	C this loop is 18 c/loop = 2.25 c/l on K8
+
+	ALIGN(16)
+L(mul_2_top):
+	mov     -8(up,n,8), %rax
+	mul     v1
+	add     %rax, w0
+	adc     %rdx, w1
+L(mul_2_entry_0):
+	mov     $0, R32(w2)
+	mov     (up,n,8), %rax
+	mul     v0
+	add     %rax, w0
+	mov     (up,n,8), %rax
+	adc     %rdx, w1
+	adc     $0, R32(w2)
+	mul     v1
+	add     %rax, w1
+	mov     w0, (rp,n,8)
+	adc     %rdx, w2
+L(mul_2_entry_3):
+	mov     8(up,n,8), %rax
+	mul     v0
+	mov     $0, R32(w3)
+	add     %rax, w1
+	adc     %rdx, w2
+	mov     $0, R32(w0)
+	adc     $0, R32(w3)
+	mov     8(up,n,8), %rax
+	mov     w1, 8(rp,n,8)
+	mul     v1
+	add     %rax, w2
+	mov     16(up,n,8), %rax
+	adc     %rdx, w3
+L(mul_2_entry_2):
+	mov     $0, R32(w1)
+	mul     v0
+	add     %rax, w2
+	mov     16(up,n,8), %rax
+	adc     %rdx, w3
+	adc     $0, R32(w0)
+	mul     v1
+	add     %rax, w3
+	mov     w2, 16(rp,n,8)
+	adc     %rdx, w0
+L(mul_2_entry_1):
+	mov     24(up,n,8), %rax
+	mul     v0
+	add     %rax, w3
+	adc     %rdx, w0
+	adc     $0, R32(w1)
+	add     $4, n
+	mov     w3, -8(rp,n,8)
+	jnz     L(mul_2_top)
+
+	mov	w0, (rp)
+	mov	w1, 8(rp)
+
+	sub	$2, vn
+	jz	L(ret)
+
+	lea	16(vp), vp
+	lea	-16(up), up
+
+	mov	un, n
+	mov	(vp), v0
+	mov	8(vp), v1
+
+	jmp	*outer_addr
+
+C ===========================================================
+C     addmul_2 for remaining vp's
+
+	ALIGN(16)
+L(addmul_prologue_0):
+	mov	-8(up,n,8), %rax
+	mul	v1
+	mov	%rax, w1
+	mov	%rdx, w2
+	mov	$0, R32(w3)
+	jmp	L(addmul_entry_0)
+
+	ALIGN(16)
+L(addmul_prologue_1):
+	mov	16(up,n,8), %rax
+	mul	v1
+	mov	%rax, w0
+	mov	%rdx, w1
+	mov	$0, R32(w2)
+	mov	24(up,n,8), %rax
+	jmp	L(addmul_entry_1)
+
+	ALIGN(16)
+L(addmul_prologue_2):
+	mov	8(up,n,8), %rax
+	mul	v1
+	mov	%rax, w3
+	mov	%rdx, w0
+	mov	$0, R32(w1)
+	jmp	L(addmul_entry_2)
+
+	ALIGN(16)
+L(addmul_prologue_3):
+	mov	(up,n,8), %rax
+	mul	v1
+	mov	%rax, w2
+	mov	%rdx, w3
+	mov	$0, R32(w0)
+	mov	$0, R32(w1)
+	jmp	L(addmul_entry_3)
+
+	C this loop is 19 c/loop = 2.375 c/l on K8
+
+	ALIGN(16)
+L(addmul_top):
+	mov	$0, R32(w3)
+	add	%rax, w0
+	mov	-8(up,n,8), %rax
+	adc	%rdx, w1
+	adc	$0, R32(w2)
+	mul	v1
+	add	w0, -8(rp,n,8)
+	adc	%rax, w1
+	adc	%rdx, w2
+L(addmul_entry_0):
+	mov	(up,n,8), %rax
+	mul	v0
+	add	%rax, w1
+	mov	(up,n,8), %rax
+	adc	%rdx, w2
+	adc	$0, R32(w3)
+	mul	v1
+	add	w1, (rp,n,8)
+	mov	$0, R32(w1)
+	adc	%rax, w2
+	mov	$0, R32(w0)
+	adc	%rdx, w3
+L(addmul_entry_3):
+	mov	8(up,n,8), %rax
+	mul	v0
+	add	%rax, w2
+	mov	8(up,n,8), %rax
+	adc	%rdx, w3
+	adc	$0, R32(w0)
+	mul	v1
+	add	w2, 8(rp,n,8)
+	adc	%rax, w3
+	adc	%rdx, w0
+L(addmul_entry_2):
+	mov	16(up,n,8), %rax
+	mul	v0
+	add	%rax, w3
+	mov	16(up,n,8), %rax
+	adc	%rdx, w0
+	adc	$0, R32(w1)
+	mul	v1
+	add	w3, 16(rp,n,8)
+	nop			C don't ask...
+	adc	%rax, w0
+	mov	$0, R32(w2)
+	mov	24(up,n,8), %rax
+	adc	%rdx, w1
+L(addmul_entry_1):
+	mul	v0
+	add	$4, n
+	jnz	L(addmul_top)
+
+	add	%rax, w0
+	adc	%rdx, w1
+	adc	$0, R32(w2)
+
+	add	w0, -8(rp)
+	adc	w1, (rp)
+	adc	w2, 8(rp)
+
+	sub	$2, vn
+	jz	L(ret)
+
+	lea	16(vp), vp
+	lea	-16(up), up
+
+	mov	un, n
+	mov	(vp), v0
+	mov	8(vp), v1
+
+	jmp	*outer_addr
+
+C ===========================================================
+C     accumulate along diagonals if un - vn is small
+
+	ALIGN(16)
+L(diagonal):
+	xor	R32(w0), R32(w0)
+	xor	R32(w1), R32(w1)
+	xor	R32(w2), R32(w2)
+
+	neg	un
+
+	mov	R32(vn), %eax
+	and	$3, %eax
+	jz	L(diag_prologue_0)
+	cmp	$2, %eax
+	jc	L(diag_prologue_1)
+	jz	L(diag_prologue_2)
+
+L(diag_prologue_3):
+	lea	-8(vp), vp
+	mov	vp, vp_inner
+	add	$1, vn
+	mov	vn, n
+	lea	L(diag_entry_3)(%rip), outer_addr
+	jmp	L(diag_entry_3)
+
+L(diag_prologue_0):
+	mov	vp, vp_inner
+	mov	vn, n
+	lea	0(%rip), outer_addr
+	mov     -8(up,n,8), %rax
+	jmp	L(diag_entry_0)
+
+L(diag_prologue_1):
+	lea	8(vp), vp
+	mov	vp, vp_inner
+	add	$3, vn
+	mov	vn, n
+	lea	0(%rip), outer_addr
+	mov     -8(vp_inner), %rax
+	jmp	L(diag_entry_1)
+
+L(diag_prologue_2):
+	lea	-16(vp), vp
+	mov	vp, vp_inner
+	add	$2, vn
+	mov	vn, n
+	lea	0(%rip), outer_addr
+	mov	16(vp_inner), %rax
+	jmp	L(diag_entry_2)
+
+
+	C this loop is 10 c/loop = 2.5 c/l on K8
+
+	ALIGN(16)
+L(diag_top):
+	add     %rax, w0
+	adc     %rdx, w1
+	mov     -8(up,n,8), %rax
+	adc     $0, w2
+L(diag_entry_0):
+	mulq    (vp_inner)
+	add     %rax, w0
+	adc     %rdx, w1
+	adc     $0, w2
+L(diag_entry_3):
+	mov     -16(up,n,8), %rax
+	mulq    8(vp_inner)
+	add     %rax, w0
+	mov     16(vp_inner), %rax
+	adc     %rdx, w1
+	adc     $0, w2
+L(diag_entry_2):
+	mulq    -24(up,n,8)
+	add     %rax, w0
+	mov     24(vp_inner), %rax
+	adc     %rdx, w1
+	lea     32(vp_inner), vp_inner
+	adc     $0, w2
+L(diag_entry_1):
+	mulq    -32(up,n,8)
+	sub     $4, n
+	jnz	L(diag_top)
+
+	add	%rax, w0
+	adc	%rdx, w1
+	adc	$0, w2
+
+	mov	w0, (rp,un,8)
+
+	inc	un
+	jz	L(diag_end)
+
+	mov	vn, n
+	mov	vp, vp_inner
+
+	lea	8(up), up
+	mov	w1, w0
+	mov	w2, w1
+	xor	R32(w2), R32(w2)
+
+	jmp	*outer_addr
+
+L(diag_end):
+	mov	w1, (rp)
+	mov	w2, 8(rp)
+
+L(ret):	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/k8/redc_1.asm b/third_party/gmp/mpn/x86_64/k8/redc_1.asm
new file mode 100644
index 0000000..9327b21
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/k8/redc_1.asm

@@ -0,0 +1,591 @@
+dnl  X86-64 mpn_redc_1 optimised for AMD K8-K10.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2004, 2008, 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C AMD K8,K9	 ?
+C AMD K10	 ?
+C AMD bull	 ?
+C AMD pile	 ?
+C AMD steam	 ?
+C AMD bobcat	 ?
+C AMD jaguar	 ?
+C Intel P4	 ?
+C Intel core	 ?
+C Intel NHM	 ?
+C Intel SBR	 ?
+C Intel IBR	 ?
+C Intel HWL	 ?
+C Intel BWL	 ?
+C Intel atom	 ?
+C VIA nano	 ?
+
+C The inner loops of this code are the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjörn Granlund.
+
+C TODO
+C  * Micro-optimise, none performed thus far.
+C  * This looks different from other current redc_1.asm variants.  Consider
+C    adapting this to the mainstream style.
+C  * Is this code really faster than more approaches which compute q0 later?
+C    Is the use of a jump jump table faster?  Or is the edge of this due to the
+C    inlined add_n code?
+C  * Put initial m[0] x q0 computation in header.
+C  * Put basecases at the file's end, single them out before the pushes.
+
+define(`rp',          `%rdi')   C rcx
+define(`up',          `%rsi')   C rdx
+define(`mp_param',    `%rdx')   C r8
+define(`n',           `%rcx')   C r9
+define(`u0inv',       `%r8')    C stack
+
+define(`i',           `%r11')
+define(`nneg',        `%r12')
+define(`mp',          `%r13')
+define(`q0',          `%rbp')
+define(`vp',          `%rdx')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(32)
+PROLOGUE(mpn_redc_1)
+	FUNC_ENTRY(4)
+IFDOS(`	mov	56(%rsp), %r8	')
+	push	%rbp
+	mov	(up), q0		C up[0]
+	push	%rbx
+	imul	u0inv, q0		C first q0, for all execution paths
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+
+	mov	n, nneg
+	neg	nneg
+	lea	(mp_param,n,8), mp	C mp += n
+	lea	-16(up,n,8), up		C up += n
+
+	mov	R32(n), R32(%rax)
+	and	$3, R32(%rax)
+	lea	4(%rax), %r9
+	cmp	$4, R32(n)
+	cmovg	%r9, %rax
+	lea	L(tab)(%rip), %r9
+ifdef(`PIC',`
+	movslq	(%r9,%rax,4), %rax
+	add	%r9, %rax
+	jmp	*%rax
+',`
+	jmp	*(%r9,%rax,8)
+')
+
+	JUMPTABSECT
+	ALIGN(8)
+L(tab):	JMPENT(	L(0), L(tab))
+	JMPENT(	L(1), L(tab))
+	JMPENT(	L(2), L(tab))
+	JMPENT(	L(3), L(tab))
+	JMPENT(	L(0m4), L(tab))
+	JMPENT(	L(1m4), L(tab))
+	JMPENT(	L(2m4), L(tab))
+	JMPENT(	L(3m4), L(tab))
+	TEXT
+
+	ALIGN(16)
+L(1):	mov	(mp_param), %rax
+	mul	q0
+	add	8(up), %rax
+	adc	16(up), %rdx
+	mov	%rdx, (rp)
+	mov	$0, R32(%rax)
+	adc	R32(%rax), R32(%rax)
+	jmp	L(ret)
+
+
+	ALIGN(16)
+L(2):	mov	(mp_param), %rax
+	mul	q0
+	xor	R32(%r14), R32(%r14)
+	mov	%rax, %r10
+	mov	-8(mp), %rax
+	mov	%rdx, %r9
+	mul	q0
+	add	(up), %r10
+	adc	%rax, %r9
+	adc	%rdx, %r14
+	add	8(up), %r9
+	adc	$0, %r14
+	mov	%r9, q0
+	imul	u0inv, q0
+	mov	-16(mp), %rax
+	mul	q0
+	xor	R32(%rbx), R32(%rbx)
+	mov	%rax, %r10
+	mov	-8(mp), %rax
+	mov	%rdx, %r11
+	mul	q0
+	add	%r9, %r10
+	adc	%rax, %r11
+	adc	%rdx, %rbx
+	add	16(up), %r11
+	adc	$0, %rbx
+	xor	R32(%rax), R32(%rax)
+	add	%r11, %r14
+	adc	24(up), %rbx
+	mov	%r14, (rp)
+	mov	%rbx, 8(rp)
+	adc	R32(%rax), R32(%rax)
+	jmp	L(ret)
+
+
+L(3):	mov	(mp_param), %rax
+	mul	q0
+	mov	%rax, %rbx
+	mov	%rdx, %r10
+	mov	-16(mp), %rax
+	mul	q0
+	xor	R32(%r9), R32(%r9)
+	xor	R32(%r14), R32(%r14)
+	add	-8(up), %rbx
+	adc	%rax, %r10
+	mov	-8(mp), %rax
+	adc	%rdx, %r9
+	mul	q0
+	add	(up), %r10
+	mov	%r10, (up)
+	adc	%rax, %r9
+	adc	%rdx, %r14
+	mov	%r10, q0
+	imul	u0inv, q0
+	add	%r9, 8(up)
+	adc	$0, %r14
+	mov	%r14, -8(up)
+
+	mov	-24(mp), %rax
+	mul	q0
+	mov	%rax, %rbx
+	mov	%rdx, %r10
+	mov	-16(mp), %rax
+	mul	q0
+	xor	R32(%r9), R32(%r9)
+	xor	R32(%r14), R32(%r14)
+	add	(up), %rbx
+	adc	%rax, %r10
+	mov	-8(mp), %rax
+	adc	%rdx, %r9
+	mul	q0
+	add	8(up), %r10
+	mov	%r10, 8(up)
+	adc	%rax, %r9
+	adc	%rdx, %r14
+	mov	%r10, q0
+	imul	u0inv, q0
+	add	%r9, 16(up)
+	adc	$0, %r14
+	mov	%r14, (up)
+
+	mov	-24(mp), %rax
+	mul	q0
+	mov	%rax, %rbx
+	mov	%rdx, %r10
+	mov	-16(mp), %rax
+	mul	q0
+	xor	R32(%r9), R32(%r9)
+	xor	R32(%r14), R32(%r14)
+	add	8(up), %rbx
+	adc	%rax, %r10
+	mov	-8(mp), %rax
+	adc	%rdx, %r9
+	mul	q0
+	add	16(up), %r10
+	adc	%rax, %r9
+	adc	%rdx, %r14
+	add	24(up), %r9
+	adc	$0, %r14
+
+	xor	R32(%rax), R32(%rax)
+	add	-8(up), %r10
+	adc	(up), %r9
+	adc	32(up), %r14
+	mov	%r10, (rp)
+	mov	%r9, 8(rp)
+	mov	%r14, 16(rp)
+	adc	R32(%rax), R32(%rax)
+	jmp	L(ret)
+
+
+	ALIGN(16)
+L(2m4):
+L(lo2):	mov	(mp,nneg,8), %rax
+	mul	q0
+	xor	R32(%r14), R32(%r14)
+	xor	R32(%rbx), R32(%rbx)
+	mov	%rax, %r10
+	mov	8(mp,nneg,8), %rax
+	mov	24(up,nneg,8), %r15
+	mov	%rdx, %r9
+	mul	q0
+	add	16(up,nneg,8), %r10
+	adc	%rax, %r9
+	mov	16(mp,nneg,8), %rax
+	adc	%rdx, %r14
+	mul	q0
+	mov	$0, R32(%r10)		C xor?
+	lea	2(nneg), i
+	add	%r9, %r15
+	imul	u0inv, %r15
+	jmp	 L(e2)
+
+	ALIGN(16)
+L(li2):	add	%r10, (up,i,8)
+	adc	%rax, %r9
+	mov	(mp,i,8), %rax
+	adc	%rdx, %r14
+	xor	R32(%r10), R32(%r10)
+	mul	q0
+L(e2):	add	%r9, 8(up,i,8)
+	adc	%rax, %r14
+	adc	%rdx, %rbx
+	mov	8(mp,i,8), %rax
+	mul	q0
+	add	%r14, 16(up,i,8)
+	adc	%rax, %rbx
+	adc	%rdx, %r10
+	mov	16(mp,i,8), %rax
+	mul	q0
+	add	%rbx, 24(up,i,8)
+	mov	$0, R32(%r14)		C zero
+	mov	%r14, %rbx		C zero
+	adc	%rax, %r10
+	mov	24(mp,i,8), %rax
+	mov	%r14, %r9		C zero
+	adc	%rdx, %r9
+	mul	q0
+	add	$4, i
+	js	 L(li2)
+
+L(le2):	add	%r10, (up)
+	adc	%rax, %r9
+	adc	%r14, %rdx
+	add	%r9, 8(up)
+	adc	$0, %rdx
+	mov	%rdx, 16(up,nneg,8)	C up[0]
+	add	$8, up
+	mov	%r15, q0
+	dec	n
+	jnz	L(lo2)
+
+	mov	nneg, n
+	sar	$2, n
+	lea	32(up,nneg,8), up
+	lea	(up,nneg,8), vp
+
+	mov	-16(up), %r8
+	mov	-8(up), %r9
+	add	-16(vp), %r8
+	adc	-8(vp), %r9
+	mov	%r8, (rp)
+	mov	%r9, 8(rp)
+	lea	16(rp), rp
+	jmp	L(addx)
+
+
+	ALIGN(16)
+L(1m4):
+L(lo1):	mov	(mp,nneg,8), %rax
+	xor	%r9, %r9
+	xor	R32(%rbx), R32(%rbx)
+	mul	q0
+	mov	%rax, %r9
+	mov	8(mp,nneg,8), %rax
+	mov	24(up,nneg,8), %r15
+	mov	%rdx, %r14
+	mov	$0, R32(%r10)		C xor?
+	mul	q0
+	add	16(up,nneg,8), %r9
+	adc	%rax, %r14
+	adc	%rdx, %rbx
+	mov	16(mp,nneg,8), %rax
+	mul	q0
+	lea	1(nneg), i
+	add	%r14, %r15
+	imul	u0inv, %r15
+	jmp	 L(e1)
+
+	ALIGN(16)
+L(li1):	add	%r10, (up,i,8)
+	adc	%rax, %r9
+	mov	(mp,i,8), %rax
+	adc	%rdx, %r14
+	xor	R32(%r10), R32(%r10)
+	mul	q0
+	add	%r9, 8(up,i,8)
+	adc	%rax, %r14
+	adc	%rdx, %rbx
+	mov	8(mp,i,8), %rax
+	mul	q0
+L(e1):	add	%r14, 16(up,i,8)
+	adc	%rax, %rbx
+	adc	%rdx, %r10
+	mov	16(mp,i,8), %rax
+	mul	q0
+	add	%rbx, 24(up,i,8)
+	mov	$0, R32(%r14)		C zero
+	mov	%r14, %rbx		C zero
+	adc	%rax, %r10
+	mov	24(mp,i,8), %rax
+	mov	%r14, %r9		C zero
+	adc	%rdx, %r9
+	mul	q0
+	add	$4, i
+	js	 L(li1)
+
+L(le1):	add	%r10, (up)
+	adc	%rax, %r9
+	adc	%r14, %rdx
+	add	%r9, 8(up)
+	adc	$0, %rdx
+	mov	%rdx, 16(up,nneg,8)	C up[0]
+	add	$8, up
+	mov	%r15, q0
+	dec	n
+	jnz	L(lo1)
+
+	mov	nneg, n
+	sar	$2, n
+	lea	24(up,nneg,8), up
+	lea	(up,nneg,8), vp
+
+	mov	-8(up), %r8
+	add	-8(vp), %r8
+	mov	%r8, (rp)
+	lea	8(rp), rp
+	jmp	L(addx)
+
+
+	ALIGN(16)
+L(0):
+L(0m4):
+L(lo0):	mov	(mp,nneg,8), %rax
+	mov	nneg, i
+	mul	q0
+	xor	R32(%r10), R32(%r10)
+	mov	%rax, %r14
+	mov	%rdx, %rbx
+	mov	8(mp,nneg,8), %rax
+	mov	24(up,nneg,8), %r15
+	mul	q0
+	add	16(up,nneg,8), %r14
+	adc	%rax, %rbx
+	adc	%rdx, %r10
+	add	%rbx, %r15
+	imul	u0inv, %r15
+	jmp	L(e0)
+
+	ALIGN(16)
+L(li0):	add	%r10, (up,i,8)
+	adc	%rax, %r9
+	mov	(mp,i,8), %rax
+	adc	%rdx, %r14
+	xor	R32(%r10), R32(%r10)
+	mul	q0
+	add	%r9, 8(up,i,8)
+	adc	%rax, %r14
+	adc	%rdx, %rbx
+	mov	8(mp,i,8), %rax
+	mul	q0
+	add	%r14, 16(up,i,8)
+	adc	%rax, %rbx
+	adc	%rdx, %r10
+L(e0):	mov	16(mp,i,8), %rax
+	mul	q0
+	add	%rbx, 24(up,i,8)
+	mov	$0, R32(%r14)		C zero
+	mov	%r14, %rbx		C zero
+	adc	%rax, %r10
+	mov	24(mp,i,8), %rax
+	mov	%r14, %r9		C zero
+	adc	%rdx, %r9
+	mul	q0
+	add	$4, i
+	js	 L(li0)
+
+L(le0):	add	%r10, (up)
+	adc	%rax, %r9
+	adc	%r14, %rdx
+	add	%r9, 8(up)
+	adc	$0, %rdx
+	mov	%rdx, 16(up,nneg,8)	C up[0]
+	add	$8, up
+	mov	%r15, q0
+	dec	n
+	jnz	L(lo0)
+
+	mov	nneg, n
+	sar	$2, n
+	clc
+	lea	16(up,nneg,8), up
+	lea	(up,nneg,8), vp
+	jmp	L(addy)
+
+
+	ALIGN(16)
+L(3m4):
+L(lo3):	mov	(mp,nneg,8), %rax
+	mul	q0
+	mov	%rax, %rbx
+	mov	%rdx, %r10
+	mov	8(mp,nneg,8), %rax
+	mov	24(up,nneg,8), %r15
+	mul	q0
+	add	16(up,nneg,8), %rbx	C result is zero, might carry
+	mov	$0, R32(%rbx)		C zero
+	mov	%rbx, %r14		C zero
+	adc	%rax, %r10
+	mov	16(mp,nneg,8), %rax
+	mov	%r14, %r9		C zero
+	adc	%rdx, %r9
+	add	%r10, %r15
+	mul	q0
+	lea	3(nneg), i
+	imul	u0inv, %r15
+C	jmp	L(li3)
+
+	ALIGN(16)
+L(li3):	add	%r10, (up,i,8)
+	adc	%rax, %r9
+	mov	(mp,i,8), %rax
+	adc	%rdx, %r14
+	xor	R32(%r10), R32(%r10)
+	mul	q0
+	add	%r9, 8(up,i,8)
+	adc	%rax, %r14
+	adc	%rdx, %rbx
+	mov	8(mp,i,8), %rax
+	mul	q0
+	add	%r14, 16(up,i,8)
+	adc	%rax, %rbx
+	adc	%rdx, %r10
+	mov	16(mp,i,8), %rax
+	mul	q0
+	add	%rbx, 24(up,i,8)
+	mov	$0, R32(%r14)		C zero
+	mov	%r14, %rbx		C zero
+	adc	%rax, %r10
+	mov	24(mp,i,8), %rax
+	mov	%r14, %r9		C zero
+	adc	%rdx, %r9
+	mul	q0
+	add	$4, i
+	js	 L(li3)
+
+L(le3):	add	%r10, (up)
+	adc	%rax, %r9
+	adc	%r14, %rdx
+	add	%r9, 8(up)
+	adc	$0, %rdx
+	mov	%rdx, 16(up,nneg,8)	C up[0]
+	mov	%r15, q0
+	lea	8(up), up
+	dec	n
+	jnz	L(lo3)
+
+
+C ==== Addition code ====
+	mov	nneg, n
+	sar	$2, n
+	lea	40(up,nneg,8), up
+	lea	(up,nneg,8), vp
+
+	mov	-24(up), %r8
+	mov	-16(up), %r9
+	mov	-8(up), %r10
+	add	-24(vp), %r8
+	adc	-16(vp), %r9
+	adc	-8(vp), %r10
+	mov	%r8, (rp)
+	mov	%r9, 8(rp)
+	mov	%r10, 16(rp)
+	lea	24(rp), rp
+
+L(addx):inc	n
+	jz	L(ad3)
+
+L(addy):mov	(up), %r8
+	mov	8(up), %r9
+	inc	n
+	jmp	L(mid)
+
+C	ALIGN(16)
+L(al3):	adc	(vp), %r8
+	adc	8(vp), %r9
+	adc	16(vp), %r10
+	adc	24(vp), %r11
+	mov	%r8, (rp)
+	lea	32(up), up
+	mov	%r9, 8(rp)
+	mov	%r10, 16(rp)
+	inc	n
+	mov	%r11, 24(rp)
+	lea	32(vp), vp
+	mov	(up), %r8
+	mov	8(up), %r9
+	lea	32(rp), rp
+L(mid):	mov	16(up), %r10
+	mov	24(up), %r11
+	jnz	L(al3)
+
+L(ae3):	adc	(vp), %r8
+	adc	8(vp), %r9
+	adc	16(vp), %r10
+	adc	24(vp), %r11
+	mov	%r8, (rp)
+	mov	%r9, 8(rp)
+	mov	%r10, 16(rp)
+	mov	%r11, 24(rp)
+
+L(ad3):	mov	R32(n), R32(%rax)	C zero
+	adc	R32(%rax), R32(%rax)
+
+L(ret):	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbx
+	pop	%rbp
+	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/k8/sqr_basecase.asm b/third_party/gmp/mpn/x86_64/k8/sqr_basecase.asm
new file mode 100644
index 0000000..60cf945
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/k8/sqr_basecase.asm

@@ -0,0 +1,807 @@
+dnl  AMD64 mpn_sqr_basecase.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  Copyright 2008, 2009, 2011, 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C The inner loops of this code are the result of running a code generation and
+C optimization tool suite written by David Harvey and Torbjorn Granlund.
+
+C NOTES
+C   * There is a major stupidity in that we call mpn_mul_1 initially, for a
+C     large trip count.  Instead, we should follow the generic/sqr_basecase.c
+C     code which uses addmul_2s from the start, conditionally leaving a 1x1
+C     multiply to the end.  (In assembly code, one would stop invoking
+C     addmul_2s loops when perhaps 3x2s respectively a 2x2s remains.)
+C   * Another stupidity is in the sqr_diag_addlsh1 code.  It does not need to
+C     save/restore carry, instead it can propagate into the high product word.
+C   * Align more labels, should shave off a few cycles.
+C   * We can safely use 32-bit size operations, since operands with (2^32)
+C     limbs will lead to non-termination in practice.
+C   * The jump table could probably be optimized, at least for non-pic.
+C   * The special code for n <= 4 was quickly written.  It is probably too
+C     large and unnecessarily slow.
+C   * Consider combining small cases code so that the n=k-1 code jumps into the
+C     middle of the n=k code.
+C   * Avoid saving registers for small cases code.
+C   * Needed variables:
+C    n   r11  input size
+C    i   r8   work left, initially n
+C    j   r9   inner loop count
+C        r15  unused
+C    v0  r13
+C    v1  r14
+C    rp  rdi
+C    up  rsi
+C    w0  rbx
+C    w1  rcx
+C    w2  rbp
+C    w3  r10
+C    tp  r12
+C    lo  rax
+C    hi  rdx
+C        rsp
+
+C INPUT PARAMETERS
+define(`rp',	  `%rdi')
+define(`up',	  `%rsi')
+define(`n_param', `%rdx')
+
+define(`n',	`%r11')
+define(`tp',	`%r12')
+define(`i',	`%r8')
+define(`j',	`%r9')
+define(`v0',	`%r13')
+define(`v1',	`%r14')
+define(`w0',	`%rbx')
+define(`w1',	`%rcx')
+define(`w2',	`%rbp')
+define(`w3',	`%r10')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_sqr_basecase)
+	FUNC_ENTRY(3)
+	mov	R32(n_param), R32(%rcx)
+	mov	R32(n_param), R32(n)		C free original n register (rdx)
+
+	add	$-40, %rsp
+
+	and	$3, R32(%rcx)
+	cmp	$4, R32(n_param)
+	lea	4(%rcx), %r8
+
+	mov	%rbx, 32(%rsp)
+	mov	%rbp, 24(%rsp)
+	mov	%r12, 16(%rsp)
+	mov	%r13, 8(%rsp)
+	mov	%r14, (%rsp)
+
+	cmovg	%r8, %rcx
+
+	lea	L(tab)(%rip), %rax
+ifdef(`PIC',
+`	movslq	(%rax,%rcx,4), %r10
+	add	%r10, %rax
+	jmp	*%rax
+',`
+	jmp	*(%rax,%rcx,8)
+')
+	JUMPTABSECT
+	ALIGN(8)
+L(tab):	JMPENT(	L(4), L(tab))
+	JMPENT(	L(1), L(tab))
+	JMPENT(	L(2), L(tab))
+	JMPENT(	L(3), L(tab))
+	JMPENT(	L(0m4), L(tab))
+	JMPENT(	L(1m4), L(tab))
+	JMPENT(	L(2m4), L(tab))
+	JMPENT(	L(3m4), L(tab))
+	TEXT
+
+L(1):	mov	(up), %rax
+	mul	%rax
+	add	$40, %rsp
+	mov	%rax, (rp)
+	mov	%rdx, 8(rp)
+	FUNC_EXIT()
+	ret
+
+L(2):	mov	(up), %rax
+	mov	%rax, %r8
+	mul	%rax
+	mov	8(up), %r11
+	mov	%rax, (rp)
+	mov	%r11, %rax
+	mov	%rdx, %r9
+	mul	%rax
+	add	$40, %rsp
+	mov	%rax, %r10
+	mov	%r11, %rax
+	mov	%rdx, %r11
+	mul	%r8
+	xor	%r8, %r8
+	add	%rax, %r9
+	adc	%rdx, %r10
+	adc	%r8, %r11
+	add	%rax, %r9
+	mov	%r9, 8(rp)
+	adc	%rdx, %r10
+	mov	%r10, 16(rp)
+	adc	%r8, %r11
+	mov	%r11, 24(rp)
+	FUNC_EXIT()
+	ret
+
+L(3):	mov	(up), %rax
+	mov	%rax, %r10
+	mul	%rax
+	mov	8(up), %r11
+	mov	%rax, (rp)
+	mov	%r11, %rax
+	mov	%rdx, 8(rp)
+	mul	%rax
+	mov	16(up), %rcx
+	mov	%rax, 16(rp)
+	mov	%rcx, %rax
+	mov	%rdx, 24(rp)
+	mul	%rax
+	mov	%rax, 32(rp)
+	mov	%rdx, 40(rp)
+
+	mov	%r11, %rax
+	mul	%r10
+	mov	%rax, %r8
+	mov	%rcx, %rax
+	mov	%rdx, %r9
+	mul	%r10
+	xor	%r10, %r10
+	add	%rax, %r9
+	mov	%r11, %rax
+	mov	%r10, %r11
+	adc	%rdx, %r10
+
+	mul	%rcx
+	add	$40, %rsp
+	add	%rax, %r10
+	adc	%r11, %rdx
+	add	%r8, %r8
+	adc	%r9, %r9
+	adc	%r10, %r10
+	adc	%rdx, %rdx
+	adc	%r11, %r11
+	add	%r8, 8(rp)
+	adc	%r9, 16(rp)
+	adc	%r10, 24(rp)
+	adc	%rdx, 32(rp)
+	adc	%r11, 40(rp)
+	FUNC_EXIT()
+	ret
+
+L(4):	mov	(up), %rax
+	mov	%rax, %r11
+	mul	%rax
+	mov	8(up), %rbx
+	mov	%rax, (rp)
+	mov	%rbx, %rax
+	mov	%rdx, 8(rp)
+	mul	%rax
+	mov	%rax, 16(rp)
+	mov	%rdx, 24(rp)
+	mov	16(up), %rax
+	mul	%rax
+	mov	%rax, 32(rp)
+	mov	%rdx, 40(rp)
+	mov	24(up), %rax
+	mul	%rax
+	mov	%rax, 48(rp)
+	mov	%rbx, %rax
+	mov	%rdx, 56(rp)
+
+	mul	%r11
+	add	$32, %rsp
+	mov	%rax, %r8
+	mov	%rdx, %r9
+	mov	16(up), %rax
+	mul	%r11
+	xor	%r10, %r10
+	add	%rax, %r9
+	adc	%rdx, %r10
+	mov	24(up), %rax
+	mul	%r11
+	xor	%r11, %r11
+	add	%rax, %r10
+	adc	%rdx, %r11
+	mov	16(up), %rax
+	mul	%rbx
+	xor	%rcx, %rcx
+	add	%rax, %r10
+	adc	%rdx, %r11
+	adc	$0, %rcx
+	mov	24(up), %rax
+	mul	%rbx
+	pop	%rbx
+	add	%rax, %r11
+	adc	%rdx, %rcx
+	mov	16(up), %rdx
+	mov	24(up), %rax
+	mul	%rdx
+	add	%rax, %rcx
+	adc	$0, %rdx
+
+	add	%r8, %r8
+	adc	%r9, %r9
+	adc	%r10, %r10
+	adc	%r11, %r11
+	adc	%rcx, %rcx
+	mov	$0, R32(%rax)
+	adc	%rdx, %rdx
+
+	adc	%rax, %rax
+	add	%r8, 8(rp)
+	adc	%r9, 16(rp)
+	adc	%r10, 24(rp)
+	adc	%r11, 32(rp)
+	adc	%rcx, 40(rp)
+	adc	%rdx, 48(rp)
+	adc	%rax, 56(rp)
+	FUNC_EXIT()
+	ret
+
+
+L(0m4):
+	lea	-16(rp,n,8), tp		C point tp in middle of result operand
+	mov	(up), v0
+	mov	8(up), %rax
+	lea	(up,n,8), up		C point up at end of input operand
+
+	lea	-4(n), i
+C Function mpn_mul_1_m3(tp, up - i, i, up[-i - 1])
+	xor	R32(j), R32(j)
+	sub	n, j
+
+	mul	v0
+	xor	R32(w2), R32(w2)
+	mov	%rax, w0
+	mov	16(up,j,8), %rax
+	mov	%rdx, w3
+	jmp	L(L3)
+
+	ALIGN(16)
+L(mul_1_m3_top):
+	add	%rax, w2
+	mov	w3, (tp,j,8)
+	mov	(up,j,8), %rax
+	adc	%rdx, w1
+	xor	R32(w0), R32(w0)
+	mul	v0
+	xor	R32(w3), R32(w3)
+	mov	w2, 8(tp,j,8)
+	add	%rax, w1
+	adc	%rdx, w0
+	mov	8(up,j,8), %rax
+	mov	w1, 16(tp,j,8)
+	xor	R32(w2), R32(w2)
+	mul	v0
+	add	%rax, w0
+	mov	16(up,j,8), %rax
+	adc	%rdx, w3
+L(L3):	xor	R32(w1), R32(w1)
+	mul	v0
+	add	%rax, w3
+	mov	24(up,j,8), %rax
+	adc	%rdx, w2
+	mov	w0, 24(tp,j,8)
+	mul	v0
+	add	$4, j
+	js	L(mul_1_m3_top)
+
+	add	%rax, w2
+	mov	w3, (tp)
+	adc	%rdx, w1
+	mov	w2, 8(tp)
+	mov	w1, 16(tp)
+
+	lea	eval(2*8)(tp), tp	C tp += 2
+	lea	-8(up), up
+	jmp	L(dowhile)
+
+
+L(1m4):
+	lea	8(rp,n,8), tp		C point tp in middle of result operand
+	mov	(up), v0		C u0
+	mov	8(up), %rax		C u1
+	lea	8(up,n,8), up		C point up at end of input operand
+
+	lea	-3(n), i
+C Function mpn_mul_2s_m0(tp, up - i, i, up - i - 1)
+	lea	-3(n), j
+	neg	j
+
+	mov	%rax, v1		C u1
+	mul	v0			C u0 * u1
+	mov	%rdx, w1
+	xor	R32(w2), R32(w2)
+	mov	%rax, 8(rp)
+	jmp	L(m0)
+
+	ALIGN(16)
+L(mul_2_m0_top):
+	mul	v1
+	add	%rax, w0
+	adc	%rdx, w1
+	mov	-24(up,j,8), %rax
+	mov	$0, R32(w2)
+	mul	v0
+	add	%rax, w0
+	mov	-24(up,j,8), %rax
+	adc	%rdx, w1
+	adc	$0, R32(w2)
+	mul	v1			C v1 * u0
+	add	%rax, w1
+	mov	w0, -24(tp,j,8)
+	adc	%rdx, w2
+L(m0):	mov	-16(up,j,8), %rax	C u2, u6 ...
+	mul	v0			C u0 * u2
+	mov	$0, R32(w3)
+	add	%rax, w1
+	adc	%rdx, w2
+	mov	-16(up,j,8), %rax
+	adc	$0, R32(w3)
+	mov	$0, R32(w0)
+	mov	w1, -16(tp,j,8)
+	mul	v1
+	add	%rax, w2
+	mov	-8(up,j,8), %rax
+	adc	%rdx, w3
+	mov	$0, R32(w1)
+	mul	v0
+	add	%rax, w2
+	mov	-8(up,j,8), %rax
+	adc	%rdx, w3
+	adc	$0, R32(w0)
+	mul	v1
+	add	%rax, w3
+	mov	w2, -8(tp,j,8)
+	adc	%rdx, w0
+L(m2x):	mov	(up,j,8), %rax
+	mul	v0
+	add	%rax, w3
+	adc	%rdx, w0
+	adc	$0, R32(w1)
+	add	$4, j
+	mov	-32(up,j,8), %rax
+	mov	w3, -32(tp,j,8)
+	js	L(mul_2_m0_top)
+
+	mul	v1
+	add	%rax, w0
+	adc	%rdx, w1
+	mov	w0, -8(tp)
+	mov	w1, (tp)
+
+	lea	-16(up), up
+	lea	eval(3*8-24)(tp), tp	C tp += 3
+	jmp	L(dowhile_end)
+
+
+L(2m4):
+	lea	-16(rp,n,8), tp		C point tp in middle of result operand
+	mov	(up), v0
+	mov	8(up), %rax
+	lea	(up,n,8), up		C point up at end of input operand
+
+	lea	-4(n), i
+C Function mpn_mul_1_m1(tp, up - (i - 1), i - 1, up[-i])
+	lea	-2(n), j
+	neg	j
+
+	mul	v0
+	mov	%rax, w2
+	mov	(up,j,8), %rax
+	mov	%rdx, w1
+	jmp	L(L1)
+
+	ALIGN(16)
+L(mul_1_m1_top):
+	add	%rax, w2
+	mov	w3, (tp,j,8)
+	mov	(up,j,8), %rax
+	adc	%rdx, w1
+L(L1):	xor	R32(w0), R32(w0)
+	mul	v0
+	xor	R32(w3), R32(w3)
+	mov	w2, 8(tp,j,8)
+	add	%rax, w1
+	adc	%rdx, w0
+	mov	8(up,j,8), %rax
+	mov	w1, 16(tp,j,8)
+	xor	R32(w2), R32(w2)
+	mul	v0
+	add	%rax, w0
+	mov	16(up,j,8), %rax
+	adc	%rdx, w3
+	xor	R32(w1), R32(w1)
+	mul	v0
+	add	%rax, w3
+	mov	24(up,j,8), %rax
+	adc	%rdx, w2
+	mov	w0, 24(tp,j,8)
+	mul	v0
+	add	$4, j
+	js	L(mul_1_m1_top)
+
+	add	%rax, w2
+	mov	w3, (tp)
+	adc	%rdx, w1
+	mov	w2, 8(tp)
+	mov	w1, 16(tp)
+
+	lea	eval(2*8)(tp), tp	C tp += 2
+	lea	-8(up), up
+	jmp	L(dowhile_mid)
+
+
+L(3m4):
+	lea	8(rp,n,8), tp		C point tp in middle of result operand
+	mov	(up), v0		C u0
+	mov	8(up), %rax		C u1
+	lea	8(up,n,8), up		C point up at end of input operand
+
+	lea	-5(n), i
+C Function mpn_mul_2s_m2(tp, up - i + 1, i - 1, up - i)
+	lea	-1(n), j
+	neg	j
+
+	mov	%rax, v1		C u1
+	mul	v0			C u0 * u1
+	mov	%rdx, w3
+	xor	R32(w0), R32(w0)
+	xor	R32(w1), R32(w1)
+	mov	%rax, 8(rp)
+	jmp	L(m2)
+
+	ALIGN(16)
+L(mul_2_m2_top):
+	mul	v1
+	add	%rax, w0
+	adc	%rdx, w1
+	mov	-24(up,j,8), %rax
+	mov	$0, R32(w2)
+	mul	v0
+	add	%rax, w0
+	mov	-24(up,j,8), %rax
+	adc	%rdx, w1
+	adc	$0, R32(w2)
+	mul	v1			C v1 * u0
+	add	%rax, w1
+	mov	w0, -24(tp,j,8)
+	adc	%rdx, w2
+	mov	-16(up,j,8), %rax
+	mul	v0
+	mov	$0, R32(w3)
+	add	%rax, w1
+	adc	%rdx, w2
+	mov	-16(up,j,8), %rax
+	adc	$0, R32(w3)
+	mov	$0, R32(w0)
+	mov	w1, -16(tp,j,8)
+	mul	v1
+	add	%rax, w2
+	mov	-8(up,j,8), %rax
+	adc	%rdx, w3
+	mov	$0, R32(w1)
+	mul	v0
+	add	%rax, w2
+	mov	-8(up,j,8), %rax
+	adc	%rdx, w3
+	adc	$0, R32(w0)
+	mul	v1
+	add	%rax, w3
+	mov	w2, -8(tp,j,8)
+	adc	%rdx, w0
+L(m2):	mov	(up,j,8), %rax
+	mul	v0
+	add	%rax, w3
+	adc	%rdx, w0
+	adc	$0, R32(w1)
+	add	$4, j
+	mov	-32(up,j,8), %rax
+	mov	w3, -32(tp,j,8)
+	js	L(mul_2_m2_top)
+
+	mul	v1
+	add	%rax, w0
+	adc	%rdx, w1
+	mov	w0, -8(tp)
+	mov	w1, (tp)
+
+	lea	-16(up), up
+	jmp	L(dowhile_mid)
+
+L(dowhile):
+C Function mpn_addmul_2s_m2(tp, up - (i - 1), i - 1, up - i)
+	lea	4(i), j
+	neg	j
+
+	mov	16(up,j,8), v0
+	mov	24(up,j,8), v1
+	mov	24(up,j,8), %rax
+	mul	v0
+	xor	R32(w3), R32(w3)
+	add	%rax, 24(tp,j,8)
+	adc	%rdx, w3
+	xor	R32(w0), R32(w0)
+	xor	R32(w1), R32(w1)
+	jmp	L(am2)
+
+	ALIGN(16)
+L(addmul_2_m2_top):
+	add	w3, (tp,j,8)
+	adc	%rax, w0
+	mov	8(up,j,8), %rax
+	adc	%rdx, w1
+	mov	$0, R32(w2)
+	mul	v0
+	add	%rax, w0
+	mov	8(up,j,8), %rax
+	adc	%rdx, w1
+	adc	$0, R32(w2)
+	mul	v1				C v1 * u0
+	add	w0, 8(tp,j,8)
+	adc	%rax, w1
+	adc	%rdx, w2
+	mov	16(up,j,8), %rax
+	mov	$0, R32(w3)
+	mul	v0				C v0 * u1
+	add	%rax, w1
+	mov	16(up,j,8), %rax
+	adc	%rdx, w2
+	adc	$0, R32(w3)
+	mul	v1				C v1 * u1
+	add	w1, 16(tp,j,8)
+	adc	%rax, w2
+	mov	24(up,j,8), %rax
+	adc	%rdx, w3
+	mul	v0
+	mov	$0, R32(w0)
+	add	%rax, w2
+	adc	%rdx, w3
+	mov	$0, R32(w1)
+	mov	24(up,j,8), %rax
+	adc	$0, R32(w0)
+	mul	v1
+	add	w2, 24(tp,j,8)
+	adc	%rax, w3
+	adc	%rdx, w0
+L(am2):	mov	32(up,j,8), %rax
+	mul	v0
+	add	%rax, w3
+	mov	32(up,j,8), %rax
+	adc	%rdx, w0
+	adc	$0, R32(w1)
+	mul	v1
+	add	$4, j
+	js	L(addmul_2_m2_top)
+
+	add	w3, (tp)
+	adc	%rax, w0
+	adc	%rdx, w1
+	mov	w0, 8(tp)
+	mov	w1, 16(tp)
+
+	lea	eval(2*8)(tp), tp	C tp += 2
+
+	add	$-2, R32(i)		C i -= 2
+
+L(dowhile_mid):
+C Function mpn_addmul_2s_m0(tp, up - (i - 1), i - 1, up - i)
+	lea	2(i), j
+	neg	j
+
+	mov	(up,j,8), v0
+	mov	8(up,j,8), v1
+	mov	8(up,j,8), %rax
+	mul	v0
+	xor	R32(w1), R32(w1)
+	add	%rax, 8(tp,j,8)
+	adc	%rdx, w1
+	xor	R32(w2), R32(w2)
+	jmp	L(20)
+
+	ALIGN(16)
+L(addmul_2_m0_top):
+	add	w3, (tp,j,8)
+	adc	%rax, w0
+	mov	8(up,j,8), %rax
+	adc	%rdx, w1
+	mov	$0, R32(w2)
+	mul	v0
+	add	%rax, w0
+	mov	8(up,j,8), %rax
+	adc	%rdx, w1
+	adc	$0, R32(w2)
+	mul	v1				C v1 * u0
+	add	w0, 8(tp,j,8)
+	adc	%rax, w1
+	adc	%rdx, w2
+L(20):	mov	16(up,j,8), %rax
+	mov	$0, R32(w3)
+	mul	v0				C v0 * u1
+	add	%rax, w1
+	mov	16(up,j,8), %rax
+	adc	%rdx, w2
+	adc	$0, R32(w3)
+	mul	v1				C v1 * u1
+	add	w1, 16(tp,j,8)
+	adc	%rax, w2
+	mov	24(up,j,8), %rax
+	adc	%rdx, w3
+	mul	v0
+	mov	$0, R32(w0)
+	add	%rax, w2
+	adc	%rdx, w3
+	mov	$0, R32(w1)
+	mov	24(up,j,8), %rax
+	adc	$0, R32(w0)
+	mul	v1
+	add	w2, 24(tp,j,8)
+	adc	%rax, w3
+	adc	%rdx, w0
+	mov	32(up,j,8), %rax
+	mul	v0
+	add	%rax, w3
+	mov	32(up,j,8), %rax
+	adc	%rdx, w0
+	adc	$0, R32(w1)
+	mul	v1
+	add	$4, j
+	js	L(addmul_2_m0_top)
+
+	add	w3, (tp)
+	adc	%rax, w0
+	adc	%rdx, w1
+	mov	w0, 8(tp)
+	mov	w1, 16(tp)
+
+	lea	eval(2*8)(tp), tp	C tp += 2
+L(dowhile_end):
+
+	add	$-2, R32(i)		C i -= 2
+	jne	L(dowhile)
+
+C Function mpn_addmul_2s_2
+	mov	-16(up), v0
+	mov	-8(up), v1
+	mov	-8(up), %rax
+	mul	v0
+	xor	R32(w3), R32(w3)
+	add	%rax, -8(tp)
+	adc	%rdx, w3
+	xor	R32(w0), R32(w0)
+	xor	R32(w1), R32(w1)
+	mov	(up), %rax
+	mul	v0
+	add	%rax, w3
+	mov	(up), %rax
+	adc	%rdx, w0
+	mul	v1
+	add	w3, (tp)
+	adc	%rax, w0
+	adc	%rdx, w1
+	mov	w0, 8(tp)
+	mov	w1, 16(tp)
+
+C Function mpn_sqr_diag_addlsh1
+	lea	-4(n,n), j
+
+	mov	8(rp), %r11
+	lea	-8(up), up
+	lea	(rp,j,8), rp
+	neg	j
+	mov	(up,j,4), %rax
+	mul	%rax
+	test	$2, R8(j)
+	jnz	L(odd)
+
+L(evn):	add	%r11, %r11
+	sbb	R32(%rbx), R32(%rbx)		C save CF
+	add	%rdx, %r11
+	mov	%rax, (rp,j,8)
+	jmp	L(d0)
+
+L(odd):	add	%r11, %r11
+	sbb	R32(%rbp), R32(%rbp)		C save CF
+	add	%rdx, %r11
+	mov	%rax, (rp,j,8)
+	lea	-2(j), j
+	jmp	L(d1)
+
+	ALIGN(16)
+L(top):	mov	(up,j,4), %rax
+	mul	%rax
+	add	R32(%rbp), R32(%rbp)		C restore carry
+	adc	%rax, %r10
+	adc	%rdx, %r11
+	mov	%r10, (rp,j,8)
+L(d0):	mov	%r11, 8(rp,j,8)
+	mov	16(rp,j,8), %r10
+	adc	%r10, %r10
+	mov	24(rp,j,8), %r11
+	adc	%r11, %r11
+	nop
+	sbb	R32(%rbp), R32(%rbp)		C save CF
+	mov	8(up,j,4), %rax
+	mul	%rax
+	add	R32(%rbx), R32(%rbx)		C restore carry
+	adc	%rax, %r10
+	adc	%rdx, %r11
+	mov	%r10, 16(rp,j,8)
+L(d1):	mov	%r11, 24(rp,j,8)
+	mov	32(rp,j,8), %r10
+	adc	%r10, %r10
+	mov	40(rp,j,8), %r11
+	adc	%r11, %r11
+	sbb	R32(%rbx), R32(%rbx)		C save CF
+	add	$4, j
+	js	L(top)
+
+	mov	(up), %rax
+	mul	%rax
+	add	R32(%rbp), R32(%rbp)		C restore carry
+	adc	%rax, %r10
+	adc	%rdx, %r11
+	mov	%r10, (rp)
+	mov	%r11, 8(rp)
+	mov	16(rp), %r10
+	adc	%r10, %r10
+	sbb	R32(%rbp), R32(%rbp)		C save CF
+	neg	R32(%rbp)
+	mov	8(up), %rax
+	mul	%rax
+	add	R32(%rbx), R32(%rbx)		C restore carry
+	adc	%rax, %r10
+	adc	%rbp, %rdx
+	mov	%r10, 16(rp)
+	mov	%rdx, 24(rp)
+
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/logops_n.asm b/third_party/gmp/mpn/x86_64/logops_n.asm
new file mode 100644
index 0000000..e25854d
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/logops_n.asm

@@ -0,0 +1,260 @@
+dnl  AMD64 logops.
+
+dnl  Copyright 2004-2017 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C		c/l	c/l	c/l	good
+C	       var-1   var-2   var-3  for cpu?
+C AMD K8,K9	 1.5	 1.5	 1.5	 y
+C AMD K10	 1.5	 1.5	 1.5	 y
+C AMD bd1
+C AMD bd2
+C AMD bd3
+C AMD bd4
+C AMD bt1	 2.67	~2.79	~2.67
+C AMD bt2	 2.0	 2.28	 2.28	 y
+C AMD zen	 1.5	 1.5	 1.5	 =
+C Intel P4	 2.8	 3.35	 3.6
+C Intel PNR	 2.0	 2.0	 2.0	 =
+C Intel NHM	 2.0	 2.0	 2.0	 =
+C Intel SBR	 1.5	 1.75	 1.75	 n
+C Intel IBR	 1.48	 1.71	 1.72	 n
+C Intel HWL	 1.5	 1.5	 1.5	 n
+C Intel BWL	 1.5	 1.5	 1.5	 n
+C Intel SKL	 1.5	 1.5	 1.5	 n
+C Intel atom	 3.82	 3.82	 3.82	 n
+C Intel SLM	 3.0	 3.0	 3.0	 =
+C VIA nano	 3.25
+
+ifdef(`OPERATION_and_n',`
+  define(`func',`mpn_and_n')
+  define(`VARIANT_1')
+  define(`LOGOP',`and')')
+ifdef(`OPERATION_andn_n',`
+  define(`func',`mpn_andn_n')
+  define(`VARIANT_2')
+  define(`LOGOP',`and')')
+ifdef(`OPERATION_nand_n',`
+  define(`func',`mpn_nand_n')
+  define(`VARIANT_3')
+  define(`LOGOP',`and')')
+ifdef(`OPERATION_ior_n',`
+  define(`func',`mpn_ior_n')
+  define(`VARIANT_1')
+  define(`LOGOP',`or')')
+ifdef(`OPERATION_iorn_n',`
+  define(`func',`mpn_iorn_n')
+  define(`VARIANT_2')
+  define(`LOGOP',`or')')
+ifdef(`OPERATION_nior_n',`
+  define(`func',`mpn_nior_n')
+  define(`VARIANT_3')
+  define(`LOGOP',`or')')
+ifdef(`OPERATION_xor_n',`
+  define(`func',`mpn_xor_n')
+  define(`VARIANT_1')
+  define(`LOGOP',`xor')')
+ifdef(`OPERATION_xnor_n',`
+  define(`func',`mpn_xnor_n')
+  define(`VARIANT_2')
+  define(`LOGOP',`xor')')
+
+
+MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n)
+
+C INPUT PARAMETERS
+define(`rp',`%rdi')
+define(`up',`%rsi')
+define(`vp',`%rdx')
+define(`n',`%rcx')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+
+ifdef(`VARIANT_1',`
+	TEXT
+	ALIGN(32)
+PROLOGUE(func)
+	FUNC_ENTRY(4)
+	mov	(vp), %r8
+	mov	R32(%rcx), R32(%rax)
+	lea	(vp,n,8), vp
+	lea	(up,n,8), up
+	lea	(rp,n,8), rp
+	neg	n
+	and	$3, R32(%rax)
+	je	L(b00)
+	cmp	$2, R32(%rax)
+	jc	L(b01)
+	je	L(b10)
+
+L(b11):	LOGOP	(up,n,8), %r8
+	mov	%r8, (rp,n,8)
+	dec	n
+	jmp	L(e11)
+L(b10):	add	$-2, n
+	jmp	L(e10)
+L(b01):	LOGOP	(up,n,8), %r8
+	mov	%r8, (rp,n,8)
+	inc	n
+	jz	L(ret)
+
+L(top):	mov	(vp,n,8), %r8
+L(b00):	mov	8(vp,n,8), %r9
+	LOGOP	(up,n,8), %r8
+	LOGOP	8(up,n,8), %r9
+	nop				C K8/K9/K10 concession
+	mov	%r8, (rp,n,8)
+	mov	%r9, 8(rp,n,8)
+L(e11):	mov	16(vp,n,8), %r8
+L(e10):	mov	24(vp,n,8), %r9
+	LOGOP	16(up,n,8), %r8
+	LOGOP	24(up,n,8), %r9
+	mov	%r8, 16(rp,n,8)
+	mov	%r9, 24(rp,n,8)
+	add	$4, n
+	jnc	L(top)
+
+L(ret):	FUNC_EXIT()
+	ret
+EPILOGUE()
+')
+
+ifdef(`VARIANT_2',`
+	TEXT
+	ALIGN(32)
+PROLOGUE(func)
+	FUNC_ENTRY(4)
+	mov	(vp), %r8
+	not	%r8
+	mov	R32(%rcx), R32(%rax)
+	lea	(vp,n,8), vp
+	lea	(up,n,8), up
+	lea	(rp,n,8), rp
+	neg	n
+	and	$3, R32(%rax)
+	je	L(b00)
+	cmp	$2, R32(%rax)
+	jc	L(b01)
+	je	L(b10)
+
+L(b11):	LOGOP	(up,n,8), %r8
+	mov	%r8, (rp,n,8)
+	dec	n
+	jmp	L(e11)
+L(b10):	add	$-2, n
+	jmp	L(e10)
+	.byte	0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90
+L(b01):	LOGOP	(up,n,8), %r8
+	mov	%r8, (rp,n,8)
+	inc	n
+	jz	L(ret)
+
+L(top):	mov	(vp,n,8), %r8
+	not	%r8
+L(b00):	mov	8(vp,n,8), %r9
+	not	%r9
+	LOGOP	(up,n,8), %r8
+	LOGOP	8(up,n,8), %r9
+	mov	%r8, (rp,n,8)
+	mov	%r9, 8(rp,n,8)
+L(e11):	mov	16(vp,n,8), %r8
+	not	%r8
+L(e10):	mov	24(vp,n,8), %r9
+	not	%r9
+	LOGOP	16(up,n,8), %r8
+	LOGOP	24(up,n,8), %r9
+	mov	%r8, 16(rp,n,8)
+	mov	%r9, 24(rp,n,8)
+	add	$4, n
+	jnc	L(top)
+
+L(ret):	FUNC_EXIT()
+	ret
+EPILOGUE()
+')
+
+ifdef(`VARIANT_3',`
+	TEXT
+	ALIGN(32)
+PROLOGUE(func)
+	FUNC_ENTRY(4)
+	mov	(vp), %r8
+	mov	R32(%rcx), R32(%rax)
+	lea	(vp,n,8), vp
+	lea	(up,n,8), up
+	lea	(rp,n,8), rp
+	neg	n
+	and	$3, R32(%rax)
+	je	L(b00)
+	cmp	$2, R32(%rax)
+	jc	L(b01)
+	je	L(b10)
+
+L(b11):	LOGOP	(up,n,8), %r8
+	not	%r8
+	mov	%r8, (rp,n,8)
+	dec	n
+	jmp	L(e11)
+L(b10):	add	$-2, n
+	jmp	L(e10)
+	.byte	0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90
+L(b01):	LOGOP	(up,n,8), %r8
+	not	%r8
+	mov	%r8, (rp,n,8)
+	inc	n
+	jz	L(ret)
+
+L(top):	mov	(vp,n,8), %r8
+L(b00):	mov	8(vp,n,8), %r9
+	LOGOP	(up,n,8), %r8
+	not	%r8
+	LOGOP	8(up,n,8), %r9
+	not	%r9
+	mov	%r8, (rp,n,8)
+	mov	%r9, 8(rp,n,8)
+L(e11):	mov	16(vp,n,8), %r8
+L(e10):	mov	24(vp,n,8), %r9
+	LOGOP	16(up,n,8), %r8
+	not	%r8
+	LOGOP	24(up,n,8), %r9
+	not	%r9
+	mov	%r8, 16(rp,n,8)
+	mov	%r9, 24(rp,n,8)
+	add	$4, n
+	jnc	L(top)
+
+L(ret):	FUNC_EXIT()
+	ret
+EPILOGUE()
+')

diff --git a/third_party/gmp/mpn/x86_64/lshift.asm b/third_party/gmp/mpn/x86_64/lshift.asm
new file mode 100644
index 0000000..fff3152
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/lshift.asm

@@ -0,0 +1,172 @@
+dnl  AMD64 mpn_lshift -- mpn left shift.
+
+dnl  Copyright 2003, 2005, 2007, 2009, 2011, 2012, 2018 Free Software
+dnl  Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C	     cycles/limb   cycles/limb cnt=1
+C AMD K8,K9	 2.375		 1.375
+C AMD K10	 2.375		 1.375
+C Intel P4	 8		10.5
+C Intel core2	 2.11		 4.28
+C Intel corei	 ?		 ?
+C Intel atom	 5.75		 3.5
+C VIA nano	 3.5		 2.25
+
+
+C INPUT PARAMETERS
+define(`rp',	`%rdi')
+define(`up',	`%rsi')
+define(`n',	`%rdx')
+define(`cnt',	`%rcx')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(32)
+PROLOGUE(mpn_lshift)
+	FUNC_ENTRY(4)
+	neg	R32(%rcx)		C put rsh count in cl
+	mov	-8(up,n,8), %rax
+	shr	R8(%rcx), %rax		C function return value
+
+	neg	R32(%rcx)		C put lsh count in cl
+	lea	1(n), R32(%r8)
+	and	$3, R32(%r8)
+	je	L(rlx)			C jump for n = 3, 7, 11, ...
+
+	dec	R32(%r8)
+	jne	L(1)
+C	n = 4, 8, 12, ...
+	mov	-8(up,n,8), %r10
+	shl	R8(%rcx), %r10
+	neg	R32(%rcx)		C put rsh count in cl
+	mov	-16(up,n,8), %r8
+	shr	R8(%rcx), %r8
+	or	%r8, %r10
+	mov	%r10, -8(rp,n,8)
+	dec	n
+	jmp	L(rll)
+
+L(1):	dec	R32(%r8)
+	je	L(1x)			C jump for n = 1, 5, 9, 13, ...
+C	n = 2, 6, 10, 16, ...
+	mov	-8(up,n,8), %r10
+	shl	R8(%rcx), %r10
+	neg	R32(%rcx)		C put rsh count in cl
+	mov	-16(up,n,8), %r8
+	shr	R8(%rcx), %r8
+	or	%r8, %r10
+	mov	%r10, -8(rp,n,8)
+	dec	n
+	neg	R32(%rcx)		C put lsh count in cl
+L(1x):
+	cmp	$1, n
+	je	L(ast)
+	mov	-8(up,n,8), %r10
+	shl	R8(%rcx), %r10
+	mov	-16(up,n,8), %r11
+	shl	R8(%rcx), %r11
+	neg	R32(%rcx)		C put rsh count in cl
+	mov	-16(up,n,8), %r8
+	mov	-24(up,n,8), %r9
+	shr	R8(%rcx), %r8
+	or	%r8, %r10
+	shr	R8(%rcx), %r9
+	or	%r9, %r11
+	mov	%r10, -8(rp,n,8)
+	mov	%r11, -16(rp,n,8)
+	sub	$2, n
+
+L(rll):	neg	R32(%rcx)		C put lsh count in cl
+L(rlx):	mov	-8(up,n,8), %r10
+	shl	R8(%rcx), %r10
+	mov	-16(up,n,8), %r11
+	shl	R8(%rcx), %r11
+
+	sub	$4, n			C				      4
+	jb	L(end)			C				      2
+	ALIGN(16)
+L(top):
+	C finish stuff from lsh block
+	neg	R32(%rcx)		C put rsh count in cl
+	mov	16(up,n,8), %r8
+	mov	8(up,n,8), %r9
+	shr	R8(%rcx), %r8
+	or	%r8, %r10
+	shr	R8(%rcx), %r9
+	or	%r9, %r11
+	mov	%r10, 24(rp,n,8)
+	mov	%r11, 16(rp,n,8)
+	C start two new rsh
+	mov	0(up,n,8), %r8
+	mov	-8(up,n,8), %r9
+	shr	R8(%rcx), %r8
+	shr	R8(%rcx), %r9
+
+	C finish stuff from rsh block
+	neg	R32(%rcx)		C put lsh count in cl
+	mov	8(up,n,8), %r10
+	mov	0(up,n,8), %r11
+	shl	R8(%rcx), %r10
+	or	%r10, %r8
+	shl	R8(%rcx), %r11
+	or	%r11, %r9
+	mov	%r8, 8(rp,n,8)
+	mov	%r9, 0(rp,n,8)
+	C start two new lsh
+	mov	-8(up,n,8), %r10
+	mov	-16(up,n,8), %r11
+	shl	R8(%rcx), %r10
+	shl	R8(%rcx), %r11
+
+	sub	$4, n
+	jae	L(top)			C				      2
+L(end):
+	neg	R32(%rcx)		C put rsh count in cl
+	mov	8(up), %r8
+	shr	R8(%rcx), %r8
+	or	%r8, %r10
+	mov	(up), %r9
+	shr	R8(%rcx), %r9
+	or	%r9, %r11
+	mov	%r10, 16(rp)
+	mov	%r11, 8(rp)
+
+	neg	R32(%rcx)		C put lsh count in cl
+L(ast):	mov	(up), %r10
+	shl	R8(%rcx), %r10
+	mov	%r10, (rp)
+	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/lshiftc.asm b/third_party/gmp/mpn/x86_64/lshiftc.asm
new file mode 100644
index 0000000..c4ba04a
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/lshiftc.asm

@@ -0,0 +1,182 @@
+dnl  AMD64 mpn_lshiftc -- mpn left shift with complement.
+
+dnl  Copyright 2003, 2005, 2006, 2009, 2011, 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C	     cycles/limb
+C AMD K8,K9	 2.75
+C AMD K10	 2.75
+C Intel P4	 ?
+C Intel core2	 ?
+C Intel corei	 ?
+C Intel atom	 ?
+C VIA nano	 3.75
+
+
+C INPUT PARAMETERS
+define(`rp',	`%rdi')
+define(`up',	`%rsi')
+define(`n',	`%rdx')
+define(`cnt',	`%rcx')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(32)
+PROLOGUE(mpn_lshiftc)
+	FUNC_ENTRY(4)
+	neg	R32(%rcx)		C put rsh count in cl
+	mov	-8(up,n,8), %rax
+	shr	R8(%rcx), %rax		C function return value
+
+	neg	R32(%rcx)		C put lsh count in cl
+	lea	1(n), R32(%r8)
+	and	$3, R32(%r8)
+	je	L(rlx)			C jump for n = 3, 7, 11, ...
+
+	dec	R32(%r8)
+	jne	L(1)
+C	n = 4, 8, 12, ...
+	mov	-8(up,n,8), %r10
+	shl	R8(%rcx), %r10
+	neg	R32(%rcx)		C put rsh count in cl
+	mov	-16(up,n,8), %r8
+	shr	R8(%rcx), %r8
+	or	%r8, %r10
+	not	%r10
+	mov	%r10, -8(rp,n,8)
+	dec	n
+	jmp	L(rll)
+
+L(1):	dec	R32(%r8)
+	je	L(1x)			C jump for n = 1, 5, 9, 13, ...
+C	n = 2, 6, 10, 16, ...
+	mov	-8(up,n,8), %r10
+	shl	R8(%rcx), %r10
+	neg	R32(%rcx)		C put rsh count in cl
+	mov	-16(up,n,8), %r8
+	shr	R8(%rcx), %r8
+	or	%r8, %r10
+	not	%r10
+	mov	%r10, -8(rp,n,8)
+	dec	n
+	neg	R32(%rcx)		C put lsh count in cl
+L(1x):
+	cmp	$1, n
+	je	L(ast)
+	mov	-8(up,n,8), %r10
+	shl	R8(%rcx), %r10
+	mov	-16(up,n,8), %r11
+	shl	R8(%rcx), %r11
+	neg	R32(%rcx)		C put rsh count in cl
+	mov	-16(up,n,8), %r8
+	mov	-24(up,n,8), %r9
+	shr	R8(%rcx), %r8
+	or	%r8, %r10
+	shr	R8(%rcx), %r9
+	or	%r9, %r11
+	not	%r10
+	not	%r11
+	mov	%r10, -8(rp,n,8)
+	mov	%r11, -16(rp,n,8)
+	sub	$2, n
+
+L(rll):	neg	R32(%rcx)		C put lsh count in cl
+L(rlx):	mov	-8(up,n,8), %r10
+	shl	R8(%rcx), %r10
+	mov	-16(up,n,8), %r11
+	shl	R8(%rcx), %r11
+
+	sub	$4, n			C				      4
+	jb	L(end)			C				      2
+	ALIGN(16)
+L(top):
+	C finish stuff from lsh block
+	neg	R32(%rcx)		C put rsh count in cl
+	mov	16(up,n,8), %r8
+	mov	8(up,n,8), %r9
+	shr	R8(%rcx), %r8
+	or	%r8, %r10
+	shr	R8(%rcx), %r9
+	or	%r9, %r11
+	not	%r10
+	not	%r11
+	mov	%r10, 24(rp,n,8)
+	mov	%r11, 16(rp,n,8)
+	C start two new rsh
+	mov	0(up,n,8), %r8
+	mov	-8(up,n,8), %r9
+	shr	R8(%rcx), %r8
+	shr	R8(%rcx), %r9
+
+	C finish stuff from rsh block
+	neg	R32(%rcx)		C put lsh count in cl
+	mov	8(up,n,8), %r10
+	mov	0(up,n,8), %r11
+	shl	R8(%rcx), %r10
+	or	%r10, %r8
+	shl	R8(%rcx), %r11
+	or	%r11, %r9
+	not	%r8
+	not	%r9
+	mov	%r8, 8(rp,n,8)
+	mov	%r9, 0(rp,n,8)
+	C start two new lsh
+	mov	-8(up,n,8), %r10
+	mov	-16(up,n,8), %r11
+	shl	R8(%rcx), %r10
+	shl	R8(%rcx), %r11
+
+	sub	$4, n
+	jae	L(top)			C				      2
+L(end):
+	neg	R32(%rcx)		C put rsh count in cl
+	mov	8(up), %r8
+	shr	R8(%rcx), %r8
+	or	%r8, %r10
+	mov	(up), %r9
+	shr	R8(%rcx), %r9
+	or	%r9, %r11
+	not	%r10
+	not	%r11
+	mov	%r10, 16(rp)
+	mov	%r11, 8(rp)
+
+	neg	R32(%rcx)		C put lsh count in cl
+L(ast):	mov	(up), %r10
+	shl	R8(%rcx), %r10
+	not	%r10
+	mov	%r10, (rp)
+	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/lshsub_n.asm b/third_party/gmp/mpn/x86_64/lshsub_n.asm
new file mode 100644
index 0000000..4d428c0
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/lshsub_n.asm

@@ -0,0 +1,172 @@
+dnl  AMD64 mpn_lshsub_n.  R = 2^k(U - V).
+
+dnl  Copyright 2006, 2011, 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C	     cycles/limb
+C AMD K8,K9	 3.15	(mpn_sub_n + mpn_lshift costs about 4 c/l)
+C AMD K10	 3.15	(mpn_sub_n + mpn_lshift costs about 4 c/l)
+C Intel P4	16.5
+C Intel core2	 4.35
+C Intel corei	 ?
+C Intel atom	 ?
+C VIA nano	 ?
+
+C This was written quickly and not optimized at all, but it runs very well on
+C K8.  But perhaps one could get under 3 c/l.  Ideas:
+C   1) Use indexing to save the 3 LEA
+C   2) Write reasonable feed-in code
+C   3) Be more clever about register usage
+C   4) Unroll more, handling CL negation, carry save/restore cost much now
+C   5) Reschedule
+
+C INPUT PARAMETERS
+define(`rp',	`%rdi')
+define(`up',	`%rsi')
+define(`vp',	`%rdx')
+define(`n',	`%rcx')
+define(`cnt',	`%r8')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_lshsub_n)
+	FUNC_ENTRY(4)
+IFDOS(`	mov	56(%rsp), %r8d	')
+
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	push	%rbx
+
+	mov	n, %rax
+	xor	R32(%rbx), R32(%rbx)	C clear carry save register
+	mov	R32(%r8), R32(%rcx)	C shift count
+	xor	R32(%r15), R32(%r15)	C limb carry
+
+	mov	R32(%rax), R32(%r11)
+	and	$3, R32(%r11)
+	je	L(4)
+	sub	$1, R32(%r11)
+
+L(oopette):
+	add	R32(%rbx), R32(%rbx)	C restore carry flag
+	mov	0(up), %r8
+	lea	8(up), up
+	sbb	0(vp), %r8
+	mov	%r8, %r12
+	sbb	R32(%rbx), R32(%rbx)	C save carry flag
+	shl	R8(%rcx), %r8
+	or	%r15, %r8
+	mov	%r12, %r15
+	lea	8(vp), vp
+	neg	R8(%rcx)
+	shr	R8(%rcx), %r15
+	neg	R8(%rcx)
+	mov	%r8, 0(rp)
+	lea	8(rp), rp
+	sub	$1, R32(%r11)
+	jnc	L(oopette)
+
+L(4):
+	sub	$4, %rax
+	jc	L(end)
+
+	ALIGN(16)
+L(oop):
+	add	R32(%rbx), R32(%rbx)	C restore carry flag
+
+	mov	0(up), %r8
+	mov	8(up), %r9
+	mov	16(up), %r10
+	mov	24(up), %r11
+
+	lea	32(up), up
+
+	sbb	0(vp), %r8
+	mov	%r8, %r12
+	sbb	8(vp), %r9
+	mov	%r9, %r13
+	sbb	16(vp), %r10
+	mov	%r10, %r14
+	sbb	24(vp), %r11
+
+	sbb	R32(%rbx), R32(%rbx)	C save carry flag
+
+	shl	R8(%rcx), %r8
+	shl	R8(%rcx), %r9
+	shl	R8(%rcx), %r10
+	or	%r15, %r8
+	mov	%r11, %r15
+	shl	R8(%rcx), %r11
+
+	lea	32(vp), vp
+
+	neg	R8(%rcx)
+
+	shr	R8(%rcx), %r12
+	shr	R8(%rcx), %r13
+	shr	R8(%rcx), %r14
+	shr	R8(%rcx), %r15		C used next loop
+
+	or	%r12, %r9
+	or	%r13, %r10
+	or	%r14, %r11
+
+	neg	R8(%rcx)
+
+	mov	%r8, 0(rp)
+	mov	%r9, 8(rp)
+	mov	%r10, 16(rp)
+	mov	%r11, 24(rp)
+
+	lea	32(rp), rp
+
+	sub	$4, %rax
+	jnc	L(oop)
+L(end):
+	neg	R32(%rbx)
+	shl	R8(%rcx), %rbx
+	adc	%r15, %rbx
+	mov	%rbx, %rax
+	pop	%rbx
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+
+	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/missing-call.m4 b/third_party/gmp/mpn/x86_64/missing-call.m4
new file mode 100644
index 0000000..c024f0e
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/missing-call.m4

@@ -0,0 +1,53 @@
+dnl  AMD64 MULX/ADX simulation support, function call version.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+define(`adox',`
+	push	$1
+	push	$2
+	call	__gmp_adox
+	pop	$2
+')
+
+define(`adcx',`
+	push	$1
+	push	$2
+	call	__gmp_adcx
+	pop	$2
+')
+
+define(`mulx',`
+	push	$1
+	call	__gmp_mulx
+	pop	$2
+	pop	$3
+')

diff --git a/third_party/gmp/mpn/x86_64/missing-inline.m4 b/third_party/gmp/mpn/x86_64/missing-inline.m4
new file mode 100644
index 0000000..bd1df13
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/missing-inline.m4

@@ -0,0 +1,100 @@
+dnl  AMD64 MULX/ADX simulation support, inline version.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+define(`adox',`
+	push	$2
+	push	%rcx
+	push	%rbx
+	push	%rax
+	mov	$1, %rcx
+	pushfq
+	pushfq
+C copy 0(%rsp):11 to 0(%rsp):0
+	mov	(%rsp), %rbx
+	shr	%rbx
+	bt	$`'10, %rbx
+	adc	%rbx, %rbx
+	mov	%rbx, (%rsp)
+C put manipulated flags into eflags, execute a plain adc
+	popfq
+	adc	%rcx, 32(%rsp)
+C copy CF to 0(%rsp):11
+	mov	(%rsp), %rbx
+	sbb	R32(%rax), R32(%rax)
+	and	$`'0x800, R32(%rax)
+	and	$`'0xfffffffffffff7ff, %rbx
+	or	%rax, %rbx
+	mov	%rbx, (%rsp)
+C put manipulated flags into eflags
+	popfq
+	pop	%rax
+	pop	%rbx
+	pop	%rcx
+	pop	$2
+')
+
+define(`adcx',`
+	push	$2
+	push	%rcx
+	push	%rbx
+	push	%rax
+	mov	$1, %rcx
+	pushfq
+	adc	%rcx, 32(%rsp)
+	mov	(%rsp), %rbx
+	sbb	R32(%rax), R32(%rax)
+	and	$`'0xfffffffffffffffe, %rbx
+	sub	%rax, %rbx
+	mov	%rbx, (%rsp)
+	popfq
+	pop	%rax
+	pop	%rbx
+	pop	%rcx
+	pop	$2
+')
+
+define(`mulx',`
+	lea	-16(%rsp), %rsp
+	push	%rax
+	push	%rdx
+	pushfq			C preserve all flags
+	mov	$1, %rax
+	mul	%rdx
+	mov	%rax, 24(%rsp)
+	mov	%rdx, 32(%rsp)
+	popfq			C restore eflags
+	pop	%rdx
+	pop	%rax
+	pop	$2
+	pop	$3
+')

diff --git a/third_party/gmp/mpn/x86_64/missing.asm b/third_party/gmp/mpn/x86_64/missing.asm
new file mode 100644
index 0000000..9b65c89
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/missing.asm

@@ -0,0 +1,130 @@
+
+	dnl  AMD64 MULX/ADX simulation support.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+ASM_START()
+
+C Fake the MULX instruction
+C
+C Accept the single explicit parameter on the stack, return the two result
+C words on the stack.  This calling convention means that we need to move the
+C return address up.
+C
+PROLOGUE(__gmp_mulx)
+	lea	-8(%rsp), %rsp
+	push	%rax
+	push	%rdx
+	pushfq				C preserve all flags
+	mov	32(%rsp), %rax		C move retaddr...
+	mov	%rax, 24(%rsp)		C ...up the stack
+	mov	40(%rsp), %rax		C input parameter
+	mul	%rdx
+	mov	%rax, 32(%rsp)
+	mov	%rdx, 40(%rsp)
+	popfq				C restore eflags
+	pop	%rdx
+	pop	%rax
+	ret
+EPILOGUE()
+PROTECT(__gmp_mulx)
+
+
+C Fake the ADOX instruction
+C
+C Accept the two parameters on the stack, return the result word on the stack.
+C This calling convention means that we need to move the return address down.
+C
+PROLOGUE(__gmp_adox)
+	push	%rcx
+	push	%rbx
+	push	%rax
+	mov	32(%rsp), %rcx		C src2
+	mov	24(%rsp), %rax		C move retaddr...
+	mov	%rax, 32(%rsp)		C ...down the stack
+	pushfq
+C copy 0(%rsp):11 to 0(%rsp):0
+	mov	(%rsp), %rbx
+	shr	%rbx
+	bt	$10, %rbx
+	adc	%rbx, %rbx
+	push	%rbx
+C put manipulated flags into eflags, execute a plain adc
+	popfq
+	adc	%rcx, 48(%rsp)
+C copy CF to 0(%rsp):11
+	pop	%rbx
+	sbb	R32(%rax), R32(%rax)
+	and	$0x800, R32(%rax)
+	and	$0xfffffffffffff7ff, %rbx
+	or	%rax, %rbx
+	push	%rbx
+C put manipulated flags into eflags
+	popfq
+	pop	%rax
+	pop	%rbx
+	pop	%rcx
+	lea	8(%rsp), %rsp
+	ret
+EPILOGUE()
+PROTECT(__gmp_adox)
+
+
+C Fake the ADCX instruction
+C
+C Accept the two parameters on the stack, return the result word on the stack.
+C This calling convention means that we need to move the return address down.
+C
+PROLOGUE(__gmp_adcx)
+	push	%rcx
+	push	%rbx
+	push	%rax
+	mov	32(%rsp), %rcx		C src2
+	mov	24(%rsp), %rax		C move retaddr...
+	mov	%rax, 32(%rsp)		C ...down the stack
+	pushfq
+	adc	%rcx, 48(%rsp)
+	pop	%rbx
+	sbb	R32(%rax), R32(%rax)
+	and	$`'0xfffffffffffffffe, %rbx
+	sub	%rax, %rbx
+	push	%rbx
+	popfq
+	pop	%rax
+	pop	%rbx
+	pop	%rcx
+	lea	8(%rsp), %rsp
+	ret
+EPILOGUE()
+PROTECT(__gmp_adcx)

diff --git a/third_party/gmp/mpn/x86_64/mod_1_1.asm b/third_party/gmp/mpn/x86_64/mod_1_1.asm
new file mode 100644
index 0000000..255305f
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/mod_1_1.asm

@@ -0,0 +1,238 @@
+dnl  AMD64 mpn_mod_1_1p
+
+dnl  Contributed to the GNU project by Torbjörn Granlund and Niels Möller.
+
+dnl  Copyright 2009-2012, 2014 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C AMD K8,K9	 6
+C AMD K10	 6
+C Intel P4	26
+C Intel core2	12.5
+C Intel NHM	11.3
+C Intel SBR	 8.4	(slowdown, old code took 8.0)
+C Intel atom	26
+C VIA nano	13
+
+define(`B2mb',   `%r10')
+define(`B2modb', `%r11')
+define(`ap',     `%rdi')
+define(`n',      `%rsi')
+define(`pre',    `%r8')
+define(`b',      `%rbx')
+
+define(`r0',     `%rbp') C r1 kept in %rax
+define(`r2',	 `%rcx')  C kept negated. Also used as shift count
+define(`t0',     `%r9')
+
+C mp_limb_t
+C mpn_mod_1_1p (mp_srcptr ap, mp_size_t n, mp_limb_t b, mp_limb_t bmodb[4])
+C                       %rdi         %rsi         %rdx                %rcx
+C The pre array contains bi, cnt, B1modb, B2modb
+C Note: This implementation needs B1modb only when cnt > 0
+
+C The iteration is almost as follows,
+C
+C   r_2 B^3 + r_1 B^2 + r_0 B + u = r_1 B2modb + (r_0 + r_2 B2mod) B + u
+C
+C where r2 is a single bit represented as a mask. But to make sure that the
+C result fits in two limbs and a bit, carry from the addition
+C
+C   r_0 + r_2 B2mod
+C
+C is handled specially. On carry, we subtract b to cancel the carry,
+C and we use instead the value
+C
+C   r_0 + B2mb (mod B)
+C
+C This addition can be issued early since it doesn't depend on r2, and it is
+C the source of the cmov in the loop.
+C
+C We have the invariant that r_2 B^2 + r_1 B + r_0 < B^2 + B b
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_mod_1_1p)
+	FUNC_ENTRY(4)
+	push	%rbp
+	push	%rbx
+	mov	%rdx, b
+	mov	%rcx, pre
+
+	mov	-8(ap, n, 8), %rax
+	cmp	$3, n
+	jnc	L(first)
+	mov	-16(ap, n, 8), r0
+	jmp	L(reduce_two)
+
+L(first):
+	C First iteration, no r2
+	mov	24(pre), B2modb
+	mul	B2modb
+	mov	-24(ap, n, 8), r0
+	add	%rax, r0
+	mov	-16(ap, n, 8), %rax
+	adc	%rdx, %rax
+	sbb	r2, r2
+	sub	$4, n
+	jc	L(reduce_three)
+
+	mov	B2modb, B2mb
+	sub	b, B2mb
+
+	ALIGN(16)
+L(top):	and	B2modb, r2
+	lea	(B2mb, r0), t0
+	mul	B2modb
+	add	r0, r2
+	mov	(ap, n, 8), r0
+	cmovc	t0, r2
+	add	%rax, r0
+	mov	r2, %rax
+	adc	%rdx, %rax
+	sbb	r2, r2
+	sub	$1, n
+	jnc	L(top)
+
+L(reduce_three):
+	C Eliminate r2
+	and	b, r2
+	sub	r2, %rax
+
+L(reduce_two):
+	mov	8(pre), R32(%rcx)
+	test	R32(%rcx), R32(%rcx)
+	jz	L(normalized)
+
+	C Unnormalized, use B1modb to reduce to size < B (b+1)
+	mulq	16(pre)
+	xor	t0, t0
+	add	%rax, r0
+	adc	%rdx, t0
+	mov	t0, %rax
+
+	C Left-shift to normalize
+ifdef(`SHLD_SLOW',`
+	shl	R8(%rcx), %rax
+	mov	r0, t0
+	neg	R32(%rcx)
+	shr	R8(%rcx), t0
+	or	t0, %rax
+	neg	R32(%rcx)
+',`
+	shld	R8(%rcx), r0, %rax
+')
+	shl	R8(%rcx), r0
+	jmp	L(udiv)
+
+L(normalized):
+	mov	%rax, t0
+	sub	b, t0
+	cmovnc	t0, %rax
+
+L(udiv):
+	lea	1(%rax), t0
+	mulq	(pre)
+	add	r0, %rax
+	adc	t0, %rdx
+	imul	b, %rdx
+	sub	%rdx, r0
+	cmp	r0, %rax
+	lea	(b, r0), %rax
+	cmovnc	r0, %rax
+	cmp	b, %rax
+	jnc	L(fix)
+L(ok):	shr	R8(%rcx), %rax
+
+	pop	%rbx
+	pop	%rbp
+	FUNC_EXIT()
+	ret
+L(fix):	sub	b, %rax
+	jmp	L(ok)
+EPILOGUE()
+
+	ALIGN(16)
+PROLOGUE(mpn_mod_1_1p_cps)
+	FUNC_ENTRY(2)
+	push	%rbp
+	bsr	%rsi, %rcx
+	push	%rbx
+	mov	%rdi, %rbx
+	push	%r12
+	xor	$63, R32(%rcx)
+	mov	%rsi, %r12
+	mov	R32(%rcx), R32(%rbp)
+	sal	R8(%rcx), %r12
+IFSTD(`	mov	%r12, %rdi	')	C pass parameter
+IFDOS(`	mov	%r12, %rcx	')	C pass parameter
+IFDOS(`	sub	$32, %rsp	')
+	ASSERT(nz, `test $15, %rsp')
+	CALL(	mpn_invert_limb)
+IFDOS(`	add	$32, %rsp	')
+	neg	%r12
+	mov	%r12, %r8
+	mov	%rax, (%rbx)		C store bi
+	mov	%rbp, 8(%rbx)		C store cnt
+	imul	%rax, %r12
+	mov	%r12, 24(%rbx)		C store B2modb
+	mov	R32(%rbp), R32(%rcx)
+	test	R32(%rcx), R32(%rcx)
+	jz	L(z)
+
+	mov	$1, R32(%rdx)
+ifdef(`SHLD_SLOW',`
+	C Destroys %rax, unlike shld. Otherwise, we could do B1modb
+	C before B2modb, and get rid of the move %r12, %r8 above.
+
+	shl	R8(%rcx), %rdx
+	neg	R32(%rcx)
+	shr	R8(%rcx), %rax
+	or	%rax, %rdx
+	neg	R32(%rcx)
+',`
+	shld	R8(%rcx), %rax, %rdx
+')
+	imul	%rdx, %r8
+	shr	R8(%rcx), %r8
+	mov	%r8, 16(%rbx)		C store B1modb
+L(z):
+	pop	%r12
+	pop	%rbx
+	pop	%rbp
+	FUNC_EXIT()
+	ret
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/x86_64/mod_1_2.asm b/third_party/gmp/mpn/x86_64/mod_1_2.asm
new file mode 100644
index 0000000..40fcaeb
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/mod_1_2.asm

@@ -0,0 +1,241 @@
+dnl  AMD64 mpn_mod_1s_2p
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  Copyright 2009-2012, 2014 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C AMD K8,K9	 4
+C AMD K10	 4
+C Intel P4	19
+C Intel core2	 8
+C Intel NHM	 6.5
+C Intel SBR	 4.5
+C Intel atom	28
+C VIA nano	 8
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_mod_1s_2p)
+	FUNC_ENTRY(4)
+	push	%r14
+	test	$1, R8(%rsi)
+	mov	%rdx, %r14
+	push	%r13
+	mov	%rcx, %r13
+	push	%r12
+	push	%rbp
+	push	%rbx
+	mov	16(%rcx), %r10
+	mov	24(%rcx), %rbx
+	mov	32(%rcx), %rbp
+	je	L(b0)
+	dec	%rsi
+	je	L(one)
+	mov	-8(%rdi,%rsi,8), %rax
+	mul	%r10
+	mov	%rax, %r9
+	mov	%rdx, %r8
+	mov	(%rdi,%rsi,8), %rax
+	add	-16(%rdi,%rsi,8), %r9
+	adc	$0, %r8
+	mul	%rbx
+	add	%rax, %r9
+	adc	%rdx, %r8
+	jmp	L(11)
+
+L(b0):	mov	-8(%rdi,%rsi,8), %r8
+	mov	-16(%rdi,%rsi,8), %r9
+
+L(11):	sub	$4, %rsi
+	jb	L(ed2)
+	lea	40(%rdi,%rsi,8), %rdi
+	mov	-40(%rdi), %r11
+	mov	-32(%rdi), %rax
+	jmp	L(m0)
+
+	ALIGN(16)
+L(top):	mov	-24(%rdi), %r9
+	add	%rax, %r11
+	mov	-16(%rdi), %rax
+	adc	%rdx, %r12
+	mul	%r10
+	add	%rax, %r9
+	mov	%r11, %rax
+	mov	%rdx, %r8
+	adc	$0, %r8
+	mul	%rbx
+	add	%rax, %r9
+	mov	%r12, %rax
+	adc	%rdx, %r8
+	mul	%rbp
+	sub	$2, %rsi
+	jb	L(ed1)
+	mov	-40(%rdi), %r11
+	add	%rax, %r9
+	mov	-32(%rdi), %rax
+	adc	%rdx, %r8
+L(m0):	mul	%r10
+	add	%rax, %r11
+	mov	%r9, %rax
+	mov	%rdx, %r12
+	adc	$0, %r12
+	mul	%rbx
+	add	%rax, %r11
+	lea	-32(%rdi), %rdi		C ap -= 4
+	mov	%r8, %rax
+	adc	%rdx, %r12
+	mul	%rbp
+	sub	$2, %rsi
+	jae	L(top)
+
+L(ed0):	mov	%r11, %r9
+	mov	%r12, %r8
+L(ed1):	add	%rax, %r9
+	adc	%rdx, %r8
+L(ed2):	mov	8(%r13), R32(%rdi)		C cnt
+	mov	%r8, %rax
+	mov	%r9, %r8
+	mul	%r10
+	add	%rax, %r8
+	adc	$0, %rdx
+L(1):	xor	R32(%rcx), R32(%rcx)
+	mov	%r8, %r9
+	sub	R32(%rdi), R32(%rcx)
+	shr	R8(%rcx), %r9
+	mov	R32(%rdi), R32(%rcx)
+	sal	R8(%rcx), %rdx
+	or	%rdx, %r9
+	sal	R8(%rcx), %r8
+	mov	%r9, %rax
+	mulq	(%r13)
+	mov	%rax, %rsi
+	inc	%r9
+	add	%r8, %rsi
+	adc	%r9, %rdx
+	imul	%r14, %rdx
+	sub	%rdx, %r8
+	lea	(%r8,%r14), %rax
+	cmp	%r8, %rsi
+	cmovc	%rax, %r8
+	mov	%r8, %rax
+	sub	%r14, %rax
+	cmovc	%r8, %rax
+	mov	R32(%rdi), R32(%rcx)
+	shr	R8(%rcx), %rax
+	pop	%rbx
+	pop	%rbp
+	pop	%r12
+	pop	%r13
+	pop	%r14
+	FUNC_EXIT()
+	ret
+L(one):
+	mov	(%rdi), %r8
+	mov	8(%rcx), R32(%rdi)
+	xor	%rdx, %rdx
+	jmp	L(1)
+EPILOGUE()
+
+	ALIGN(16)
+PROLOGUE(mpn_mod_1s_2p_cps)
+	FUNC_ENTRY(2)
+	push	%rbp
+	bsr	%rsi, %rcx
+	push	%rbx
+	mov	%rdi, %rbx
+	push	%r12
+	xor	$63, R32(%rcx)
+	mov	%rsi, %r12
+	mov	R32(%rcx), R32(%rbp)	C preserve cnt over call
+	sal	R8(%rcx), %r12		C b << cnt
+IFSTD(`	mov	%r12, %rdi	')	C pass parameter
+IFDOS(`	mov	%r12, %rcx	')	C pass parameter
+IFDOS(`	sub	$32, %rsp	')
+	ASSERT(nz, `test $15, %rsp')
+	CALL(	mpn_invert_limb)
+IFDOS(`	add	$32, %rsp	')
+	mov	%r12, %r8
+	mov	%rax, %r11
+	mov	%rax, (%rbx)		C store bi
+	mov	%rbp, 8(%rbx)		C store cnt
+	neg	%r8
+	mov	R32(%rbp), R32(%rcx)
+	mov	$1, R32(%rsi)
+ifdef(`SHLD_SLOW',`
+	shl	R8(%rcx), %rsi
+	neg	R32(%rcx)
+	mov	%rax, %rbp
+	shr	R8(%rcx), %rax
+	or	%rax, %rsi
+	mov	%rbp, %rax
+	neg	R32(%rcx)
+',`
+	shld	R8(%rcx), %rax, %rsi	C FIXME: Slow on Atom and Nano
+')
+	imul	%r8, %rsi
+	mul	%rsi
+
+	add	%rsi, %rdx
+	shr	R8(%rcx), %rsi
+	mov	%rsi, 16(%rbx)		C store B1modb
+
+	not	%rdx
+	imul	%r12, %rdx
+	lea	(%rdx,%r12), %rsi
+	cmp	%rdx, %rax
+	cmovnc	%rdx, %rsi
+	mov	%r11, %rax
+	mul	%rsi
+
+	add	%rsi, %rdx
+	shr	R8(%rcx), %rsi
+	mov	%rsi, 24(%rbx)		C store B2modb
+
+	not	%rdx
+	imul	%r12, %rdx
+	add	%rdx, %r12
+	cmp	%rdx, %rax
+	cmovnc	%rdx, %r12
+
+	shr	R8(%rcx), %r12
+	mov	%r12, 32(%rbx)		C store B3modb
+
+	pop	%r12
+	pop	%rbx
+	pop	%rbp
+	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/mod_1_4.asm b/third_party/gmp/mpn/x86_64/mod_1_4.asm
new file mode 100644
index 0000000..6cf304c
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/mod_1_4.asm

@@ -0,0 +1,272 @@
+dnl  AMD64 mpn_mod_1s_4p
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  Copyright 2009-2012, 2014 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C AMD K8,K9	 3
+C AMD K10	 3
+C Intel P4	15.5
+C Intel core2	 5
+C Intel corei	 4
+C Intel atom	23
+C VIA nano	 4.75
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_mod_1s_4p)
+	FUNC_ENTRY(4)
+	push	%r15
+	push	%r14
+	push	%r13
+	push	%r12
+	push	%rbp
+	push	%rbx
+
+	mov	%rdx, %r15
+	mov	%rcx, %r14
+	mov	16(%rcx), %r11		C B1modb
+	mov	24(%rcx), %rbx		C B2modb
+	mov	32(%rcx), %rbp		C B3modb
+	mov	40(%rcx), %r13		C B4modb
+	mov	48(%rcx), %r12		C B5modb
+	xor	R32(%r8), R32(%r8)
+	mov	R32(%rsi), R32(%rdx)
+	and	$3, R32(%rdx)
+	je	L(b0)
+	cmp	$2, R32(%rdx)
+	jc	L(b1)
+	je	L(b2)
+
+L(b3):	lea	-24(%rdi,%rsi,8), %rdi
+	mov	8(%rdi), %rax
+	mul	%r11
+	mov	(%rdi), %r9
+	add	%rax, %r9
+	adc	%rdx, %r8
+	mov	16(%rdi), %rax
+	mul	%rbx
+	jmp	L(m0)
+
+	ALIGN(8)
+L(b0):	lea	-32(%rdi,%rsi,8), %rdi
+	mov	8(%rdi), %rax
+	mul	%r11
+	mov	(%rdi), %r9
+	add	%rax, %r9
+	adc	%rdx, %r8
+	mov	16(%rdi), %rax
+	mul	%rbx
+	add	%rax, %r9
+	adc	%rdx, %r8
+	mov	24(%rdi), %rax
+	mul	%rbp
+	jmp	L(m0)
+
+	ALIGN(8)
+L(b1):	lea	-8(%rdi,%rsi,8), %rdi
+	mov	(%rdi), %r9
+	jmp	L(m1)
+
+	ALIGN(8)
+L(b2):	lea	-16(%rdi,%rsi,8), %rdi
+	mov	8(%rdi), %r8
+	mov	(%rdi), %r9
+	jmp	L(m1)
+
+	ALIGN(16)
+L(top):	mov	-24(%rdi), %rax
+	mov	-32(%rdi), %r10
+	mul	%r11			C up[1] * B1modb
+	add	%rax, %r10
+	mov	-16(%rdi), %rax
+	mov	$0, R32(%rcx)
+	adc	%rdx, %rcx
+	mul	%rbx			C up[2] * B2modb
+	add	%rax, %r10
+	mov	-8(%rdi), %rax
+	adc	%rdx, %rcx
+	sub	$32, %rdi
+	mul	%rbp			C up[3] * B3modb
+	add	%rax, %r10
+	mov	%r13, %rax
+	adc	%rdx, %rcx
+	mul	%r9			C rl * B4modb
+	add	%rax, %r10
+	mov	%r12, %rax
+	adc	%rdx, %rcx
+	mul	%r8			C rh * B5modb
+	mov	%r10, %r9
+	mov	%rcx, %r8
+L(m0):	add	%rax, %r9
+	adc	%rdx, %r8
+L(m1):	sub	$4, %rsi
+	ja	L(top)
+
+L(end):	mov	8(%r14), R32(%rsi)
+	mov	%r8, %rax
+	mul	%r11
+	mov	%rax, %r8
+	add	%r9, %r8
+	adc	$0, %rdx
+	xor	R32(%rcx), R32(%rcx)
+	sub	R32(%rsi), R32(%rcx)
+	mov	%r8, %rdi
+	shr	R8(%rcx), %rdi
+	mov	R32(%rsi), R32(%rcx)
+	sal	R8(%rcx), %rdx
+	or	%rdx, %rdi
+	mov	%rdi, %rax
+	mulq	(%r14)
+	mov	%r15, %rbx
+	mov	%rax, %r9
+	sal	R8(%rcx), %r8
+	inc	%rdi
+	add	%r8, %r9
+	adc	%rdi, %rdx
+	imul	%rbx, %rdx
+	sub	%rdx, %r8
+	lea	(%r8,%rbx), %rax
+	cmp	%r8, %r9
+	cmovc	%rax, %r8
+	mov	%r8, %rax
+	sub	%rbx, %rax
+	cmovc	%r8, %rax
+	shr	R8(%rcx), %rax
+	pop	%rbx
+	pop	%rbp
+	pop	%r12
+	pop	%r13
+	pop	%r14
+	pop	%r15
+	FUNC_EXIT()
+	ret
+EPILOGUE()
+
+	ALIGN(16)
+PROLOGUE(mpn_mod_1s_4p_cps)
+	FUNC_ENTRY(2)
+	push	%rbp
+	bsr	%rsi, %rcx
+	push	%rbx
+	mov	%rdi, %rbx
+	push	%r12
+	xor	$63, R32(%rcx)
+	mov	%rsi, %r12
+	mov	R32(%rcx), R32(%rbp)	C preserve cnt over call
+	sal	R8(%rcx), %r12		C b << cnt
+IFSTD(`	mov	%r12, %rdi	')	C pass parameter
+IFDOS(`	mov	%r12, %rcx	')	C pass parameter
+IFDOS(`	sub	$32, %rsp	')
+	ASSERT(nz, `test $15, %rsp')
+	CALL(	mpn_invert_limb)
+IFDOS(`	add	$32, %rsp	')
+	mov	%r12, %r8
+	mov	%rax, %r11
+	mov	%rax, (%rbx)		C store bi
+	mov	%rbp, 8(%rbx)		C store cnt
+	neg	%r8
+	mov	R32(%rbp), R32(%rcx)
+	mov	$1, R32(%rsi)
+ifdef(`SHLD_SLOW',`
+	shl	R8(%rcx), %rsi
+	neg	R32(%rcx)
+	mov	%rax, %rbp
+	shr	R8(%rcx), %rax
+	or	%rax, %rsi
+	mov	%rbp, %rax
+	neg	R32(%rcx)
+',`
+	shld	R8(%rcx), %rax, %rsi	C FIXME: Slow on Atom and Nano
+')
+	imul	%r8, %rsi
+	mul	%rsi
+
+	add	%rsi, %rdx
+	shr	R8(%rcx), %rsi
+	mov	%rsi, 16(%rbx)		C store B1modb
+
+	not	%rdx
+	imul	%r12, %rdx
+	lea	(%rdx,%r12), %rsi
+	cmp	%rdx, %rax
+	cmovnc	%rdx, %rsi
+	mov	%r11, %rax
+	mul	%rsi
+
+	add	%rsi, %rdx
+	shr	R8(%rcx), %rsi
+	mov	%rsi, 24(%rbx)		C store B2modb
+
+	not	%rdx
+	imul	%r12, %rdx
+	lea	(%rdx,%r12), %rsi
+	cmp	%rdx, %rax
+	cmovnc	%rdx, %rsi
+	mov	%r11, %rax
+	mul	%rsi
+
+	add	%rsi, %rdx
+	shr	R8(%rcx), %rsi
+	mov	%rsi, 32(%rbx)		C store B3modb
+
+	not	%rdx
+	imul	%r12, %rdx
+	lea	(%rdx,%r12), %rsi
+	cmp	%rdx, %rax
+	cmovnc	%rdx, %rsi
+	mov	%r11, %rax
+	mul	%rsi
+
+	add	%rsi, %rdx
+	shr	R8(%rcx), %rsi
+	mov	%rsi, 40(%rbx)		C store B4modb
+
+	not	%rdx
+	imul	%r12, %rdx
+	add	%rdx, %r12
+	cmp	%rdx, %rax
+	cmovnc	%rdx, %r12
+
+	shr	R8(%rcx), %r12
+	mov	%r12, 48(%rbx)		C store B5modb
+
+	pop	%r12
+	pop	%rbx
+	pop	%rbp
+	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/mod_34lsub1.asm b/third_party/gmp/mpn/x86_64/mod_34lsub1.asm
new file mode 100644
index 0000000..75421a6
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/mod_34lsub1.asm

@@ -0,0 +1,215 @@
+dnl  AMD64 mpn_mod_34lsub1 -- remainder modulo 2^48-1.
+
+dnl  Copyright 2000-2002, 2004, 2005, 2007, 2009-2012 Free Software Foundation,
+dnl  Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C	    cycles/limb
+C AMD K8,K9	 0.67	   0.583 is possible with zero-reg instead of $0, 4-way
+C AMD K10	 0.67	   this seems hard to beat
+C AMD bd1	 1
+C AMD bd2	 1
+C AMD bd3	 ?
+C AMD bd4	 ?
+C AMD zen	 0.62
+C AMD bobcat	 1.07
+C AMD jaguar	 1
+C Intel P4	 7.35	   terrible, use old code
+C Intel core2	 1.25	   1+epsilon with huge unrolling
+C Intel NHM	 1.15	   this seems hard to beat
+C Intel SBR	 0.93
+C Intel IBR	 0.93
+C Intel HWL	 0.82
+C Intel BWL	 0.64
+C Intel SKY	 0.60
+C Intel atom	 2.5
+C Intel SLM      1.59
+C VIA nano	 1.25	   this seems hard to beat
+
+C INPUT PARAMETERS
+define(`ap',	%rdi)
+define(`n',	%rsi)
+
+C mp_limb_t mpn_mod_34lsub1 (mp_srcptr up, mp_size_t n)
+
+C TODO
+C  * Review feed-in and wind-down code.
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(32)
+PROLOGUE(mpn_mod_34lsub1)
+	FUNC_ENTRY(2)
+
+	mov	$0x0000FFFFFFFFFFFF, %r11
+
+	mov	(ap), %rax
+
+	cmp	$2, %rsi
+	ja	L(gt2)
+
+	jb	L(one)
+
+	mov	8(ap), %rsi
+	mov	%rax, %rdx
+	shr	$48, %rax		C src[0] low
+
+	and	%r11, %rdx		C src[0] high
+	add	%rdx, %rax
+	mov	R32(%rsi), R32(%rdx)
+
+	shr	$32, %rsi		C src[1] high
+	add	%rsi, %rax
+
+	shl	$16, %rdx		C src[1] low
+	add	%rdx, %rax
+L(one):	FUNC_EXIT()
+	ret
+
+
+C Don't change this, the wind-down code is not able to handle greater values
+define(UNROLL,3)
+
+L(gt2):	mov	8(ap), %rcx
+	mov	16(ap), %rdx
+	xor	%r9, %r9
+	add	$24, ap
+	sub	$eval(UNROLL*3+3), %rsi
+	jc	L(end)
+	ALIGN(16)
+L(top):
+	add	(ap), %rax
+	adc	8(ap), %rcx
+	adc	16(ap), %rdx
+	adc	$0, %r9
+forloop(i,1,UNROLL-1,`dnl
+	add	eval(i*24)(ap), %rax
+	adc	eval(i*24+8)(ap), %rcx
+	adc	eval(i*24+16)(ap), %rdx
+	adc	$0, %r9
+')dnl
+	add	$eval(UNROLL*24), ap
+	sub	$eval(UNROLL*3), %rsi
+	jnc	L(top)
+
+L(end):
+	lea	L(tab)(%rip), %r8
+ifdef(`PIC',
+`	movslq	36(%r8,%rsi,4), %r10
+	add	%r10, %r8
+	jmp	*%r8
+',`
+	jmp	*72(%r8,%rsi,8)
+')
+	JUMPTABSECT
+	ALIGN(8)
+L(tab):	JMPENT(	L(0), L(tab))
+	JMPENT(	L(1), L(tab))
+	JMPENT(	L(2), L(tab))
+	JMPENT(	L(3), L(tab))
+	JMPENT(	L(4), L(tab))
+	JMPENT(	L(5), L(tab))
+	JMPENT(	L(6), L(tab))
+	JMPENT(	L(7), L(tab))
+	JMPENT(	L(8), L(tab))
+	TEXT
+
+L(6):	add	(ap), %rax
+	adc	8(ap), %rcx
+	adc	16(ap), %rdx
+	adc	$0, %r9
+	add	$24, ap
+L(3):	add	(ap), %rax
+	adc	8(ap), %rcx
+	adc	16(ap), %rdx
+	jmp	L(cj1)
+
+L(7):	add	(ap), %rax
+	adc	8(ap), %rcx
+	adc	16(ap), %rdx
+	adc	$0, %r9
+	add	$24, ap
+L(4):	add	(ap), %rax
+	adc	8(ap), %rcx
+	adc	16(ap), %rdx
+	adc	$0, %r9
+	add	$24, ap
+L(1):	add	(ap), %rax
+	adc	$0, %rcx
+	jmp	L(cj2)
+
+L(8):	add	(ap), %rax
+	adc	8(ap), %rcx
+	adc	16(ap), %rdx
+	adc	$0, %r9
+	add	$24, ap
+L(5):	add	(ap), %rax
+	adc	8(ap), %rcx
+	adc	16(ap), %rdx
+	adc	$0, %r9
+	add	$24, ap
+L(2):	add	(ap), %rax
+	adc	8(ap), %rcx
+
+L(cj2):	adc	$0, %rdx
+L(cj1):	adc	$0, %r9
+L(0):	add	%r9, %rax
+	adc	$0, %rcx
+	adc	$0, %rdx
+	adc	$0, %rax
+
+	mov	%rax, %rdi		C 0mod3
+	shr	$48, %rax		C 0mod3 high
+
+	and	%r11, %rdi		C 0mod3 low
+	mov	R32(%rcx), R32(%r10)	C 1mod3
+
+	shr	$32, %rcx		C 1mod3 high
+
+	add	%rdi, %rax		C apply 0mod3 low
+	movzwl	%dx, R32(%rdi)		C 2mod3
+	shl	$16, %r10		C 1mod3 low
+
+	add	%rcx, %rax		C apply 1mod3 high
+	shr	$16, %rdx		C 2mod3 high
+
+	add	%r10, %rax		C apply 1mod3 low
+	shl	$32, %rdi		C 2mod3 low
+
+	add	%rdx, %rax		C apply 2mod3 high
+	add	%rdi, %rax		C apply 2mod3 low
+
+	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/mode1o.asm b/third_party/gmp/mpn/x86_64/mode1o.asm
new file mode 100644
index 0000000..2cd2b08
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/mode1o.asm

@@ -0,0 +1,171 @@
+dnl  AMD64 mpn_modexact_1_odd -- Hensel norm remainder.
+
+dnl  Copyright 2000-2006, 2011, 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C	     cycles/limb
+C AMD K8,K9	10
+C AMD K10	10
+C Intel P4	33
+C Intel core2	13
+C Intel corei	14.5
+C Intel atom	35
+C VIA nano	 ?
+
+
+C The dependent chain in the main loop is
+C
+C                            cycles
+C	sub	%rdx, %rax	1
+C	imul	%r9, %rax	4
+C	mul	%r8		5
+C			      ----
+C       total		       10
+C
+C The mov load from src seems to need to be scheduled back before the jz to
+C achieve this speed, out-of-order execution apparently can't completely hide
+C the latency otherwise.
+C
+C The l=src[i]-cbit step is rotated back too, since that allows us to avoid it
+C for the first iteration (where there's no cbit).
+C
+C The code alignment used (32-byte) for the loop also seems necessary.  Without
+C that the non-PIC case has adc crossing the 0x60 offset, apparently making it
+C run at 11 cycles instead of 10.
+
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(32)
+PROLOGUE(mpn_modexact_1_odd)
+	FUNC_ENTRY(3)
+	mov	$0, R32(%rcx)
+IFDOS(`	jmp	L(ent)		')
+
+PROLOGUE(mpn_modexact_1c_odd)
+	FUNC_ENTRY(4)
+L(ent):
+	C rdi	src
+	C rsi	size
+	C rdx	divisor
+	C rcx	carry
+
+	mov	%rdx, %r8		C d
+	shr	R32(%rdx)		C d/2
+
+	LEA(	binvert_limb_table, %r9)
+
+	and	$127, R32(%rdx)
+	mov	%rcx, %r10		C initial carry
+
+	movzbl	(%r9,%rdx), R32(%rdx)	C inv 8 bits
+
+	mov	(%rdi), %rax		C src[0]
+	lea	(%rdi,%rsi,8), %r11	C src end
+	mov	%r8, %rdi		C d, made available to imull
+
+	lea	(%rdx,%rdx), R32(%rcx)	C 2*inv
+	imul	R32(%rdx), R32(%rdx)	C inv*inv
+
+	neg	%rsi			C -size
+
+	imul	R32(%rdi), R32(%rdx)	C inv*inv*d
+
+	sub	R32(%rdx), R32(%rcx)	C inv = 2*inv - inv*inv*d, 16 bits
+
+	lea	(%rcx,%rcx), R32(%rdx)	C 2*inv
+	imul	R32(%rcx), R32(%rcx)	C inv*inv
+
+	imul	R32(%rdi), R32(%rcx)	C inv*inv*d
+
+	sub	R32(%rcx), R32(%rdx)	C inv = 2*inv - inv*inv*d, 32 bits
+	xor	R32(%rcx), R32(%rcx)	C initial cbit
+
+	lea	(%rdx,%rdx), %r9	C 2*inv
+	imul	%rdx, %rdx		C inv*inv
+
+	imul	%r8, %rdx		C inv*inv*d
+
+	sub	%rdx, %r9		C inv = 2*inv - inv*inv*d, 64 bits
+	mov	%r10, %rdx		C initial climb
+
+	ASSERT(e,`	C d*inv == 1 mod 2^64
+	mov	%r8, %r10
+	imul	%r9, %r10
+	cmp	$1, %r10')
+
+	inc	%rsi
+	jz	L(one)
+
+
+	ALIGN(16)
+L(top):
+	C rax	l = src[i]-cbit
+	C rcx	new cbit, 0 or 1
+	C rdx	climb, high of last product
+	C rsi	counter, limbs, negative
+	C rdi
+	C r8	divisor
+	C r9	inverse
+	C r11	src end ptr
+
+	sub	%rdx, %rax		C l = src[i]-cbit - climb
+
+	adc	$0, %rcx		C more cbit
+	imul	%r9, %rax		C q = l * inverse
+
+	mul	%r8			C climb = high (q * d)
+
+	mov	(%r11,%rsi,8), %rax	C src[i+1]
+	sub	%rcx, %rax		C next l = src[i+1] - cbit
+	setc	R8(%rcx)		C new cbit
+
+	inc	%rsi
+	jnz	L(top)
+
+
+L(one):
+	sub	%rdx, %rax		C l = src[i]-cbit - climb
+
+	adc	$0, %rcx		C more cbit
+	imul	%r9, %rax		C q = l * inverse
+
+	mul	%r8			C climb = high (q * d)
+
+	lea	(%rcx,%rdx), %rax	C climb+cbit
+	FUNC_EXIT()
+	ret
+
+EPILOGUE(mpn_modexact_1c_odd)
+EPILOGUE(mpn_modexact_1_odd)

diff --git a/third_party/gmp/mpn/x86_64/mul_1.asm b/third_party/gmp/mpn/x86_64/mul_1.asm
new file mode 100644
index 0000000..e1ba89b
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/mul_1.asm

@@ -0,0 +1,192 @@
+dnl  AMD64 mpn_mul_1.
+
+dnl  Copyright 2003-2005, 2007, 2008, 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C AMD K8,K9      2.54
+C AMD K10        2.54
+C AMD bull       4.98
+C AMD pile       4.80
+C AMD steam
+C AMD excavator
+C AMD bobcat     5.37
+C AMD jaguar     6.16
+C Intel P4      12.6
+C Intel core2    4.05
+C Intel NHM      4.0
+C Intel SBR      2.91
+C Intel IBR      2.73
+C Intel HWL      2.44
+C Intel BWL      2.39
+C Intel SKL      2.44
+C Intel atom    19.8
+C Intel SLM      9.0
+C VIA nano       4.25
+
+C The loop of this code is the result of running a code generation and
+C optimization tool suite written by David Harvey and Torbjorn Granlund.
+
+C TODO
+C  * The loop is great, but the prologue and epilogue code was quickly written.
+C    Tune it!
+
+define(`rp',      `%rdi')   C rcx
+define(`up',      `%rsi')   C rdx
+define(`n_param', `%rdx')   C r8
+define(`vl',      `%rcx')   C r9
+
+define(`n',       `%r11')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+IFDOS(`	define(`up', ``%rsi'')	') dnl
+IFDOS(`	define(`rp', ``%rcx'')	') dnl
+IFDOS(`	define(`vl', ``%r9'')	') dnl
+IFDOS(`	define(`r9', ``rdi'')	') dnl
+IFDOS(`	define(`n',  ``%r8'')	') dnl
+IFDOS(`	define(`r8', ``r11'')	') dnl
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_mul_1c)
+IFDOS(``push	%rsi		'')
+IFDOS(``push	%rdi		'')
+IFDOS(``mov	%rdx, %rsi	'')
+	push	%rbx
+IFSTD(`	mov	%r8, %r10')
+IFDOS(`	mov	64(%rsp), %r10')	C 40 + 3*8  (3 push insns)
+	jmp	L(common)
+EPILOGUE()
+
+PROLOGUE(mpn_mul_1)
+IFDOS(``push	%rsi		'')
+IFDOS(``push	%rdi		'')
+IFDOS(``mov	%rdx, %rsi	'')
+
+	push	%rbx
+	xor	%r10, %r10
+L(common):
+	mov	(up), %rax		C read first u limb early
+IFSTD(`	mov	n_param, %rbx   ')	C move away n from rdx, mul uses it
+IFDOS(`	mov	n, %rbx         ')
+	mul	vl
+IFSTD(`	mov	%rbx, n         ')
+
+	add	%r10, %rax
+	adc	$0, %rdx
+
+	and	$3, R32(%rbx)
+	jz	L(b0)
+	cmp	$2, R32(%rbx)
+	jz	L(b2)
+	jg	L(b3)
+
+L(b1):	dec	n
+	jne	L(gt1)
+	mov	%rax, (rp)
+	jmp	L(ret)
+L(gt1):	lea	8(up,n,8), up
+	lea	-8(rp,n,8), rp
+	neg	n
+	xor	%r10, %r10
+	xor	R32(%rbx), R32(%rbx)
+	mov	%rax, %r9
+	mov	(up,n,8), %rax
+	mov	%rdx, %r8
+	jmp	L(L1)
+
+L(b0):	lea	(up,n,8), up
+	lea	-16(rp,n,8), rp
+	neg	n
+	xor	%r10, %r10
+	mov	%rax, %r8
+	mov	%rdx, %rbx
+	jmp	 L(L0)
+
+L(b3):	lea	-8(up,n,8), up
+	lea	-24(rp,n,8), rp
+	neg	n
+	mov	%rax, %rbx
+	mov	%rdx, %r10
+	jmp	L(L3)
+
+L(b2):	lea	-16(up,n,8), up
+	lea	-32(rp,n,8), rp
+	neg	n
+	xor	%r8, %r8
+	xor	R32(%rbx), R32(%rbx)
+	mov	%rax, %r10
+	mov	24(up,n,8), %rax
+	mov	%rdx, %r9
+	jmp	L(L2)
+
+	ALIGN(16)
+L(top):	mov	%r10, (rp,n,8)
+	add	%rax, %r9
+	mov	(up,n,8), %rax
+	adc	%rdx, %r8
+	mov	$0, R32(%r10)
+L(L1):	mul	vl
+	mov	%r9, 8(rp,n,8)
+	add	%rax, %r8
+	adc	%rdx, %rbx
+L(L0):	mov	8(up,n,8), %rax
+	mul	vl
+	mov	%r8, 16(rp,n,8)
+	add	%rax, %rbx
+	adc	%rdx, %r10
+L(L3):	mov	16(up,n,8), %rax
+	mul	vl
+	mov	%rbx, 24(rp,n,8)
+	mov	$0, R32(%r8)		C zero
+	mov	%r8, %rbx		C zero
+	add	%rax, %r10
+	mov	24(up,n,8), %rax
+	mov	%r8, %r9		C zero
+	adc	%rdx, %r9
+L(L2):	mul	vl
+	add	$4, n
+	js	 L(top)
+
+	mov	%r10, (rp,n,8)
+	add	%rax, %r9
+	adc	%r8, %rdx
+	mov	%r9, 8(rp,n,8)
+	add	%r8, %rdx
+L(ret):	mov	%rdx, %rax
+
+	pop	%rbx
+IFDOS(``pop	%rdi		'')
+IFDOS(``pop	%rsi		'')
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/mul_2.asm b/third_party/gmp/mpn/x86_64/mul_2.asm
new file mode 100644
index 0000000..d64313b
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/mul_2.asm

@@ -0,0 +1,204 @@
+dnl  AMD64 mpn_mul_2 -- Multiply an n-limb vector with a 2-limb vector and
+dnl  store the result in a third limb vector.
+
+dnl  Copyright 2008, 2011, 2012, 2016 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C AMD K8,K9      4.53
+C AMD K10        4.53
+C AMD bull       9.76   10.37
+C AMD pile       9.22
+C AMD steam
+C AMD excavator
+C AMD bobcat    11.3
+C AMD jaguar    11.9
+C Intel P4      25.0
+C Intel core2    8.05
+C Intel NHM      7.72
+C Intel SBR      6.33
+C Intel IBR      6.15
+C Intel HWL      6.00
+C Intel BWL      4.44
+C Intel SKL      4.54
+C Intel atom    39.0
+C Intel SLM     24.0
+C VIA nano
+
+C This code is the result of running a code generation and optimization tool
+C suite written by David Harvey and Torbjorn Granlund.
+
+C TODO
+C  * Work on feed-in and wind-down code.
+C  * Convert "mov $0" to "xor".
+C  * Adjust initial lea to save some bytes.
+C  * Perhaps adjust n from n_param&3 value?
+C  * Replace with 2.25 c/l sequence.
+
+C INPUT PARAMETERS
+define(`rp',	 `%rdi')
+define(`up',	 `%rsi')
+define(`n_param',`%rdx')
+define(`vp',	 `%rcx')
+
+define(`v0', `%r8')
+define(`v1', `%r9')
+define(`w0', `%rbx')
+define(`w1', `%rcx')
+define(`w2', `%rbp')
+define(`w3', `%r10')
+define(`n',  `%r11')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_mul_2)
+	FUNC_ENTRY(4)
+	push	%rbx
+	push	%rbp
+
+	mov	(vp), v0
+	mov	8(vp), v1
+
+	mov	(up), %rax
+
+	mov	n_param, n
+	neg	n
+	lea	-8(up,n_param,8), up
+	lea	-8(rp,n_param,8), rp
+
+	and	$3, R32(n_param)
+	jz	L(m2p0)
+	cmp	$2, R32(n_param)
+	jc	L(m2p1)
+	jz	L(m2p2)
+L(m2p3):
+	mul	v0
+	xor	R32(w3), R32(w3)
+	mov	%rax, w1
+	mov	%rdx, w2
+	mov	8(up,n,8), %rax
+	add	$-1, n
+	mul	v1
+	add	%rax, w2
+	jmp	L(m23)
+L(m2p0):
+	mul	v0
+	xor	R32(w2), R32(w2)
+	mov	%rax, w0
+	mov	%rdx, w1
+	jmp	L(m20)
+L(m2p1):
+	mul	v0
+	xor	R32(w3), R32(w3)
+	xor	R32(w0), R32(w0)
+	xor	R32(w1), R32(w1)
+	add	$1, n
+	jmp	L(m2top)
+L(m2p2):
+	mul	v0
+	xor	R32(w0), R32(w0)
+	xor	R32(w1), R32(w1)
+	mov	%rax, w2
+	mov	%rdx, w3
+	mov	8(up,n,8), %rax
+	add	$-2, n
+	jmp	L(m22)
+
+
+	ALIGN(32)
+L(m2top):
+	add	%rax, w3
+	adc	%rdx, w0
+	mov	0(up,n,8), %rax
+	adc	$0, R32(w1)
+	mov	$0, R32(w2)
+	mul	v1
+	add	%rax, w0
+	mov	w3, 0(rp,n,8)
+	adc	%rdx, w1
+	mov	8(up,n,8), %rax
+	mul	v0
+	add	%rax, w0
+	adc	%rdx, w1
+	adc	$0, R32(w2)
+L(m20):	mov	8(up,n,8), %rax
+	mul	v1
+	add	%rax, w1
+	adc	%rdx, w2
+	mov	16(up,n,8), %rax
+	mov	$0, R32(w3)
+	mul	v0
+	add	%rax, w1
+	mov	16(up,n,8), %rax
+	adc	%rdx, w2
+	adc	$0, R32(w3)
+	mul	v1
+	add	%rax, w2
+	mov	w0, 8(rp,n,8)
+L(m23):	adc	%rdx, w3
+	mov	24(up,n,8), %rax
+	mul	v0
+	mov	$0, R32(w0)
+	add	%rax, w2
+	adc	%rdx, w3
+	mov	w1, 16(rp,n,8)
+	mov	24(up,n,8), %rax
+	mov	$0, R32(w1)
+	adc	$0, R32(w0)
+L(m22):	mul	v1
+	add	%rax, w3
+	mov	w2, 24(rp,n,8)
+	adc	%rdx, w0
+	mov	32(up,n,8), %rax
+	mul	v0
+	add	$4, n
+	js	L(m2top)
+
+
+	add	%rax, w3
+	adc	%rdx, w0
+	adc	$0, R32(w1)
+	mov	(up), %rax
+	mul	v1
+	mov	w3, (rp)
+	add	%rax, w0
+	adc	%rdx, w1
+	mov	w0, 8(rp)
+	mov	w1, %rax
+
+	pop	%rbp
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/mulx/adx/addmul_1.asm b/third_party/gmp/mpn/x86_64/mulx/adx/addmul_1.asm
new file mode 100644
index 0000000..9ceb611
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/mulx/adx/addmul_1.asm

@@ -0,0 +1,157 @@
+dnl  AMD64 mpn_addmul_1 for CPUs with mulx and adx.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2012, 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C AMD K8,K9	 -
+C AMD K10	 -
+C AMD bd1	 -
+C AMD bd2	 -
+C AMD bd3	 -
+C AMD bd4	 -
+C AMD zen	 ?
+C AMD bt1	 -
+C AMD bt2	 -
+C Intel P4	 -
+C Intel PNR	 -
+C Intel NHM	 -
+C Intel SBR	 -
+C Intel IBR	 -
+C Intel HWL	 -
+C Intel BWL	 ?
+C Intel SKL	 ?
+C Intel atom	 -
+C Intel SLM	 -
+C VIA nano	 -
+
+define(`rp',      `%rdi')	dnl rcx
+define(`up',      `%rsi')	dnl rdx
+define(`n_param', `%rdx')	dnl r8
+define(`v0_param',`%rcx')	dnl r9
+
+define(`n',       `%rcx')	dnl
+define(`v0',      `%rdx')	dnl
+
+C Testing mechanism for running this on older AMD64 processors
+ifelse(FAKE_MULXADX,1,`
+  include(CONFIG_TOP_SRCDIR`/mpn/x86_64/missing-call.m4')
+',`
+  define(`adox',	``adox'	$1, $2')
+  define(`adcx',	``adcx'	$1, $2')
+  define(`mulx',	``mulx'	$1, $2, $3')
+')
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_addmul_1)
+	mov	(up), %r8
+
+	push	%rbx
+	push	%r12
+	push	%r13
+
+	lea	(up,n_param,8), up
+	lea	-16(rp,n_param,8), rp
+	mov	R32(n_param), R32(%rax)
+	xchg	v0_param, v0		C FIXME: is this insn fast?
+
+	neg	n
+
+	and	$3, R8(%rax)
+	jz	L(b0)
+	cmp	$2, R8(%rax)
+	jl	L(b1)
+	jz	L(b2)
+
+L(b3):	mulx(	(up,n,8), %r11, %r10)
+	mulx(	8(up,n,8), %r13, %r12)
+	mulx(	16(up,n,8), %rbx, %rax)
+	dec	n
+	jmp	L(lo3)
+
+L(b0):	mulx(	(up,n,8), %r9, %r8)
+	mulx(	8(up,n,8), %r11, %r10)
+	mulx(	16(up,n,8), %r13, %r12)
+	jmp	L(lo0)
+
+L(b2):	mulx(	(up,n,8), %r13, %r12)
+	mulx(	8(up,n,8), %rbx, %rax)
+	lea	2(n), n
+	jrcxz	L(wd2)
+L(gt2):	mulx(	(up,n,8), %r9, %r8)
+	jmp	L(lo2)
+
+L(b1):	and	R8(%rax), R8(%rax)
+	mulx(	(up,n,8), %rbx, %rax)
+	lea	1(n), n
+	jrcxz	L(wd1)
+	mulx(	(up,n,8), %r9, %r8)
+	mulx(	8(up,n,8), %r11, %r10)
+	jmp	L(lo1)
+
+L(end):	adcx(	%r10, %r13)
+	mov	%r11, -8(rp)
+L(wd2):	adox(	(rp), %r13)
+	adcx(	%r12, %rbx)
+	mov	%r13, (rp)
+L(wd1):	adox(	8(rp), %rbx)
+	adcx(	%rcx, %rax)
+	adox(	%rcx, %rax)
+	mov	%rbx, 8(rp)
+	pop	%r13
+	pop	%r12
+	pop	%rbx
+	ret
+
+L(top):	jrcxz	L(end)
+	mulx(	(up,n,8), %r9, %r8)
+	adcx(	%r10, %r13)
+	mov	%r11, -8(rp,n,8)
+L(lo2):	adox(	(rp,n,8), %r13)
+	mulx(	8(up,n,8), %r11, %r10)
+	adcx(	%r12, %rbx)
+	mov	%r13, (rp,n,8)
+L(lo1):	adox(	8(rp,n,8), %rbx)
+	mulx(	16(up,n,8), %r13, %r12)
+	adcx(	%rax, %r9)
+	mov	%rbx, 8(rp,n,8)
+L(lo0):	adox(	16(rp,n,8), %r9)
+	mulx(	24(up,n,8), %rbx, %rax)
+	adcx(	%r8, %r11)
+	mov	%r9, 16(rp,n,8)
+L(lo3):	adox(	24(rp,n,8), %r11)
+	lea	4(n), n
+	jmp	L(top)
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/x86_64/nano/copyd.asm b/third_party/gmp/mpn/x86_64/nano/copyd.asm
new file mode 100644
index 0000000..f0dc54a
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/nano/copyd.asm

@@ -0,0 +1,37 @@
+dnl  X86-64 mpn_copyd optimised for Intel Sandy Bridge.
+
+dnl  Copyright 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_copyd)
+include_mpn(`x86_64/fastsse/copyd-palignr.asm')

diff --git a/third_party/gmp/mpn/x86_64/nano/copyi.asm b/third_party/gmp/mpn/x86_64/nano/copyi.asm
new file mode 100644
index 0000000..9c26e00
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/nano/copyi.asm

@@ -0,0 +1,37 @@
+dnl  X86-64 mpn_copyi optimised for Intel Sandy Bridge.
+
+dnl  Copyright 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_copyi)
+include_mpn(`x86_64/fastsse/copyi-palignr.asm')

diff --git a/third_party/gmp/mpn/x86_64/nano/dive_1.asm b/third_party/gmp/mpn/x86_64/nano/dive_1.asm
new file mode 100644
index 0000000..e9a0763
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/nano/dive_1.asm

@@ -0,0 +1,166 @@
+dnl  AMD64 mpn_divexact_1 -- mpn by limb exact division.
+
+dnl  Copyright 2001, 2002, 2004-2006, 2010-2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C	     cycles/limb
+C	       norm	       unorm
+C AMD K8,K9	11		11
+C AMD K10	11		11
+C Intel P4	 ?
+C Intel core2	13.5		13.25
+C Intel corei	14.25
+C Intel atom	34		36
+C VIA nano	19.25		19.25
+
+
+C INPUT PARAMETERS
+C rp		rdi
+C up		rsi
+C n		rdx
+C divisor	rcx
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_divexact_1)
+	FUNC_ENTRY(4)
+	push	%rbx
+
+	mov	%rcx, %rax
+	xor	R32(%rcx), R32(%rcx)	C shift count
+	mov	%rdx, %r8
+
+	bt	$0, R32(%rax)
+	jc	L(odd)			C skip bsfq unless divisor is even
+	bsf	%rax, %rcx
+	shr	R8(%rcx), %rax
+L(odd):	mov	%rax, %rbx
+	shr	R32(%rax)
+	and	$127, R32(%rax)		C d/2, 7 bits
+
+	LEA(	binvert_limb_table, %rdx)
+
+	movzbl	(%rdx,%rax), R32(%rax)	C inv 8 bits
+
+	mov	%rbx, %r11		C d without twos
+
+	lea	(%rax,%rax), R32(%rdx)	C 2*inv
+	imul	R32(%rax), R32(%rax)	C inv*inv
+	imul	R32(%rbx), R32(%rax)	C inv*inv*d
+	sub	R32(%rax), R32(%rdx)	C inv = 2*inv - inv*inv*d, 16 bits
+
+	lea	(%rdx,%rdx), R32(%rax)	C 2*inv
+	imul	R32(%rdx), R32(%rdx)	C inv*inv
+	imul	R32(%rbx), R32(%rdx)	C inv*inv*d
+	sub	R32(%rdx), R32(%rax)	C inv = 2*inv - inv*inv*d, 32 bits
+
+	lea	(%rax,%rax), %r10	C 2*inv
+	imul	%rax, %rax		C inv*inv
+	imul	%rbx, %rax		C inv*inv*d
+	sub	%rax, %r10		C inv = 2*inv - inv*inv*d, 64 bits
+
+	lea	(%rsi,%r8,8), %rsi	C up end
+	lea	-8(%rdi,%r8,8), %rdi	C rp end
+	neg	%r8			C -n
+
+	mov	(%rsi,%r8,8), %rax	C up[0]
+
+	inc	%r8
+	jz	L(one)
+
+	test	R32(%rcx), R32(%rcx)
+	jnz	L(unorm)		C branch if count != 0
+	xor	R32(%rbx), R32(%rbx)
+	jmp	L(nent)
+
+	ALIGN(8)
+L(ntop):mul	%r11			C carry limb in rdx	0 10
+	mov	-8(%rsi,%r8,8), %rax	C
+	sub	%rbx, %rax		C apply carry bit
+	setc	%bl			C
+	sub	%rdx, %rax		C apply carry limb	5
+	adc	$0, %rbx		C			6
+L(nent):imul	%r10, %rax		C			6
+	mov	%rax, (%rdi,%r8,8)	C
+	inc	%r8			C
+	jnz	L(ntop)
+
+	mov	-8(%rsi), %r9		C up high limb
+	jmp	L(com)
+
+L(unorm):
+	mov	(%rsi,%r8,8), %r9	C up[1]
+	shr	R8(%rcx), %rax		C
+	neg	R32(%rcx)
+	shl	R8(%rcx), %r9		C
+	neg	R32(%rcx)
+	or	%r9, %rax
+	xor	R32(%rbx), R32(%rbx)
+	jmp	L(uent)
+
+	ALIGN(8)
+L(utop):mul	%r11			C carry limb in rdx	0 10
+	mov	(%rsi,%r8,8), %rax	C
+	shl	R8(%rcx), %rax		C
+	neg	R32(%rcx)
+	or	%r9, %rax
+	sub	%rbx, %rax		C apply carry bit
+	setc	%bl			C
+	sub	%rdx, %rax		C apply carry limb	5
+	adc	$0, %rbx		C			6
+L(uent):imul	%r10, %rax		C			6
+	mov	(%rsi,%r8,8), %r9	C
+	shr	R8(%rcx), %r9		C
+	neg	R32(%rcx)
+	mov	%rax, (%rdi,%r8,8)	C
+	inc	%r8			C
+	jnz	L(utop)
+
+L(com):	mul	%r11			C carry limb in rdx
+	sub	%rbx, %r9		C apply carry bit
+	sub	%rdx, %r9		C apply carry limb
+	imul	%r10, %r9
+	mov	%r9, (%rdi)
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+
+L(one):	shr	R8(%rcx), %rax
+	imul	%r10, %rax
+	mov	%rax, (%rdi)
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/nano/gcd_11.asm b/third_party/gmp/mpn/x86_64/nano/gcd_11.asm
new file mode 100644
index 0000000..4723093
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/nano/gcd_11.asm

@@ -0,0 +1,37 @@
+dnl  AMD64 mpn_gcd_11.
+
+dnl  Copyright 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_gcd_11)
+include_mpn(`x86_64/core2/gcd_11.asm')

diff --git a/third_party/gmp/mpn/x86_64/nano/gmp-mparam.h b/third_party/gmp/mpn/x86_64/nano/gmp-mparam.h
new file mode 100644
index 0000000..fde69db
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/nano/gmp-mparam.h

@@ -0,0 +1,243 @@
+/* VIA Nano gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 2000-2010, 2012, 2014 Free Software Foundation,
+Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+#define SHLD_SLOW 1
+#define SHRD_SLOW 1
+
+/* 1600 MHz Nano 2xxx */
+/* FFT tuning limit = 25000000 */
+/* Generated by tuneup.c, 2014-03-12, gcc 4.2 */
+
+#define MOD_1_NORM_THRESHOLD                 0  /* always */
+#define MOD_1_UNNORM_THRESHOLD               0  /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          4
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          2
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD        18
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD        20
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD      8
+#define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_1_NORM_THRESHOLD              1
+#define DIV_QR_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           22
+
+#define MUL_TOOM22_THRESHOLD                27
+#define MUL_TOOM33_THRESHOLD                38
+#define MUL_TOOM44_THRESHOLD               324
+#define MUL_TOOM6H_THRESHOLD               450
+#define MUL_TOOM8H_THRESHOLD               632
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      73
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     207
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD     211
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD     219
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD     315
+
+#define SQR_BASECASE_THRESHOLD              10
+#define SQR_TOOM2_THRESHOLD                 52
+#define SQR_TOOM3_THRESHOLD                 73
+#define SQR_TOOM4_THRESHOLD                387
+#define SQR_TOOM6_THRESHOLD                662
+#define SQR_TOOM8_THRESHOLD                781
+
+#define MULMID_TOOM42_THRESHOLD             32
+
+#define MULMOD_BNM1_THRESHOLD               14
+#define SQRMOD_BNM1_THRESHOLD               15
+
+#define MUL_FFT_MODF_THRESHOLD             376  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    376, 5}, {     19, 6}, {     10, 5}, {     21, 6}, \
+    {     23, 7}, {     12, 6}, {     25, 7}, {     21, 8}, \
+    {     11, 7}, {     24, 8}, {     13, 7}, {     27, 8}, \
+    {     15, 7}, {     31, 8}, {     19, 7}, {     39, 8}, \
+    {     21, 9}, {     11, 8}, {     27, 9}, {     15, 8}, \
+    {     35, 9}, {     19, 8}, {     41, 9}, {     23, 8}, \
+    {     49, 9}, {     27,10}, {     15, 9}, {     43,10}, \
+    {     23, 9}, {     55,11}, {     15,10}, {     31, 9}, \
+    {     67,10}, {     39, 9}, {     83,10}, {     47, 9}, \
+    {     95,10}, {     79,11}, {     47,10}, {    103,12}, \
+    {     31,11}, {     63,10}, {    143,11}, {     79,10}, \
+    {    159, 9}, {    319,10}, {    175,11}, {     95, 9}, \
+    {    383, 8}, {    767,10}, {    207,11}, {    111,12}, \
+    {     63,11}, {    127,10}, {    255,11}, {    143, 9}, \
+    {    575, 8}, {   1151,10}, {    303,11}, {    159,10}, \
+    {    319, 9}, {    639, 8}, {   1279,10}, {    335,12}, \
+    {     95,11}, {    191,10}, {    383, 9}, {    767,11}, \
+    {    207,10}, {    415, 9}, {    831, 8}, {   1663,10}, \
+    {    447,13}, {     63,12}, {    127,11}, {    255,10}, \
+    {    511, 9}, {   1023,11}, {    271,10}, {    543, 9}, \
+    {   1087,10}, {    575, 9}, {   1215,12}, {    159,11}, \
+    {    319,10}, {    639, 9}, {   1279,11}, {    335,10}, \
+    {    671, 9}, {   1343,11}, {    351,10}, {    703, 9}, \
+    {   1407,12}, {    191,11}, {    383,10}, {    767, 9}, \
+    {   1535,10}, {    831, 9}, {   1663,12}, {    223,11}, \
+    {    447,10}, {    895,13}, {    127,12}, {    255,11}, \
+    {    511,10}, {   1023,11}, {    543,10}, {   1087,12}, \
+    {    287,11}, {    575,10}, {   1151,11}, {    607,10}, \
+    {   1215,12}, {    319,11}, {    639,10}, {   1279,11}, \
+    {    671,10}, {   1343,12}, {    351,11}, {    703,10}, \
+    {   1407,13}, {    191,12}, {    383,11}, {    767,10}, \
+    {   1535,12}, {    415,11}, {    831,10}, {   1663,12}, \
+    {    447,11}, {    895,10}, {   1791,14}, {    127,13}, \
+    {    255,12}, {    511,11}, {   1023,12}, {    543,11}, \
+    {   1087,12}, {    575,11}, {   1151,12}, {    607,11}, \
+    {   1215,13}, {    319,12}, {    639,11}, {   1279,12}, \
+    {    671,11}, {   1343,12}, {    703,11}, {   1407,13}, \
+    {    383,12}, {    767,11}, {   1535,12}, {    831,11}, \
+    {   1663,13}, {    447,12}, {    895,11}, {   1791,13}, \
+    {    511,12}, {   1023,11}, {   2047,12}, {   1087,13}, \
+    {    575,12}, {   1151,11}, {   2303,12}, {   1215,13}, \
+    {    639,12}, {   1279,11}, {   2559,12}, {   1343,13}, \
+    {    703,12}, {   1407,14}, {    383,13}, {    767,12}, \
+    {   1535,13}, {    831,12}, {   1663,13}, {    895,12}, \
+    {   1791,13}, {    959,14}, {    511,13}, {   1023,12}, \
+    {   2047,13}, {   1087,12}, {   2175,13}, {   1151,12}, \
+    {   2303,13}, {   1215,14}, {    639,13}, {   1279,12}, \
+    {   2559,13}, {   1407,12}, {   2815,13}, {   1471,14}, \
+    {    767,13}, {   1535,12}, {   3071,13}, {   1663,14}, \
+    {    895,13}, {   1791,12}, {   3583,13}, {   1919,15}, \
+    {    511,14}, {   1023,13}, {   2047,12}, {   4095,13}, \
+    {   2175,14}, {   1151,13}, {   2303,12}, {   4607,13}, \
+    {   2431,14}, {   1279,13}, {   2559,12}, {   5119,14}, \
+    {   1407,13}, {   2815,12}, {   5631,15}, {  32768,16}, \
+    {  65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \
+    {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 224
+#define MUL_FFT_THRESHOLD                 3520
+
+#define SQR_FFT_MODF_THRESHOLD             340  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    340, 5}, {     19, 6}, {     10, 5}, {     21, 6}, \
+    {     21, 7}, {     11, 6}, {     23, 7}, {     21, 8}, \
+    {     11, 7}, {     24, 8}, {     13, 7}, {     27, 8}, \
+    {     15, 7}, {     31, 8}, {     21, 9}, {     11, 8}, \
+    {     27, 9}, {     15, 8}, {     33, 9}, {     19, 8}, \
+    {     41, 9}, {     23, 8}, {     47, 9}, {     27,10}, \
+    {     15, 9}, {     39,10}, {     23, 9}, {     51,11}, \
+    {     15,10}, {     31, 9}, {     67,10}, {     39, 9}, \
+    {     79,10}, {     47, 9}, {     95,10}, {     55,11}, \
+    {     31,10}, {     63, 9}, {    127,10}, {     71, 9}, \
+    {    143,10}, {     79,11}, {     47,10}, {     95, 9}, \
+    {    191,10}, {    103,12}, {     31,11}, {     63,10}, \
+    {    127, 9}, {    255,10}, {    135, 7}, {   1087, 9}, \
+    {    287,11}, {     79, 9}, {    319, 8}, {    639,10}, \
+    {    167,11}, {     95,10}, {    191, 9}, {    383, 8}, \
+    {    767,11}, {    111,12}, {     63,11}, {    127,10}, \
+    {    255, 9}, {    511, 8}, {   1023,10}, {    271, 9}, \
+    {    543, 8}, {   1087,11}, {    143, 9}, {    575, 8}, \
+    {   1151,10}, {    303, 9}, {    639, 8}, {   1279,10}, \
+    {    335, 9}, {    671,10}, {    351, 9}, {    703,12}, \
+    {     95,11}, {    191,10}, {    383, 9}, {    767,11}, \
+    {    207,10}, {    415, 9}, {    831,13}, {     63,12}, \
+    {    127,11}, {    255,10}, {    511, 9}, {   1023,11}, \
+    {    271,10}, {    543, 9}, {   1087,10}, {    575, 9}, \
+    {   1151,11}, {    303,10}, {    607, 9}, {   1215,12}, \
+    {    159,11}, {    319,10}, {    639, 9}, {   1279,10}, \
+    {    671, 9}, {   1343,11}, {    351,10}, {    703, 9}, \
+    {   1407,12}, {    191,11}, {    383,10}, {    767, 9}, \
+    {   1535,11}, {    415,10}, {    831, 9}, {   1663,12}, \
+    {    223,11}, {    447,10}, {    959,13}, {    127,12}, \
+    {    255,11}, {    511,10}, {   1023,11}, {    543,10}, \
+    {   1087,11}, {    575,10}, {   1215,12}, {    319,11}, \
+    {    639,10}, {   1279,11}, {    671,10}, {   1343,12}, \
+    {    351,11}, {    703,10}, {   1407,13}, {    191,12}, \
+    {    383,11}, {    767,10}, {   1535,12}, {    415,11}, \
+    {    831,10}, {   1663,12}, {    447,11}, {    895,10}, \
+    {   1791,12}, {    479,11}, {    959,14}, {    127,12}, \
+    {    511,11}, {   1023,12}, {    543,11}, {   1087,12}, \
+    {    575,11}, {   1151,12}, {    607,11}, {   1215,13}, \
+    {    319,12}, {    639,11}, {   1279,12}, {    671,11}, \
+    {   1343,12}, {    703,11}, {   1407,13}, {    383,12}, \
+    {    767,11}, {   1535,12}, {    831,11}, {   1663,13}, \
+    {    447,12}, {    895,11}, {   1791,12}, {    959,13}, \
+    {    511,12}, {   1023,11}, {   2047,12}, {   1087,13}, \
+    {    575,12}, {   1215,13}, {    639,12}, {   1343,13}, \
+    {    703,12}, {   1407,11}, {   2815,13}, {    767,12}, \
+    {   1535,13}, {    831,12}, {   1663,13}, {    895,12}, \
+    {   1791,13}, {    959,14}, {    511,13}, {   1023,12}, \
+    {   2047,13}, {   1087,12}, {   2175,13}, {   1215,14}, \
+    {    639,13}, {   1279,12}, {   2559,13}, {   1407,12}, \
+    {   2815,14}, {    767,13}, {   1535,12}, {   3071,13}, \
+    {   1663,14}, {    895,13}, {   1791,12}, {   3583,13}, \
+    {   1919,15}, {    511,14}, {   1023,13}, {   2047,12}, \
+    {   4095,13}, {   2175,14}, {   1151,13}, {   2303,12}, \
+    {   4607,14}, {   1279,13}, {   2559,14}, {   1407,13}, \
+    {   2815,15}, {  32768,16}, {  65536,17}, { 131072,18}, \
+    { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \
+    {4194304,23}, {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 230
+#define SQR_FFT_THRESHOLD                 2496
+
+#define MULLO_BASECASE_THRESHOLD            13
+#define MULLO_DC_THRESHOLD                  38
+#define MULLO_MUL_N_THRESHOLD             6633
+
+#define DC_DIV_QR_THRESHOLD                 56
+#define DC_DIVAPPR_Q_THRESHOLD             173
+#define DC_BDIV_QR_THRESHOLD                55
+#define DC_BDIV_Q_THRESHOLD                 96
+
+#define INV_MULMOD_BNM1_THRESHOLD           54
+#define INV_NEWTON_THRESHOLD               202
+#define INV_APPR_THRESHOLD                 166
+
+#define BINV_NEWTON_THRESHOLD              246
+#define REDC_1_TO_REDC_2_THRESHOLD           7
+#define REDC_2_TO_REDC_N_THRESHOLD          85
+
+#define MU_DIV_QR_THRESHOLD               1499
+#define MU_DIVAPPR_Q_THRESHOLD            1652
+#define MUPI_DIV_QR_THRESHOLD               83
+#define MU_BDIV_QR_THRESHOLD              1210
+#define MU_BDIV_Q_THRESHOLD               1499
+
+#define POWM_SEC_TABLE  1,28,129,642,2387
+
+#define MATRIX22_STRASSEN_THRESHOLD         15
+#define HGCD_THRESHOLD                     127
+#define HGCD_APPR_THRESHOLD                214
+#define HGCD_REDUCE_THRESHOLD             2479
+#define GCD_DC_THRESHOLD                   487
+#define GCDEXT_DC_THRESHOLD                505
+#define JACOBI_BASE_METHOD                   4
+
+#define GET_STR_DC_THRESHOLD                12
+#define GET_STR_PRECOMPUTE_THRESHOLD        24
+#define SET_STR_DC_THRESHOLD               802
+#define SET_STR_PRECOMPUTE_THRESHOLD      2042
+
+#define FAC_DSC_THRESHOLD                 1737
+#define FAC_ODD_THRESHOLD                   44

diff --git a/third_party/gmp/mpn/x86_64/nano/popcount.asm b/third_party/gmp/mpn/x86_64/nano/popcount.asm
new file mode 100644
index 0000000..fb14dd3
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/nano/popcount.asm

@@ -0,0 +1,35 @@
+dnl  x86-64 mpn_popcount.
+
+dnl  Copyright 2007, 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_popcount)
+include_mpn(`x86/pentium4/sse2/popcount.asm')

diff --git a/third_party/gmp/mpn/x86_64/pentium4/addmul_2.asm b/third_party/gmp/mpn/x86_64/pentium4/addmul_2.asm
new file mode 100644
index 0000000..7ae6a1a
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/pentium4/addmul_2.asm

@@ -0,0 +1,37 @@
+dnl  X86-64 mpn_addmul_2 optimised for Intel Nocona.
+
+dnl  Copyright 2018 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_addmul_2)
+include_mpn(`x86_64/bd1/addmul_2.asm')

diff --git a/third_party/gmp/mpn/x86_64/pentium4/aors_n.asm b/third_party/gmp/mpn/x86_64/pentium4/aors_n.asm
new file mode 100644
index 0000000..8e6ee1b
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/pentium4/aors_n.asm

@@ -0,0 +1,196 @@
+dnl  x86-64 mpn_add_n/mpn_sub_n optimized for Pentium 4.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  Copyright 2007, 2008, 2010-2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C	     cycles/limb
+C AMD K8,K9	 2.8
+C AMD K10	 2.8
+C Intel P4	 4
+C Intel core2	 3.6-5	(fluctuating)
+C Intel corei	 ?
+C Intel atom	 ?
+C VIA nano	 ?
+
+
+C INPUT PARAMETERS
+define(`rp',	`%rdi')
+define(`up',	`%rsi')
+define(`vp',	`%rdx')
+define(`n',	`%rcx')
+define(`cy',	`%r8')
+
+ifdef(`OPERATION_add_n', `
+	define(ADDSUB,	      add)
+	define(func,	      mpn_add_n)
+	define(func_nc,	      mpn_add_nc)')
+ifdef(`OPERATION_sub_n', `
+	define(ADDSUB,	      sub)
+	define(func,	      mpn_sub_n)
+	define(func_nc,	      mpn_sub_nc)')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+ASM_START()
+	TEXT
+PROLOGUE(func)
+	FUNC_ENTRY(4)
+	xor	%r8, %r8
+IFDOS(`	jmp	L(ent)		')
+EPILOGUE()
+PROLOGUE(func_nc)
+	FUNC_ENTRY(4)
+IFDOS(`	mov	56(%rsp), %r8	')
+L(ent):	push	%rbx
+	push	%r12
+
+	mov	(vp), %r9
+
+	mov	R32(n), R32(%rax)
+	and	$3, R32(%rax)
+	jne	L(n00)		C n = 0, 4, 8, ...
+	mov	R32(%r8), R32(%rbx)
+	mov	(up), %r8
+	mov	8(up), %r10
+	ADDSUB	%r9, %r8
+	mov	8(vp), %r9
+	setc	R8(%rax)
+	lea	-16(rp), rp
+	jmp	L(L00)
+
+L(n00):	cmp	$2, R32(%rax)
+	jnc	L(n01)		C n = 1, 5, 9, ...
+	mov	(up), %r11
+	mov	R32(%r8), R32(%rax)
+	xor	R32(%rbx), R32(%rbx)
+	dec	n
+	jnz	L(gt1)
+	ADDSUB	%r9, %r11
+	setc	R8(%rbx)
+	ADDSUB	%rax, %r11
+	adc	$0, R32(%rbx)
+	mov	%r11, (rp)
+	jmp	L(ret)
+L(gt1):	mov	8(up), %r8
+	ADDSUB	%r9, %r11
+	mov	8(vp), %r9
+	setc	R8(%rbx)
+	lea	-8(rp), rp
+	lea	8(up), up
+	lea	8(vp), vp
+	jmp	L(L01)
+
+L(n01):	jne	L(n10)		C n = 2, 6, 10, ...
+	mov	(up), %r12
+	mov	R32(%r8), R32(%rbx)
+	mov	8(up), %r11
+	ADDSUB	%r9, %r12
+	mov	8(vp), %r9
+	setc	R8(%rax)
+	lea	-32(rp), rp
+	lea	16(up), up
+	lea	16(vp), vp
+	jmp	L(L10)
+
+L(n10):	mov	(up), %r10	C n = 3, 7, 11, ...
+	mov	R32(%r8), R32(%rax)
+	xor	R32(%rbx), R32(%rbx)
+	mov	8(up), %r12
+	ADDSUB	%r9, %r10
+	mov	8(vp), %r9
+	setc	R8(%rbx)
+	lea	-24(rp), rp
+	lea	-8(up), up
+	lea	-8(vp), vp
+	jmp	L(L11)
+
+L(c0):	mov	$1, R8(%rbx)
+	jmp	L(rc0)
+L(c1):	mov	$1, R8(%rax)
+	jmp	L(rc1)
+L(c2):	mov	$1, R8(%rbx)
+	jmp	L(rc2)
+L(c3):	mov	$1, R8(%rax)
+	jmp	L(rc3)
+
+	ALIGN(16)
+L(top):	mov	(up), %r8	C not on critical path
+	ADDSUB	%r9, %r11	C not on critical path
+	mov	(vp), %r9	C not on critical path
+	setc	R8(%rbx)	C save carry out
+	mov	%r12, (rp)
+L(L01):	ADDSUB	%rax, %r11	C apply previous carry out
+	jc	L(c0)		C jump if ripple
+L(rc0):	mov	8(up), %r10
+	ADDSUB	%r9, %r8
+	mov	8(vp), %r9
+	setc	R8(%rax)
+	mov	%r11, 8(rp)
+L(L00):	ADDSUB	%rbx, %r8
+	jc	L(c1)
+L(rc1):	mov	16(up), %r12
+	ADDSUB	%r9, %r10
+	mov	16(vp), %r9
+	setc	R8(%rbx)
+	mov	%r8, 16(rp)
+L(L11):	ADDSUB	%rax, %r10
+	jc	L(c2)
+L(rc2):	mov	24(up), %r11
+	ADDSUB	%r9, %r12
+	lea	32(up), up
+	mov	24(vp), %r9
+	lea	32(vp), vp
+	setc	R8(%rax)
+	mov	%r10, 24(rp)
+L(L10):	ADDSUB	%rbx, %r12
+	jc	L(c3)
+L(rc3):	lea	32(rp), rp
+	sub	$4, n
+	ja	L(top)
+
+L(end):	ADDSUB	%r9, %r11
+	setc	R8(%rbx)
+	mov	%r12, (rp)
+	ADDSUB	%rax, %r11
+	jnc	L(1)
+	mov	$1, R8(%rbx)
+L(1):	mov	%r11, 8(rp)
+
+L(ret):	mov	R32(%rbx), R32(%rax)
+	pop	%r12
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/pentium4/aorslsh1_n.asm b/third_party/gmp/mpn/x86_64/pentium4/aorslsh1_n.asm
new file mode 100644
index 0000000..66937d3
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/pentium4/aorslsh1_n.asm

@@ -0,0 +1,50 @@
+dnl  AMD64 mpn_addlsh1_n, mpn_sublsh1_n -- rp[] = up[] +- (vp[] << 1),
+dnl  optimised for Pentium 4.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  Copyright 2008, 2010-2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+define(LSH, 1)
+define(RSH, 31)			C 31, not 63, since we use 32-bit ops
+
+ifdef(`OPERATION_addlsh1_n', `
+  define(ADDSUB,	add)
+  define(func,		mpn_addlsh1_n)')
+ifdef(`OPERATION_sublsh1_n', `
+  define(ADDSUB,	sub)
+  define(func,		mpn_sublsh1_n)')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_sublsh1_n)
+include_mpn(`x86_64/pentium4/aorslshC_n.asm')

diff --git a/third_party/gmp/mpn/x86_64/pentium4/aorslsh2_n.asm b/third_party/gmp/mpn/x86_64/pentium4/aorslsh2_n.asm
new file mode 100644
index 0000000..001f0ac
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/pentium4/aorslsh2_n.asm

@@ -0,0 +1,50 @@
+dnl  AMD64 mpn_addlsh2_n, mpn_sublsh2_n -- rp[] = up[] +- (vp[] << 2),
+dnl  optimised for Pentium 4.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  Copyright 2008, 2010-2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+define(LSH, 2)
+define(RSH, 30)			C 30, not 62, since we use 32-bit ops
+
+ifdef(`OPERATION_addlsh2_n', `
+  define(ADDSUB,	add)
+  define(func,		mpn_addlsh2_n)')
+ifdef(`OPERATION_sublsh2_n', `
+  define(ADDSUB,	sub)
+  define(func,		mpn_sublsh2_n)')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_sublsh2_n)
+include_mpn(`x86_64/pentium4/aorslshC_n.asm')

diff --git a/third_party/gmp/mpn/x86_64/pentium4/aorslshC_n.asm b/third_party/gmp/mpn/x86_64/pentium4/aorslshC_n.asm
new file mode 100644
index 0000000..d03c6a3
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/pentium4/aorslshC_n.asm

@@ -0,0 +1,203 @@
+dnl  AMD64 mpn_addlshC_n, mpn_sublshC_n -- rp[] = up[] +- (vp[] << C), where
+dnl  C is 1, 2, 3.  Optimized for Pentium 4.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  Copyright 2008, 2010-2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+C	     cycles/limb
+C AMD K8,K9	 3.8
+C AMD K10	 3.8
+C Intel P4	 5.8
+C Intel core2	 4.75
+C Intel corei	 4.75
+C Intel atom	 ?
+C VIA nano	 4.75
+
+
+C INPUT PARAMETERS
+define(`rp',`%rdi')
+define(`up',`%rsi')
+define(`vp',`%rdx')
+define(`n', `%rcx')
+
+define(M, eval(m4_lshift(1,LSH)))
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(func)
+	FUNC_ENTRY(4)
+	push	%rbx
+	push	%r12
+	push	%rbp
+
+	mov	(vp), %r9
+	shl	$LSH, %r9
+	mov	4(vp), R32(%rbp)
+
+	xor	R32(%rbx), R32(%rbx)
+
+	mov	R32(n), R32(%rax)
+	and	$3, R32(%rax)
+	jne	L(n00)		C n = 0, 4, 8, ...
+
+	mov	(up), %r8
+	mov	8(up), %r10
+	shr	$RSH, R32(%rbp)
+	ADDSUB	%r9, %r8
+	mov	8(vp), %r9
+	lea	(%rbp,%r9,M), %r9
+	setc	R8(%rax)
+	mov	12(vp), R32(%rbp)
+	lea	-16(rp), rp
+	jmp	L(L00)
+
+L(n00):	cmp	$2, R32(%rax)
+	jnc	L(n01)		C n = 1, 5, 9, ...
+	mov	(up), %r11
+	lea	-8(rp), rp
+	shr	$RSH, R32(%rbp)
+	ADDSUB	%r9, %r11
+	setc	R8(%rbx)
+	dec	n
+	jz	L(1)		C jump for n = 1
+	mov	8(up), %r8
+	mov	8(vp), %r9
+	lea	(%rbp,%r9,M), %r9
+	mov	12(vp), R32(%rbp)
+	lea	8(up), up
+	lea	8(vp), vp
+	jmp	L(L01)
+
+L(n01):	jne	L(n10)		C n = 2, 6, 10, ...
+	mov	(up), %r12
+	mov	8(up), %r11
+	shr	$RSH, R32(%rbp)
+	ADDSUB	%r9, %r12
+	mov	8(vp), %r9
+	lea	(%rbp,%r9,M), %r9
+	setc	R8(%rax)
+	mov	12(vp), R32(%rbp)
+	lea	16(up), up
+	lea	16(vp), vp
+	jmp	L(L10)
+
+L(n10):	mov	(up), %r10
+	mov	8(up), %r12
+	shr	$RSH, R32(%rbp)
+	ADDSUB	%r9, %r10
+	mov	8(vp), %r9
+	lea	(%rbp,%r9,M), %r9
+	setc	R8(%rbx)
+	mov	12(vp), R32(%rbp)
+	lea	-24(rp), rp
+	lea	-8(up), up
+	lea	-8(vp), vp
+	jmp	L(L11)
+
+L(c0):	mov	$1, R8(%rbx)
+	jmp	L(rc0)
+L(c1):	mov	$1, R8(%rax)
+	jmp	L(rc1)
+L(c2):	mov	$1, R8(%rbx)
+	jmp	L(rc2)
+
+	ALIGN(16)
+L(top):	mov	(up), %r8	C not on critical path
+	shr	$RSH, R32(%rbp)
+	ADDSUB	%r9, %r11	C not on critical path
+	mov	(vp), %r9
+	lea	(%rbp,%r9,M), %r9
+	setc	R8(%rbx)	C save carry out
+	mov	4(vp), R32(%rbp)
+	mov	%r12, (rp)
+	ADDSUB	%rax, %r11	C apply previous carry out
+	jc	L(c0)		C jump if ripple
+L(rc0):
+L(L01):	mov	8(up), %r10
+	shr	$RSH, R32(%rbp)
+	ADDSUB	%r9, %r8
+	mov	8(vp), %r9
+	lea	(%rbp,%r9,M), %r9
+	setc	R8(%rax)
+	mov	12(vp), R32(%rbp)
+	mov	%r11, 8(rp)
+	ADDSUB	%rbx, %r8
+	jc	L(c1)
+L(rc1):
+L(L00):	mov	16(up), %r12
+	shr	$RSH, R32(%rbp)
+	ADDSUB	%r9, %r10
+	mov	16(vp), %r9
+	lea	(%rbp,%r9,M), %r9
+	setc	R8(%rbx)
+	mov	20(vp), R32(%rbp)
+	mov	%r8, 16(rp)
+	ADDSUB	%rax, %r10
+	jc	L(c2)
+L(rc2):
+L(L11):	mov	24(up), %r11
+	shr	$RSH, R32(%rbp)
+	ADDSUB	%r9, %r12
+	mov	24(vp), %r9
+	lea	(%rbp,%r9,M), %r9
+	lea	32(up), up
+	lea	32(vp), vp
+	setc	R8(%rax)
+	mov	-4(vp), R32(%rbp)
+	mov	%r10, 24(rp)
+	ADDSUB	%rbx, %r12
+	jc	L(c3)
+L(rc3):	lea	32(rp), rp
+L(L10):	sub	$4, n
+	ja	L(top)
+
+L(end):
+	shr	$RSH, R32(%rbp)
+	ADDSUB	%r9, %r11
+	setc	R8(%rbx)
+	mov	%r12, (rp)
+	ADDSUB	%rax, %r11
+	jnc	L(1)
+	mov	$1, R8(%rbx)
+L(1):	mov	%r11, 8(rp)
+	lea	(%rbx,%rbp), R32(%rax)
+	pop	%rbp
+	pop	%r12
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+L(c3):	mov	$1, R8(%rax)
+	jmp	L(rc3)
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/x86_64/pentium4/aorsmul_1.asm b/third_party/gmp/mpn/x86_64/pentium4/aorsmul_1.asm
new file mode 100644
index 0000000..e5dbb34
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/pentium4/aorsmul_1.asm

@@ -0,0 +1,37 @@
+dnl  X86-64 mpn_addmul_1 and mpn_submul_1 optimised for Intel Nocona.
+
+dnl  Copyright 2018 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
+include_mpn(`x86_64/bd1/aorsmul_1.asm')

diff --git a/third_party/gmp/mpn/x86_64/pentium4/gmp-mparam.h b/third_party/gmp/mpn/x86_64/pentium4/gmp-mparam.h
new file mode 100644
index 0000000..9c79310
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/pentium4/gmp-mparam.h

@@ -0,0 +1,257 @@
+/* Pentium 4-64 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+/* These routines exists for all x86_64 chips, but they are slower on Pentium4
+   than separate add/sub and shift.  Make sure they are not really used.  */
+#undef HAVE_NATIVE_mpn_rsblsh1_n
+#undef HAVE_NATIVE_mpn_rsblsh2_n
+#undef HAVE_NATIVE_mpn_addlsh_n
+#undef HAVE_NATIVE_mpn_rsblsh_n
+
+/* 3400 MHz Pentium4 Nocona / 1024 Kibyte L2 cache */
+/* FFT tuning limit = 107,095,964 */
+/* Generated by tuneup.c, 2019-11-09, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD                 0  /* always */
+#define MOD_1_UNNORM_THRESHOLD               0  /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          5
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          3
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD        14
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD        32
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     11
+#define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_1_NORM_THRESHOLD              2
+#define DIV_QR_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */
+#define DIV_QR_2_PI2_THRESHOLD              12
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           20
+
+#define DIV_1_VS_MUL_1_PERCENT             228
+
+#define MUL_TOOM22_THRESHOLD                12
+#define MUL_TOOM33_THRESHOLD                81
+#define MUL_TOOM44_THRESHOLD               130
+#define MUL_TOOM6H_THRESHOLD               173
+#define MUL_TOOM8H_THRESHOLD               430
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      81
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD      91
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD      89
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD      88
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD     112
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 18
+#define SQR_TOOM3_THRESHOLD                113
+#define SQR_TOOM4_THRESHOLD                202
+#define SQR_TOOM6_THRESHOLD                238
+#define SQR_TOOM8_THRESHOLD                430
+
+#define MULMID_TOOM42_THRESHOLD             20
+
+#define MULMOD_BNM1_THRESHOLD                9
+#define SQRMOD_BNM1_THRESHOLD               11
+
+#define MUL_FFT_MODF_THRESHOLD             236  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    236, 5}, {     11, 6}, {      6, 5}, {     13, 6}, \
+    {      9, 5}, {     19, 6}, {     17, 7}, {      9, 6}, \
+    {     19, 7}, {     10, 6}, {     21, 7}, {     11, 6}, \
+    {     23, 7}, {     13, 8}, {      7, 7}, {     17, 8}, \
+    {      9, 7}, {     21, 8}, {     11, 7}, {     23, 8}, \
+    {     13, 9}, {      7, 8}, {     15, 7}, {     31, 8}, \
+    {     21, 9}, {     11, 8}, {     27,10}, {      7, 9}, \
+    {     15, 8}, {     33, 9}, {     19, 8}, {     39, 9}, \
+    {     23, 8}, {     47, 9}, {     27,10}, {     15, 9}, \
+    {     39,10}, {     23, 9}, {     51,11}, {     15,10}, \
+    {     31, 9}, {     67,10}, {     39, 9}, {     83,10}, \
+    {     47, 9}, {     95,10}, {     55,11}, {     31,10}, \
+    {     63, 9}, {    127, 8}, {    255,10}, {     71, 9}, \
+    {    143, 8}, {    287,10}, {     79,11}, {     47,10}, \
+    {     95, 9}, {    191,12}, {     31,11}, {     63,10}, \
+    {    127, 9}, {    255,10}, {    143, 9}, {    287,11}, \
+    {     79,10}, {    159, 9}, {    319,10}, {    175,11}, \
+    {     95,10}, {    191, 9}, {    383,10}, {    223,12}, \
+    {     63,11}, {    127,10}, {    255,11}, {    143,10}, \
+    {    287,11}, {    159,10}, {    319,11}, {    175,12}, \
+    {     95,11}, {    191,10}, {    383,11}, {    223,13}, \
+    {     63,12}, {    127,11}, {    255,10}, {    511,11}, \
+    {    287,10}, {    575,12}, {    159,11}, {    351,12}, \
+    {    191,11}, {    383,12}, {    223,11}, {    447,13}, \
+    {    127,12}, {    255,11}, {    511,12}, {    287,11}, \
+    {    575,10}, {   1151,12}, {    351,13}, {    191,12}, \
+    {    415,11}, {    831,10}, {   1663,12}, {    447,14}, \
+    {    127,13}, {    255,12}, {    511,11}, {   1023,12}, \
+    {    543,11}, {   1087,10}, {   2175,12}, {    575,11}, \
+    {   1151,13}, {    319,12}, {    639,11}, {   1279,12}, \
+    {    671,11}, {   1343,12}, {    703,13}, {    383,12}, \
+    {    767,11}, {   1535,12}, {    831,11}, {   1663,13}, \
+    {    447,14}, {    255,13}, {    511,12}, {   1023,11}, \
+    {   2047,12}, {   1087,11}, {   2175,13}, {    575,12}, \
+    {   1151,11}, {   2303,12}, {   1215,11}, {   2431,10}, \
+    {   4863,13}, {    639,12}, {   1279,11}, {   2559,12}, \
+    {   1343,13}, {    703,14}, {    383,13}, {    767,12}, \
+    {   1535,13}, {    831,12}, {   1663,15}, {    255,14}, \
+    {    511,13}, {   1023,12}, {   2047,13}, {   1087,12}, \
+    {   2175,13}, {   1151,12}, {   2303,13}, {   1215,12}, \
+    {   2431,11}, {   4863,14}, {    639,13}, {   1279,12}, \
+    {   2559,13}, {   1343,12}, {   2687,13}, {   1407,12}, \
+    {   2815,13}, {   1471,14}, {    767,13}, {   1663,14}, \
+    {    895,13}, {   1791,12}, {   3583,13}, {   1919,12}, \
+    {   3839,15}, {    511,14}, {   1023,13}, {   2175,14}, \
+    {   1151,13}, {   2303,12}, {   4607,13}, {   2431,12}, \
+    {   4863,14}, {   1279,13}, {   2687,14}, {   1407,13}, \
+    {   2815,15}, {    767,14}, {   1791,13}, {   3583,14}, \
+    {   1919,13}, {   3839,16}, {    511,15}, {   1023,14}, \
+    {   2175,13}, {   4351,14}, {   2303,13}, {   4607,14}, \
+    {   2431,13}, {   4863,15}, {   1279,14}, {   2943,13}, \
+    {   5887,15}, {   1535,14}, {   3199,15}, {   1791,14}, \
+    {   3839,13}, {   7679,16}, {   1023,15}, {   2047,14}, \
+    {   4351,15}, {   2303,14}, {   4863,15}, {   2815,14}, \
+    {   5887,16}, {   1535,15}, {   3071,14}, {   6143,15}, \
+    {  32768,16}, {  65536,17}, { 131072,18}, { 262144,19}, \
+    { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \
+    {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 229
+#define MUL_FFT_THRESHOLD                 2752
+
+#define SQR_FFT_MODF_THRESHOLD             240  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    240, 5}, {     11, 6}, {      6, 5}, {     13, 6}, \
+    {      9, 5}, {     19, 6}, {     17, 7}, {      9, 6}, \
+    {     23, 7}, {     12, 6}, {     25, 7}, {     13, 8}, \
+    {      7, 7}, {     17, 8}, {      9, 7}, {     21, 8}, \
+    {     11, 7}, {     24, 8}, {     13, 9}, {      7, 8}, \
+    {     15, 7}, {     31, 8}, {     21, 9}, {     11, 8}, \
+    {     27,10}, {      7, 9}, {     15, 8}, {     33, 9}, \
+    {     19, 8}, {     39, 9}, {     27,10}, {     15, 9}, \
+    {     39,10}, {     23, 9}, {     47,11}, {     15,10}, \
+    {     31, 9}, {     63,10}, {     39, 9}, {     79,10}, \
+    {     55,11}, {     31,10}, {     63, 9}, {    127, 8}, \
+    {    255,10}, {     71, 9}, {    143, 8}, {    287,10}, \
+    {     79,11}, {     47,10}, {     95, 9}, {    191,12}, \
+    {     31,11}, {     63,10}, {    127, 9}, {    255,10}, \
+    {    143, 9}, {    287,11}, {     79,10}, {    159, 9}, \
+    {    319,10}, {    175, 9}, {    351,11}, {     95,10}, \
+    {    191, 9}, {    383,10}, {    207, 9}, {    415,10}, \
+    {    223,12}, {     63,11}, {    127,10}, {    255,11}, \
+    {    143,10}, {    287,11}, {    159,10}, {    319,11}, \
+    {    175,10}, {    351,12}, {     95,11}, {    191,10}, \
+    {    383,11}, {    207,10}, {    415,11}, {    223,13}, \
+    {     63,12}, {    127,11}, {    255,10}, {    511,11}, \
+    {    287,10}, {    575,12}, {    159,11}, {    319,10}, \
+    {    639,11}, {    351,12}, {    191,11}, {    383,10}, \
+    {    767,12}, {    223,11}, {    447,13}, {    127,12}, \
+    {    255,11}, {    511,12}, {    287,11}, {    575,10}, \
+    {   1151,12}, {    319,11}, {    639,12}, {    351,13}, \
+    {    191,12}, {    383,11}, {    767,12}, {    415,11}, \
+    {    831,12}, {    447,14}, {    127,13}, {    255,12}, \
+    {    511,11}, {   1023,12}, {    543,11}, {   1087,12}, \
+    {    575,11}, {   1151,13}, {    319,12}, {    639,11}, \
+    {   1279,12}, {    671,11}, {   1343,13}, {    383,12}, \
+    {    767,11}, {   1535,12}, {    831,13}, {    447,14}, \
+    {    255,13}, {    511,12}, {   1023,11}, {   2047,12}, \
+    {   1087,13}, {    575,12}, {   1151,11}, {   2303,12}, \
+    {   1215,11}, {   2431,10}, {   4863,13}, {    639,12}, \
+    {   1279,11}, {   2559,12}, {   1343,11}, {   2687,14}, \
+    {    383,13}, {    767,12}, {   1535,13}, {    831,12}, \
+    {   1663,15}, {    255,14}, {    511,13}, {   1023,12}, \
+    {   2047,13}, {   1087,12}, {   2175,13}, {   1151,12}, \
+    {   2303,13}, {   1215,12}, {   2431,11}, {   4863,14}, \
+    {    639,13}, {   1279,12}, {   2559,13}, {   1343,12}, \
+    {   2687,13}, {   1407,12}, {   2815,13}, {   1471,14}, \
+    {    767,13}, {   1663,14}, {    895,13}, {   1791,12}, \
+    {   3583,13}, {   1919,12}, {   3839,15}, {    511,14}, \
+    {   1023,13}, {   2175,14}, {   1151,13}, {   2303,12}, \
+    {   4607,13}, {   2431,12}, {   4863,14}, {   1279,13}, \
+    {   2687,14}, {   1407,13}, {   2943,15}, {    767,14}, \
+    {   1663,13}, {   3327,14}, {   1791,13}, {   3583,14}, \
+    {   1919,13}, {   3839,16}, {    511,15}, {   1023,14}, \
+    {   2175,13}, {   4351,14}, {   2303,13}, {   4607,14}, \
+    {   2431,13}, {   4863,15}, {   1279,14}, {   2815,13}, \
+    {   5631,14}, {   2943,13}, {   5887,15}, {   1535,14}, \
+    {   3327,15}, {   1791,14}, {   3839,13}, {   7679,16}, \
+    {   1023,15}, {   2047,14}, {   4351,15}, {   2303,14}, \
+    {   4863,15}, {   2815,14}, {   5887,16}, {   1535,15}, \
+    {   3071,14}, {   6143,15}, {  32768,16}, {  65536,17}, \
+    { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
+    {2097152,22}, {4194304,23}, {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 235
+#define SQR_FFT_THRESHOLD                 2368
+
+#define MULLO_BASECASE_THRESHOLD             0  /* always */
+#define MULLO_DC_THRESHOLD                  45
+#define MULLO_MUL_N_THRESHOLD             5397
+#define SQRLO_BASECASE_THRESHOLD             6
+#define SQRLO_DC_THRESHOLD                  46
+#define SQRLO_SQR_THRESHOLD               4658
+
+#define DC_DIV_QR_THRESHOLD                 36
+#define DC_DIVAPPR_Q_THRESHOLD              95
+#define DC_BDIV_QR_THRESHOLD                35
+#define DC_BDIV_Q_THRESHOLD                 47
+
+#define INV_MULMOD_BNM1_THRESHOLD           22
+#define INV_NEWTON_THRESHOLD               178
+#define INV_APPR_THRESHOLD                 116
+
+#define BINV_NEWTON_THRESHOLD              206
+#define REDC_1_TO_REDC_2_THRESHOLD          24
+#define REDC_2_TO_REDC_N_THRESHOLD          50
+
+#define MU_DIV_QR_THRESHOLD                979
+#define MU_DIVAPPR_Q_THRESHOLD             979
+#define MUPI_DIV_QR_THRESHOLD               97
+#define MU_BDIV_QR_THRESHOLD               762
+#define MU_BDIV_Q_THRESHOLD                942
+
+#define POWM_SEC_TABLE  7,34,114,523,1486
+
+#define GET_STR_DC_THRESHOLD                13
+#define GET_STR_PRECOMPUTE_THRESHOLD        25
+#define SET_STR_DC_THRESHOLD               381
+#define SET_STR_PRECOMPUTE_THRESHOLD      1659
+
+#define FAC_DSC_THRESHOLD                  969
+#define FAC_ODD_THRESHOLD                    0  /* always */
+
+#define MATRIX22_STRASSEN_THRESHOLD         29
+#define HGCD2_DIV1_METHOD                    3  /* 2.03% faster than 5 */
+#define HGCD_THRESHOLD                      92
+#define HGCD_APPR_THRESHOLD                 95
+#define HGCD_REDUCE_THRESHOLD             1815
+#define GCD_DC_THRESHOLD                   195
+#define GCDEXT_DC_THRESHOLD                233
+#define JACOBI_BASE_METHOD                   4  /* 17.06% faster than 1 */
+
+/* Tuneup completed successfully, took 297016 seconds */

diff --git a/third_party/gmp/mpn/x86_64/pentium4/lshift.asm b/third_party/gmp/mpn/x86_64/pentium4/lshift.asm
new file mode 100644
index 0000000..4037be4
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/pentium4/lshift.asm

@@ -0,0 +1,37 @@
+dnl  X86-64 mpn_lshift optimised for Pentium 4.
+
+dnl  Copyright 2018 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_lshift)
+include_mpn(`x86_64/fastsse/lshift.asm')

diff --git a/third_party/gmp/mpn/x86_64/pentium4/lshiftc.asm b/third_party/gmp/mpn/x86_64/pentium4/lshiftc.asm
new file mode 100644
index 0000000..52856c1
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/pentium4/lshiftc.asm

@@ -0,0 +1,37 @@
+dnl  X86-64 mpn_lshiftc optimised for Pentium 4.
+
+dnl  Copyright 2018 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_lshiftc)
+include_mpn(`x86_64/fastsse/lshiftc.asm')

diff --git a/third_party/gmp/mpn/x86_64/pentium4/mod_34lsub1.asm b/third_party/gmp/mpn/x86_64/pentium4/mod_34lsub1.asm
new file mode 100644
index 0000000..f34b3f0
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/pentium4/mod_34lsub1.asm

@@ -0,0 +1,167 @@
+dnl  AMD64 mpn_mod_34lsub1 -- remainder modulo 2^48-1.
+
+dnl  Copyright 2000-2002, 2004, 2005, 2007, 2010-2012 Free Software Foundation,
+dnl  Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C	     cycles/limb
+C AMD K8,K9	 1.0
+C AMD K10	 1.12
+C Intel P4	 3.25
+C Intel core2	 1.5
+C Intel corei	 1.5
+C Intel atom	 2.5
+C VIA nano	 1.75
+
+
+C INPUT PARAMETERS
+define(`ap',	%rdi)
+define(`n',	%rsi)
+
+C mp_limb_t mpn_mod_34lsub1 (mp_srcptr up, mp_size_t n)
+
+C TODO
+C  * Review feed-in and wind-down code.  In particular, try to avoid adc and
+C    sbb to placate Pentium4.
+C  * It seems possible to reach 2.67 c/l by using a cleaner 6-way unrolling,
+C    without the dual loop exits.
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(32)
+PROLOGUE(mpn_mod_34lsub1)
+	FUNC_ENTRY(2)
+
+	mov	$0x0000FFFFFFFFFFFF, %r11
+
+	sub	$2, %rsi
+	ja	L(gt2)
+
+	mov	(ap), %rax
+	nop
+	jb	L(1)
+
+	mov	8(ap), %rsi
+	mov	%rax, %rdx
+	shr	$48, %rax		C src[0] low
+
+	and	%r11, %rdx		C src[0] high
+	add	%rdx, %rax
+	mov	R32(%rsi), R32(%rdx)
+
+	shr	$32, %rsi		C src[1] high
+	add	%rsi, %rax
+
+	shl	$16, %rdx		C src[1] low
+	add	%rdx, %rax
+
+L(1):	FUNC_EXIT()
+	ret
+
+
+	ALIGN(16)
+L(gt2):	xor	R32(%rax), R32(%rax)
+	xor	R32(%rcx), R32(%rcx)
+	xor	R32(%rdx), R32(%rdx)
+	xor	%r8, %r8
+	xor	%r9, %r9
+	xor	%r10, %r10
+
+L(top):	add	(ap), %rax
+	adc	$0, %r10
+	add	8(ap), %rcx
+	adc	$0, %r8
+	add	16(ap), %rdx
+	adc	$0, %r9
+
+	sub	$3, %rsi
+	jng	L(end)
+
+	add	24(ap), %rax
+	adc	$0, %r10
+	add	32(ap), %rcx
+	adc	$0, %r8
+	add	40(ap), %rdx
+	lea	48(ap), ap
+	adc	$0, %r9
+
+	sub	$3, %rsi
+	jg	L(top)
+
+
+	add	$-24, ap
+L(end):	add	%r9, %rax
+	adc	%r10, %rcx
+	adc	%r8, %rdx
+
+	inc	%rsi
+	mov	$0x1, R32(%r10)
+	js	L(combine)
+
+	mov	$0x10000, R32(%r10)
+	adc	24(ap), %rax
+	dec	%rsi
+	js	L(combine)
+
+	adc	32(ap), %rcx
+	mov	$0x100000000, %r10
+
+L(combine):
+	sbb	%rsi, %rsi		C carry
+	mov	%rax, %rdi		C 0mod3
+	shr	$48, %rax		C 0mod3 high
+
+	and	%r10, %rsi		C carry masked
+	and	%r11, %rdi		C 0mod3 low
+	mov	R32(%rcx), R32(%r10)	C 1mod3
+
+	add	%rsi, %rax		C apply carry
+	shr	$32, %rcx		C 1mod3 high
+
+	add	%rdi, %rax		C apply 0mod3 low
+	movzwl	%dx, R32(%rdi)		C 2mod3
+	shl	$16, %r10		C 1mod3 low
+
+	add	%rcx, %rax		C apply 1mod3 high
+	shr	$16, %rdx		C 2mod3 high
+
+	add	%r10, %rax		C apply 1mod3 low
+	shl	$32, %rdi		C 2mod3 low
+
+	add	%rdx, %rax		C apply 2mod3 high
+	add	%rdi, %rax		C apply 2mod3 low
+
+	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/pentium4/mul_1.asm b/third_party/gmp/mpn/x86_64/pentium4/mul_1.asm
new file mode 100644
index 0000000..70de670
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/pentium4/mul_1.asm

@@ -0,0 +1,37 @@
+dnl  X86-64 mpn_mul_1 optimised for Intel Nocona.
+
+dnl  Copyright 2018 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_mul_1 mpn_mul_1c)
+include_mpn(`x86_64/bd1/mul_1.asm')

diff --git a/third_party/gmp/mpn/x86_64/pentium4/mul_2.asm b/third_party/gmp/mpn/x86_64/pentium4/mul_2.asm
new file mode 100644
index 0000000..a0f7302
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/pentium4/mul_2.asm

@@ -0,0 +1,37 @@
+dnl  X86-64 mpn_mul_2 optimised for Intel Nocona.
+
+dnl  Copyright 2018 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_mul_2)
+include_mpn(`x86_64/bd1/mul_2.asm')

diff --git a/third_party/gmp/mpn/x86_64/pentium4/mul_basecase.asm b/third_party/gmp/mpn/x86_64/pentium4/mul_basecase.asm
new file mode 100644
index 0000000..fb16029
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/pentium4/mul_basecase.asm

@@ -0,0 +1,37 @@
+dnl  X86-64 mpn_mul_basecase optimised for Intel Nocona.
+
+dnl  Copyright 2018 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_mul_basecase)
+include_mpn(`x86_64/core2/mul_basecase.asm')

diff --git a/third_party/gmp/mpn/x86_64/pentium4/mullo_basecase.asm b/third_party/gmp/mpn/x86_64/pentium4/mullo_basecase.asm
new file mode 100644
index 0000000..b9e08a8
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/pentium4/mullo_basecase.asm

@@ -0,0 +1,37 @@
+dnl  X86-64 mpn_mullo_basecase optimised for Intel Nocona.
+
+dnl  Copyright 2018 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_mullo_basecase)
+include_mpn(`x86_64/core2/mullo_basecase.asm')

diff --git a/third_party/gmp/mpn/x86_64/pentium4/popcount.asm b/third_party/gmp/mpn/x86_64/pentium4/popcount.asm
new file mode 100644
index 0000000..7014b39
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/pentium4/popcount.asm

@@ -0,0 +1,35 @@
+dnl  x86-64 mpn_popcount optimized for Pentium 4.
+
+dnl  Copyright 2007 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_popcount)
+include_mpn(`x86/pentium4/sse2/popcount.asm')

diff --git a/third_party/gmp/mpn/x86_64/pentium4/redc_1.asm b/third_party/gmp/mpn/x86_64/pentium4/redc_1.asm
new file mode 100644
index 0000000..00e380d
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/pentium4/redc_1.asm

@@ -0,0 +1,37 @@
+dnl  X86-64 mpn_redc_1 optimised for Intel Nocona.
+
+dnl  Copyright 2018 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_redc_1)
+include_mpn(`x86_64/bt1/redc_1.asm')

diff --git a/third_party/gmp/mpn/x86_64/pentium4/rsh1aors_n.asm b/third_party/gmp/mpn/x86_64/pentium4/rsh1aors_n.asm
new file mode 100644
index 0000000..5528ce4
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/pentium4/rsh1aors_n.asm

@@ -0,0 +1,334 @@
+dnl  x86-64 mpn_rsh1add_n/mpn_rsh1sub_n optimized for Pentium 4.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  Copyright 2007, 2008, 2010-2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C	     cycles/limb
+C AMD K8,K9	 4.13
+C AMD K10	 4.13
+C Intel P4	 5.70
+C Intel core2	 4.75
+C Intel corei	 5
+C Intel atom	 8.75
+C VIA nano	 5.25
+
+C TODO
+C  * Try to make this smaller, 746 bytes seem excessive for this 2nd class
+C    function.  Less sw pipelining would help, and since we now probably
+C    pipeline somewhat too deeply, it might not affect performance too much.
+C  * A separate small-n loop might speed things as well as make things smaller.
+C    That loop should be selected before pushing registers.
+
+C INPUT PARAMETERS
+define(`rp',	`%rdi')
+define(`up',	`%rsi')
+define(`vp',	`%rdx')
+define(`n',	`%rcx')
+define(`cy',	`%r8')
+
+ifdef(`OPERATION_rsh1add_n', `
+	define(ADDSUB,	      add)
+	define(func,	      mpn_rsh1add_n)
+	define(func_nc,	      mpn_rsh1add_nc)')
+ifdef(`OPERATION_rsh1sub_n', `
+	define(ADDSUB,	      sub)
+	define(func,	      mpn_rsh1sub_n)
+	define(func_nc,	      mpn_rsh1sub_nc)')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1add_nc mpn_rsh1sub_n mpn_rsh1sub_nc)
+
+ASM_START()
+	TEXT
+PROLOGUE(func)
+	FUNC_ENTRY(4)
+	xor	%r8, %r8
+IFDOS(`	jmp	L(ent)		')
+EPILOGUE()
+PROLOGUE(func_nc)
+	FUNC_ENTRY(4)
+IFDOS(`	mov	56(%rsp), %r8	')
+L(ent):	push	%rbx
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+
+	mov	(vp), %r9
+	mov	(up), %r15
+
+	mov	R32(n), R32(%rax)
+	and	$3, R32(%rax)
+	jne	L(n00)
+
+	mov	R32(%r8), R32(%rbx)	C n = 0, 4, 8, ...
+	mov	8(up), %r10
+	ADDSUB	%r9, %r15
+	mov	8(vp), %r9
+	setc	R8(%rax)
+	ADDSUB	%rbx, %r15		C return bit
+	jnc	1f
+	mov	$1, R8(%rax)
+1:	mov	16(up), %r12
+	ADDSUB	%r9, %r10
+	mov	16(vp), %r9
+	setc	R8(%rbx)
+	mov	%r15, %r13
+	ADDSUB	%rax, %r10
+	jnc	1f
+	mov	$1, R8(%rbx)
+1:	mov	24(up), %r11
+	ADDSUB	%r9, %r12
+	lea	32(up), up
+	mov	24(vp), %r9
+	lea	32(vp), vp
+	setc	R8(%rax)
+	mov	%r10, %r14
+	shl	$63, %r10
+	shr	%r13
+	jmp	L(L00)
+
+L(n00):	cmp	$2, R32(%rax)
+	jnc	L(n01)
+	xor	R32(%rbx), R32(%rbx)	C n = 1, 5, 9, ...
+	lea	-24(rp), rp
+	mov	R32(%r8), R32(%rax)
+	dec	n
+	jnz	L(gt1)
+	ADDSUB	%r9, %r15
+	setc	R8(%rbx)
+	ADDSUB	%rax, %r15
+	jnc	1f
+	mov	$1, R8(%rbx)
+1:	mov	%r15, %r14
+	shl	$63, %rbx
+	shr	%r14
+	jmp	L(cj1)
+L(gt1):	mov	8(up), %r8
+	ADDSUB	%r9, %r15
+	mov	8(vp), %r9
+	setc	R8(%rbx)
+	ADDSUB	%rax, %r15
+	jnc	1f
+	mov	$1, R8(%rbx)
+1:	mov	16(up), %r10
+	ADDSUB	%r9, %r8
+	mov	16(vp), %r9
+	setc	R8(%rax)
+	mov	%r15, %r14
+	ADDSUB	%rbx, %r8
+	jnc	1f
+	mov	$1, R8(%rax)
+1:	mov	24(up), %r12
+	ADDSUB	%r9, %r10
+	mov	24(vp), %r9
+	setc	R8(%rbx)
+	mov	%r8, %r13
+	shl	$63, %r8
+	shr	%r14
+	lea	8(up), up
+	lea	8(vp), vp
+	jmp	L(L01)
+
+L(n01):	jne	L(n10)
+	lea	-16(rp), rp		C n = 2, 6, 10, ...
+	mov	R32(%r8), R32(%rbx)
+	mov	8(up), %r11
+	ADDSUB	%r9, %r15
+	mov	8(vp), %r9
+	setc	R8(%rax)
+	ADDSUB	%rbx, %r15
+	jnc	1f
+	mov	$1, R8(%rax)
+1:	sub	$2, n
+	jnz	L(gt2)
+	ADDSUB	%r9, %r11
+	setc	R8(%rbx)
+	mov	%r15, %r13
+	ADDSUB	%rax, %r11
+	jnc	1f
+	mov	$1, R8(%rbx)
+1:	mov	%r11, %r14
+	shl	$63, %r11
+	shr	%r13
+	jmp	L(cj2)
+L(gt2):	mov	16(up), %r8
+	ADDSUB	%r9, %r11
+	mov	16(vp), %r9
+	setc	R8(%rbx)
+	mov	%r15, %r13
+	ADDSUB	%rax, %r11
+	jnc	1f
+	mov	$1, R8(%rbx)
+1:	mov	24(up), %r10
+	ADDSUB	%r9, %r8
+	mov	24(vp), %r9
+	setc	R8(%rax)
+	mov	%r11, %r14
+	shl	$63, %r11
+	shr	%r13
+	lea	16(up), up
+	lea	16(vp), vp
+	jmp	L(L10)
+
+L(n10):	xor	R32(%rbx), R32(%rbx)	C n = 3, 7, 11, ...
+	lea	-8(rp), rp
+	mov	R32(%r8), R32(%rax)
+	mov	8(up), %r12
+	ADDSUB	%r9, %r15
+	mov	8(vp), %r9
+	setc	R8(%rbx)
+	ADDSUB	%rax, %r15
+	jnc	1f
+	mov	$1, R8(%rbx)
+1:	mov	16(up), %r11
+	ADDSUB	%r9, %r12
+	mov	16(vp), %r9
+	setc	R8(%rax)
+	mov	%r15, %r14
+	ADDSUB	%rbx, %r12
+	jnc	1f
+	mov	$1, R8(%rax)
+1:	sub	$3, n
+	jnz	L(gt3)
+	ADDSUB	%r9, %r11
+	setc	R8(%rbx)
+	mov	%r12, %r13
+	shl	$63, %r12
+	shr	%r14
+	jmp	L(cj3)
+L(gt3):	mov	24(up), %r8
+	ADDSUB	%r9, %r11
+	mov	24(vp), %r9
+	setc	R8(%rbx)
+	mov	%r12, %r13
+	shl	$63, %r12
+	shr	%r14
+	lea	24(up), up
+	lea	24(vp), vp
+	jmp	L(L11)
+
+L(c0):	mov	$1, R8(%rbx)
+	jmp	L(rc0)
+L(c1):	mov	$1, R8(%rax)
+	jmp	L(rc1)
+L(c2):	mov	$1, R8(%rbx)
+	jmp	L(rc2)
+
+	ALIGN(16)
+L(top):	mov	(up), %r8	C not on critical path
+	or	%r13, %r10
+	ADDSUB	%r9, %r11	C not on critical path
+	mov	(vp), %r9	C not on critical path
+	setc	R8(%rbx)	C save carry out
+	mov	%r12, %r13	C new for later
+	shl	$63, %r12	C shift new right
+	shr	%r14		C shift old left
+	mov	%r10, (rp)
+L(L11):	ADDSUB	%rax, %r11	C apply previous carry out
+	jc	L(c0)		C jump if ripple
+L(rc0):	mov	8(up), %r10
+	or	%r14, %r12
+	ADDSUB	%r9, %r8
+	mov	8(vp), %r9
+	setc	R8(%rax)
+	mov	%r11, %r14
+	shl	$63, %r11
+	shr	%r13
+	mov	%r12, 8(rp)
+L(L10):	ADDSUB	%rbx, %r8
+	jc	L(c1)
+L(rc1):	mov	16(up), %r12
+	or	%r13, %r11
+	ADDSUB	%r9, %r10
+	mov	16(vp), %r9
+	setc	R8(%rbx)
+	mov	%r8, %r13
+	shl	$63, %r8
+	shr	%r14
+	mov	%r11, 16(rp)
+L(L01):	ADDSUB	%rax, %r10
+	jc	L(c2)
+L(rc2):	mov	24(up), %r11
+	or	%r14, %r8
+	ADDSUB	%r9, %r12
+	lea	32(up), up
+	mov	24(vp), %r9
+	lea	32(vp), vp
+	setc	R8(%rax)
+	mov	%r10, %r14
+	shl	$63, %r10
+	shr	%r13
+	mov	%r8, 24(rp)
+	lea	32(rp), rp
+L(L00):	ADDSUB	%rbx, %r12
+	jc	L(c3)
+L(rc3):	sub	$4, n
+	ja	L(top)
+
+L(end):	or	%r13, %r10
+	ADDSUB	%r9, %r11
+	setc	R8(%rbx)
+	mov	%r12, %r13
+	shl	$63, %r12
+	shr	%r14
+	mov	%r10, (rp)
+L(cj3):	ADDSUB	%rax, %r11
+	jnc	1f
+	mov	$1, R8(%rbx)
+1:	or	%r14, %r12
+	mov	%r11, %r14
+	shl	$63, %r11
+	shr	%r13
+	mov	%r12, 8(rp)
+L(cj2):	or	%r13, %r11
+	shl	$63, %rbx
+	shr	%r14
+	mov	%r11, 16(rp)
+L(cj1):	or	%r14, %rbx
+	mov	%rbx, 24(rp)
+
+	mov	R32(%r15), R32(%rax)
+	and	$1, R32(%rax)
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+L(c3):	mov	$1, R8(%rax)
+	jmp	L(rc3)
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/pentium4/rshift.asm b/third_party/gmp/mpn/x86_64/pentium4/rshift.asm
new file mode 100644
index 0000000..b7c1ee2
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/pentium4/rshift.asm

@@ -0,0 +1,169 @@
+dnl  x86-64 mpn_rshift optimized for Pentium 4.
+
+dnl  Copyright 2003, 2005, 2007, 2008, 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C	     cycles/limb
+C AMD K8,K9	 2.5
+C AMD K10	 ?
+C Intel P4	 3.29
+C Intel core2	 2.1 (fluctuates, presumably cache related)
+C Intel corei	 ?
+C Intel atom	14.3
+C VIA nano	 ?
+
+C INPUT PARAMETERS
+define(`rp',`%rdi')
+define(`up',`%rsi')
+define(`n',`%rdx')
+define(`cnt',`%cl')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(32)
+PROLOGUE(mpn_rshift)
+	FUNC_ENTRY(4)
+	mov	(up), %rax
+	movd	R32(%rcx), %mm4
+	neg	R32(%rcx)			C put lsh count in cl
+	and	$63, R32(%rcx)
+	movd	R32(%rcx), %mm5
+
+	lea	-8(up,n,8), up
+	lea	-8(rp,n,8), rp
+	lea	1(n), R32(%r8)
+	neg	n
+
+	shl	R8(%rcx), %rax		C function return value
+
+	and	$3, R32(%r8)
+	je	L(rol)			C jump for n = 3, 7, 11, ...
+
+	dec	R32(%r8)
+	jne	L(1)
+C	n = 4, 8, 12, ...
+	movq	8(up,n,8), %mm2
+	psrlq	%mm4, %mm2
+	movq	16(up,n,8), %mm0
+	psllq	%mm5, %mm0
+	por	%mm0, %mm2
+	movq	%mm2, 8(rp,n,8)
+	inc	n
+	jmp	L(rol)
+
+L(1):	dec	R32(%r8)
+	je	L(1x)			C jump for n = 1, 5, 9, 13, ...
+C	n = 2, 6, 10, 16, ...
+	movq	8(up,n,8), %mm2
+	psrlq	%mm4, %mm2
+	movq	16(up,n,8), %mm0
+	psllq	%mm5, %mm0
+	por	%mm0, %mm2
+	movq	%mm2, 8(rp,n,8)
+	inc	n
+L(1x):
+	cmp	$-1, n
+	je	L(ast)
+	movq	8(up,n,8), %mm2
+	psrlq	%mm4, %mm2
+	movq	16(up,n,8), %mm3
+	psrlq	%mm4, %mm3
+	movq	16(up,n,8), %mm0
+	movq	24(up,n,8), %mm1
+	psllq	%mm5, %mm0
+	por	%mm0, %mm2
+	psllq	%mm5, %mm1
+	por	%mm1, %mm3
+	movq	%mm2, 8(rp,n,8)
+	movq	%mm3, 16(rp,n,8)
+	add	$2, n
+
+L(rol):	movq	8(up,n,8), %mm2
+	psrlq	%mm4, %mm2
+	movq	16(up,n,8), %mm3
+	psrlq	%mm4, %mm3
+
+	add	$4, n			C				      4
+	jb	L(end)			C				      2
+	ALIGN(32)
+L(top):
+	C finish stuff from lsh block
+	movq	-16(up,n,8), %mm0
+	movq	-8(up,n,8), %mm1
+	psllq	%mm5, %mm0
+	por	%mm0, %mm2
+	psllq	%mm5, %mm1
+	movq	(up,n,8), %mm0
+	por	%mm1, %mm3
+	movq	8(up,n,8), %mm1
+	movq	%mm2, -24(rp,n,8)
+	movq	%mm3, -16(rp,n,8)
+	C start two new rsh
+	psllq	%mm5, %mm0
+	psllq	%mm5, %mm1
+
+	C finish stuff from rsh block
+	movq	-8(up,n,8), %mm2
+	movq	(up,n,8), %mm3
+	psrlq	%mm4, %mm2
+	por	%mm2, %mm0
+	psrlq	%mm4, %mm3
+	movq	8(up,n,8), %mm2
+	por	%mm3, %mm1
+	movq	16(up,n,8), %mm3
+	movq	%mm0, -8(rp,n,8)
+	movq	%mm1, (rp,n,8)
+	C start two new lsh
+	add	$4, n
+	psrlq	%mm4, %mm2
+	psrlq	%mm4, %mm3
+
+	jae	L(top)			C				      2
+L(end):
+	movq	-8(up), %mm0
+	psllq	%mm5, %mm0
+	por	%mm0, %mm2
+	movq	(up), %mm1
+	psllq	%mm5, %mm1
+	por	%mm1, %mm3
+	movq	%mm2, -16(rp)
+	movq	%mm3, -8(rp)
+
+L(ast):	movq	(up), %mm2
+	psrlq	%mm4, %mm2
+	movq	%mm2, (rp)
+	emms
+	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/pentium4/sec_tabselect.asm b/third_party/gmp/mpn/x86_64/pentium4/sec_tabselect.asm
new file mode 100644
index 0000000..e436034
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/pentium4/sec_tabselect.asm

@@ -0,0 +1,37 @@
+dnl  X86-64 mpn_sec_tabselect.
+
+dnl  Copyright 2012, 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_sec_tabselect)
+include_mpn(`x86_64/fastsse/sec_tabselect.asm')

diff --git a/third_party/gmp/mpn/x86_64/pentium4/sqr_basecase.asm b/third_party/gmp/mpn/x86_64/pentium4/sqr_basecase.asm
new file mode 100644
index 0000000..9725287
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/pentium4/sqr_basecase.asm

@@ -0,0 +1,37 @@
+dnl  X86-64 mpn_sqr_basecase optimised for Intel Nocona.
+
+dnl  Copyright 2018 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_sqr_basecase)
+include_mpn(`x86_64/core2/sqr_basecase.asm')

diff --git a/third_party/gmp/mpn/x86_64/popham.asm b/third_party/gmp/mpn/x86_64/popham.asm
new file mode 100644
index 0000000..3a29b2e
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/popham.asm

@@ -0,0 +1,163 @@
+dnl  AMD64 mpn_popcount, mpn_hamdist -- population count and hamming distance.
+
+dnl  Copyright 2004, 2005, 2007, 2010-2012, 2017 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+
+C		     popcount	      hamdist
+C		    cycles/limb	    cycles/limb
+C AMD K8,K9		 6		 7
+C AMD K10		 6		 7
+C Intel P4		12		14.3
+C Intel core2		 7		 8
+C Intel corei		 ?		 7.3
+C Intel atom		16.5		17.5
+C VIA nano		 8.75		10.4
+
+C TODO
+C  * Tune.  It should be possible to reach 5 c/l for popcount and 6 c/l for
+C    hamdist for K8/K9.
+
+
+ifdef(`OPERATION_popcount',`
+  define(`func',`mpn_popcount')
+  define(`up',		`%rdi')
+  define(`n',		`%rsi')
+  define(`h55555555',	`%r10')
+  define(`h33333333',	`%r11')
+  define(`h0f0f0f0f',	`%rcx')
+  define(`h01010101',	`%rdx')
+  define(`POP',		`$1')
+  define(`HAM',		`dnl')
+')
+ifdef(`OPERATION_hamdist',`
+  define(`func',`mpn_hamdist')
+  define(`up',		`%rdi')
+  define(`vp',		`%rsi')
+  define(`n',		`%rdx')
+  define(`h55555555',	`%r10')
+  define(`h33333333',	`%r11')
+  define(`h0f0f0f0f',	`%rcx')
+  define(`h01010101',	`%r12')
+  define(`POP',		`dnl')
+  define(`HAM',		`$1')
+')
+
+
+MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(32)
+PROLOGUE(func)
+ POP(`	FUNC_ENTRY(2)		')
+ HAM(`	FUNC_ENTRY(3)		')
+	push	%rbx
+	mov	$0x5555555555555555, h55555555
+	push	%rbp
+	mov	$0x3333333333333333, h33333333
+ HAM(`	push	%r12		')
+	lea	(up,n,8), up
+	mov	$0x0f0f0f0f0f0f0f0f, h0f0f0f0f
+ HAM(`	lea	(vp,n,8), vp	')
+	neg	n
+	mov	$0x0101010101010101, h01010101
+	xor	R32(%rax), R32(%rax)
+	test	$1, R8(n)
+	jz	L(top)
+
+	mov	(up,n,8), %r8
+ HAM(`	xor	(vp,n,8), %r8	')
+
+	mov	%r8, %r9
+	shr	%r8
+	and	h55555555, %r8
+	sub	%r8, %r9
+
+	mov	%r9, %r8
+	shr	$2, %r9
+	and	h33333333, %r8
+	and	h33333333, %r9
+	add	%r8, %r9		C 16 4-bit fields (0..4)
+
+	dec	n
+	jmp	L(mid)
+
+	ALIGN(16)
+L(top):	mov	(up,n,8), %r8
+	mov	8(up,n,8), %rbx
+ HAM(`	xor	(vp,n,8), %r8	')
+ HAM(`	xor	8(vp,n,8), %rbx	')
+
+	mov	%r8, %r9
+	mov	%rbx, %rbp
+	shr	%r8
+	shr	%rbx
+	and	h55555555, %r8
+	and	h55555555, %rbx
+	sub	%r8, %r9
+	sub	%rbx, %rbp
+
+	mov	%r9, %r8
+	mov	%rbp, %rbx
+	shr	$2, %r9
+	shr	$2, %rbp
+	and	h33333333, %r8
+	and	h33333333, %r9
+	and	h33333333, %rbx
+	and	h33333333, %rbp
+	add	%r8, %r9		C 16 4-bit fields (0..4)
+	add	%rbx, %rbp		C 16 4-bit fields (0..4)
+
+	add	%rbp, %r9		C 16 4-bit fields (0..8)
+L(mid):	mov	%r9, %r8
+	shr	$4, %r9
+	and	h0f0f0f0f, %r8
+	and	h0f0f0f0f, %r9
+	add	%r8, %r9		C 8 8-bit fields (0..16)
+
+	imul	h01010101, %r9		C sum the 8 fields in high 8 bits
+	shr	$56, %r9
+
+	add	%r9, %rax		C add to total
+	add	$2, n
+	jnc	L(top)
+
+L(end):
+ HAM(`	pop	%r12		')
+	pop	%rbp
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/rsh1aors_n.asm b/third_party/gmp/mpn/x86_64/rsh1aors_n.asm
new file mode 100644
index 0000000..a3e9cc5
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/rsh1aors_n.asm

@@ -0,0 +1,189 @@
+dnl  AMD64 mpn_rsh1add_n -- rp[] = (up[] + vp[]) >> 1
+dnl  AMD64 mpn_rsh1sub_n -- rp[] = (up[] - vp[]) >> 1
+
+dnl  Copyright 2003, 2005, 2009, 2011, 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C AMD K8,K9	 2.14	(mpn_add_n + mpn_rshift need 4.125)
+C AMD K10	 2.14	(mpn_add_n + mpn_rshift need 4.125)
+C Intel P4	12.75
+C Intel core2	 3.75
+C Intel NMH	 4.4
+C Intel SBR	 ?
+C Intel atom	 ?
+C VIA nano	 3.25
+
+C TODO
+C  * Rewrite to use indexed addressing, like addlsh1.asm and sublsh1.asm.
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`vp', `%rdx')
+define(`n',`  %rcx')
+
+ifdef(`OPERATION_rsh1add_n', `
+	define(ADDSUB,	      add)
+	define(ADCSBB,	      adc)
+	define(func_n,	      mpn_rsh1add_n)
+	define(func_nc,	      mpn_rsh1add_nc)')
+ifdef(`OPERATION_rsh1sub_n', `
+	define(ADDSUB,	      sub)
+	define(ADCSBB,	      sbb)
+	define(func_n,	      mpn_rsh1sub_n)
+	define(func_nc,	      mpn_rsh1sub_nc)')
+
+MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1add_nc mpn_rsh1sub_n mpn_rsh1sub_nc)
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(func_nc)
+	FUNC_ENTRY(4)
+IFDOS(`	mov	56(%rsp), %r8	')
+	push	%rbx
+
+	xor	R32(%rax), R32(%rax)
+	neg	%r8			C set C flag from parameter
+	mov	(up), %rbx
+	ADCSBB	(vp), %rbx
+	jmp	L(ent)
+EPILOGUE()
+
+	ALIGN(16)
+PROLOGUE(func_n)
+	FUNC_ENTRY(4)
+	push	%rbx
+
+	xor	R32(%rax), R32(%rax)
+	mov	(up), %rbx
+	ADDSUB	(vp), %rbx
+L(ent):
+	rcr	%rbx			C rotate, save acy
+	adc	R32(%rax), R32(%rax)	C return value
+
+	mov	R32(n), R32(%r11)
+	and	$3, R32(%r11)
+
+	cmp	$1, R32(%r11)
+	je	L(do)			C jump if n = 1 5 9 ...
+
+L(n1):	cmp	$2, R32(%r11)
+	jne	L(n2)			C jump unless n = 2 6 10 ...
+	add	%rbx, %rbx		C rotate carry limb, restore acy
+	mov	8(up), %r10
+	ADCSBB	8(vp), %r10
+	lea	8(up), up
+	lea	8(vp), vp
+	lea	8(rp), rp
+	rcr	%r10
+	rcr	%rbx
+	mov	%rbx, -8(rp)
+	jmp	L(cj1)
+
+L(n2):	cmp	$3, R32(%r11)
+	jne	L(n3)			C jump unless n = 3 7 11 ...
+	add	%rbx, %rbx		C rotate carry limb, restore acy
+	mov	8(up), %r9
+	mov	16(up), %r10
+	ADCSBB	8(vp), %r9
+	ADCSBB	16(vp), %r10
+	lea	16(up), up
+	lea	16(vp), vp
+	lea	16(rp), rp
+	rcr	%r10
+	rcr	%r9
+	rcr	%rbx
+	mov	%rbx, -16(rp)
+	jmp	L(cj2)
+
+L(n3):	dec	n			C come here for n = 4 8 12 ...
+	add	%rbx, %rbx		C rotate carry limb, restore acy
+	mov	8(up), %r8
+	mov	16(up), %r9
+	ADCSBB	8(vp), %r8
+	ADCSBB	16(vp), %r9
+	mov	24(up), %r10
+	ADCSBB	24(vp), %r10
+	lea	24(up), up
+	lea	24(vp), vp
+	lea	24(rp), rp
+	rcr	%r10
+	rcr	%r9
+	rcr	%r8
+	rcr	%rbx
+	mov	%rbx, -24(rp)
+	mov	%r8, -16(rp)
+L(cj2):	mov	%r9, -8(rp)
+L(cj1):	mov	%r10, %rbx
+
+L(do):
+	shr	$2, n			C				4
+	je	L(end)			C				2
+	ALIGN(16)
+L(top):	add	%rbx, %rbx		C rotate carry limb, restore acy
+
+	mov	8(up), %r8
+	mov	16(up), %r9
+	ADCSBB	8(vp), %r8
+	ADCSBB	16(vp), %r9
+	mov	24(up), %r10
+	mov	32(up), %r11
+	ADCSBB	24(vp), %r10
+	ADCSBB	32(vp), %r11
+
+	lea	32(up), up
+	lea	32(vp), vp
+
+	rcr	%r11			C rotate, save acy
+	rcr	%r10
+	rcr	%r9
+	rcr	%r8
+
+	rcr	%rbx
+	mov	%rbx, (rp)
+	mov	%r8, 8(rp)
+	mov	%r9, 16(rp)
+	mov	%r10, 24(rp)
+	mov	%r11, %rbx
+
+	lea	32(rp), rp
+	dec	n
+	jne	L(top)
+
+L(end):	mov	%rbx, (rp)
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/rshift.asm b/third_party/gmp/mpn/x86_64/rshift.asm
new file mode 100644
index 0000000..3f344f1
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/rshift.asm

@@ -0,0 +1,176 @@
+dnl  AMD64 mpn_rshift -- mpn right shift.
+
+dnl  Copyright 2003, 2005, 2009, 2011, 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C	     cycles/limb
+C AMD K8,K9	 2.375
+C AMD K10	 2.375
+C Intel P4	 8
+C Intel core2	 2.11
+C Intel corei	 ?
+C Intel atom	 5.75
+C VIA nano	 3.5
+
+
+C INPUT PARAMETERS
+define(`rp',	`%rdi')
+define(`up',	`%rsi')
+define(`n',	`%rdx')
+define(`cnt',	`%rcx')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(32)
+PROLOGUE(mpn_rshift)
+	FUNC_ENTRY(4)
+	neg	R32(%rcx)		C put rsh count in cl
+	mov	(up), %rax
+	shl	R8(%rcx), %rax		C function return value
+	neg	R32(%rcx)		C put lsh count in cl
+
+	lea	1(n), R32(%r8)
+
+	lea	-8(up,n,8), up
+	lea	-8(rp,n,8), rp
+	neg	n
+
+	and	$3, R32(%r8)
+	je	L(rlx)			C jump for n = 3, 7, 11, ...
+
+	dec	R32(%r8)
+	jne	L(1)
+C	n = 4, 8, 12, ...
+	mov	8(up,n,8), %r10
+	shr	R8(%rcx), %r10
+	neg	R32(%rcx)		C put rsh count in cl
+	mov	16(up,n,8), %r8
+	shl	R8(%rcx), %r8
+	or	%r8, %r10
+	mov	%r10, 8(rp,n,8)
+	inc	n
+	jmp	L(rll)
+
+L(1):	dec	R32(%r8)
+	je	L(1x)			C jump for n = 1, 5, 9, 13, ...
+C	n = 2, 6, 10, 16, ...
+	mov	8(up,n,8), %r10
+	shr	R8(%rcx), %r10
+	neg	R32(%rcx)		C put rsh count in cl
+	mov	16(up,n,8), %r8
+	shl	R8(%rcx), %r8
+	or	%r8, %r10
+	mov	%r10, 8(rp,n,8)
+	inc	n
+	neg	R32(%rcx)		C put lsh count in cl
+L(1x):
+	cmp	$-1, n
+	je	L(ast)
+	mov	8(up,n,8), %r10
+	shr	R8(%rcx), %r10
+	mov	16(up,n,8), %r11
+	shr	R8(%rcx), %r11
+	neg	R32(%rcx)		C put rsh count in cl
+	mov	16(up,n,8), %r8
+	mov	24(up,n,8), %r9
+	shl	R8(%rcx), %r8
+	or	%r8, %r10
+	shl	R8(%rcx), %r9
+	or	%r9, %r11
+	mov	%r10, 8(rp,n,8)
+	mov	%r11, 16(rp,n,8)
+	add	$2, n
+
+L(rll):	neg	R32(%rcx)		C put lsh count in cl
+L(rlx):	mov	8(up,n,8), %r10
+	shr	R8(%rcx), %r10
+	mov	16(up,n,8), %r11
+	shr	R8(%rcx), %r11
+
+	add	$4, n			C				      4
+	jb	L(end)			C				      2
+	ALIGN(16)
+L(top):
+	C finish stuff from lsh block
+	neg	R32(%rcx)		C put rsh count in cl
+	mov	-16(up,n,8), %r8
+	mov	-8(up,n,8), %r9
+	shl	R8(%rcx), %r8
+	or	%r8, %r10
+	shl	R8(%rcx), %r9
+	or	%r9, %r11
+	mov	%r10, -24(rp,n,8)
+	mov	%r11, -16(rp,n,8)
+	C start two new rsh
+	mov	(up,n,8), %r8
+	mov	8(up,n,8), %r9
+	shl	R8(%rcx), %r8
+	shl	R8(%rcx), %r9
+
+	C finish stuff from rsh block
+	neg	R32(%rcx)		C put lsh count in cl
+	mov	-8(up,n,8), %r10
+	mov	0(up,n,8), %r11
+	shr	R8(%rcx), %r10
+	or	%r10, %r8
+	shr	R8(%rcx), %r11
+	or	%r11, %r9
+	mov	%r8, -8(rp,n,8)
+	mov	%r9, 0(rp,n,8)
+	C start two new lsh
+	mov	8(up,n,8), %r10
+	mov	16(up,n,8), %r11
+	shr	R8(%rcx), %r10
+	shr	R8(%rcx), %r11
+
+	add	$4, n
+	jae	L(top)			C				      2
+L(end):
+	neg	R32(%rcx)		C put rsh count in cl
+	mov	-8(up), %r8
+	shl	R8(%rcx), %r8
+	or	%r8, %r10
+	mov	(up), %r9
+	shl	R8(%rcx), %r9
+	or	%r9, %r11
+	mov	%r10, -16(rp)
+	mov	%r11, -8(rp)
+
+	neg	R32(%rcx)		C put lsh count in cl
+L(ast):	mov	(up), %r10
+	shr	R8(%rcx), %r10
+	mov	%r10, (rp)
+	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/sec_tabselect.asm b/third_party/gmp/mpn/x86_64/sec_tabselect.asm
new file mode 100644
index 0000000..e8aed26
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/sec_tabselect.asm

@@ -0,0 +1,176 @@
+dnl  AMD64 mpn_sec_tabselect.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2011-2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C	     cycles/limb          good for cpu
+C AMD K8,K9	 1.5			Y
+C AMD K10	 1.4
+C AMD bd1	 2.64
+C AMD bobcat	 2.15			Y
+C Intel P4	 4
+C Intel core2	 1.38
+C Intel NHM	 1.75
+C Intel SBR	 1.25
+C Intel atom	 2.5			Y
+C VIA nano	 1.75			Y
+
+C NOTES
+C  * This has not been tuned for any specific processor.  Its speed should not
+C    be too bad, though.
+C  * Using SSE2/AVX2 could result in many-fold speedup.
+C  * WORKS FOR n mod 4 = 0 ONLY!
+
+C mpn_sec_tabselect (mp_limb_t *rp, mp_limb_t *tp, mp_size_t n, mp_size_t nents, mp_size_t which)
+define(`rp',     `%rdi')
+define(`tp',     `%rsi')
+define(`n',      `%rdx')
+define(`nents',  `%rcx')
+define(`which',  `%r8')
+
+define(`i',      `%rbp')
+define(`j',      `%r9')
+
+C rax  rbx  rcx  rdx  rdi  rsi  rbp   r8   r9  r10  r11  r12  r13  r14  r15
+C          nents  n   rp   tab   i   which j    *    *    *    *    *    *
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_sec_tabselect)
+	FUNC_ENTRY(4)
+IFDOS(`	mov	56(%rsp), %r8d	')
+
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+
+	mov	n, j
+	add	$-4, j
+	js	L(outer_end)
+
+L(outer_top):
+	mov	nents, i
+	push	tp
+	xor	R32(%r12), R32(%r12)
+	xor	R32(%r13), R32(%r13)
+	xor	R32(%r14), R32(%r14)
+	xor	R32(%r15), R32(%r15)
+	mov	which, %rbx
+
+	ALIGN(16)
+L(top):	sub	$1, %rbx
+	sbb	%rax, %rax
+	mov	0(tp), %r10
+	mov	8(tp), %r11
+	and	%rax, %r10
+	and	%rax, %r11
+	or	%r10, %r12
+	or	%r11, %r13
+	mov	16(tp), %r10
+	mov	24(tp), %r11
+	and	%rax, %r10
+	and	%rax, %r11
+	or	%r10, %r14
+	or	%r11, %r15
+	lea	(tp,n,8), tp
+	add	$-1, i
+	jne	L(top)
+
+	mov	%r12, 0(rp)
+	mov	%r13, 8(rp)
+	mov	%r14, 16(rp)
+	mov	%r15, 24(rp)
+	pop	tp
+	lea	32(tp), tp
+	lea	32(rp), rp
+	add	$-4, j
+	jns	L(outer_top)
+L(outer_end):
+
+	test	$2, R8(n)
+	jz	L(b0x)
+L(b1x):	mov	nents, i
+	push	tp
+	xor	R32(%r12), R32(%r12)
+	xor	R32(%r13), R32(%r13)
+	mov	which, %rbx
+	ALIGN(16)
+L(tp2):	sub	$1, %rbx
+	sbb	%rax, %rax
+	mov	0(tp), %r10
+	mov	8(tp), %r11
+	and	%rax, %r10
+	and	%rax, %r11
+	or	%r10, %r12
+	or	%r11, %r13
+	lea	(tp,n,8), tp
+	add	$-1, i
+	jne	L(tp2)
+	mov	%r12, 0(rp)
+	mov	%r13, 8(rp)
+	pop	tp
+	lea	16(tp), tp
+	lea	16(rp), rp
+
+L(b0x):	test	$1, R8(n)
+	jz	L(b00)
+L(b01):	mov	nents, i
+	xor	R32(%r12), R32(%r12)
+	mov	which, %rbx
+	ALIGN(16)
+L(tp1):	sub	$1, %rbx
+	sbb	%rax, %rax
+	mov	0(tp), %r10
+	and	%rax, %r10
+	or	%r10, %r12
+	lea	(tp,n,8), tp
+	add	$-1, i
+	jne	L(tp1)
+	mov	%r12, 0(rp)
+
+L(b00):	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/silvermont/aorrlsh1_n.asm b/third_party/gmp/mpn/x86_64/silvermont/aorrlsh1_n.asm
new file mode 100644
index 0000000..98c26cf
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/silvermont/aorrlsh1_n.asm

@@ -0,0 +1,50 @@
+dnl  X86-64 mpn_addlsh1_n/mpn_rsblsh1_n optimised for Intel Silvermont.
+
+dnl  Copyright 2017 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+define(LSH, 1)
+define(RSH, 63)
+
+ifdef(`OPERATION_addlsh1_n', `
+	define(ADDSUB,	add)
+	define(ADCSBB,	adc)
+	define(func,	mpn_addlsh1_n)')
+ifdef(`OPERATION_rsblsh1_n', `
+	define(ADDSUB,	sub)
+	define(ADCSBB,	sbb)
+	define(func,	mpn_rsblsh1_n)')
+
+MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_rsblsh1_n)
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+include_mpn(`x86_64/aorrlshC_n.asm')

diff --git a/third_party/gmp/mpn/x86_64/silvermont/aorrlsh2_n.asm b/third_party/gmp/mpn/x86_64/silvermont/aorrlsh2_n.asm
new file mode 100644
index 0000000..2a83217
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/silvermont/aorrlsh2_n.asm

@@ -0,0 +1,50 @@
+dnl  X86-64 mpn_addlsh2_n/mpn_rsblsh2_n optimised for Intel Silvermont.
+
+dnl  Copyright 2017 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+define(LSH, 2)
+define(RSH, 62)
+
+ifdef(`OPERATION_addlsh2_n', `
+	define(ADDSUB,	add)
+	define(ADCSBB,	adc)
+	define(func,	mpn_addlsh2_n)')
+ifdef(`OPERATION_rsblsh2_n', `
+	define(ADDSUB,	sub)
+	define(ADCSBB,	sbb)
+	define(func,	mpn_rsblsh2_n)')
+
+MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_rsblsh2_n)
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+include_mpn(`x86_64/aorrlshC_n.asm')

diff --git a/third_party/gmp/mpn/x86_64/silvermont/aors_n.asm b/third_party/gmp/mpn/x86_64/silvermont/aors_n.asm
new file mode 100644
index 0000000..dce3d75
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/silvermont/aors_n.asm

@@ -0,0 +1,37 @@
+dnl  X86-64 mpn_add_n, mpn_sub_n, optimised for Intel Silvermont.
+
+dnl  Copyright 2017 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+include_mpn(`x86_64/coreisbr/aors_n.asm')

diff --git a/third_party/gmp/mpn/x86_64/silvermont/aorsmul_1.asm b/third_party/gmp/mpn/x86_64/silvermont/aorsmul_1.asm
new file mode 100644
index 0000000..ead0d76
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/silvermont/aorsmul_1.asm

@@ -0,0 +1,37 @@
+dnl  X86-64 mpn_addmul_1/mpn_submul_1 optimised for Intel Silvermont.
+
+dnl  Copyright 2017 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c)
+include_mpn(`x86_64/core2/aorsmul_1.asm')

diff --git a/third_party/gmp/mpn/x86_64/silvermont/gmp-mparam.h b/third_party/gmp/mpn/x86_64/silvermont/gmp-mparam.h
new file mode 100644
index 0000000..f8cb0f4
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/silvermont/gmp-mparam.h

@@ -0,0 +1,252 @@
+/* Intel Silvermont gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+/* Disable use of slow functions.  FIXME: We should disable lib inclusion.  */
+#undef HAVE_NATIVE_mpn_mul_2
+#undef HAVE_NATIVE_mpn_addmul_2
+
+/* 2400 MHz Intel Atom C2758 Silvermont/Rangeley */
+/* FFT tuning limit = 468153400 */
+/* Generated by tuneup.c, 2019-10-19, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD                 0  /* always */
+#define MOD_1_UNNORM_THRESHOLD               0  /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          4
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          3
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD        55
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD         0  /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD      9
+#define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_1_NORM_THRESHOLD              1
+#define DIV_QR_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD        MP_SIZE_T_MAX  /* never */
+
+#define DIV_1_VS_MUL_1_PERCENT             168
+
+#define MUL_TOOM22_THRESHOLD                19
+#define MUL_TOOM33_THRESHOLD                66
+#define MUL_TOOM44_THRESHOLD               152
+#define MUL_TOOM6H_THRESHOLD               222
+#define MUL_TOOM8H_THRESHOLD               333
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD     105
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     114
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD     105
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD     113
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD      88
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 24
+#define SQR_TOOM3_THRESHOLD                 97
+#define SQR_TOOM4_THRESHOLD                232
+#define SQR_TOOM6_THRESHOLD                286
+#define SQR_TOOM8_THRESHOLD                  0  /* always */
+
+#define MULMID_TOOM42_THRESHOLD             24
+
+#define MULMOD_BNM1_THRESHOLD               13
+#define SQRMOD_BNM1_THRESHOLD               15
+
+#define MUL_FFT_MODF_THRESHOLD             340  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    340, 5}, {     17, 6}, {      9, 5}, {     19, 6}, \
+    {     17, 7}, {      9, 6}, {     20, 7}, {     11, 6}, \
+    {     23, 7}, {     17, 8}, {      9, 7}, {     21, 8}, \
+    {     11, 7}, {     23, 8}, {     13, 7}, {     27, 8}, \
+    {     15, 7}, {     31, 8}, {     21, 9}, {     11, 8}, \
+    {     27, 9}, {     15, 8}, {     33, 9}, {     19, 8}, \
+    {     39, 9}, {     23, 8}, {     47, 9}, {     27,10}, \
+    {     15, 9}, {     39,10}, {     23, 9}, {     51,11}, \
+    {     15,10}, {     31, 9}, {     67,10}, {     39, 9}, \
+    {     79,10}, {     47, 9}, {     95,10}, {     55,11}, \
+    {     31,10}, {     79,11}, {     47,10}, {     95,12}, \
+    {     31,11}, {     63,10}, {    127, 9}, {    255,10}, \
+    {    135,11}, {     79, 9}, {    319,11}, {     95,10}, \
+    {    191, 9}, {    383,10}, {    207, 9}, {    415,11}, \
+    {    111,12}, {     63,11}, {    127,10}, {    255, 9}, \
+    {    511,10}, {    271,11}, {    143,10}, {    287, 9}, \
+    {    575,10}, {    303,11}, {    159,10}, {    319,12}, \
+    {     95,11}, {    191,10}, {    383,11}, {    207,10}, \
+    {    415,13}, {     63,12}, {    127,11}, {    255,10}, \
+    {    511,11}, {    271,10}, {    543,11}, {    287,10}, \
+    {    575,11}, {    319,10}, {    639,11}, {    351,10}, \
+    {    703, 9}, {   1407,12}, {    191,11}, {    415,10}, \
+    {    831,12}, {    223,11}, {    479,13}, {    127,12}, \
+    {    255,11}, {    543,12}, {    287,11}, {    575,10}, \
+    {   1151,12}, {    319,11}, {    639,12}, {    351,11}, \
+    {    703,10}, {   1407,13}, {    191,12}, {    415,11}, \
+    {    831,10}, {   1663,12}, {    479,14}, {    127,13}, \
+    {    255,12}, {    543,11}, {   1087,10}, {   2175,12}, \
+    {    575,11}, {   1151,13}, {    319,12}, {    639,11}, \
+    {   1279,12}, {    703,11}, {   1407,13}, {    383,12}, \
+    {    831,11}, {   1663,13}, {    447,12}, {    959,14}, \
+    {    255,13}, {    511,12}, {   1087,11}, {   2175,13}, \
+    {    575,12}, {   1215,11}, {   2431,10}, {   4863,13}, \
+    {    639,12}, {   1279,13}, {    703,12}, {   1407,14}, \
+    {    383,13}, {    831,12}, {   1663,13}, {    959,15}, \
+    {    255,14}, {    511,13}, {   1087,12}, {   2175,13}, \
+    {   1215,12}, {   2431,11}, {   4863,14}, {    639,13}, \
+    {   1407,12}, {   2815,13}, {   1471,12}, {   2943,11}, \
+    {   5887,14}, {    767,13}, {   1663,14}, {    895,13}, \
+    {   1919,15}, {    511,14}, {   1023,13}, {   2175,14}, \
+    {   1151,13}, {   2431,12}, {   4863,14}, {   1279,13}, \
+    {   2559,14}, {   1407,13}, {   2943,12}, {   5887,15}, \
+    {    767,14}, {   1663,13}, {   3455,12}, {   6911,14}, \
+    {   1919,16}, {    511,15}, {   1023,14}, {   2431,13}, \
+    {   4863,15}, {   1279,14}, {   2943,13}, {   5887,12}, \
+    {  11775,15}, {   1535,14}, {   3455,13}, {   6911,15}, \
+    {   1791,14}, {   3839,13}, {   7679,16}, {   1023,15}, \
+    {   2047,14}, {   4223,15}, {   2303,14}, {   4863,15}, \
+    {   2815,14}, {   5887,13}, {  11775,16}, {   1535,15}, \
+    {   3327,14}, {   6911,15}, {   3839,14}, {   7679,17}, \
+    {   1023,16}, {   2047,15}, {   4863,16}, {   2559,15}, \
+    {   5887,14}, {  11775,16}, {   3071,15}, {   6911,16}, \
+    {   3583,15}, {   7679,14}, {  15359,17}, {   2047,16}, \
+    {   4607,15}, {   9215,16}, {   5631,15}, {  11775,17}, \
+    {   3071,16}, {  65536,17}, { 131072,18}, { 262144,19}, \
+    { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \
+    {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 225
+#define MUL_FFT_THRESHOLD                 3712
+
+#define SQR_FFT_MODF_THRESHOLD             308  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    308, 5}, {     17, 6}, {      9, 5}, {     19, 6}, \
+    {     21, 7}, {     11, 6}, {     23, 7}, {     17, 8}, \
+    {      9, 7}, {     21, 8}, {     11, 7}, {     23, 8}, \
+    {     13, 7}, {     27, 8}, {     15, 7}, {     31, 8}, \
+    {     21, 9}, {     11, 8}, {     27, 9}, {     15, 8}, \
+    {     33, 9}, {     19, 8}, {     39, 9}, {     23, 8}, \
+    {     47, 9}, {     27,10}, {     15, 9}, {     39,10}, \
+    {     23, 9}, {     47,11}, {     15,10}, {     31, 9}, \
+    {     63,10}, {     39, 9}, {     79,10}, {     47, 9}, \
+    {     95,11}, {     31,10}, {     79,11}, {     47,10}, \
+    {     95,12}, {     31,11}, {     63,10}, {    127, 9}, \
+    {    255, 8}, {    511, 9}, {    271, 8}, {    543,11}, \
+    {     79,10}, {    159, 9}, {    319, 8}, {    639,10}, \
+    {    175,11}, {     95,10}, {    191, 9}, {    383,10}, \
+    {    207, 9}, {    415,12}, {     63,11}, {    127,10}, \
+    {    255, 9}, {    511,10}, {    271, 9}, {    543,10}, \
+    {    287, 9}, {    575,10}, {    303,11}, {    159,10}, \
+    {    319, 9}, {    639,11}, {    175,10}, {    351,12}, \
+    {     95,11}, {    191,10}, {    383,11}, {    207,10}, \
+    {    415,13}, {     63,12}, {    127,11}, {    255,10}, \
+    {    511,11}, {    271,10}, {    543,11}, {    287,10}, \
+    {    575,11}, {    303,12}, {    159,11}, {    319,10}, \
+    {    639,11}, {    351,10}, {    703,12}, {    191,11}, \
+    {    383,10}, {    767,11}, {    415,10}, {    831,12}, \
+    {    223,11}, {    479,12}, {    255,11}, {    543,12}, \
+    {    287,11}, {    575,12}, {    319,11}, {    639,12}, \
+    {    351,11}, {    703,10}, {   1407,13}, {    191,12}, \
+    {    383,11}, {    767,12}, {    415,11}, {    831,12}, \
+    {    479,13}, {    255,12}, {    543,11}, {   1087,10}, \
+    {   2175,12}, {    575,11}, {   1151,12}, {    607,13}, \
+    {    319,12}, {    639,11}, {   1279,12}, {    703,11}, \
+    {   1407,13}, {    383,12}, {    831,11}, {   1663,13}, \
+    {    447,12}, {    895,14}, {    255,13}, {    511,12}, \
+    {   1087,11}, {   2175,13}, {    575,12}, {   1215,11}, \
+    {   2431,13}, {    639,12}, {   1279,13}, {    703,12}, \
+    {   1407,14}, {    383,13}, {    831,12}, {   1663,13}, \
+    {    959,15}, {    255,14}, {    511,13}, {   1087,12}, \
+    {   2175,13}, {   1215,12}, {   2431,14}, {    639,13}, \
+    {   1343,12}, {   2687,13}, {   1407,12}, {   2815,13}, \
+    {   1471,12}, {   2943,14}, {    767,13}, {   1663,14}, \
+    {    895,13}, {   1791,15}, {    511,14}, {   1023,13}, \
+    {   2175,14}, {   1151,13}, {   2431,12}, {   4863,14}, \
+    {   1279,13}, {   2687,14}, {   1407,13}, {   2943,15}, \
+    {    767,14}, {   1663,13}, {   3455,12}, {   6911,14}, \
+    {   1791,13}, {   3583,16}, {    511,15}, {   1023,14}, \
+    {   2431,13}, {   4863,15}, {   1279,14}, {   2943,13}, \
+    {   5887,12}, {  11775,15}, {   1535,14}, {   3455,13}, \
+    {   6911,15}, {   1791,14}, {   3839,13}, {   7679,16}, \
+    {   1023,15}, {   2047,14}, {   4223,15}, {   2303,14}, \
+    {   4863,15}, {   2815,14}, {   5887,13}, {  11775,16}, \
+    {   1535,15}, {   3071,14}, {   6143,15}, {   3327,14}, \
+    {   6911,15}, {   3839,14}, {   7679,17}, {   1023,16}, \
+    {   2047,15}, {   4863,16}, {   2559,15}, {   5887,14}, \
+    {  11775,16}, {   3071,15}, {   6911,16}, {   3583,15}, \
+    {   7679,14}, {  15359,17}, {   2047,16}, {   4607,15}, \
+    {   9983,16}, {   5631,15}, {  11775,17}, {   3071,16}, \
+    {  65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \
+    {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 232
+#define SQR_FFT_THRESHOLD                 2752
+
+#define MULLO_BASECASE_THRESHOLD             0  /* always */
+#define MULLO_DC_THRESHOLD                  55
+#define MULLO_MUL_N_THRESHOLD             6633
+#define SQRLO_BASECASE_THRESHOLD             9
+#define SQRLO_DC_THRESHOLD                   0  /* never mpn_sqrlo_basecase */
+#define SQRLO_SQR_THRESHOLD               5397
+
+#define DC_DIV_QR_THRESHOLD                 33
+#define DC_DIVAPPR_Q_THRESHOLD             222
+#define DC_BDIV_QR_THRESHOLD                31
+#define DC_BDIV_Q_THRESHOLD                147
+
+#define INV_MULMOD_BNM1_THRESHOLD           37
+#define INV_NEWTON_THRESHOLD               222
+#define INV_APPR_THRESHOLD                 222
+
+#define BINV_NEWTON_THRESHOLD              212
+#define REDC_1_TO_REDC_2_THRESHOLD          55
+#define REDC_2_TO_REDC_N_THRESHOLD           0  /* always */
+
+#define MU_DIV_QR_THRESHOLD               1142
+#define MU_DIVAPPR_Q_THRESHOLD            1142
+#define MUPI_DIV_QR_THRESHOLD               81
+#define MU_BDIV_QR_THRESHOLD               942
+#define MU_BDIV_Q_THRESHOLD               1043
+
+#define POWM_SEC_TABLE  1,34,102,588,1730
+
+#define GET_STR_DC_THRESHOLD                17
+#define GET_STR_PRECOMPUTE_THRESHOLD        30
+#define SET_STR_DC_THRESHOLD               381
+#define SET_STR_PRECOMPUTE_THRESHOLD      1659
+
+#define FAC_DSC_THRESHOLD                  351
+#define FAC_ODD_THRESHOLD                   27
+
+#define MATRIX22_STRASSEN_THRESHOLD         16
+#define HGCD2_DIV1_METHOD                    3  /* 3.06% faster than 1 */
+#define HGCD_THRESHOLD                     120
+#define HGCD_APPR_THRESHOLD                153
+#define HGCD_REDUCE_THRESHOLD             2121
+#define GCD_DC_THRESHOLD                   416
+#define GCDEXT_DC_THRESHOLD                309
+#define JACOBI_BASE_METHOD                   1  /* 2.28% faster than 3 */
+
+/* Tuneup completed successfully, took 938046 seconds */

diff --git a/third_party/gmp/mpn/x86_64/silvermont/hamdist.asm b/third_party/gmp/mpn/x86_64/silvermont/hamdist.asm
new file mode 100644
index 0000000..848ed01
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/silvermont/hamdist.asm

@@ -0,0 +1,38 @@
+dnl  x86-64 mpn_hamdist.
+
+dnl  Copyright 2017 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_hamdist)
+include_mpn(`x86_64/coreinhm/hamdist.asm')

diff --git a/third_party/gmp/mpn/x86_64/silvermont/lshift.asm b/third_party/gmp/mpn/x86_64/silvermont/lshift.asm
new file mode 100644
index 0000000..acd3180
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/silvermont/lshift.asm

@@ -0,0 +1,37 @@
+dnl  X86-64 mpn_lshift optimised for Intel Silvermont.
+
+dnl  Copyright 2017 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_lshift)
+include_mpn(`x86_64/fastsse/lshift-movdqu2.asm')

diff --git a/third_party/gmp/mpn/x86_64/silvermont/lshiftc.asm b/third_party/gmp/mpn/x86_64/silvermont/lshiftc.asm
new file mode 100644
index 0000000..3a68bb5
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/silvermont/lshiftc.asm

@@ -0,0 +1,37 @@
+dnl  X86-64 mpn_lshiftc optimised for Intel Silvermont.
+
+dnl  Copyright 2017 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_lshiftc)
+include_mpn(`x86_64/fastsse/lshiftc-movdqu2.asm')

diff --git a/third_party/gmp/mpn/x86_64/silvermont/mul_1.asm b/third_party/gmp/mpn/x86_64/silvermont/mul_1.asm
new file mode 100644
index 0000000..c1e1c94
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/silvermont/mul_1.asm

@@ -0,0 +1,37 @@
+dnl  X86-64 mpn_mul_1 optimised for Intel Silvermont.
+
+dnl  Copyright 2017 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_mul_1 mpn_mul_1c)
+include_mpn(`x86_64/bd1/mul_1.asm')

diff --git a/third_party/gmp/mpn/x86_64/silvermont/mul_basecase.asm b/third_party/gmp/mpn/x86_64/silvermont/mul_basecase.asm
new file mode 100644
index 0000000..6228c48
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/silvermont/mul_basecase.asm

@@ -0,0 +1,37 @@
+dnl  X86-64 mpn_mul_basecase optimised for Intel Silvermont.
+
+dnl  Copyright 2017 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_mul_basecase)
+include_mpn(`x86_64/k8/mul_basecase.asm')

diff --git a/third_party/gmp/mpn/x86_64/silvermont/mullo_basecase.asm b/third_party/gmp/mpn/x86_64/silvermont/mullo_basecase.asm
new file mode 100644
index 0000000..0244f8a
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/silvermont/mullo_basecase.asm

@@ -0,0 +1,37 @@
+dnl  X86-64 mpn_mullo_basecase optimised for Intel Silvermont.
+
+dnl  Copyright 2017 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_mullo_basecase)
+include_mpn(`x86_64/k8/mullo_basecase.asm')

diff --git a/third_party/gmp/mpn/x86_64/silvermont/popcount.asm b/third_party/gmp/mpn/x86_64/silvermont/popcount.asm
new file mode 100644
index 0000000..73eb7b5
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/silvermont/popcount.asm

@@ -0,0 +1,38 @@
+dnl  x86-64 mpn_popcount.
+
+dnl  Copyright 2017 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_popcount)
+include_mpn(`x86_64/coreinhm/popcount.asm')

diff --git a/third_party/gmp/mpn/x86_64/silvermont/rshift.asm b/third_party/gmp/mpn/x86_64/silvermont/rshift.asm
new file mode 100644
index 0000000..b84371c
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/silvermont/rshift.asm

@@ -0,0 +1,37 @@
+dnl  X86-64 mpn_rshift optimised for Intel Silvermont.
+
+dnl  Copyright 2017 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_rshift)
+include_mpn(`x86_64/fastsse/rshift-movdqu2.asm')

diff --git a/third_party/gmp/mpn/x86_64/silvermont/sqr_basecase.asm b/third_party/gmp/mpn/x86_64/silvermont/sqr_basecase.asm
new file mode 100644
index 0000000..afccf93
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/silvermont/sqr_basecase.asm

@@ -0,0 +1,37 @@
+dnl  X86-64 mpn_sqr_basecase optimised for Intel Silvermont.
+
+dnl  Copyright 2017 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_sqr_basecase)
+include_mpn(`x86_64/k8/sqr_basecase.asm')

diff --git a/third_party/gmp/mpn/x86_64/skylake/gmp-mparam.h b/third_party/gmp/mpn/x86_64/skylake/gmp-mparam.h
new file mode 100644
index 0000000..a899ea1
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/skylake/gmp-mparam.h

@@ -0,0 +1,246 @@
+/* Skylake gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+/* Disable use of slow functions.  FIXME: We should disable lib inclusion.  */
+#undef HAVE_NATIVE_mpn_mul_2
+#undef HAVE_NATIVE_mpn_addmul_2
+
+/* 3600-4000 MHz Intel Xeon E3-1270v5 Skylake */
+/* FFT tuning limit = 465,990,371 */
+/* Generated by tuneup.c, 2019-10-18, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD                 0  /* always */
+#define MOD_1_UNNORM_THRESHOLD               0  /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          4
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          3
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD        13
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD        32
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD      9
+#define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_1_NORM_THRESHOLD              1
+#define DIV_QR_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */
+#define DIV_QR_2_PI2_THRESHOLD              41
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           20
+
+#define DIV_1_VS_MUL_1_PERCENT             473
+
+#define MUL_TOOM22_THRESHOLD                26
+#define MUL_TOOM33_THRESHOLD                73
+#define MUL_TOOM44_THRESHOLD               208
+#define MUL_TOOM6H_THRESHOLD               300
+#define MUL_TOOM8H_THRESHOLD               406
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      73
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     153
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD     137
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD     151
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD     106
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 32
+#define SQR_TOOM3_THRESHOLD                117
+#define SQR_TOOM4_THRESHOLD                336
+#define SQR_TOOM6_THRESHOLD                426
+#define SQR_TOOM8_THRESHOLD                547
+
+#define MULMID_TOOM42_THRESHOLD             46
+
+#define MULMOD_BNM1_THRESHOLD               15
+#define SQRMOD_BNM1_THRESHOLD               17
+
+#define MUL_FFT_MODF_THRESHOLD             404  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    404, 5}, {     21, 6}, {     11, 5}, {     23, 6}, \
+    {     12, 5}, {     25, 6}, {     28, 7}, {     15, 6}, \
+    {     31, 7}, {     21, 8}, {     11, 7}, {     25, 8}, \
+    {     13, 7}, {     28, 8}, {     15, 7}, {     32, 8}, \
+    {     17, 7}, {     35, 8}, {     19, 7}, {     39, 8}, \
+    {     21, 9}, {     11, 8}, {     23, 7}, {     47, 8}, \
+    {     27, 9}, {     15, 8}, {     35, 9}, {     19, 8}, \
+    {     41, 9}, {     23, 8}, {     49, 9}, {     27,10}, \
+    {     15, 9}, {     39, 8}, {     79, 9}, {     43,10}, \
+    {     23, 9}, {     55,11}, {     15,10}, {     31, 9}, \
+    {     71,10}, {     39, 9}, {     83,10}, {     47, 9}, \
+    {     99,10}, {     55,11}, {     31,10}, {     79,11}, \
+    {     47,10}, {    103,12}, {     31,11}, {     63,10}, \
+    {    135,11}, {     79,10}, {    167,11}, {     95,10}, \
+    {    191, 9}, {    383,10}, {    199,11}, {    111,12}, \
+    {     63,11}, {    127,10}, {    255, 9}, {    511,10}, \
+    {    271, 9}, {    543,11}, {    143,10}, {    287, 9}, \
+    {    575,11}, {    159,12}, {     95,11}, {    191,10}, \
+    {    383,13}, {     63,12}, {    127,11}, {    255,10}, \
+    {    511,11}, {    271,10}, {    543,11}, {    287,10}, \
+    {    575,11}, {    303,12}, {    159,11}, {    319,10}, \
+    {    639,11}, {    335,10}, {    671,11}, {    351,10}, \
+    {    703,12}, {    191,11}, {    383,10}, {    767,11}, \
+    {    415,12}, {    223,11}, {    447,13}, {    127,12}, \
+    {    255,11}, {    543,12}, {    287,11}, {    607,12}, \
+    {    319,11}, {    671,12}, {    351,11}, {    703,13}, \
+    {    191,12}, {    383,11}, {    767,12}, {    415,11}, \
+    {    831,12}, {    479,14}, {    127,13}, {    255,12}, \
+    {    543,11}, {   1087,12}, {    607,13}, {    319,12}, \
+    {    671,11}, {   1343,12}, {    703,13}, {    383,12}, \
+    {    831,13}, {    447,12}, {    959,13}, {    511,12}, \
+    {   1087,13}, {    575,12}, {   1151,13}, {    639,12}, \
+    {   1343,13}, {    703,14}, {    383,13}, {    767,12}, \
+    {   1535,13}, {    831,12}, {   1727,13}, {    959,14}, \
+    {    511,13}, {   1087,12}, {   2175,13}, {   1151,14}, \
+    {    639,13}, {   1343,12}, {   2687,13}, {   1407,14}, \
+    {    767,13}, {   1599,12}, {   3199,13}, {   1663,14}, \
+    {    895,13}, {   1791,15}, {    511,14}, {   1023,13}, \
+    {   2175,14}, {   1151,13}, {   2431,12}, {   4863,14}, \
+    {   1279,13}, {   2687,14}, {   1407,13}, {   2815,15}, \
+    {    767,14}, {   1535,13}, {   3199,14}, {   1663,13}, \
+    {   3455,12}, {   6911,14}, {   1791,16}, {    511,15}, \
+    {   1023,14}, {   2175,13}, {   4351,14}, {   2431,13}, \
+    {   4863,15}, {   1279,14}, {   2943,13}, {   5887,15}, \
+    {   1535,14}, {   3455,13}, {   6911,15}, {   1791,14}, \
+    {   3839,13}, {   7679,16}, {   1023,15}, {   2047,14}, \
+    {   4223,15}, {   2303,14}, {   4863,15}, {   2559,14}, \
+    {   5247,15}, {   2815,14}, {   5887,16}, {   1535,15}, \
+    {   3327,14}, {   6911,15}, {   3839,14}, {   7679,17}, \
+    {   1023,16}, {   2047,15}, {   4351,14}, {   8703,15}, \
+    {   4863,16}, {   2559,15}, {   5887,14}, {  11775,16}, \
+    {   3071,15}, {   6911,16}, {   3583,15}, {   7679,14}, \
+    {  15359,15}, {   7935,17}, {   2047,16}, {   4095,15}, \
+    {   8703,16}, {   4607,15}, {   9983,14}, {  19967,16}, \
+    {   5631,15}, {  11775,17}, {   3071,16}, {  65536,17}, \
+    { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
+    {2097152,22}, {4194304,23}, {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 227
+#define MUL_FFT_THRESHOLD                 6272
+
+#define SQR_FFT_MODF_THRESHOLD             400  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    400, 5}, {     25, 6}, {     13, 5}, {     27, 6}, \
+    {     29, 7}, {     15, 6}, {     31, 7}, {     28, 8}, \
+    {     15, 7}, {     32, 8}, {     17, 7}, {     35, 8}, \
+    {     19, 7}, {     39, 8}, {     27, 9}, {     15, 8}, \
+    {     33, 9}, {     19, 8}, {     39, 9}, {     23, 8}, \
+    {     47, 9}, {     27,10}, {     15, 9}, {     39,10}, \
+    {     23, 9}, {     51,11}, {     15,10}, {     31, 9}, \
+    {     67,10}, {     39, 9}, {     79,10}, {     55,11}, \
+    {     31,10}, {     79,11}, {     47,10}, {     95,12}, \
+    {     31,11}, {     63,10}, {    135,11}, {     79,10}, \
+    {    159, 9}, {    319,11}, {     95,12}, {     63,11}, \
+    {    127,10}, {    255, 9}, {    511,10}, {    271,11}, \
+    {    143,10}, {    287, 9}, {    575,10}, {    303,11}, \
+    {    159,10}, {    319,12}, {     95,10}, {    383,13}, \
+    {     63,12}, {    127,11}, {    255,10}, {    511,11}, \
+    {    271,10}, {    543,11}, {    287,10}, {    575,11}, \
+    {    303,10}, {    607,12}, {    159,11}, {    319,10}, \
+    {    639,11}, {    335,10}, {    671,11}, {    351,10}, \
+    {    703,11}, {    367,10}, {    735,11}, {    383,10}, \
+    {    767,11}, {    415,10}, {    831,12}, {    223,11}, \
+    {    479,13}, {    127,12}, {    255,11}, {    543,12}, \
+    {    287,11}, {    607,12}, {    319,11}, {    671,12}, \
+    {    351,11}, {    735,12}, {    383,11}, {    799,12}, \
+    {    415,11}, {    831,12}, {    479,14}, {    127,13}, \
+    {    255,12}, {    511,11}, {   1023,12}, {    607,13}, \
+    {    319,12}, {    735,13}, {    383,12}, {    831,13}, \
+    {    447,12}, {    959,13}, {    511,12}, {   1023,13}, \
+    {    575,12}, {   1151,13}, {    639,12}, {   1279,13}, \
+    {    703,12}, {   1407,14}, {    383,13}, {    767,12}, \
+    {   1535,13}, {    831,12}, {   1727,13}, {    895,12}, \
+    {   1791,13}, {    959,14}, {    511,13}, {   1087,12}, \
+    {   2175,13}, {   1151,14}, {    639,13}, {   1343,12}, \
+    {   2687,13}, {   1407,14}, {    767,13}, {   1599,12}, \
+    {   3199,13}, {   1663,14}, {    895,13}, {   1791,15}, \
+    {    511,14}, {   1023,13}, {   2175,14}, {   1151,13}, \
+    {   2431,12}, {   4863,14}, {   1279,13}, {   2687,14}, \
+    {   1407,15}, {    767,14}, {   1535,13}, {   3199,14}, \
+    {   1663,13}, {   3455,14}, {   1791,16}, {    511,15}, \
+    {   1023,14}, {   2431,13}, {   4863,15}, {   1279,14}, \
+    {   2943,13}, {   5887,15}, {   1535,14}, {   3455,15}, \
+    {   1791,14}, {   3839,16}, {   1023,15}, {   2047,14}, \
+    {   4223,15}, {   2303,14}, {   4863,15}, {   2559,14}, \
+    {   5119,15}, {   2815,14}, {   5887,16}, {   1535,15}, \
+    {   3071,14}, {   6143,15}, {   3327,14}, {   6911,15}, \
+    {   3839,17}, {   1023,16}, {   2047,15}, {   4863,16}, \
+    {   2559,15}, {   5887,14}, {  11775,16}, {   3071,15}, \
+    {   6911,16}, {   3583,15}, {   7679,14}, {  15359,17}, \
+    {   2047,16}, {   4095,15}, {   8191,16}, {   4607,15}, \
+    {   9983,14}, {  19967,16}, {   5631,15}, {  11775,17}, \
+    {   3071,16}, {  65536,17}, { 131072,18}, { 262144,19}, \
+    { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \
+    {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 205
+#define SQR_FFT_THRESHOLD                 4224
+
+#define MULLO_BASECASE_THRESHOLD             0  /* always */
+#define MULLO_DC_THRESHOLD                  79
+#define MULLO_MUL_N_THRESHOLD            11278
+#define SQRLO_BASECASE_THRESHOLD            10
+#define SQRLO_DC_THRESHOLD                 109
+#define SQRLO_SQR_THRESHOLD               8207
+
+#define DC_DIV_QR_THRESHOLD                 55
+#define DC_DIVAPPR_Q_THRESHOLD             179
+#define DC_BDIV_QR_THRESHOLD                82
+#define DC_BDIV_Q_THRESHOLD                166
+
+#define INV_MULMOD_BNM1_THRESHOLD           50
+#define INV_NEWTON_THRESHOLD               170
+#define INV_APPR_THRESHOLD                 171
+
+#define BINV_NEWTON_THRESHOLD              294
+#define REDC_1_TO_REDC_2_THRESHOLD          33
+#define REDC_2_TO_REDC_N_THRESHOLD          59
+
+#define MU_DIV_QR_THRESHOLD               1528
+#define MU_DIVAPPR_Q_THRESHOLD            1589
+#define MUPI_DIV_QR_THRESHOLD               62
+#define MU_BDIV_QR_THRESHOLD              1470
+#define MU_BDIV_Q_THRESHOLD               1597
+
+#define POWM_SEC_TABLE  2,8,191,452,904
+
+#define GET_STR_DC_THRESHOLD                12
+#define GET_STR_PRECOMPUTE_THRESHOLD        19
+#define SET_STR_DC_THRESHOLD               898
+#define SET_STR_PRECOMPUTE_THRESHOLD      1670
+
+#define FAC_DSC_THRESHOLD                  474
+#define FAC_ODD_THRESHOLD                    0  /* always */
+
+#define MATRIX22_STRASSEN_THRESHOLD         16
+#define HGCD2_DIV1_METHOD                    5  /* 3.85% faster than 3 */
+#define HGCD_THRESHOLD                      64
+#define HGCD_APPR_THRESHOLD                 60
+#define HGCD_REDUCE_THRESHOLD             2681
+#define GCD_DC_THRESHOLD                   618
+#define GCDEXT_DC_THRESHOLD                321
+#define JACOBI_BASE_METHOD                   1  /* 12.01% faster than 4 */
+
+/* Tuneup completed successfully, took 213784 seconds */

diff --git a/third_party/gmp/mpn/x86_64/sqr_diag_addlsh1.asm b/third_party/gmp/mpn/x86_64/sqr_diag_addlsh1.asm
new file mode 100644
index 0000000..f486125
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/sqr_diag_addlsh1.asm

@@ -0,0 +1,116 @@
+dnl  AMD64 mpn_sqr_diag_addlsh1
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2011-2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C AMD K8,K9	 2.5
+C AMD K10	 2.5
+C AMD bull	 3.6
+C AMD pile	 3.6
+C AMD steam	 ?
+C AMD bobcat	 4
+C AMD jaguar	 ?
+C Intel P4	 11.5
+C Intel core	 4
+C Intel NHM	 3.6
+C Intel SBR	 3.15
+C Intel IBR	 3.0
+C Intel HWL	 2.6
+C Intel BWL	 ?
+C Intel atom	14
+C VIA nano	 3.5
+
+C When playing with pointers, set this to $2 to fall back to conservative
+C indexing in wind-down code.
+define(`I',`$1')
+
+define(`rp',     `%rdi')
+define(`tp',     `%rsi')
+define(`up_arg', `%rdx')
+define(`n',      `%rcx')
+
+define(`up',     `%r11')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(32)
+PROLOGUE(mpn_sqr_diag_addlsh1)
+	FUNC_ENTRY(4)
+	push	%rbx
+
+	dec	n
+	shl	n
+
+	mov	(up_arg), %rax
+
+	lea	(rp,n,8), rp
+	lea	(tp,n,8), tp
+	lea	(up_arg,n,4), up
+	neg	n
+
+	mul	%rax
+	mov	%rax, (rp,n,8)
+
+	xor	R32(%rbx), R32(%rbx)
+	jmp	L(mid)
+
+	ALIGN(16)
+L(top):	add	%r10, %r8
+	adc	%rax, %r9
+	mov	%r8, -8(rp,n,8)
+	mov	%r9, (rp,n,8)
+L(mid):	mov	8(up,n,4), %rax
+	mov	(tp,n,8), %r8
+	mov	8(tp,n,8), %r9
+	adc	%r8, %r8
+	adc	%r9, %r9
+	lea	(%rdx,%rbx), %r10
+	setc	R8(%rbx)
+	mul	%rax
+	add	$2, n
+	js	L(top)
+
+L(end):	add	%r10, %r8
+	adc	%rax, %r9
+	mov	%r8, I(-8(rp),-8(rp,n,8))
+	mov	%r9, I((rp),(rp,n,8))
+	adc	%rbx, %rdx
+	mov	%rdx, I(8(rp),8(rp,n,8))
+
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/sublsh1_n.asm b/third_party/gmp/mpn/x86_64/sublsh1_n.asm
new file mode 100644
index 0000000..c6d829f
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/sublsh1_n.asm

@@ -0,0 +1,160 @@
+dnl  AMD64 mpn_sublsh1_n -- rp[] = up[] - (vp[] << 1)
+
+dnl  Copyright 2003, 2005-2007, 2011, 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C	     cycles/limb
+C AMD K8,K9	 2.2
+C AMD K10	 2.2
+C Intel P4	12.75
+C Intel core2	 3.45
+C Intel corei	 ?
+C Intel atom	 ?
+C VIA nano	 3.25
+
+C Sometimes speed degenerates, supposedly related to that some operand
+C alignments cause cache conflicts.
+
+C The speed is limited by decoding/issue bandwidth.  There are 26 instructions
+C in the loop, which corresponds to 26/3/4 = 2.167 c/l.
+
+C INPUT PARAMETERS
+define(`rp',`%rdi')
+define(`up',`%rsi')
+define(`vp',`%rdx')
+define(`n', `%rcx')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_sublsh1_n)
+	FUNC_ENTRY(4)
+	push	%rbx
+	push	%rbp
+
+	mov	(vp), %r8
+	mov	R32(n), R32(%rax)
+	lea	(rp,n,8), rp
+	lea	(up,n,8), up
+	lea	(vp,n,8), vp
+	neg	n
+	xor	R32(%rbp), R32(%rbp)
+	and	$3, R32(%rax)
+	je	L(b00)
+	cmp	$2, R32(%rax)
+	jc	L(b01)
+	je	L(b10)
+
+L(b11):	add	%r8, %r8
+	mov	8(vp,n,8), %r9
+	adc	%r9, %r9
+	mov	16(vp,n,8), %r10
+	adc	%r10, %r10
+	sbb	R32(%rax), R32(%rax)	C save scy
+	mov	(up,n,8), %rbp
+	mov	8(up,n,8), %rbx
+	sub	%r8, %rbp
+	sbb	%r9, %rbx
+	mov	%rbp, (rp,n,8)
+	mov	%rbx, 8(rp,n,8)
+	mov	16(up,n,8), %rbp
+	sbb	%r10, %rbp
+	mov	%rbp, 16(rp,n,8)
+	sbb	R32(%rbp), R32(%rbp)	C save acy
+	add	$3, n
+	jmp	L(ent)
+
+L(b10):	add	%r8, %r8
+	mov	8(vp,n,8), %r9
+	adc	%r9, %r9
+	sbb	R32(%rax), R32(%rax)	C save scy
+	mov	(up,n,8), %rbp
+	mov	8(up,n,8), %rbx
+	sub	%r8, %rbp
+	sbb	%r9, %rbx
+	mov	%rbp, (rp,n,8)
+	mov	%rbx, 8(rp,n,8)
+	sbb	R32(%rbp), R32(%rbp)	C save acy
+	add	$2, n
+	jmp	L(ent)
+
+L(b01):	add	%r8, %r8
+	sbb	R32(%rax), R32(%rax)	C save scy
+	mov	(up,n,8), %rbp
+	sub	%r8, %rbp
+	mov	%rbp, (rp,n,8)
+	sbb	R32(%rbp), R32(%rbp)	C save acy
+	inc	n
+L(ent):	jns	L(end)
+
+	ALIGN(16)
+L(top):	add	R32(%rax), R32(%rax)	C restore scy
+
+	mov	(vp,n,8), %r8
+L(b00):	adc	%r8, %r8
+	mov	8(vp,n,8), %r9
+	adc	%r9, %r9
+	mov	16(vp,n,8), %r10
+	adc	%r10, %r10
+	mov	24(vp,n,8), %r11
+	adc	%r11, %r11
+
+	sbb	R32(%rax), R32(%rax)	C save scy
+	add	R32(%rbp), R32(%rbp)	C restore acy
+
+	mov	(up,n,8), %rbp
+	mov	8(up,n,8), %rbx
+	sbb	%r8, %rbp
+	sbb	%r9, %rbx
+	mov	%rbp, (rp,n,8)
+	mov	%rbx, 8(rp,n,8)
+	mov	16(up,n,8), %rbp
+	mov	24(up,n,8), %rbx
+	sbb	%r10, %rbp
+	sbb	%r11, %rbx
+	mov	%rbp, 16(rp,n,8)
+	mov	%rbx, 24(rp,n,8)
+
+	sbb	R32(%rbp), R32(%rbp)	C save acy
+	add	$4, n
+	js	L(top)
+
+L(end):	add	R32(%rbp), R32(%rax)
+	neg	R32(%rax)
+
+	pop	%rbp
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/x86_64-defs.m4 b/third_party/gmp/mpn/x86_64/x86_64-defs.m4
new file mode 100644
index 0000000..64e3729
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/x86_64-defs.m4

@@ -0,0 +1,475 @@
+divert(-1)
+
+dnl  m4 macros for amd64 assembler.
+
+dnl  Copyright 1999-2005, 2008, 2009, 2011-2013, 2017 Free Software Foundation,
+dnl  Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+dnl  Usage: CPUVEC_FUNCS_LIST
+dnl
+dnl  A list of the functions from gmp-impl.h x86 struct cpuvec_t, in the
+dnl  order they appear in that structure.
+
+define(CPUVEC_FUNCS_LIST,
+``add_n',
+`addlsh1_n',
+`addlsh2_n',
+`addmul_1',
+`addmul_2',
+`bdiv_dbm1c',
+`cnd_add_n',
+`cnd_sub_n',
+`com',
+`copyd',
+`copyi',
+`divexact_1',
+`divrem_1',
+`gcd_11',
+`lshift',
+`lshiftc',
+`mod_1',
+`mod_1_1p',
+`mod_1_1p_cps',
+`mod_1s_2p',
+`mod_1s_2p_cps',
+`mod_1s_4p',
+`mod_1s_4p_cps',
+`mod_34lsub1',
+`modexact_1c_odd',
+`mul_1',
+`mul_basecase',
+`mullo_basecase',
+`preinv_divrem_1',
+`preinv_mod_1',
+`redc_1',
+`redc_2',
+`rshift',
+`sqr_basecase',
+`sub_n',
+`sublsh1_n',
+`submul_1'')
+
+
+dnl  Called: PROLOGUE_cpu(GSYM_PREFIX`'foo)
+dnl
+dnl  In the amd64 code we use explicit TEXT and ALIGN() calls in the code,
+dnl  since different alignments are wanted in various circumstances.  So for
+dnl  instance,
+dnl
+dnl                  TEXT
+dnl                  ALIGN(16)
+dnl          PROLOGUE(mpn_add_n)
+dnl                  ...
+dnl          EPILOGUE()
+
+define(`PROLOGUE_cpu',
+m4_assert_numargs(1)
+`	GLOBL	$1
+	TYPE($1,`function')
+$1:
+')
+
+
+dnl  Usage: ASSERT([cond][,instructions])
+dnl
+dnl  If WANT_ASSERT is 1, output the given instructions and expect the given
+dnl  flags condition to then be satisfied.  For example,
+dnl
+dnl         ASSERT(ne, `cmpq %rax, %rbx')
+dnl
+dnl  The instructions can be omitted to just assert a flags condition with
+dnl  no extra calculation.  For example,
+dnl
+dnl         ASSERT(nc)
+dnl
+dnl  When `instructions' is not empty, a pushfq/popfq is added for
+dnl  convenience to preserve the flags, but the instructions themselves must
+dnl  preserve any registers that matter.
+dnl
+dnl  The condition can be omitted to just output the given instructions when
+dnl  assertion checking is wanted.  In this case the pushf/popf is omitted.
+dnl  For example,
+dnl
+dnl         ASSERT(, `movq %rax, VAR_KEEPVAL')
+
+define(ASSERT,
+m4_assert_numargs_range(1,2)
+m4_assert_defined(`WANT_ASSERT')
+`ifelse(WANT_ASSERT,1,
+`ifelse(`$1',,
+`	$2',
+`ifelse(`$2',,,
+`	pushfq')
+	$2
+	`j$1'	L(ASSERT_ok`'ASSERT_counter)
+	ud2	C assertion failed
+L(ASSERT_ok`'ASSERT_counter):
+ifelse(`$2',,,`	popfq')
+define(`ASSERT_counter',incr(ASSERT_counter))')')')
+
+define(ASSERT_counter,1)
+
+dnl LEA - load effective address
+dnl
+dnl FIXME: We should never create a GOT entry and therefore use the simpler 2nd
+dnl variant always. We need to understand what happens for not-yet-hidden
+dnl symbols first.
+dnl
+define(`LEA',`dnl
+ifdef(`PIC',
+	`mov	$1@GOTPCREL(%rip), $2'
+,
+	`lea	$1(%rip), $2')
+')
+
+
+define(`DEF_OBJECT',
+m4_assert_numargs_range(2,3)
+`	ifelse($#,3,`$3',`RODATA')
+	ALIGN($2)
+$1:
+')
+
+define(`END_OBJECT',
+m4_assert_numargs(1)
+`	SIZE(`$1',.-`$1')')
+
+
+define(`R32',
+	`ifelse($1,`%rax',`%eax',
+		$1,`%rbx',`%ebx',
+		$1,`%rcx',`%ecx',
+		$1,`%rdx',`%edx',
+		$1,`%rsi',`%esi',
+		$1,`%rdi',`%edi',
+		$1,`%rbp',`%ebp',
+		$1,`%r8',`%r8d',
+		$1,`%r9',`%r9d',
+		$1,`%r10',`%r10d',
+		$1,`%r11',`%r11d',
+		$1,`%r12',`%r12d',
+		$1,`%r13',`%r13d',
+		$1,`%r14',`%r14d',
+		$1,`%r15',`%r15d')')
+define(`R8',
+	`ifelse($1,`%rax',`%al',
+		$1,`%rbx',`%bl',
+		$1,`%rcx',`%cl',
+		$1,`%rdx',`%dl',
+		$1,`%rsi',`%sil',
+		$1,`%rdi',`%dil',
+		$1,`%rbp',`%bpl',
+		$1,`%r8',`%r8b',
+		$1,`%r9',`%r9b',
+		$1,`%r10',`%r10b',
+		$1,`%r11',`%r11b',
+		$1,`%r12',`%r12b',
+		$1,`%r13',`%r13b',
+		$1,`%r14',`%r14b',
+		$1,`%r15',`%r15b')')
+
+
+dnl  Usage: CALL(funcname)
+dnl
+
+define(`CALL',`dnl
+ifdef(`PIC',
+	`call	GSYM_PREFIX`'$1@PLT'
+,
+	`call	GSYM_PREFIX`'$1'
+)')
+
+define(`TCALL',`dnl
+ifdef(`PIC',
+	`jmp	GSYM_PREFIX`'$1@PLT'
+,
+	`jmp	GSYM_PREFIX`'$1'
+)')
+
+
+define(`JUMPTABSECT', `.section	.data.rel.ro.local,"a",@progbits')
+
+
+dnl  Usage: JMPENT(targlabel,tablabel)
+
+define(`JMPENT',`dnl
+ifdef(`PIC',
+	`.long	$1-$2'dnl
+,
+	`.quad	$1'dnl
+)')
+
+
+dnl  These macros are defined just for DOS64, where they provide calling
+dnl  sequence glue code.
+
+define(`FUNC_ENTRY',`')
+define(`FUNC_EXIT',`')
+
+
+dnl  Target ABI macros.
+
+define(`IFDOS',   `')
+define(`IFSTD',   `$1')
+define(`IFELF',   `$1')
+
+
+dnl  Usage: PROTECT(symbol)
+dnl
+dnl  Used for private GMP symbols that should never be overridden by users.
+dnl  This can save reloc entries and improve shlib sharing as well as
+dnl  application startup times
+
+define(`PROTECT',  `.hidden $1')
+
+
+dnl  Usage: x86_lookup(target, key,value, key,value, ...)
+dnl
+dnl  Look for `target' among the `key' parameters.
+dnl
+dnl  x86_lookup expands to the corresponding `value', or generates an error
+dnl  if `target' isn't found.
+
+define(x86_lookup,
+m4_assert_numargs_range(1,999)
+`ifelse(eval($#<3),1,
+`m4_error(`unrecognised part of x86 instruction: $1
+')',
+`ifelse(`$1',`$2', `$3',
+`x86_lookup(`$1',shift(shift(shift($@))))')')')
+
+
+dnl  Usage: x86_opcode_regxmm(reg)
+dnl
+dnl  Validate the given xmm register, and return its number, 0 to 7.
+
+define(x86_opcode_regxmm,
+m4_assert_numargs(1)
+`x86_lookup(`$1',x86_opcode_regxmm_list)')
+
+define(x86_opcode_regxmm_list,
+``%xmm0',0,
+`%xmm1',1,
+`%xmm2',2,
+`%xmm3',3,
+`%xmm4',4,
+`%xmm5',5,
+`%xmm6',6,
+`%xmm7',7,
+`%xmm8',8,
+`%xmm9',9,
+`%xmm10',10,
+`%xmm11',11,
+`%xmm12',12,
+`%xmm13',13,
+`%xmm14',14,
+`%xmm15',15')
+
+dnl  Usage: palignr($imm,%srcreg,%dstreg)
+dnl
+dnl  Emit a palignr instruction, using a .byte sequence, since obsolete but
+dnl  still distributed versions of gas don't know SSSE3 instructions.
+
+define(`palignr',
+m4_assert_numargs(3)
+`.byte	0x66,dnl
+ifelse(eval(x86_opcode_regxmm($3) >= 8 || x86_opcode_regxmm($2) >= 8),1,
+       `eval(0x40+x86_opcode_regxmm($3)/8*4+x86_opcode_regxmm($2)/8),')dnl
+0x0f,0x3a,0x0f,dnl
+eval(0xc0+x86_opcode_regxmm($3)%8*8+x86_opcode_regxmm($2)%8),dnl
+substr($1,1)')
+
+
+dnl  Usage
+dnl
+dnl    regnum(op)   raw operand index (so slightly misnamed)
+dnl    regnumh(op)  high bit of register operand nimber
+dnl    ix(op)       0 for reg operand, 1 for plain pointer operand.
+dnl
+
+define(`regnum',`x86_lookup(`$1',oplist)')
+define(`regnumh',`eval(regnum($1)/8 & 1)')
+define(`ix',`eval(regnum($1)/16)')
+define(`oplist',
+``%rax',   0, `%rcx',   1, `%rdx',   2,  `%rbx',   3,
+ `%rsp',   4, `%rbp',   5, `%rsi',   6,  `%rdi',   7,
+ `%r8',    8, `%r9',    9, `%r10',  10,  `%r11',  11,
+ `%r12',  12, `%r13',  13, `%r14',  14,  `%r15',  15,
+ `(%rax)',16, `(%rcx)',17, `(%rdx)',18,  `(%rbx)',19,
+ `(%rsp)',20, `(%rbp)',21, `(%rsi)',22,  `(%rdi)',23,
+ `(%r8)', 24, `(%r9)', 25, `(%r10)',26,  `(%r11)',27,
+ `(%r12)',28, `(%r13)',29, `(%r14)',30,  `(%r15)',31')
+
+dnl  Usage (by mulx, shlx, shrx)
+dnl
+dnl     reg1,reg2,reg3,opc1,opc2
+dnl
+dnl  or
+dnl
+dnl     (reg1),reg2,reg3,opc1,opc2
+dnl
+dnl  where reg1 is any register but rsp,rbp,r12,r13, or
+dnl
+dnl  or
+dnl
+dnl     off,(reg1),reg2,reg3,opc1,opc2
+dnl
+dnl  where reg1 is any register but rsp,r12.
+dnl
+dnl  The exceptions are due to special coding needed for some registers; rsp
+dnl  and r12 need an extra byte 0x24 at the end while rbp and r13 lack the
+dnl  offset-less form.
+dnl
+dnl  Other addressing forms are not handled.  Invalid forms are not properly
+dnl  detected.  Offsets that don't fit one byte are not handled correctly.
+
+define(`c4_helper',`dnl
+.byte	0xc4`'dnl
+ifelse(`$#',5,`dnl
+,eval(0xe2^32*regnumh($1)^128*regnumh($3))`'dnl
+,eval(0x$4-8*regnum($2))`'dnl
+,0x$5`'dnl
+,eval(0xc0+(7 & regnum($1))+8*(7 & regnum($3))-0xc0*ix($1))`'dnl
+',`$#',6,`dnl
+,eval(0xe2^32*regnumh($2)^128*regnumh($4))`'dnl
+,eval(0x$5-8*regnum($3))`'dnl
+,0x$6`'dnl
+,eval(0x40+(7 & regnum($2))+8*(7 & regnum($4)))`'dnl
+,eval(($1 + 256) % 256)`'dnl
+')')
+
+
+dnl  Usage
+dnl
+dnl     mulx(reg1,reg2,reg3)
+dnl
+dnl  or
+dnl
+dnl     mulx((reg1),reg2,reg3)
+dnl
+dnl  where reg1 is any register but rsp,rbp,r12,r13, or
+dnl
+dnl     mulx(off,(reg1),reg2,reg3)
+dnl
+dnl  where reg1 is any register but rsp,r12.
+
+define(`mulx',`dnl
+ifelse(`$#',3,`dnl
+c4_helper($1,$2,$3,fb,f6)',`dnl         format 1,2
+c4_helper($1,$2,$3,$4,fb,f6)'dnl	format 3
+)')
+
+
+dnl  Usage
+dnl
+dnl     shlx(reg1,reg2,reg3)
+dnl     shrx(reg1,reg2,reg3)
+dnl
+dnl  or
+dnl
+dnl     shlx(reg1,(reg2),reg3)
+dnl     shrx(reg1,(reg2),reg3)
+dnl
+dnl  where reg2 is any register but rsp,rbp,r12,r13, or
+dnl
+dnl     shlx(reg1,off,(reg2),reg3)
+dnl     shrx(reg1,off,(reg2),reg3)
+dnl
+dnl  where reg2 is any register but rsp,r12.
+
+define(`shlx',`dnl
+ifelse(`$#',3,`dnl
+c4_helper($2,$1,$3,f9,f7)',`dnl         format 1,2
+c4_helper($1,$3,$2,$4,f9,f7)'dnl        format 3
+)')
+
+define(`shrx',`dnl
+ifelse(`$#',3,`dnl
+c4_helper($2,$1,$3,fb,f7)',`dnl         format 1,2
+c4_helper($1,$3,$2,$4,fb,f7)'dnl        format 3
+)')
+
+define(`sarx',`dnl
+ifelse(`$#',3,`dnl
+c4_helper($2,$1,$3,fa,f7)',`dnl         format 1,2
+c4_helper($1,$3,$2,$4,fa,f7)'dnl        format 3
+)')
+
+
+dnl  Usage
+dnl
+dnl     adcx(reg1,reg2)
+dnl     adox(reg1,reg2)
+dnl
+dnl  or
+dnl
+dnl     adcx((reg1),reg2)
+dnl     adox((reg1),reg2)
+dnl
+dnl  where reg1 is any register but rsp,rbp,r12,r13, or
+dnl
+dnl     adcx(off,(reg1),reg2)
+dnl     adox(off,(reg1),reg2)
+dnl
+dnl  where reg1 is any register but rsp,r12.
+dnl
+dnl  The exceptions are due to special coding needed for some registers; rsp
+dnl  and r12 need an extra byte 0x24 at the end while rbp and r13 lack the
+dnl  offset-less form.
+dnl
+dnl  Other addressing forms are not handled.  Invalid forms are not properly
+dnl  detected.  Offsets that don't fit one byte are not handled correctly.
+
+define(`adx_helper',`dnl
+,eval(0x48+regnumh($1)+4*regnumh($2))`'dnl
+,0x0f`'dnl
+,0x38`'dnl
+,0xf6`'dnl
+')
+
+define(`adx',`dnl
+ifelse(`$#',2,`dnl
+adx_helper($1,$2)dnl
+,eval(0xc0+(7 & regnum($1))+8*(7 & regnum($2))-0xc0*ix($1))`'dnl
+',`$#',3,`dnl
+adx_helper($2,$3)dnl
+,eval(0x40+(7 & regnum($2))+8*(7 & regnum($3)))`'dnl
+,eval(($1 + 256) % 256)`'dnl
+')')
+
+define(`adcx',`dnl
+.byte	0x66`'dnl
+adx($@)')
+
+define(`adox',`dnl
+.byte	0xf3`'dnl
+adx($@)')
+
+divert`'dnl

diff --git a/third_party/gmp/mpn/x86_64/zen/aorrlsh1_n.asm b/third_party/gmp/mpn/x86_64/zen/aorrlsh1_n.asm
new file mode 100644
index 0000000..803fa30
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/zen/aorrlsh1_n.asm

@@ -0,0 +1,37 @@
+dnl  X86-64 mpn_addlsh1_n, mpn_addlsh1_nc, mpn_rsblsh1_n, mpn_rsblsh1_nc.
+
+dnl  Copyright 2017 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_addlsh1_nc mpn_rsblsh1_n mpn_rsblsh1_nc)
+include_mpn(`x86_64/atom/aorrlsh1_n.asm')

diff --git a/third_party/gmp/mpn/x86_64/zen/aorrlsh_n.asm b/third_party/gmp/mpn/x86_64/zen/aorrlsh_n.asm
new file mode 100644
index 0000000..e049b2f
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/zen/aorrlsh_n.asm

@@ -0,0 +1,226 @@
+dnl  AMD64 mpn_addlsh_n, mpn_rsblsh_n.
+
+dnl  Copyright 2017 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C		     cycles/limb
+C AMD K8,K9		n/a
+C AMD K10		n/a
+C AMD bd1		n/a
+C AMD bd2		n/a
+C AMD bd3		n/a
+C AMD bd4		 2.31
+C AMD zen		 1.69
+C AMD bt1		n/a
+C AMD bt2		n/a
+C Intel P4		n/a
+C Intel PNR		n/a
+C Intel NHM		n/a
+C Intel SBR		n/a
+C Intel IBR		n/a
+C Intel HWL		 2.08
+C Intel BWL		 1.78
+C Intel SKL		 1.78
+C Intel atom		n/a
+C Intel SLM		n/a
+C VIA nano		n/a
+
+C TODO
+C  * The loop sustains 4 insns/cycle on zen.
+C  * Perhaps avoid using jrcxz by using dec n + jnz.
+
+define(`rp',	`%rdi')
+define(`up',	`%rsi')
+define(`vp',	`%rdx')
+define(`n',	`%rcx')
+define(`cnt',	`%r8')
+
+define(`tnc',	`%r9')
+
+ifdef(`OPERATION_addlsh_n',`
+  define(ADCSBB,       `adc')
+  define(func, mpn_addlsh_n)
+')
+ifdef(`OPERATION_rsblsh_n',`
+  define(ADCSBB,       `sbb')
+  define(func, mpn_rsblsh_n)
+')
+
+MULFUNC_PROLOGUE(mpn_addlsh_n mpn_rsblsh_n)
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(32)
+PROLOGUE(func)
+	FUNC_ENTRY(4)
+IFDOS(`	mov	56(%rsp), %r8d	')
+
+	mov	(vp), %r10
+
+	mov	R32(n), R32(%rax)
+	shr	$3, n
+	xor	R32(tnc), R32(tnc)
+	sub	cnt, tnc
+	and	$7, R32(%rax)
+
+	lea	L(tab)(%rip), %r11
+ifdef(`PIC',`
+	movslq	(%r11,%rax,4), %rax
+	add	%r11, %rax
+	jmp	*%rax
+',`
+	jmp	*(%r11,%rax,8)
+')
+
+L(0):	lea	32(up), up
+	lea	32(vp), vp
+	lea	32(rp), rp
+	xor	R32(%r11), R32(%r11)
+	jmp	L(e0)
+
+L(7):	mov	%r10, %r11
+	lea	24(up), up
+	lea	24(vp), vp
+	lea	24(rp), rp
+	xor	R32(%r10), R32(%r10)
+	jmp	L(e7)
+
+L(6):	lea	16(up), up
+	lea	16(vp), vp
+	lea	16(rp), rp
+	xor	R32(%r11), R32(%r11)
+	jmp	L(e6)
+
+L(5):	mov	%r10, %r11
+	lea	8(up), up
+	lea	8(vp), vp
+	lea	8(rp), rp
+	xor	R32(%r10), R32(%r10)
+	jmp	L(e5)
+
+L(end):	ADCSBB	24(up), %rax
+	mov	%rax, -40(rp)
+	shrx(	tnc, %r11, %rax)
+	ADCSBB	n, %rax
+	FUNC_EXIT()
+	ret
+
+	ALIGN(32)
+L(top):	jrcxz	L(end)
+	mov	-32(vp), %r10
+	ADCSBB	24(up), %rax
+	lea	64(up), up
+	shrx(	tnc, %r11, %r11)
+	mov	%rax, -40(rp)
+L(e0):	dec	n
+	shlx(	cnt, %r10, %rax)
+	lea	(%r11,%rax), %rax
+	mov	-24(vp), %r11
+	ADCSBB	-32(up), %rax
+	shrx(	tnc, %r10, %r10)
+	mov	%rax, -32(rp)
+L(e7):	shlx(	cnt, %r11, %rax)
+	lea	(%r10,%rax), %rax
+	mov	-16(vp), %r10
+	ADCSBB	-24(up), %rax
+	shrx(	tnc, %r11, %r11)
+	mov	%rax, -24(rp)
+L(e6):	shlx(	cnt, %r10, %rax)
+	lea	(%r11,%rax), %rax
+	mov	-8(vp), %r11
+	ADCSBB	-16(up), %rax
+	shrx(	tnc, %r10, %r10)
+	mov	%rax, -16(rp)
+L(e5):	shlx(	cnt, %r11, %rax)
+	lea	(%r10,%rax), %rax
+	mov	(vp), %r10
+	ADCSBB	-8(up), %rax
+	shrx(	tnc, %r11, %r11)
+	mov	%rax, -8(rp)
+L(e4):	shlx(	cnt, %r10, %rax)
+	lea	(%r11,%rax), %rax
+	mov	8(vp), %r11
+	ADCSBB	(up), %rax
+	shrx(	tnc, %r10, %r10)
+	mov	%rax, (rp)
+L(e3):	shlx(	cnt, %r11, %rax)
+	lea	(%r10,%rax), %rax
+	mov	16(vp), %r10
+	ADCSBB	8(up), %rax
+	shrx(	tnc, %r11, %r11)
+	mov	%rax, 8(rp)
+L(e2):	shlx(	cnt, %r10, %rax)
+	lea	(%r11,%rax), %rax
+	mov	24(vp), %r11
+	ADCSBB	16(up), %rax
+	lea	64(vp), vp
+	shrx(	tnc, %r10, %r10)
+	mov	%rax, 16(rp)
+	lea	64(rp), rp
+L(e1):	shlx(	cnt, %r11, %rax)
+	lea	(%r10,%rax), %rax
+	jmp	L(top)
+
+L(4):	xor	R32(%r11), R32(%r11)
+	jmp	L(e4)
+
+L(3):	mov	%r10, %r11
+	lea	-8(up), up
+	lea	-8(vp), vp
+	lea	-8(rp), rp
+	xor	R32(%r10), R32(%r10)
+	jmp	L(e3)
+
+L(2):	lea	-16(up), up
+	lea	-16(vp), vp
+	lea	-16(rp), rp
+	xor	R32(%r11), R32(%r11)
+	jmp	L(e2)
+
+L(1):	mov	%r10, %r11
+	lea	-24(up), up
+	lea	40(vp), vp
+	lea	40(rp), rp
+	xor	R32(%r10), R32(%r10)
+	jmp	L(e1)
+EPILOGUE()
+	JUMPTABSECT
+	ALIGN(8)
+L(tab):	JMPENT(	L(0), L(tab))
+	JMPENT(	L(1), L(tab))
+	JMPENT(	L(2), L(tab))
+	JMPENT(	L(3), L(tab))
+	JMPENT(	L(4), L(tab))
+	JMPENT(	L(5), L(tab))
+	JMPENT(	L(6), L(tab))
+	JMPENT(	L(7), L(tab))

diff --git a/third_party/gmp/mpn/x86_64/zen/aorsmul_1.asm b/third_party/gmp/mpn/x86_64/zen/aorsmul_1.asm
new file mode 100644
index 0000000..89795e3
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/zen/aorsmul_1.asm

@@ -0,0 +1,165 @@
+dnl  AMD64 mpn_addmul_1 and mpn_submul_1 for CPUs with mulx.
+
+dnl  Copyright 2012, 2013, 2017 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C AMD K8,K9	 -
+C AMD K10	 -
+C AMD bd1	 -
+C AMD bd2	 -
+C AMD bd3	 -
+C AMD bd4	 4.3
+C AMD zen	 2
+C AMD bt1	 -
+C AMD bt2	 -
+C Intel P4	 -
+C Intel PNR	 -
+C Intel NHM	 -
+C Intel SBR	 -
+C Intel IBR	 -
+C Intel HWL	 ?
+C Intel BWL	 ?
+C Intel SKL	 ?
+C Intel atom	 -
+C Intel SLM	 -
+C VIA nano	 -
+
+define(`rp',      `%rdi')   C rcx
+define(`up',      `%rsi')   C rdx
+define(`n_param', `%rdx')   C r8
+define(`v0_param',`%rcx')   C r9
+
+define(`n',       `%rcx')
+define(`v0',      `%rdx')
+
+ifdef(`OPERATION_addmul_1',`
+      define(`ADDSUB',        `add')
+      define(`ADCSBB',        `adc')
+      define(`func',  `mpn_addmul_1')
+')
+ifdef(`OPERATION_submul_1',`
+      define(`ADDSUB',        `sub')
+      define(`ADCSBB',        `sbb')
+      define(`func',  `mpn_submul_1')
+')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(func)
+	FUNC_ENTRY(4)
+	mov	(up), %r8
+
+	push	%rbx
+	push	%r12
+	push	%r13
+
+	lea	(up,n_param,8), up
+	lea	-32(rp,n_param,8), rp
+	mov	R32(n_param), R32(%rax)
+	xchg	v0_param, v0		C FIXME: is this insn fast?
+
+	neg	n
+
+	and	$3, R8(%rax)
+	jz	L(b0)
+	cmp	$2, R8(%rax)
+	jz	L(b2)
+	jg	L(b3)
+
+L(b1):	mulx(	%r8, %rbx, %rax)
+	sub	$-1, n
+	jz	L(wd1)
+	.byte	0xc4,0x62,0xb3,0xf6,0x04,0xce		C mulx (up,n,8), %r9, %r8
+	.byte	0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08	C mulx 8(up,n,8), %r11, %r10
+	test	R32(%rax), R32(%rax)		C clear cy
+	jmp	L(lo1)
+
+L(b0):	mulx(	%r8, %r9, %r8)
+	.byte	0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08	C mulx 8(up,n,8), %r11, %r10
+	.byte	0xc4,0x62,0x93,0xf6,0x64,0xce,0x10	C mulx 16(up,n,8), %r13, %r12
+	xor	R32(%rax), R32(%rax)
+	jmp	L(lo0)
+
+L(b3):	mulx(	%r8, %r11, %r10)
+	.byte	0xc4,0x62,0x93,0xf6,0x64,0xce,0x08	C mulx 8(up,n,8), %r13, %r12
+	.byte	0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x10	C mulx 16(up,n,8), %rbx, %rax
+	add	%r10, %r13
+	adc	%r12, %rbx
+	adc	$0, %rax
+	sub	$-3, n
+	jz	L(wd3)
+	test	R32(%rax), R32(%rax)		C clear cy
+	jmp	L(lo3)
+
+L(b2):	mulx(	%r8, %r13, %r12)
+	.byte	0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x08	C mulx 8(up,n,8), %rbx, %rax
+	add	%r12, %rbx
+	adc	$0, %rax
+	sub	$-2, n
+	jz	L(wd2)
+	.byte	0xc4,0x62,0xb3,0xf6,0x04,0xce		C mulx (up,n,8), %r9, %r8
+	test	R32(%rax), R32(%rax)		C clear cy
+	jmp	L(lo2)
+
+L(top):	ADDSUB	%r9, (rp,n,8)
+L(lo3):	.byte	0xc4,0x62,0xb3,0xf6,0x04,0xce		C mulx (up,n,8), %r9, %r8
+	ADCSBB	%r11, 8(rp,n,8)
+L(lo2):	.byte	0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08	C mulx 8(up,n,8), %r11, %r10
+	ADCSBB	%r13, 16(rp,n,8)
+L(lo1):	.byte	0xc4,0x62,0x93,0xf6,0x64,0xce,0x10	C mulx 16(up,n,8), %r13, %r12
+	ADCSBB	%rbx, 24(rp,n,8)
+	adc	%rax, %r9
+L(lo0):	.byte	0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18	C mulx 24(up,n,8), %rbx, %rax
+	adc	%r8, %r11
+	adc	%r10, %r13
+	adc	%r12, %rbx
+	adc	$0, %rax		C rax = carry limb
+	add	$4, n
+	js	L(top)
+
+L(end):	ADDSUB	%r9, (rp)
+L(wd3):	ADCSBB	%r11, 8(rp)
+L(wd2):	ADCSBB	%r13, 16(rp)
+L(wd1):	ADCSBB	%rbx, 24(rp)
+	adc	n, %rax
+	pop	%r13
+	pop	%r12
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/x86_64/zen/com.asm b/third_party/gmp/mpn/x86_64/zen/com.asm
new file mode 100644
index 0000000..b34f841
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/zen/com.asm

@@ -0,0 +1,37 @@
+dnl  X86-64 mpn_com optimised for AMD Zen.
+
+dnl  Copyright 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_com)
+include_mpn(`x86_64/fastsse/com.asm')

diff --git a/third_party/gmp/mpn/x86_64/zen/copyd.asm b/third_party/gmp/mpn/x86_64/zen/copyd.asm
new file mode 100644
index 0000000..63ed237
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/zen/copyd.asm

@@ -0,0 +1,37 @@
+dnl  X86-64 mpn_copyd optimised for AMD Zen.
+
+dnl  Copyright 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_copyd)
+include_mpn(`x86_64/fastsse/copyd.asm')

diff --git a/third_party/gmp/mpn/x86_64/zen/copyi.asm b/third_party/gmp/mpn/x86_64/zen/copyi.asm
new file mode 100644
index 0000000..1aafaaa
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/zen/copyi.asm

@@ -0,0 +1,37 @@
+dnl  X86-64 mpn_copyi optimised for AMD Zen.
+
+dnl  Copyright 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_copyi)
+include_mpn(`x86_64/fastsse/copyi.asm')

diff --git a/third_party/gmp/mpn/x86_64/zen/gcd_11.asm b/third_party/gmp/mpn/x86_64/zen/gcd_11.asm
new file mode 100644
index 0000000..0ffb6ca
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/zen/gcd_11.asm

@@ -0,0 +1,37 @@
+dnl  AMD64 mpn_gcd_11.
+
+dnl  Copyright 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_gcd_11)
+include_mpn(`x86_64/bd2/gcd_11.asm')

diff --git a/third_party/gmp/mpn/x86_64/zen/gcd_22.asm b/third_party/gmp/mpn/x86_64/zen/gcd_22.asm
new file mode 100644
index 0000000..5dfd9e3
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/zen/gcd_22.asm

@@ -0,0 +1,37 @@
+dnl  AMD64 mpn_gcd_22.
+
+dnl  Copyright 2019 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+dnl ABI_SUPPORT(DOS64)	C returns mp_double_limb_t in memory
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_gcd_22)
+include_mpn(`x86_64/coreihwl/gcd_22.asm')

diff --git a/third_party/gmp/mpn/x86_64/zen/gmp-mparam.h b/third_party/gmp/mpn/x86_64/zen/gmp-mparam.h
new file mode 100644
index 0000000..05a12b3
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/zen/gmp-mparam.h

@@ -0,0 +1,280 @@
+/* AMD Zen gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+/* Disable use of slow functions.  FIXME: We should disable lib inclusion.  */
+#undef HAVE_NATIVE_mpn_mul_2
+#undef HAVE_NATIVE_mpn_addmul_2
+
+/* 3700-4300 MHz Pinnacle Ridge */
+/* FFT tuning limit = 468,514,360 */
+/* Generated by tuneup.c, 2019-10-18, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD                 0  /* always */
+#define MOD_1_UNNORM_THRESHOLD               0  /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          4
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          3
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD        13
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD        18
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD      9
+#define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_1_NORM_THRESHOLD              1
+#define DIV_QR_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */
+#define DIV_QR_2_PI2_THRESHOLD              32
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           22
+
+#define DIV_1_VS_MUL_1_PERCENT             338
+
+#define MUL_TOOM22_THRESHOLD                16
+#define MUL_TOOM33_THRESHOLD               107
+#define MUL_TOOM44_THRESHOLD               190
+#define MUL_TOOM6H_THRESHOLD               230
+#define MUL_TOOM8H_THRESHOLD               272
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      97
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     110
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD     106
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD     117
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD     136
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 32
+#define SQR_TOOM3_THRESHOLD                114
+#define SQR_TOOM4_THRESHOLD                422
+#define SQR_TOOM6_THRESHOLD                  0  /* always */
+#define SQR_TOOM8_THRESHOLD                  0  /* always */
+
+#define MULMID_TOOM42_THRESHOLD             40
+
+#define MULMOD_BNM1_THRESHOLD               12
+#define SQRMOD_BNM1_THRESHOLD               17
+
+#define MUL_FFT_MODF_THRESHOLD             540  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    540, 5}, {     22, 6}, {     12, 5}, {     25, 6}, \
+    {     25, 7}, {     13, 6}, {     29, 7}, {     15, 6}, \
+    {     31, 7}, {     21, 8}, {     11, 7}, {     25, 8}, \
+    {     13, 7}, {     29, 8}, {     15, 7}, {     32, 8}, \
+    {     17, 7}, {     35, 8}, {     19, 7}, {     39, 8}, \
+    {     21, 7}, {     43, 9}, {     11, 8}, {     29, 9}, \
+    {     15, 8}, {     35, 9}, {     19, 8}, {     43, 9}, \
+    {     23, 8}, {     49, 9}, {     27,10}, {     15, 9}, \
+    {     31, 8}, {     63, 9}, {     43,10}, {     23, 9}, \
+    {     55,11}, {     15,10}, {     31, 9}, {     67,10}, \
+    {     39, 9}, {     83,10}, {     47, 9}, {     99,10}, \
+    {     55,11}, {     31,10}, {     79,11}, {     47,10}, \
+    {    103,12}, {     31,11}, {     63,10}, {    135,11}, \
+    {     79,10}, {    167,11}, {     95,10}, {    191,12}, \
+    {     63,11}, {    159,12}, {     95,11}, {    191,13}, \
+    {     63,12}, {    127,11}, {    255,10}, {    511,11}, \
+    {    271,10}, {    543,11}, {    287,12}, {    159,11}, \
+    {    319,10}, {    639,11}, {    335,10}, {    671, 9}, \
+    {   1343,11}, {    351,12}, {    191,11}, {    383,10}, \
+    {    767,11}, {    415,10}, {    831,12}, {    223,11}, \
+    {    447,13}, {    127,12}, {    255,11}, {    543,10}, \
+    {   1087,12}, {    287,11}, {    575,10}, {   1151,11}, \
+    {    607,10}, {   1215,12}, {    319,11}, {    639,10}, \
+    {   1279,11}, {    671,10}, {   1343, 9}, {   2687,12}, \
+    {    351,11}, {    703,13}, {    191,12}, {    383,11}, \
+    {    767,12}, {    415,11}, {    831,10}, {   1663,12}, \
+    {    447,14}, {    127,13}, {    255,12}, {    511,11}, \
+    {   1023,12}, {    543,11}, {   1087,12}, {    575,11}, \
+    {   1151,12}, {    607,11}, {   1215,13}, {    319,12}, \
+    {    639,11}, {   1279,12}, {    671,11}, {   1343,10}, \
+    {   2687,12}, {    703,11}, {   1407,13}, {    383,12}, \
+    {    799,11}, {   1599,12}, {    831,11}, {   1663,13}, \
+    {    447,12}, {    895,11}, {   1791,12}, {    927,11}, \
+    {   1855,12}, {    959,11}, {   1919,10}, {   3839,13}, \
+    {    511,12}, {   1087,11}, {   2175,13}, {    575,12}, \
+    {   1215,11}, {   2431,13}, {    639,12}, {   1343,11}, \
+    {   2687,13}, {    703,12}, {   1407,14}, {    383,13}, \
+    {    767,12}, {   1599,13}, {    831,12}, {   1727,11}, \
+    {   3455,13}, {    895,12}, {   1855,13}, {    959,12}, \
+    {   1919,11}, {   3839,14}, {    511,13}, {   1087,12}, \
+    {   2175,13}, {   1215,12}, {   2431,14}, {    639,13}, \
+    {   1343,12}, {   2687,13}, {   1471,12}, {   2943,14}, \
+    {    767,13}, {   1599,12}, {   3199,13}, {   1727,12}, \
+    {   3455,14}, {    895,13}, {   1855,12}, {   3711,13}, \
+    {   1919,15}, {    511,14}, {   1023,13}, {   2175,14}, \
+    {   1151,13}, {   2431,12}, {   4863,14}, {   1279,13}, \
+    {   2687,14}, {   1407,13}, {   2815,15}, {    767,14}, \
+    {   1535,13}, {   3199,14}, {   1663,13}, {   3455,12}, \
+    {   6911,14}, {   1791,13}, {   3583,14}, {   1919,16}, \
+    {    511,15}, {   1023,14}, {   2175,13}, {   4479,14}, \
+    {   2431,13}, {   4863,15}, {   1279,14}, {   2687,13}, \
+    {   5375,14}, {   2943,13}, {   5887,15}, {   1535,14}, \
+    {   3455,13}, {   6911,15}, {   1791,14}, {   3839,13}, \
+    {   7679,16}, {   1023,15}, {   2047,14}, {   4479,15}, \
+    {   2303,14}, {   4991,15}, {   2559,14}, {   5247,15}, \
+    {   2815,14}, {   5887,16}, {   1535,15}, {   3327,14}, \
+    {   6911,15}, {   3839,14}, {   7679,17}, {   1023,16}, \
+    {   2047,15}, {   4095,14}, {   8191,15}, {   4351,14}, \
+    {   8959,15}, {   4863,16}, {   2559,15}, {   5375,14}, \
+    {  11007,15}, {   5887,14}, {  11775,16}, {   3071,15}, \
+    {   6911,16}, {   3583,15}, {   7167,14}, {  14335,15}, \
+    {   7679,14}, {  15359,15}, {   7935,14}, {  15871,17}, \
+    {   2047,16}, {   4095,15}, {   8959,16}, {   4607,15}, \
+    {   9215,14}, {  18431,15}, {   9727,14}, {  19455,15}, \
+    {   9983,14}, {  19967,16}, {   5119,15}, {  11007,16}, \
+    {   5631,15}, {  11775,17}, {   3071,16}, {  65536,17}, \
+    { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
+    {2097152,22}, {4194304,23}, {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 271
+#define MUL_FFT_THRESHOLD                 6272
+
+#define SQR_FFT_MODF_THRESHOLD             404  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    404, 5}, {     13, 4}, {     27, 5}, {     21, 6}, \
+    {     11, 5}, {     25, 6}, {     13, 5}, {     27, 6}, \
+    {     14, 5}, {     29, 6}, {     29, 7}, {     15, 6}, \
+    {     31, 7}, {     17, 6}, {     35, 7}, {     25, 8}, \
+    {     13, 7}, {     29, 8}, {     15, 7}, {     33, 8}, \
+    {     17, 7}, {     35, 8}, {     19, 7}, {     39, 8}, \
+    {     29, 9}, {     15, 8}, {     35, 9}, {     19, 8}, \
+    {     41, 9}, {     23, 8}, {     49, 9}, {     27,10}, \
+    {     15, 9}, {     31, 8}, {     63, 9}, {     43,10}, \
+    {     23, 9}, {     51,11}, {     15,10}, {     31, 9}, \
+    {     67,10}, {     39, 9}, {     79,10}, {     47, 9}, \
+    {     95,10}, {     55,11}, {     31,10}, {     79,11}, \
+    {     47,10}, {     95,12}, {     31,11}, {     63,10}, \
+    {    135,11}, {     79,10}, {    159,11}, {     95,12}, \
+    {     63,11}, {    127,10}, {    255, 9}, {    511,11}, \
+    {    143,10}, {    287, 9}, {    575,11}, {    159,12}, \
+    {     95,11}, {    191,13}, {     63,12}, {    127,11}, \
+    {    255,10}, {    511,11}, {    271,10}, {    543,11}, \
+    {    287,10}, {    575,11}, {    303,12}, {    159,11}, \
+    {    319,10}, {    639,11}, {    335,10}, {    671, 9}, \
+    {   1343,11}, {    351,10}, {    703,11}, {    367,10}, \
+    {    735,12}, {    191,11}, {    383,10}, {    767,11}, \
+    {    399,10}, {    799,11}, {    415,10}, {    831,12}, \
+    {    223,11}, {    447,10}, {    895,13}, {    127,12}, \
+    {    255,11}, {    511,10}, {   1023,11}, {    543,10}, \
+    {   1087,12}, {    287,11}, {    575,10}, {   1151,11}, \
+    {    607,10}, {   1215,12}, {    319,11}, {    639,10}, \
+    {   1279,11}, {    671,10}, {   1343,12}, {    351,11}, \
+    {    703,10}, {   1407,11}, {    735,10}, {   1471,13}, \
+    {    191,12}, {    383,11}, {    767,10}, {   1535,11}, \
+    {    799,12}, {    415,11}, {    831,10}, {   1663,12}, \
+    {    447,11}, {    895,14}, {    127,13}, {    255,12}, \
+    {    511,11}, {   1023,12}, {    543,11}, {   1087,12}, \
+    {    575,11}, {   1151,12}, {    607,11}, {   1215,13}, \
+    {    319,12}, {    639,11}, {   1279,12}, {    671,11}, \
+    {   1343,12}, {    703,11}, {   1407,12}, {    735,11}, \
+    {   1471,13}, {    383,12}, {    767,11}, {   1535,12}, \
+    {    799,11}, {   1599,12}, {    831,11}, {   1663,13}, \
+    {    447,12}, {    895,11}, {   1791,12}, {    959,14}, \
+    {    255,13}, {    511,12}, {   1023,11}, {   2047,12}, \
+    {   1087,11}, {   2175,13}, {    575,12}, {   1215,11}, \
+    {   2431,13}, {    639,12}, {   1343,13}, {    703,12}, \
+    {   1471,11}, {   2943,14}, {    383,13}, {    767,12}, \
+    {   1599,13}, {    831,12}, {   1727,11}, {   3455,13}, \
+    {    895,12}, {   1855,13}, {    959,15}, {    255,14}, \
+    {    511,13}, {   1023,12}, {   2047,13}, {   1087,12}, \
+    {   2175,13}, {   1215,12}, {   2431,14}, {    639,13}, \
+    {   1343,12}, {   2687,13}, {   1471,12}, {   2943,14}, \
+    {    767,13}, {   1599,12}, {   3199,13}, {   1727,12}, \
+    {   3455,14}, {    895,13}, {   1855,12}, {   3711,13}, \
+    {   1919,15}, {    511,14}, {   1023,13}, {   2175,14}, \
+    {   1151,13}, {   2431,12}, {   4863,14}, {   1279,13}, \
+    {   2687,14}, {   1407,13}, {   2943,15}, {    767,14}, \
+    {   1535,13}, {   3199,14}, {   1663,13}, {   3455,12}, \
+    {   6911,14}, {   1791,13}, {   3583,14}, {   1919,16}, \
+    {    511,15}, {   1023,14}, {   2047,13}, {   4095,14}, \
+    {   2175,13}, {   4479,12}, {   8959,14}, {   2431,13}, \
+    {   4863,15}, {   1279,14}, {   2943,13}, {   5887,12}, \
+    {  11775,15}, {   1535,14}, {   3455,13}, {   6911,15}, \
+    {   1791,14}, {   3839,13}, {   7679,14}, {   3967,16}, \
+    {   1023,15}, {   2047,14}, {   4479,15}, {   2303,14}, \
+    {   4991,15}, {   2559,14}, {   5247,15}, {   2815,14}, \
+    {   5887,13}, {  11775,16}, {   1535,15}, {   3071,14}, \
+    {   6143,15}, {   3327,14}, {   6911,15}, {   3839,14}, \
+    {   7679,17}, {   1023,16}, {   2047,15}, {   4095,14}, \
+    {   8191,15}, {   4351,14}, {   8959,15}, {   4863,14}, \
+    {   9727,16}, {   2559,15}, {   5887,14}, {  11775,16}, \
+    {   3071,15}, {   6911,16}, {   3583,15}, {   7167,14}, \
+    {  14335,15}, {   7679,14}, {  15359,15}, {   7935,14}, \
+    {  15871,17}, {   2047,16}, {   4095,15}, {   8959,16}, \
+    {   4607,15}, {   9215,14}, {  18431,15}, {   9727,14}, \
+    {  19455,15}, {   9983,14}, {  19967,16}, {   5119,15}, \
+    {  10239,16}, {   5631,15}, {  11775,17}, {   3071,16}, \
+    {   6655,15}, {  13311,16}, {  65536,17}, { 131072,18}, \
+    { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \
+    {4194304,23}, {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 302
+#define SQR_FFT_THRESHOLD                 4224
+
+#define MULLO_BASECASE_THRESHOLD             0  /* always */
+#define MULLO_DC_THRESHOLD                  69
+#define MULLO_MUL_N_THRESHOLD            11278
+#define SQRLO_BASECASE_THRESHOLD            12
+#define SQRLO_DC_THRESHOLD                  82
+#define SQRLO_SQR_THRESHOLD               8207
+
+#define DC_DIV_QR_THRESHOLD                 76
+#define DC_DIVAPPR_Q_THRESHOLD             232
+#define DC_BDIV_QR_THRESHOLD                76
+#define DC_BDIV_Q_THRESHOLD                104
+
+#define INV_MULMOD_BNM1_THRESHOLD           37
+#define INV_NEWTON_THRESHOLD               274
+#define INV_APPR_THRESHOLD                 230
+
+#define BINV_NEWTON_THRESHOLD              372
+#define REDC_1_TO_REDC_N_THRESHOLD          68
+
+#define MU_DIV_QR_THRESHOLD               1499
+#define MU_DIVAPPR_Q_THRESHOLD            1718
+#define MUPI_DIV_QR_THRESHOLD              108
+#define MU_BDIV_QR_THRESHOLD              1470
+#define MU_BDIV_Q_THRESHOLD               1787
+
+#define POWM_SEC_TABLE  3,22,81,494
+
+#define GET_STR_DC_THRESHOLD                12
+#define GET_STR_PRECOMPUTE_THRESHOLD        20
+#define SET_STR_DC_THRESHOLD               486
+#define SET_STR_PRECOMPUTE_THRESHOLD      1264
+
+#define FAC_DSC_THRESHOLD                  187
+#define FAC_ODD_THRESHOLD                    0  /* always */
+
+#define MATRIX22_STRASSEN_THRESHOLD         23
+#define HGCD2_DIV1_METHOD                    1  /* 9.20% faster than 3 */
+#define HGCD_THRESHOLD                     109
+#define HGCD_APPR_THRESHOLD                104
+#define HGCD_REDUCE_THRESHOLD             3014
+#define GCD_DC_THRESHOLD                   566
+#define GCDEXT_DC_THRESHOLD                382
+#define JACOBI_BASE_METHOD                   1  /* 15.55% faster than 3 */
+
+/* Tuneup completed successfully, took 281243 seconds */

diff --git a/third_party/gmp/mpn/x86_64/zen/hamdist.asm b/third_party/gmp/mpn/x86_64/zen/hamdist.asm
new file mode 100644
index 0000000..48dcf61
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/zen/hamdist.asm

@@ -0,0 +1,38 @@
+dnl  AMD64 mpn_hamdist -- hamming distance.
+
+dnl  Copyright 2017 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_hamdist)
+include_mpn(`x86_64/coreinhm/hamdist.asm')

diff --git a/third_party/gmp/mpn/x86_64/zen/lshift.asm b/third_party/gmp/mpn/x86_64/zen/lshift.asm
new file mode 100644
index 0000000..4dce319
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/zen/lshift.asm

@@ -0,0 +1,37 @@
+dnl  X86-64 mpn_lshift optimised for AMD Zen.
+
+dnl  Copyright 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_lshift)
+include_mpn(`x86_64/fastsse/lshift-movdqu2.asm')

diff --git a/third_party/gmp/mpn/x86_64/zen/lshiftc.asm b/third_party/gmp/mpn/x86_64/zen/lshiftc.asm
new file mode 100644
index 0000000..d52b194
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/zen/lshiftc.asm

@@ -0,0 +1,37 @@
+dnl  X86-64 mpn_lshiftc optimised for AMD Zen.
+
+dnl  Copyright 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_lshiftc)
+include_mpn(`x86_64/fastsse/lshiftc-movdqu2.asm')

diff --git a/third_party/gmp/mpn/x86_64/zen/mul_1.asm b/third_party/gmp/mpn/x86_64/zen/mul_1.asm
new file mode 100644
index 0000000..6a083ac
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/zen/mul_1.asm

@@ -0,0 +1,161 @@
+dnl  AMD64 mpn_mul_1 for CPUs with mulx.
+
+dnl  Copyright 2012, 2013, 2017 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C AMD K8,K9	 -
+C AMD K10	 -
+C AMD bd1	 -
+C AMD bd2	 -
+C AMD bd3	 -
+C AMD bd4	 4.4
+C AMD zen	 2
+C AMD bobcat	 -
+C AMD jaguar	 -
+C Intel P4	 -
+C Intel PNR	 -
+C Intel NHM	 -
+C Intel SBR	 -
+C Intel IBR	 -
+C Intel HWL	 ?
+C Intel BWL	 ?
+C Intel SKL	 ?
+C Intel atom	 -
+C Intel SLM      -
+C VIA nano	 -
+
+define(`rp',      `%rdi')   C rcx
+define(`up',      `%rsi')   C rdx
+define(`n_param', `%rdx')   C r8
+define(`v0_param',`%rcx')   C r9
+
+define(`n',       `%rcx')
+define(`v0',      `%rdx')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_mul_1c)
+	FUNC_ENTRY(4)
+IFDOS(` mov	56(%rsp), %r8	')
+	jmp	L(ent)
+EPILOGUE()
+	ALIGN(16)
+PROLOGUE(mpn_mul_1)
+	FUNC_ENTRY(4)
+	xor	R32(%r8), R32(%r8)	C carry-in limb
+L(ent):	mov	(up), %r9
+
+	push	%rbx
+	push	%r12
+	push	%r13
+
+	lea	(up,n_param,8), up
+	lea	-32(rp,n_param,8), rp
+	mov	R32(n_param), R32(%rax)
+	xchg	v0_param, v0		C FIXME: is this insn fast?
+
+	neg	n
+
+	and	$3, R8(%rax)
+	jz	L(b0)
+	cmp	$2, R8(%rax)
+	jz	L(b2)
+	jg	L(b3)
+
+L(b1):	mov	%r8, %r12
+	mulx(	%r9, %rbx, %rax)
+	sub	$-1, n
+	jz	L(wd1)
+	.byte	0xc4,0x62,0xb3,0xf6,0x04,0xce		C mulx (up,n,8), %r9, %r8
+	.byte	0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08	C mulx 8(up,n,8), %r11, %r10
+	add	%r12, %rbx
+	jmp	L(lo1)
+
+L(b3):	mulx(	%r9, %r11, %r10)
+	.byte	0xc4,0x62,0x93,0xf6,0x64,0xce,0x08	C mulx 8(up,n,8), %r13, %r12
+	.byte	0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x10	C mulx 16(up,n,8), %rbx, %rax
+	sub	$-3, n
+	jz	L(wd3)
+	add	%r8, %r11
+	jmp	L(lo3)
+
+L(b2):	mov	%r8, %r10		C carry-in limb
+	mulx(	%r9, %r13, %r12)
+	.byte	0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x08	C mulx 8(up,n,8), %rbx, %rax
+	sub	$-2, n
+	jz	L(wd2)
+	.byte	0xc4,0x62,0xb3,0xf6,0x04,0xce		C mulx (up,n,8), %r9, %r8
+	add	%r10, %r13
+	jmp	L(lo2)
+
+L(b0):	mov	%r8, %rax		C carry-in limb
+	mulx(	%r9, %r9, %r8)
+	.byte	0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08	C mulx 8(up,n,8), %r11, %r10
+	.byte	0xc4,0x62,0x93,0xf6,0x64,0xce,0x10	C mulx 16(up,n,8), %r13, %r12
+	add	%rax, %r9
+	jmp	L(lo0)
+
+L(top):	jrcxz	L(end)
+	adc	%r8, %r11
+	mov	%r9, (rp,n,8)
+L(lo3):	.byte	0xc4,0x62,0xb3,0xf6,0x04,0xce		C mulx (up,n,8), %r9, %r8
+	adc	%r10, %r13
+	mov	%r11, 8(rp,n,8)
+L(lo2):	.byte	0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08	C mulx 8(up,n,8), %r11, %r10
+	adc	%r12, %rbx
+	mov	%r13, 16(rp,n,8)
+L(lo1):	.byte	0xc4,0x62,0x93,0xf6,0x64,0xce,0x10	C mulx 16(up,n,8), %r13, %r12
+	adc	%rax, %r9
+	mov	%rbx, 24(rp,n,8)
+L(lo0):	.byte	0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18	C mulx 24(up,n,8), %rbx, %rax
+	lea	4(n), n
+	jmp	L(top)
+
+L(end):	mov	%r9, (rp)
+L(wd3):	adc	%r8, %r11
+	mov	%r11, 8(rp)
+L(wd2):	adc	%r10, %r13
+	mov	%r13, 16(rp)
+L(wd1):	adc	%r12, %rbx
+	adc	$0, %rax
+	mov	%rbx, 24(rp)
+
+	pop	%r13
+	pop	%r12
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+EPILOGUE()
+ASM_END()

diff --git a/third_party/gmp/mpn/x86_64/zen/mul_basecase.asm b/third_party/gmp/mpn/x86_64/zen/mul_basecase.asm
new file mode 100644
index 0000000..affa3b6
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/zen/mul_basecase.asm

@@ -0,0 +1,455 @@
+dnl  AMD64 mpn_mul_basecase optimised for AMD Zen.
+
+dnl  Copyright 2012, 2013, 2017 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C TODO
+C  * Try 2x unrolling instead of current 4x, at least for mul_1.  Else consider
+C    shallower sw pipelining of mul_1/addmul_1 loops, allowing 4 or 6 instead
+C    of 8 product registers.
+C  * Split up mul_1 into 4 loops in order to fall into the addmul_1 loops
+C    without branch tree.
+C  * Improve the overlapped software pipelining.  The mulx in the osp block now
+C    suffers from write/read conflicts, in particular the 1 mod 4 case.  Also,
+C    mul_1 could osp into addmul_1.
+C  * Let vn_param be vn to save a copy.
+C  * Re-allocate to benefit more from 32-bit encoding.
+C  * Poor performance for e.g. n = 12,16.
+
+define(`rp',       `%rdi')
+define(`up',       `%rsi')
+define(`un_param', `%rdx')
+define(`vp_param', `%rcx')
+define(`vn_param', `%r8')
+
+define(`un',       `%r14')
+define(`vp',       `%rbp')
+define(`v0',       `%rdx')
+define(`n',        `%rcx')
+define(`vn',       `%r15')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(32)
+PROLOGUE(mpn_mul_basecase)
+	FUNC_ENTRY(4)
+IFDOS(`	mov	56(%rsp), %r8d	')
+
+	cmp	$2, un_param
+	ja	L(gen)
+	mov	(vp_param), %rdx
+	mulx(	(up), %rax, %r9)	C 0 1
+	je	L(s2x)
+
+L(s11):	mov	%rax, (rp)
+	mov	%r9, 8(rp)
+	FUNC_EXIT()
+	ret
+
+L(s2x):	cmp	$2, vn_param
+	mulx(	8,(up), %r8, %r10)	C 1 2
+	je	L(s22)
+
+L(s21):	add	%r8, %r9
+	adc	$0, %r10
+	mov	%rax, (rp)
+	mov	%r9, 8(rp)
+	mov	%r10, 16(rp)
+	FUNC_EXIT()
+	ret
+
+L(s22):	add	%r8, %r9		C 1
+	adc	$0, %r10		C 2
+	mov	8(vp_param), %rdx
+	mov	%rax, (rp)
+	mulx(	(up), %r8, %r11)	C 1 2
+	mulx(	8,(up), %rax, %rdx)	C 2 3
+	add	%r11, %rax		C 2
+	adc	$0, %rdx		C 3
+	add	%r8, %r9		C 1
+	adc	%rax, %r10		C 2
+	adc	$0, %rdx		C 3
+	mov	%r9, 8(rp)
+	mov	%r10, 16(rp)
+	mov	%rdx, 24(rp)
+	FUNC_EXIT()
+	ret
+
+
+L(gen):	push	%r15
+	push	%r14
+	push	%r13
+	push	%r12
+	push	%rbp
+	push	%rbx
+
+	mov	un_param, un
+	mov	vp_param, vp
+	mov	vn_param, vn
+
+	mov	(up), %r9
+	mov	(vp), v0
+
+	lea	(up,un,8), up
+	lea	-32(rp,un,8), rp
+
+	neg	un
+	mov	un, n
+	test	$1, R8(un)
+	jz	L(mx0)
+L(mx1):	test	$2, R8(un)
+	jz	L(mb3)
+
+L(mb1):	mulx(	%r9, %rbx, %rax)
+	inc	n
+	.byte	0xc4,0x22,0xb3,0xf6,0x44,0xf6,0x08	C mulx 8(up,un,8), %r9, %r8
+	.byte	0xc4,0x22,0xa3,0xf6,0x54,0xf6,0x10	C mulx 16(up,un,8), %r11, %r10
+	jmp	L(mlo1)
+
+L(mb3):	mulx(	%r9, %r11, %r10)
+	.byte	0xc4,0x22,0x93,0xf6,0x64,0xf6,0x08	C mulx 8(up,un,8), %r13, %r12
+	.byte	0xc4,0xa2,0xe3,0xf6,0x44,0xf6,0x10	C mulx 16(up,un,8), %rbx, %rax
+	sub	$-3, n
+	jz	L(mwd3)
+	test	R32(%rdx), R32(%rdx)
+	jmp	L(mlo3)
+
+L(mx0):	test	$2, R8(un)
+	jz	L(mb0)
+
+L(mb2):	mulx(	%r9, %r13, %r12)
+	.byte	0xc4,0xa2,0xe3,0xf6,0x44,0xf6,0x08	C mulx 8(up,un,8), %rbx, %rax
+	lea	2(n), n
+	.byte	0xc4,0x22,0xb3,0xf6,0x44,0xf6,0x10	C mulx 16(up,un,8), %r9, %r8
+	jmp	L(mlo2)
+
+L(mb0):	mulx(	%r9, %r9, %r8)
+	.byte	0xc4,0x22,0xa3,0xf6,0x54,0xf6,0x08	C mulx 8(up,un,8), %r11, %r10
+	.byte	0xc4,0x22,0x93,0xf6,0x64,0xf6,0x10	C mulx 16(up,un,8), %r13, %r12
+	jmp	L(mlo0)
+
+L(mtop):jrcxz	L(mend)
+	adc	%r8, %r11
+	mov	%r9, (rp,n,8)
+L(mlo3):.byte	0xc4,0x62,0xb3,0xf6,0x04,0xce		C mulx (up,n,8), %r9, %r8
+	adc	%r10, %r13
+	mov	%r11, 8(rp,n,8)
+L(mlo2):.byte	0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08	C mulx 8(up,n,8), %r11, %r10
+	adc	%r12, %rbx
+	mov	%r13, 16(rp,n,8)
+L(mlo1):.byte	0xc4,0x62,0x93,0xf6,0x64,0xce,0x10	C mulx 16(up,n,8), %r13, %r12
+	adc	%rax, %r9
+	mov	%rbx, 24(rp,n,8)
+L(mlo0):.byte	0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18	C mulx 24(up,n,8), %rbx, %rax
+	lea	4(n), n
+	jmp	L(mtop)
+
+L(mend):mov	%r9, (rp)
+	adc	%r8, %r11
+L(mwd3):mov	%r11, 8(rp)
+	adc	%r10, %r13
+	mov	%r13, 16(rp)
+	adc	%r12, %rbx
+	adc	$0, %rax
+	mov	%rbx, 24(rp)
+	mov	%rax, 32(rp)
+	add	$8, vp
+	dec	vn
+	jz	L(end)
+
+C The rest of the file are 4 osp loops around addmul_1
+
+	test	$1, R8(un)
+	jnz	L(0x1)
+
+L(0x0):	test	$2, R8(un)
+	jnz	L(oloop2_entry)
+
+L(oloop0_entry):
+	C initial feed-in block
+	mov	(vp), %rdx
+	add	$8, vp
+	mov	un, n
+	add	$8, rp
+	.byte	0xc4,0x22,0xb3,0xf6,0x04,0xf6		C mulx (up,un,8), %r9, %r8
+	.byte	0xc4,0x22,0xa3,0xf6,0x54,0xf6,0x08	C mulx 8(up,un,8), %r11, %r10
+	.byte	0xc4,0x22,0x93,0xf6,0x64,0xf6,0x10	C mulx 16(up,un,8), %r13, %r12
+	.byte	0xc4,0xa2,0xe3,0xf6,0x44,0xf6,0x18	C mulx 24(up,un,8), %rbx, %rax
+	add	%r8, %r11
+	jmp	L(lo0)
+
+L(oloop0):
+	C overlapped software pipelining block
+	mov	(vp), %rdx			C new
+	add	$8, vp
+	add	%r9, (rp)			C prev
+	.byte	0xc4,0x22,0xb3,0xf6,0x04,0xf6		C mulx (%rsi,%r14,8),%r9,%r8
+	adc	%r11, 8(rp)			C prev
+	.byte	0xc4,0x22,0xa3,0xf6,0x54,0xf6,0x08	C mulx 0x8(%rsi,%r14,8),%r11,%r10
+	adc	%r13, 16(rp)			C prev
+	.byte	0xc4,0x22,0x93,0xf6,0x64,0xf6,0x10	C mulx 0x10(%rsi,%r14,8),%r13,%r12
+	adc	%rbx, 24(rp)			C prev
+	mov	un, n
+	adc	$0, %rax			C prev
+	mov	%rax, 32(rp)			C prev
+	add	$8, rp
+	.byte	0xc4,0xa2,0xe3,0xf6,0x44,0xf6,0x18	C mulx 0x18(%rsi,%r14,8),%rbx,%rax
+	add	%r8, %r11			C new
+	jmp	L(lo0)
+
+	ALIGN(16)
+L(tp0):	add	%r9, (rp,n,8)
+	.byte	0xc4,0x62,0xb3,0xf6,0x04,0xce		C mulx (up,n,8), %r9, %r8
+	adc	%r11, 8(rp,n,8)
+	.byte	0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08	C mulx 8(up,n,8), %r11, %r10
+	adc	%r13, 16(rp,n,8)
+	.byte	0xc4,0x62,0x93,0xf6,0x64,0xce,0x10	C mulx 16(up,n,8), %r13, %r12
+	adc	%rbx, 24(rp,n,8)
+	adc	%rax, %r9
+	.byte	0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18	C mulx 24(up,n,8), %rbx, %rax
+	adc	%r8, %r11
+L(lo0):	adc	%r10, %r13
+	adc	%r12, %rbx
+	adc	$0, %rax
+	add	$4, n
+	jnz	L(tp0)
+
+	dec	vn
+	jne	L(oloop0)
+
+	jmp	L(final_wind_down)
+
+L(oloop2_entry):
+	mov	(vp), %rdx
+	add	$8, vp
+	lea	2(un), n
+	add	$8, rp
+	.byte	0xc4,0x22,0x93,0xf6,0x24,0xf6		C mulx (up,un,8), %r13, %r12
+	.byte	0xc4,0xa2,0xe3,0xf6,0x44,0xf6,0x08	C mulx 8(up,un,8), %rbx, %rax
+	add	%r12, %rbx
+	adc	$0, %rax
+	.byte	0xc4,0x22,0xb3,0xf6,0x44,0xf6,0x10	C mulx 16(up,un,8), %r9, %r8
+	.byte	0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08	C mulx 8(up,n,8), %r11, %r10
+	add	%r13, 16(rp,n,8)
+	jmp	L(lo2)
+
+L(oloop2):
+	mov	(vp), %rdx
+	add	$8, vp
+	add	%r9, (rp)
+	adc	%r11, 8(rp)
+	adc	%r13, 16(rp)
+	.byte	0xc4,0x22,0x93,0xf6,0x24,0xf6		C mulx (up,un,8), %r13, %r12
+	adc	%rbx, 24(rp)
+	adc	$0, %rax
+	mov	%rax, 32(rp)
+	.byte	0xc4,0xa2,0xe3,0xf6,0x44,0xf6,0x08	C mulx 8(up,un,8), %rbx, %rax
+	lea	2(un), n
+	add	$8, rp
+	.byte	0xc4,0x22,0xb3,0xf6,0x44,0xf6,0x10	C mulx 16(up,un,8), %r9, %r8
+	add	%r12, %rbx
+	adc	$0, %rax
+	.byte	0xc4,0x22,0xa3,0xf6,0x54,0xf6,0x18	C mulx 0x18(%rsi,%r14,8),%r11,%r10
+	add	%r13, 16(rp,n,8)
+	jmp	L(lo2)
+
+	ALIGN(16)
+L(tp2):	add	%r9, (rp,n,8)
+	.byte	0xc4,0x62,0xb3,0xf6,0x04,0xce		C mulx (up,n,8), %r9, %r8
+	adc	%r11, 8(rp,n,8)
+	.byte	0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08	C mulx 8(up,n,8), %r11, %r10
+	adc	%r13, 16(rp,n,8)
+L(lo2):	.byte	0xc4,0x62,0x93,0xf6,0x64,0xce,0x10	C mulx 16(up,n,8), %r13, %r12
+	adc	%rbx, 24(rp,n,8)
+	adc	%rax, %r9
+	.byte	0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18	C mulx 24(up,n,8), %rbx, %rax
+	adc	%r8, %r11
+	adc	%r10, %r13
+	adc	%r12, %rbx
+	adc	$0, %rax
+	add	$4, n
+	jnz	L(tp2)
+
+	dec	vn
+	jne	L(oloop2)
+
+	jmp	L(final_wind_down)
+
+L(0x1):	test	$2, R8(un)
+	jz	L(oloop3_entry)
+
+L(oloop1_entry):
+	mov	(vp), %rdx
+	add	$8, vp
+	lea	1(un), n
+	add	$8, rp
+	.byte	0xc4,0xa2,0xe3,0xf6,0x04,0xf6		C mulx (up,un,8), %rbx, %rax
+	.byte	0xc4,0x22,0xb3,0xf6,0x44,0xf6,0x08	C mulx 8(up,un,8), %r9, %r8
+	.byte	0xc4,0x22,0xa3,0xf6,0x54,0xf6,0x10	C mulx 16(up,un,8), %r11, %r10
+	.byte	0xc4,0x62,0x93,0xf6,0x64,0xce,0x10	C mulx 16(up,n,8), %r13, %r12
+	add	%rbx, 24(rp,n,8)
+	jmp	L(lo1)
+
+L(oloop1):
+	mov	(vp), %rdx
+	add	$8, vp
+	add	%r9, (rp)
+	.byte	0xc4,0x22,0xb3,0xf6,0x44,0xf6,0x08	C mulx 8(up,un,8), %r9, %r8
+	adc	%r11, 8(rp)
+	.byte	0xc4,0x22,0xa3,0xf6,0x54,0xf6,0x10	C mulx 16(up,un,8), %r11, %r10
+	adc	%r13, 16(rp)
+	.byte	0xc4,0x22,0x93,0xf6,0x64,0xf6,0x18	C mulx 0x18(%rsi,%r14,8),%r13,%r12
+	adc	%rbx, 24(rp)
+	adc	$0, %rax
+	mov	%rax, 32(rp)
+	.byte	0xc4,0xa2,0xe3,0xf6,0x04,0xf6		C mulx (up,un,8), %rbx, %rax
+	lea	1(un), n
+	add	$8, rp
+	add	%rbx, 24(rp,n,8)
+	jmp	L(lo1)
+
+	ALIGN(16)
+L(tp1):	add	%r9, (rp,n,8)
+	.byte	0xc4,0x62,0xb3,0xf6,0x04,0xce		C mulx (up,n,8), %r9, %r8
+	adc	%r11, 8(rp,n,8)
+	.byte	0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08	C mulx 8(up,n,8), %r11, %r10
+	adc	%r13, 16(rp,n,8)
+	.byte	0xc4,0x62,0x93,0xf6,0x64,0xce,0x10	C mulx 16(up,n,8), %r13, %r12
+	adc	%rbx, 24(rp,n,8)
+L(lo1):	adc	%rax, %r9
+	.byte	0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18	C mulx 24(up,n,8), %rbx, %rax
+	adc	%r8, %r11
+	adc	%r10, %r13
+	adc	%r12, %rbx
+	adc	$0, %rax
+	add	$4, n
+	jnz	L(tp1)
+
+	dec	vn
+	jne	L(oloop1)
+
+	jmp	L(final_wind_down)
+
+L(oloop3_entry):
+	mov	(vp), %rdx
+	add	$8, vp
+	lea	3(un), n
+	add	$8, rp
+	.byte	0xc4,0x22,0xa3,0xf6,0x14,0xf6		C mulx (up,un,8), %r11, %r10
+	.byte	0xc4,0x22,0x93,0xf6,0x64,0xf6,0x08	C mulx 8(up,un,8), %r13, %r12
+	.byte	0xc4,0xa2,0xe3,0xf6,0x44,0xf6,0x10	C mulx 16(up,un,8), %rbx, %rax
+	add	%r10, %r13
+	adc	%r12, %rbx
+	adc	$0, %rax
+	test	n, n
+	jz	L(wd3)
+	.byte	0xc4,0x62,0xb3,0xf6,0x04,0xce		C mulx (up,n,8), %r9, %r8
+	add	%r11, 8(rp,n,8)
+	jmp	L(lo3)
+
+L(oloop3):
+	mov	(vp), %rdx
+	add	$8, vp
+	add	%r9, (rp)
+	adc	%r11, 8(rp)
+	.byte	0xc4,0x22,0xa3,0xf6,0x14,0xf6		C mulx (up,un,8), %r11, %r10
+	adc	%r13, 16(rp)
+	.byte	0xc4,0x22,0x93,0xf6,0x64,0xf6,0x08	C mulx 8(up,un,8), %r13, %r12
+	adc	%rbx, 24(rp)
+	adc	$0, %rax
+	mov	%rax, 32(rp)
+	.byte	0xc4,0xa2,0xe3,0xf6,0x44,0xf6,0x10	C mulx 16(up,un,8), %rbx, %rax
+	lea	3(un), n
+	add	$8, rp
+	add	%r10, %r13
+	.byte	0xc4,0x62,0xb3,0xf6,0x04,0xce		C mulx (up,n,8), %r9, %r8
+	adc	%r12, %rbx
+	adc	$0, %rax
+	add	%r11, 8(rp,n,8)
+	jmp	L(lo3)
+
+	ALIGN(16)
+L(tp3):	add	%r9, (rp,n,8)
+	.byte	0xc4,0x62,0xb3,0xf6,0x04,0xce		C mulx (up,n,8), %r9, %r8
+	adc	%r11, 8(rp,n,8)
+L(lo3):	.byte	0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08	C mulx 8(up,n,8), %r11, %r10
+	adc	%r13, 16(rp,n,8)
+	.byte	0xc4,0x62,0x93,0xf6,0x64,0xce,0x10	C mulx 16(up,n,8), %r13, %r12
+	adc	%rbx, 24(rp,n,8)
+	adc	%rax, %r9
+	.byte	0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18	C mulx 24(up,n,8), %rbx, %rax
+	adc	%r8, %r11
+	adc	%r10, %r13
+	adc	%r12, %rbx
+	adc	$0, %rax
+	add	$4, n
+	jnz	L(tp3)
+
+	dec	vn
+	jne	L(oloop3)
+
+L(final_wind_down):
+	add	%r9, (rp)
+	adc	%r11, 8(rp)
+	adc	%r13, 16(rp)
+	adc	%rbx, 24(rp)
+	adc	$0, %rax
+	mov	%rax, 32(rp)
+
+L(end):	pop	%rbx
+	pop	%rbp
+	pop	%r12
+	pop	%r13
+	pop	%r14
+	pop	%r15
+	FUNC_EXIT()
+	ret
+
+L(3):	mov	(vp), %rdx
+	add	$8, vp
+	add	$8, rp
+	.byte	0xc4,0x22,0xa3,0xf6,0x14,0xf6		C mulx (up,un,8), %r11, %r10
+	.byte	0xc4,0x22,0x93,0xf6,0x64,0xf6,0x08	C mulx 8(up,un,8), %r13, %r12
+	.byte	0xc4,0xa2,0xe3,0xf6,0x44,0xf6,0x10	C mulx 16(up,un,8), %rbx, %rax
+	add	%r10, %r13
+	adc	%r12, %rbx
+	adc	$0, %rax
+L(wd3):	adc	%r11, 8(rp)
+	adc	%r13, 16(rp)
+	adc	%rbx, 24(rp)
+	adc	$0, %rax
+	mov	%rax, 32(rp)
+	dec	vn
+	jne	L(3)
+	jmp	L(end)
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/zen/mullo_basecase.asm b/third_party/gmp/mpn/x86_64/zen/mullo_basecase.asm
new file mode 100644
index 0000000..2ae729a
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/zen/mullo_basecase.asm

@@ -0,0 +1,299 @@
+dnl  X64-64 mpn_mullo_basecase optimised for AMD Zen.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  Copyright 2017 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C The inner loops of this code are the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjorn Granlund.
+
+define(`rp',	   `%rdi')
+define(`up',	   `%rsi')
+define(`vp_param', `%rdx')
+define(`n',	   `%rcx')
+
+define(`vp',	`%r11')
+define(`nn',    `%rbp')
+
+C TODO
+C  * Rearrange feed-in jumps for short branch forms.
+C  * Roll out the heavy artillery and 4-way unroll outer loop.  Since feed-in
+C    code implodes, the blow-up will not be more than perhaps 2.5x.
+C  * Micro-optimise critical lead-in code blocks.
+C  * Clean up register use, e.g. r15 vs vp, disuse of nn, etc.
+C  * Write n < 4 code specifically for Zen (current code is for Haswell).
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(32)
+PROLOGUE(mpn_mullo_basecase)
+	FUNC_ENTRY(4)
+	cmp	$4, R32(n)
+	jae	L(big)
+
+	mov	vp_param, vp
+	mov	(up), %rdx
+
+	cmp	$2, R32(n)
+	jae	L(gt1)
+L(n1):	imul	(vp), %rdx
+	mov	%rdx, (rp)
+	FUNC_EXIT()
+	ret
+L(gt1):	ja	L(gt2)
+L(n2):	mov	(vp), %r9
+	mulx(	%r9, %rax, %rdx)
+	mov	%rax, (rp)
+	mov	8(up), %rax
+	imul	%r9, %rax
+	add	%rax, %rdx
+	mov	8(vp), %r9
+	mov	(up), %rcx
+	imul	%r9, %rcx
+	add	%rcx, %rdx
+	mov	%rdx, 8(rp)
+	FUNC_EXIT()
+	ret
+L(gt2):
+L(n3):	mov	(vp), %r9
+	mulx(	%r9, %rax, %r10)	C u0 x v0
+	mov	%rax, (rp)
+	mov	8(up), %rdx
+	mulx(	%r9, %rax, %rdx)	C u1 x v0
+	imul	16(up), %r9		C u2 x v0
+	add	%rax, %r10
+	adc	%rdx, %r9
+	mov	8(vp), %r8
+	mov	(up), %rdx
+	mulx(	%r8, %rax, %rdx)	C u0 x v1
+	add	%rax, %r10
+	adc	%rdx, %r9
+	imul	8(up), %r8		C u1 x v1
+	add	%r8, %r9
+	mov	%r10, 8(rp)
+	mov	16(vp), %r10
+	mov	(up), %rax
+	imul	%rax, %r10		C u0 x v2
+	add	%r10, %r9
+	mov	%r9, 16(rp)
+	FUNC_EXIT()
+	ret
+
+	ALIGN(16)
+L(big):	push	%r15
+	push	%r14
+	push	%r13
+	push	%r12
+	push	%rbp
+	push	%rbx
+
+	mov	(up), %r9
+	lea	-8(up,n,8), up
+	lea	-40(rp,n,8), rp
+
+	mov	$4, R32(%r14)
+	sub	n, %r14
+	mov	-8(vp_param,n,8), %rbp
+	imul	%r9, %rbp
+	lea	8(vp_param), %r15
+	mov	(vp_param), %rdx
+
+	test	$1, R8(%r14)
+	jnz	L(mx0)
+L(mx1):	test	$2, R8(%r14)
+	jz	L(mb3)
+
+L(mb1):	mulx(	%r9, %rbx, %rax)
+	lea	-2(%r14), n
+	.byte	0xc4,0x22,0xb3,0xf6,0x44,0xf6,0xf0	C mulx -0x10(%rsi,%r14,8),%r9,%r8
+	.byte	0xc4,0x22,0xa3,0xf6,0x54,0xf6,0xf8	C mulx -0x8(%rsi,%r14,8),%r11,%r10
+	jmp	L(mlo1)
+
+L(mb3):	mulx(	%r9, %r11, %r10)
+	.byte	0xc4,0x22,0x93,0xf6,0x64,0xf6,0xf0	C mulx -0x10(%rsi,%r14,8),%r13,%r12
+	.byte	0xc4,0xa2,0xe3,0xf6,0x44,0xf6,0xf8	C mulx -0x8(%rsi,%r14,8),%rbx,%rax
+	lea	(%r14), n
+	jrcxz	L(x)
+	jmp	L(mlo3)
+L(x):	jmp	L(mcor)
+
+L(mb2):	mulx(	%r9, %r13, %r12)
+	.byte	0xc4,0xa2,0xe3,0xf6,0x44,0xf6,0xf0	C mulx -0x10(%rsi,%r14,8),%rbx,%rax
+	lea	-1(%r14), n
+	.byte	0xc4,0x22,0xb3,0xf6,0x44,0xf6,0xf8	C mulx -0x8(%rsi,%r14,8),%r9,%r8
+	jmp	L(mlo2)
+
+L(mx0):	test	$2, R8(%r14)
+	jz	L(mb2)
+
+L(mb0):	mulx(	%r9, %r9, %r8)
+	.byte	0xc4,0x22,0xa3,0xf6,0x54,0xf6,0xf0	C mulx -0x10(%rsi,%r14,8),%r11,%r10
+	.byte	0xc4,0x22,0x93,0xf6,0x64,0xf6,0xf8	C mulx -0x8(%rsi,%r14,8),%r13,%r12
+	lea	-3(%r14), n
+	jmp	L(mlo0)
+
+	ALIGN(16)
+L(mtop):jrcxz	L(mend)
+	adc	%r8, %r11
+	mov	%r9, (rp,n,8)
+L(mlo3):.byte	0xc4,0x62,0xb3,0xf6,0x04,0xce		C mulx (up,n,8), %r9, %r8
+	adc	%r10, %r13
+	mov	%r11, 8(rp,n,8)
+L(mlo2):.byte	0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08	C mulx 8(up,n,8), %r11, %r10
+	adc	%r12, %rbx
+	mov	%r13, 16(rp,n,8)
+L(mlo1):.byte	0xc4,0x62,0x93,0xf6,0x64,0xce,0x10	C mulx 16(up,n,8), %r13, %r12
+	adc	%rax, %r9
+	mov	%rbx, 24(rp,n,8)
+L(mlo0):.byte	0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18	C mulx 24(up,n,8), %rbx, %rax
+	lea	4(n), n
+	jmp	L(mtop)
+
+L(mend):mov	%r9, (rp)
+	adc	%r8, %r11
+	mov	%r11, 8(rp)
+	adc	%r10, %r13
+	mov	%r13, 16(rp)
+	adc	%r12, %rbx
+	mov	%rbx, 24(rp)
+
+L(outer):
+	mulx(	(up), %r10, %r8)	C FIXME r8 unused (use imul?)
+	adc	%rax, %rbp
+	add	%r10, %rbp
+	mov	(%r15), %rdx
+	add	$8, %r15
+	mov	-24(up,%r14,8), %r8
+	lea	-8(up), up
+
+	test	$1, R8(%r14)
+	jz	L(x0)
+L(x1):	test	$2, R8(%r14)
+	jnz	L(b3)
+
+L(b1):	mulx(	%r8, %rbx, %rax)
+	lea	-1(%r14), n
+	.byte	0xc4,0x62,0xb3,0xf6,0x04,0xce		C mulx (%rsi,%rcx,8),%r9,%r8
+	.byte	0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08	C mulx 0x8(%rsi,%rcx,8),%r11,%r10
+	jmp	L(lo1)
+
+L(x0):	test	$2, R8(%r14)
+	jz	L(b2)
+
+L(b0):	mulx(	%r8, %r9, %r8)
+	lea	-2(%r14), n
+	.byte	0xc4,0x22,0xa3,0xf6,0x54,0xf6,0xf8	C mulx -0x8(%rsi,%r14,8),%r11,%r10
+	.byte	0xc4,0x22,0x93,0xf6,0x24,0xf6		C mulx (%rsi,%r14,8),%r13,%r12
+	jmp	L(lo0)
+
+L(b3):	mulx(	%r8, %r11, %r10)
+	lea	1(%r14), n
+	.byte	0xc4,0x22,0x93,0xf6,0x64,0xf6,0xf8	C mulx -0x8(%rsi,%r14,8),%r13,%r12
+	.byte	0xc4,0xa2,0xe3,0xf6,0x04,0xf6		C mulx (%rsi,%r14,8),%rbx,%rax
+	add	%r10, %r13
+	adc	%r12, %rbx
+	adc	$0, %rax
+	jrcxz	L(cor)
+	jmp	L(lo3)
+
+L(cor):	add	8(rp), %r11
+	mov	16(rp), %r10
+	mov	24(rp), %r12
+L(mcor):mov	%r11, 8(rp)
+	adc	%r10, %r13
+	adc	%r12, %rbx
+	mulx(	(up), %r10, %r8)	C FIXME r8 unused (use imul?)
+	adc	%rax, %rbp
+	add	%r10, %rbp
+	mov	(%r15), %rdx
+	mov	-24(up), %r8
+	mulx(	%r8, %r9, %r12)
+	mulx(	-16,(up), %r14, %rax)
+	add	%r12, %r14
+	adc	$0, %rax
+	adc	%r9, %r13
+	mov	%r13, 16(rp)
+	adc	%r14, %rbx
+	mulx(	-8,(up), %r10, %r8)	C FIXME r8 unused (use imul?)
+	adc	%rax, %rbp
+	add	%r10, %rbp
+	mov	8(%r15), %rdx
+	mulx(	-24,(up), %r14, %rax)
+	add	%r14, %rbx
+	mov	%rbx, 24(rp)
+	mulx(	-16,(up), %r10, %r8)	C FIXME r8 unused (use imul?)
+	adc	%rax, %rbp
+	add	%r10, %rbp
+	mov	%rbp, 32(rp)
+	pop	%rbx
+	pop	%rbp
+	pop	%r12
+	pop	%r13
+	pop	%r14
+	pop	%r15
+	FUNC_EXIT()
+	ret
+
+L(b2):	mulx(	%r8, %r13, %r12)
+	lea	(%r14), n
+	.byte	0xc4,0xa2,0xe3,0xf6,0x44,0xf6,0xf8	C mulx -0x8(%rsi,%r14,8),%rbx,%rax
+	add	%r12, %rbx
+	adc	$0, %rax
+	.byte	0xc4,0x22,0xb3,0xf6,0x04,0xf6		C mulx (%rsi,%r14,8),%r9,%r8
+	jmp	L(lo2)
+
+	ALIGN(16)
+L(top):	add	%r9, (rp,n,8)
+L(lo3):	.byte	0xc4,0x62,0xb3,0xf6,0x04,0xce		C mulx (up,n,8), %r9, %r8
+	adc	%r11, 8(rp,n,8)
+L(lo2):	.byte	0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08	C mulx 8(up,n,8), %r11, %r10
+	adc	%r13, 16(rp,n,8)
+L(lo1):	.byte	0xc4,0x62,0x93,0xf6,0x64,0xce,0x10	C mulx 16(up,n,8), %r13, %r12
+	adc	%rbx, 24(rp,n,8)
+	adc	%rax, %r9
+L(lo0):	.byte	0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18	C mulx 24(up,n,8), %rbx, %rax
+	adc	%r8, %r11
+	adc	%r10, %r13
+	adc	%r12, %rbx
+	adc	$0, %rax
+	add	$4, n
+	js	L(top)
+
+	add	%r9, (rp)
+	adc	%r11, 8(rp)
+	adc	%r13, 16(rp)
+	adc	%rbx, 24(rp)
+	inc	%r14
+	jmp	L(outer)
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/zen/popcount.asm b/third_party/gmp/mpn/x86_64/zen/popcount.asm
new file mode 100644
index 0000000..be1613b
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/zen/popcount.asm

@@ -0,0 +1,38 @@
+dnl  AMD64 mpn_popcount -- population count.
+
+dnl  Copyright 2017 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_popcount)
+include_mpn(`x86_64/coreinhm/popcount.asm')

diff --git a/third_party/gmp/mpn/x86_64/zen/rshift.asm b/third_party/gmp/mpn/x86_64/zen/rshift.asm
new file mode 100644
index 0000000..0196870
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/zen/rshift.asm

@@ -0,0 +1,37 @@
+dnl  X86-64 mpn_rshift optimised for AMD Zen.
+
+dnl  Copyright 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_rshift)
+include_mpn(`x86_64/fastsse/rshift-movdqu2.asm')

diff --git a/third_party/gmp/mpn/x86_64/zen/sbpi1_bdiv_r.asm b/third_party/gmp/mpn/x86_64/zen/sbpi1_bdiv_r.asm
new file mode 100644
index 0000000..f6e8f9c
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/zen/sbpi1_bdiv_r.asm

@@ -0,0 +1,507 @@
+dnl  AMD64 mpn_sbpi1_bdiv_r optimised for AMD Zen
+
+dnl  Copyright 2017 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+define(`up',       `%rdi')
+define(`un_param', `%rsi')
+define(`dp_param', `%rdx')
+define(`dn_param', `%rcx')
+define(`dinv',     `%r8')
+
+define(`i',        `%rcx')
+define(`dn',       `%r14')
+
+define(`dp',       `%rsi')
+define(`un',       `%r15')
+
+C TODO
+C  * The o1...o8  loops for special dn counts were naively hand-optimised by
+C    folding the generic loops.  They can probably be tuned.  The speculative
+C    quotient limb generation might not be in the optimal spot.
+C  * Perhaps avoid late-in-loop jumps, e.g., lo0.
+C  * Improve regalloc wrt dn_param/dn and un_param/un to save some moves.
+
+C ABI_SUPPORT(DOS64)
+C ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_sbpi1_bdiv_r)
+	FUNC_ENTRY(4)
+IFDOS(`	mov	56(%rsp), dinv	')
+	push	%r15
+	push	%r14
+	push	%r13
+	push	%r12
+	push	%rbp
+	push	%rbx
+
+	sub	dn_param, un_param		C outer loop count
+	mov	dn_param, dn		C FIXME: Suppress by reg re-alloc
+	push	dinv				C keep dinv on stack
+	mov	un_param, un		C FIXME: Suppress by reg re-alloc
+	xor	R32(%rbp), R32(%rbp)
+
+	lea	(dp_param,dn_param,8), dp
+
+	mov	(up), %rdx
+	imul	dinv, %rdx			C first quotient limb
+
+	neg	dn
+	lea	-32(up,dn_param,8), up
+
+	test	$1, R8(dn_param)
+	jnz	L(cx1)
+
+L(cx0):	test	$2, R8(dn_param)
+	jnz	L(b2)
+
+
+C =============================================================================
+L(b0):	cmp	$-4, dn
+	jnz	L(gt4)
+
+L(o4):	mulx(	-32,(dp), %r9, %r14)
+	mulx(	-24,(dp), %r11, %r10)
+	mulx(	-16,(dp), %r13, %r12)
+	mulx(	-8,(dp), %rbx, %rax)
+	add	%r14, %r11
+	adc	%r10, %r13
+	adc	%r12, %rbx
+	adc	$0, %rax
+	add	(up), %r9
+	adc	8(up), %r11
+	mov	%r8, %rdx			C dinv
+	mov	%r11, 8(up)
+	mulx(	%r11, %rdx, %r12)		C next quotient
+	adc	%r13, 16(up)
+	adc	%rbx, 24(up)
+	adc	%rbp, %rax
+	setc	R8(%rbp)
+	add	%rax, 32(up)
+	adc	$0, R32(%rbp)
+	lea	8(up), up
+	dec	un
+	jne	L(o4)
+	jmp	L(ret)
+
+L(gt4):	cmp	$-8, dn
+	jnz	L(out0)
+
+L(o8):	mulx(	-64,(dp), %r9, %r14)
+	mulx(	-56,(dp), %rcx, %r10)
+	mulx(	-48,(dp), %r13, %r12)
+	mulx(	-40,(dp), %rbx, %rax)
+	add	%r14, %rcx
+	adc	%r10, %r13
+	adc	%r12, %rbx
+	adc	$0, %rax
+	add	-32(up), %r9
+	mulx(	-32,(dp), %r9, %r14)
+	adc	-24(up), %rcx
+	mov	%rcx, -24(up)
+	mulx(	-24,(dp), %r11, %r10)
+	adc	%r13, -16(up)
+	mulx(	-16,(dp), %r13, %r12)
+	adc	%rbx, -8(up)
+	adc	%rax, %r9
+	mulx(	-8,(dp), %rbx, %rax)
+	adc	%r14, %r11
+	adc	%r10, %r13
+	adc	%r12, %rbx
+	adc	$0, %rax
+	mov	%r8, %rdx			C dinv
+	mulx(	%rcx, %rdx, %r12)		C next quotient
+	add	%r9, (up)
+	adc	%r11, 8(up)
+	adc	%r13, 16(up)
+	adc	%rbx, 24(up)
+	adc	%rbp, %rax
+	setc	R8(%rbp)
+	add	%rax, 32(up)
+	adc	$0, R32(%rbp)
+	lea	8(up), up
+	dec	un
+	jne	L(o8)
+	jmp	L(ret)
+
+L(out0):mov	dn, i
+	.byte	0xc4,0x22,0xb3,0xf6,0x04,0xf6		C mulx (dp,dn,8),%r9,%r8
+	.byte	0xc4,0x22,0xa3,0xf6,0x54,0xf6,0x08	C mulx 8(dp,dn,8),%r11,%r10
+	.byte	0xc4,0x22,0x93,0xf6,0x64,0xf6,0x10	C mulx 16(dp,dn,8),%r13,%r12
+	clc
+	jmp	L(lo0)
+
+	ALIGN(16)
+L(top0):add	%r9, (up,i,8)
+	.byte	0xc4,0x62,0xb3,0xf6,0x04,0xce		C mulx (dp,i,8), %r9, %r8
+	adc	%r11, 8(up,i,8)
+	.byte	0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08	C mulx 8(dp,i,8), %r11, %r10
+	adc	%r13, 16(up,i,8)
+	.byte	0xc4,0x62,0x93,0xf6,0x64,0xce,0x10	C mulx 16(dp,i,8), %r13, %r12
+	adc	%rbx, 24(up,i,8)
+	adc	%rax, %r9
+L(lo0):	.byte	0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18	C mulx 24(dp,i,8), %rbx, %rax
+	adc	%r8, %r11
+	adc	%r10, %r13
+	adc	%r12, %rbx
+	adc	$0, %rax
+	add	$4, i
+	js	L(top0)
+
+	mov	(%rsp), %rdx			C dinv
+	.byte	0xc4,0x22,0xeb,0xf6,0x64,0xf7,0x28	C mulx 40(%rdi,%r14,8),%rdx,%r12
+	add	%r9, (up)
+	adc	%r11, 8(up)
+	adc	%r13, 16(up)
+	adc	%rbx, 24(up)
+	adc	%rbp, %rax
+	setc	R8(%rbp)
+	add	%rax, 32(up)
+	adc	$0, R32(%rbp)
+	lea	8(up), up
+	dec	un
+	jne	L(out0)
+	jmp	L(ret)
+
+L(cx1):	test	$2, R8(dn_param)
+	jnz	L(b3)
+
+C =============================================================================
+L(b1):	cmp	$-1, dn
+	jnz	L(gt1)
+
+	mov	24(up), %r9
+L(o1):	mulx(	-8,(dp), %rbx, %rdx)
+	add	%r9, %rbx
+	adc	%rbp, %rdx
+	add	32(up), %rdx
+	setc	R8(%rbp)
+	mov	%rdx, %r9
+	mulx(	%r8, %rdx, %r12)		C next quotient
+	lea	8(up), up
+	dec	un
+	jne	L(o1)
+	mov	%r9, 24(up)
+	jmp	L(ret)
+
+L(gt1):	cmp	$-5, dn
+	jnz	L(out1)
+
+L(o5):	mulx(	-40,(dp), %rbx, %rax)
+	mulx(	-32,(dp), %r9, %r14)
+	mulx(	-24,(dp), %r11, %r10)
+	mulx(	-16,(dp), %r13, %r12)
+	add	-8(up), %rbx
+	adc	%rax, %r9
+	mulx(	-8,(dp), %rbx, %rax)
+	adc	%r14, %r11
+	adc	%r10, %r13
+	adc	%r12, %rbx
+	adc	$0, %rax
+	add	(up), %r9
+	mov	%r9, (up)
+	mov	%r8, %rdx			C dinv
+	mulx(	%r9, %rdx, %r12)		C next quotient
+	adc	%r11, 8(up)
+	adc	%r13, 16(up)
+	adc	%rbx, 24(up)
+	adc	%rbp, %rax
+	setc	R8(%rbp)
+	add	%rax, 32(up)
+	adc	$0, R32(%rbp)
+	lea	8(up), up
+	dec	un
+	jne	L(o5)
+	jmp	L(ret)
+
+L(out1):lea	1(dn), i
+	.byte	0xc4,0xa2,0xe3,0xf6,0x04,0xf6		C mulx (dp,dn,8),%rbx,%rax
+	.byte	0xc4,0x22,0xb3,0xf6,0x44,0xf6,0x08	C mulx 8(dp,dn,8),%r9,%r8
+	.byte	0xc4,0x22,0xa3,0xf6,0x54,0xf6,0x10	C mulx 16(dp,dn,8),%r11,%r10
+	clc
+	jmp	L(lo1)
+
+	ALIGN(16)
+L(top1):add	%r9, (up,i,8)
+	.byte	0xc4,0x62,0xb3,0xf6,0x04,0xce		C mulx (dp,i,8), %r9, %r8
+	adc	%r11, 8(up,i,8)
+	.byte	0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08	C mulx 8(dp,i,8), %r11, %r10
+	adc	%r13, 16(up,i,8)
+L(lo1):	.byte	0xc4,0x62,0x93,0xf6,0x64,0xce,0x10	C mulx 16(dp,i,8), %r13, %r12
+	adc	%rbx, 24(up,i,8)
+	adc	%rax, %r9
+	.byte	0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18	C mulx 24(dp,i,8), %rbx, %rax
+	adc	%r8, %r11
+	adc	%r10, %r13
+	adc	%r12, %rbx
+	adc	$0, %rax
+	add	$4, i
+	js	L(top1)
+
+	mov	(%rsp), %rdx			C dinv
+	.byte	0xc4,0x22,0xeb,0xf6,0x64,0xf7,0x28	C mulx 40(up,dn,8), %rdx, %r12
+	add	%r9, (up)
+	adc	%r11, 8(up)
+	adc	%r13, 16(up)
+	adc	%rbx, 24(up)
+	adc	%rbp, %rax
+	setc	R8(%rbp)
+	add	%rax, 32(up)
+	adc	$0, R32(%rbp)
+	lea	8(up), up
+	dec	un
+	jne	L(out1)
+	jmp	L(ret)
+
+C =============================================================================
+L(b2):	cmp	$-2, dn
+	jnz	L(gt2)
+
+	mov	16(up), %r10
+	mov	24(up), %r9
+L(o2):	mulx(	-16,(dp), %r13, %r12)
+	mulx(	-8,(dp), %rbx, %rax)
+	add	%r12, %rbx
+	adc	$0, %rax
+	add	%r10, %r13			C add just to produce carry
+	mov	%r9, %r10
+	adc	%rbx, %r10
+	mov	%r8, %rdx
+	mulx(	%r10, %rdx, %r12)		C next quotient
+	adc	%rbp, %rax
+	setc	R8(%rbp)
+	mov	32(up), %r9
+	add	%rax, %r9
+	adc	$0, R32(%rbp)
+	lea	8(up), up
+	dec	un
+	jne	L(o2)
+	mov	%r10, 16(up)
+	mov	%r9, 24(up)
+	jmp	L(ret)
+
+L(gt2):	cmp	$-6, dn
+	jnz	L(out2)
+
+L(o6):	mulx(	-48,(dp), %r13, %r12)
+	mulx(	-40,(dp), %rcx, %rax)
+	add	%r12, %rcx
+	adc	$0, %rax
+	mulx(	-32,(dp), %r9, %r14)
+	mulx(	-24,(dp), %r11, %r10)
+	add	-16(up), %r13
+	mulx(	-16,(dp), %r13, %r12)
+	adc	-8(up), %rcx
+	mov	%rcx, -8(up)
+	adc	%rax, %r9
+	mulx(	-8,(dp), %rbx, %rax)
+	adc	%r14, %r11
+	adc	%r10, %r13
+	adc	%r12, %rbx
+	adc	$0, %rax
+	mov	%r8, %rdx			C dinv
+	mulx(	%rcx, %rdx, %r12)		C next quotient
+	add	%r9, (up)
+	adc	%r11, 8(up)
+	adc	%r13, 16(up)
+	adc	%rbx, 24(up)
+	adc	%rbp, %rax
+	setc	R8(%rbp)
+	add	%rax, 32(up)
+	adc	$0, R32(%rbp)
+	lea	8(up), up
+	dec	un
+	jne	L(o6)
+	jmp	L(ret)
+
+L(out2):lea	2(dn), i
+	.byte	0xc4,0x22,0x93,0xf6,0x24,0xf6		C mulx (dp,dn,8),%r13,%r12
+	.byte	0xc4,0xa2,0xe3,0xf6,0x44,0xf6,0x08	C mulx 8(dp,dn,8),%rbx,%rax
+	add	%r12, %rbx
+	adc	$0, %rax
+	.byte	0xc4,0x22,0xb3,0xf6,0x44,0xf6,0x10	C mulx 16(dp,dn,8),%r9,%r8
+	jmp	L(lo2)
+
+	ALIGN(16)
+L(top2):add	%r9, (up,i,8)
+	.byte	0xc4,0x62,0xb3,0xf6,0x04,0xce		C mulx (dp,i,8), %r9, %r8
+	adc	%r11, 8(up,i,8)
+L(lo2):	.byte	0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08	C mulx 8(dp,i,8), %r11, %r10
+	adc	%r13, 16(up,i,8)
+	.byte	0xc4,0x62,0x93,0xf6,0x64,0xce,0x10	C mulx 16(dp,i,8), %r13, %r12
+	adc	%rbx, 24(up,i,8)
+	adc	%rax, %r9
+	.byte	0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18	C mulx 24(dp,i,8), %rbx, %rax
+	adc	%r8, %r11
+	adc	%r10, %r13
+	adc	%r12, %rbx
+	adc	$0, %rax
+	add	$4, i
+	js	L(top2)
+
+	mov	(%rsp), %rdx			C dinv
+	.byte	0xc4,0x22,0xeb,0xf6,0x64,0xf7,0x28	C mulx 40(up,dn,8), %rdx, %r12
+	add	%r9, (up)
+	adc	%r11, 8(up)
+	adc	%r13, 16(up)
+	adc	%rbx, 24(up)
+	adc	%rbp, %rax
+	setc	R8(%rbp)
+	add	%rax, 32(up)
+	adc	$0, R32(%rbp)
+	lea	8(up), up
+	dec	un
+	jne	L(out2)
+	jmp	L(ret)
+
+C =============================================================================
+L(b3):	cmp	$-3, dn
+	jnz	L(gt3)
+
+	mov	8(up), %r14
+	mov	16(up), %r9
+	mov	24(up), %rcx
+L(o3):	mulx(	-24,(dp), %r11, %r10)
+	mulx(	-16,(dp), %r13, %r12)
+	mulx(	-8,(dp), %rbx, %rax)
+	add	%r10, %r13
+	adc	%r12, %rbx
+	adc	$0, %rax
+	add	%r14, %r11
+	mov	%r9, %r14
+	adc	%r13, %r14
+	mov	%rcx, %r9
+	mov	%r8, %rdx			C dinv
+	mulx(	%r14, %rdx, %r12)		C next quotient
+	adc	%rbx, %r9
+	adc	%rbp, %rax
+	setc	R8(%rbp)
+	mov	32(up), %rcx
+	add	%rax, %rcx
+	adc	$0, R32(%rbp)
+	lea	8(up), up
+	dec	un
+	jne	L(o3)
+	mov	%r14, 8(up)
+	mov	%r9, 16(up)
+	mov	%rcx, 24(up)
+	jmp	L(ret)
+
+L(gt3):	cmp	$-7, dn
+	jnz	L(out3)
+
+L(o7):	mulx(	-56,(dp), %r11, %r10)
+	mulx(	-48,(dp), %rcx, %r12)
+	mulx(	-40,(dp), %rbx, %rax)
+	add	%r10, %rcx
+	adc	%r12, %rbx
+	adc	$0, %rax
+	mulx(	-32,(dp), %r9, %r14)
+	add	-24(up), %r11
+	mulx(	-24,(dp), %r11, %r10)
+	adc	-16(up), %rcx
+	mov	%rcx, -16(up)
+	mulx(	-16,(dp), %r13, %r12)
+	adc	%rbx, -8(up)
+	adc	%rax, %r9
+	mulx(	-8,(dp), %rbx, %rax)
+	adc	%r14, %r11
+	adc	%r10, %r13
+	adc	%r12, %rbx
+	adc	$0, %rax
+	mov	%r8, %rdx			C dinv
+	mulx(	%rcx, %rdx, %r12)		C next quotient
+	add	%r9, (up)
+	adc	%r11, 8(up)
+	adc	%r13, 16(up)
+	adc	%rbx, 24(up)
+	adc	%rbp, %rax
+	setc	R8(%rbp)
+	add	%rax, 32(up)
+	adc	$0, R32(%rbp)
+	lea	8(up), up
+	dec	un
+	jne	L(o7)
+	jmp	L(ret)
+
+L(out3):lea	3(dn), i
+	.byte	0xc4,0x22,0xa3,0xf6,0x14,0xf6		C mulx (dp,dn,8),%r11,%r10
+	.byte	0xc4,0x22,0x93,0xf6,0x64,0xf6,0x08	C mulx 8(dp,dn,8),%r13,%r12
+	.byte	0xc4,0xa2,0xe3,0xf6,0x44,0xf6,0x10	C mulx 16(dp,dn,8),%rbx,%rax
+	add	%r10, %r13
+	adc	%r12, %rbx
+	adc	$0, %rax
+	jmp	L(lo3)
+
+	ALIGN(16)
+L(top3):add	%r9, (up,i,8)
+L(lo3):	.byte	0xc4,0x62,0xb3,0xf6,0x04,0xce		C mulx (dp,i,8), %r9, %r8
+	adc	%r11, 8(up,i,8)
+	.byte	0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08	C mulx 8(dp,i,8), %r11, %r10
+	adc	%r13, 16(up,i,8)
+	.byte	0xc4,0x62,0x93,0xf6,0x64,0xce,0x10	C mulx 16(dp,i,8), %r13, %r12
+	adc	%rbx, 24(up,i,8)
+	adc	%rax, %r9
+	.byte	0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18	C mulx 24(dp,i,8), %rbx, %rax
+	adc	%r8, %r11
+	adc	%r10, %r13
+	adc	%r12, %rbx
+	adc	$0, %rax
+	add	$4, i
+	js	L(top3)
+
+	mov	(%rsp), %rdx			C dinv
+	.byte	0xc4,0x22,0xeb,0xf6,0x64,0xf7,0x28	C mulx 40(up,dn,8), %rdx, %r12
+	add	%r9, (up)
+	adc	%r11, 8(up)
+	adc	%r13, 16(up)
+	adc	%rbx, 24(up)
+	adc	%rbp, %rax
+	setc	R8(%rbp)
+	add	%rax, 32(up)
+	adc	$0, R32(%rbp)
+	lea	8(up), up
+	dec	un
+	jne	L(out3)
+
+L(ret):	mov	%rbp, %rax
+	pop	%rsi			C dummy dealloc
+	pop	%rbx
+	pop	%rbp
+	pop	%r12
+	pop	%r13
+	pop	%r14
+	pop	%r15
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/zen/sqr_basecase.asm b/third_party/gmp/mpn/x86_64/zen/sqr_basecase.asm
new file mode 100644
index 0000000..a7c6127
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/zen/sqr_basecase.asm

@@ -0,0 +1,482 @@
+dnl  AMD64 mpn_sqr_basecase optimised for AMD Zen.
+
+dnl  Copyright 2012, 2013, 2017 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C TODO
+C  * Do overlapped software pipelining.  This should close the remaining gap to
+C    mul_basecase.
+C
+C  * Update un just once in the outer loop.
+C
+C  * Perhaps keep un and n pre-multiplied by 8, thus suppressing ",8" from
+C    loads and stores.  At least in some cases, the non-scaled form is faster.
+C
+C  * Optimise xit3 code, e.g., using shrx and sarx like in the main loop.
+C
+C  * The mul_1 feed-in code has gotten little attention and could probably be
+C    improved.  Perhaps even expand it to 4 separate loops to allow straight
+C    fall-through into the 4 addmul_1 loops.
+C
+C  * Clean up ad-hoc scratch register usage in the addmul_1 feed-in code blocks.
+
+define(`rp',      `%rdi')
+define(`up',      `%rsi')
+define(`un_param',`%rdx')
+
+define(`un',      `%rbp')
+define(`n',       `%rcx')
+
+C these are used just for the small op code
+define(`w0',	`%r8')
+define(`w1',	`%r9')
+define(`w2',	`%r10')
+define(`w3',	`%r11')
+
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_sqr_basecase)
+	FUNC_ENTRY(3)
+
+	cmp	$2, R32(un_param)
+	jae	L(gt1)
+
+	mov	(up), %rdx
+	mulx(	%rdx, %rax, %rdx)
+	mov	%rax, (rp)
+	mov	%rdx, 8(rp)
+	FUNC_EXIT()
+	ret
+
+L(gt1):	jne	L(gt2)
+
+	mov	(up), %rdx
+	mov	8(up), %rcx
+	mulx(	%rcx, %r9, %r10)	C v0 * v1	W 1 2
+	mulx(	%rdx, %rax, %r8)	C v0 * v0	W 0 1
+	mov	%rcx, %rdx
+	mulx(	%rdx, %r11, %rdx)	C v1 * v1	W 2 3
+	add	%r9, %r9		C		W 1
+	adc	%r10, %r10		C		W 2
+	adc	$0, %rdx		C		W 3
+	add	%r9, %r8		C W 1
+	adc	%r11, %r10		C W 2
+	adc	$0, %rdx		C W 3
+	mov	%rax, (rp)
+	mov	%r8, 8(rp)
+	mov	%r10, 16(rp)
+	mov	%rdx, 24(rp)
+	FUNC_EXIT()
+	ret
+
+L(gt2):	cmp	$4, R32(un_param)
+	jae	L(gt3)
+
+	push	%rbx
+	mov	(up), %rdx
+	mulx(	8,(up), w2, w3)
+	mulx(	16,(up), w0, w1)
+	add	w3, w0
+	mov	8(up), %rdx
+	mulx(	16,(up), %rax, w3)
+	adc	%rax, w1
+	adc	$0, w3
+	test	R32(%rbx), R32(%rbx)
+	mov	(up), %rdx
+	mulx(	%rdx, %rbx, %rcx)
+	mov	%rbx, (rp)
+	mov	8(up), %rdx
+	mulx(	%rdx, %rax, %rbx)
+	mov	16(up), %rdx
+	mulx(	%rdx, %rsi, %rdx)
+	adcx(	w2, w2)
+	adcx(	w0, w0)
+	adcx(	w1, w1)
+	adcx(	w3, w3)
+	adox(	w2, %rcx)
+	adox(	w0, %rax)
+	adox(	w1, %rbx)
+	adox(	w3, %rsi)
+	mov	$0, R32(%r8)
+	adox(	%r8, %rdx)
+	adcx(	%r8, %rdx)
+	mov	%rcx, 8(rp)
+	mov	%rax, 16(rp)
+	mov	%rbx, 24(rp)
+	mov	%rsi, 32(rp)
+	mov	%rdx, 40(rp)
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+
+L(gt3):	push	%r15
+C	push	%r14
+	push	%r13
+	push	%r12
+	push	%rbp
+	push	%rbx
+	mov	R32(un_param), R32(un)
+
+	mov	(up), %rdx		C up[0]
+	mov	8(up), %r9		C up[1]
+
+	mulx(	%rdx, %rax, %r15)	C up[0]^2
+	mov	%rax, (rp)
+	shl	%rdx
+
+	lea	(up,un,8), up
+	lea	-32(rp,un,8), rp
+
+	neg	un
+	lea	4(un), n
+	and	$-4, n
+
+	test	$1, R8(un)
+	jnz	L(mx0)
+L(mx1):	test	$2, R8(un)
+	jz	L(mb3)
+
+L(mb1):	mulx(	%r9, %rbx, %rax)
+	.byte	0xc4,0x62,0xb3,0xf6,0x44,0xee,0x10	C mulx 16(up,un,8), %r9, %r8
+	.byte	0xc4,0x62,0xa3,0xf6,0x54,0xee,0x18	C mulx 24(up,un,8), %r11, %r10
+	add	%r15, %rbx
+	jmp	L(mlo1)
+
+L(mb3):	mulx(	%r9, %r11, %r10)
+	.byte	0xc4,0x62,0x93,0xf6,0x64,0xee,0x10	C mulx 16(up,un,8), %r13, %r12
+	.byte	0xc4,0xe2,0xe3,0xf6,0x44,0xee,0x18	C mulx 24(up,un,8), %rbx, %rax
+	add	%r15, %r11
+	jrcxz	L(n4)
+	jmp	L(mlo3)
+L(n4):	mov	%r11, 8(rp)
+	adc	%r10, %r13
+	adc	%r12, %rbx
+	jmp	L(m)
+
+L(mx0):	test	$2, R8(un)
+	jnz	L(mb0)
+
+L(mb2):	mulx(	%r9, %r13, %r12)
+	.byte	0xc4,0xe2,0xe3,0xf6,0x44,0xee,0x10	C mulx 16(up,un,8), %rbx, %rax
+	.byte	0xc4,0x62,0xb3,0xf6,0x44,0xee,0x18	C mulx 24(up,un,8), %r9, %r8
+	add	%r15, %r13
+	jmp	L(mlo2)
+
+L(mb0):	mulx(	%r9, %r9, %r8)
+	.byte	0xc4,0x62,0xa3,0xf6,0x54,0xee,0x10	C mulx 16(up,un,8), %r11, %r10
+	.byte	0xc4,0x62,0x93,0xf6,0x64,0xee,0x18	C mulx 24(up,un,8), %r13, %r12
+	add	%r15, %r9
+	jmp	L(mlo0)
+
+	ALIGN(16)
+L(mtop):jrcxz	L(mend)
+	adc	%r8, %r11
+	mov	%r9, (rp,n,8)
+L(mlo3):.byte	0xc4,0x62,0xb3,0xf6,0x04,0xce		C mulx (up,n,8), %r9, %r8
+	adc	%r10, %r13
+	mov	%r11, 8(rp,n,8)
+L(mlo2):.byte	0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08	C mulx 8(up,n,8), %r11, %r10
+	adc	%r12, %rbx
+	mov	%r13, 16(rp,n,8)
+L(mlo1):.byte	0xc4,0x62,0x93,0xf6,0x64,0xce,0x10	C mulx 16(up,n,8), %r13, %r12
+	adc	%rax, %r9
+	mov	%rbx, 24(rp,n,8)
+L(mlo0):.byte	0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18	C mulx 24(up,n,8), %rbx, %rax
+	lea	4(n), n
+	jmp	L(mtop)
+
+L(mend):mov	%r9, (rp)
+	adc	%r8, %r11
+	mov	%r11, 8(rp)
+	adc	%r10, %r13
+	mov	%r13, 16(rp)
+	adc	%r12, %rbx
+	adc	$0, %rax
+	mov	%rbx, 24(rp)
+	mov	%rax, 32(rp)
+
+	lea	2(un), un
+
+	mov	$63, R32(%r15)			C keep at 63 for shrx/sarx.
+	test	$1, R8(un)
+	jz	L(x0)
+L(x1):	test	$2, R8(un)
+	jz	L(f3)
+	jmp	L(f1)
+L(x0):	test	$2, R8(un)
+	jz	L(f0)
+C	jmp	L(f2)
+
+L(f2):	mov	-8(up,un,8), %rdx		C up[0]
+	lea	2(un), n
+	lea	8(rp), rp
+	.byte	0xc4,0x62,0x82,0xf7,0x5c,0xee,0xf0	C sarx %r15, -16(up,un,8), %r11
+	.byte	0xc4,0x62,0x83,0xf7,0x6c,0xee,0xf0	C shrx %r15, -16(up,un,8), %r13
+	and	%rdx, %r11			C "ci" in C code
+	mulx(	%rdx, %rax, %r10)		C up[0]^2
+	lea	(%r13,%rdx,2), %rdx		C "u0" arg in C code
+	add	%rax, %r11
+
+	.byte	0xc4,0x62,0x93,0xf6,0x24,0xee		C mulx (up,un,8), %r13, %r12
+	.byte	0xc4,0xe2,0xe3,0xf6,0x44,0xee,0x08	C mulx 8(up,un,8), %rbx, %rax
+	adc	%r10, %r13
+	adc	%r12, %rbx
+	adc	$0, %rax
+	jmp	L(b2)
+
+	ALIGN(16)
+L(top2):add	%r9, (rp,n,8)
+L(b2):	.byte	0xc4,0x62,0xb3,0xf6,0x04,0xce		C mulx (up,n,8), %r9, %r8
+	adc	%r11, 8(rp,n,8)
+	.byte	0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08	C mulx 8(up,n,8), %r11, %r10
+	adc	%r13, 16(rp,n,8)
+	.byte	0xc4,0x62,0x93,0xf6,0x64,0xce,0x10	C mulx 16(up,n,8), %r13, %r12
+	adc	%rbx, 24(rp,n,8)
+	adc	%rax, %r9
+	.byte	0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18	C mulx 24(up,n,8), %rbx, %rax
+	adc	%r8, %r11
+	adc	%r10, %r13
+	adc	%r12, %rbx
+	adc	$0, %rax
+	add	$4, n
+	jnz	L(top2)
+
+	inc	un
+	add	%r9, (rp)
+	adc	%r11, 8(rp)
+	adc	%r13, 16(rp)
+	adc	%rbx, 24(rp)
+	adc	$0, %rax
+	mov	%rax, 32(rp)
+
+L(f1):	mov	-8(up,un,8), %rdx		C up[0]
+	lea	1(un), n
+	lea	8(rp), rp
+	.byte	0xc4,0x62,0x82,0xf7,0x6c,0xee,0xf0	C sarx	%r15, -16(up,un,8), %r13
+	.byte	0xc4,0xe2,0x83,0xf7,0x5c,0xee,0xf0	C shrx	%r15, -16(up,un,8), %rbx
+	and	%rdx, %r13			C "ci" in C code
+	mulx(	%rdx, %rax, %r12)		C up[0]^2
+	lea	(%rbx,%rdx,2), %rdx		C "u0" arg in C code
+	add	%rax, %r13
+
+	.byte	0xc4,0xe2,0xe3,0xf6,0x04,0xee		C mulx (up,un,8), %rbx, %rax
+	adc	%r12, %rbx
+	adc	$0, %rax
+	.byte	0xc4,0x62,0xb3,0xf6,0x44,0xee,0x08	C mulx 8(up,un,8), %r9, %r8
+	jmp	L(b1)
+
+	ALIGN(16)
+L(top1):add	%r9, (rp,n,8)
+	.byte	0xc4,0x62,0xb3,0xf6,0x04,0xce		C mulx (up,n,8), %r9, %r8
+	adc	%r11, 8(rp,n,8)
+L(b1):	.byte	0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08	C mulx 8(up,n,8), %r11, %r10
+	adc	%r13, 16(rp,n,8)
+	.byte	0xc4,0x62,0x93,0xf6,0x64,0xce,0x10	C mulx 16(up,n,8), %r13, %r12
+	adc	%rbx, 24(rp,n,8)
+	adc	%rax, %r9
+	.byte	0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18	C mulx 24(up,n,8), %rbx, %rax
+	adc	%r8, %r11
+	adc	%r10, %r13
+	adc	%r12, %rbx
+	adc	$0, %rax
+	add	$4, n
+	jnz	L(top1)
+
+	inc	un
+	add	%r9, (rp)
+	adc	%r11, 8(rp)
+	adc	%r13, 16(rp)
+	adc	%rbx, 24(rp)
+	adc	$0, %rax
+	mov	%rax, 32(rp)
+
+L(f0):	mov	-8(up,un,8), %rdx		C up[0]
+	lea	(un), n
+	lea	8(rp), rp
+	.byte	0xc4,0xe2,0x82,0xf7,0x5c,0xee,0xf0	C sarx	%r15, -16(up,un,8), %rbx
+	.byte	0xc4,0x62,0x83,0xf7,0x4c,0xee,0xf0	C shrx	%r15, -16(up,un,8), %r9
+	and	%rdx, %rbx			C "ci" in C code
+	mulx(	%rdx, %r10, %rax)		C up[0]^2
+	lea	(%r9,%rdx,2), %rdx		C "u0" arg in C code
+	add	%r10, %rbx
+	adc	$0, %rax			C "cin" in C code
+
+	.byte	0xc4,0x62,0xb3,0xf6,0x04,0xce		C mulx (up,un,8), %r9, %r8
+	.byte	0xc4,0x62,0xa3,0xf6,0x54,0xee,0x08	C mulx 8(up,un,8), %r11, %r10
+	jmp	L(b0)
+
+	ALIGN(16)
+L(top0):add	%r9, (rp,n,8)
+	.byte	0xc4,0x62,0xb3,0xf6,0x04,0xce		C mulx (up,n,8), %r9, %r8
+	adc	%r11, 8(rp,n,8)
+	.byte	0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08	C mulx 8(up,n,8), %r11, %r10
+	adc	%r13, 16(rp,n,8)
+L(b0):	.byte	0xc4,0x62,0x93,0xf6,0x64,0xce,0x10	C mulx 16(up,n,8), %r13, %r12
+	adc	%rbx, 24(rp,n,8)
+	adc	%rax, %r9
+	.byte	0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18	C mulx 24(up,n,8), %rbx, %rax
+	adc	%r8, %r11
+	adc	%r10, %r13
+	adc	%r12, %rbx
+	adc	$0, %rax
+	add	$4, n
+	jnz	L(top0)
+
+	inc	un
+	add	%r9, (rp)
+	adc	%r11, 8(rp)
+	adc	%r13, 16(rp)
+	adc	%rbx, 24(rp)
+	adc	$0, %rax
+	mov	%rax, 32(rp)
+
+L(f3):	mov	-8(up,un,8), %rdx		C up[0]
+	lea	3(un), n
+	lea	8(rp), rp
+	.byte	0xc4,0x62,0x82,0xf7,0x4c,0xee,0xf0	C sarx %r15, -16(up,un,8), %r9
+	.byte	0xc4,0x62,0x83,0xf7,0x5c,0xee,0xf0	C shrx %r15, -16(up,un,8), %r11
+	and	%rdx, %r9			C "ci" in C code
+	mulx(	%rdx, %rax, %r8)		C up[0]^2
+	lea	(%r11,%rdx,2), %rdx		C "u0" arg in C code
+	add	%rax, %r9
+
+	.byte	0xc4,0x62,0xa3,0xf6,0x14,0xee		C mulx (%rsi,%rbp,8),%r11,%r10
+	.byte	0xc4,0x62,0x93,0xf6,0x64,0xee,0x08	C mulx 0x8(%rsi,%rbp,8),%r13,%r12
+	.byte	0xc4,0xe2,0xe3,0xf6,0x44,0xee,0x10	C mulx 0x10(%rsi,%rbp,8),%rbx,%rax
+	adc	%r8, %r11
+	adc	%r10, %r13
+	adc	%r12, %rbx
+	adc	$0, %rax
+	jrcxz	L(xit3)
+	jmp	L(top3)			C FIXME perhaps fall through
+
+	ALIGN(16)
+L(top3):add	%r9, (rp,n,8)
+	.byte	0xc4,0x62,0xb3,0xf6,0x04,0xce		C mulx (up,n,8), %r9, %r8
+	adc	%r11, 8(rp,n,8)
+	.byte	0xc4,0x62,0xa3,0xf6,0x54,0xce,0x08	C mulx 8(up,n,8), %r11, %r10
+	adc	%r13, 16(rp,n,8)
+	.byte	0xc4,0x62,0x93,0xf6,0x64,0xce,0x10	C mulx 16(up,n,8), %r13, %r12
+	adc	%rbx, 24(rp,n,8)
+	adc	%rax, %r9
+	.byte	0xc4,0xe2,0xe3,0xf6,0x44,0xce,0x18	C mulx 24(up,n,8), %rbx, %rax
+	adc	%r8, %r11
+	adc	%r10, %r13
+	adc	%r12, %rbx
+	adc	$0, %rax
+	add	$4, n
+	jnz	L(top3)
+
+	inc	un
+	add	%r9, (rp)
+	adc	%r11, 8(rp)
+	adc	%r13, 16(rp)
+	adc	%rbx, 24(rp)
+	adc	$0, %rax
+	mov	%rax, 32(rp)
+	jmp	L(f2)
+
+
+L(xit3):add	%r9, (rp)
+	adc	%r11, 8(rp)
+	adc	16(rp), %r13
+	adc	24(rp), %rbx
+L(m):	adc	$0, %rax
+	mov	%rax, 32(rp)
+	mov	-24(up), %rdx		C FIXME: CSE
+	mov	-32(up), %r9		C FIXME: CSE
+	sar	$63, %r9
+	and	%rdx, %r9
+	add	%r13, %r9
+	mulx(	%rdx, %rax, %r10)
+	mov	-16(up), %r8		C FIXME: CSE
+	adc	$0, %r10
+	add	%rax, %r9
+	adc	$0, %r10
+	mov	%r9, 16(rp)
+	mov	-32(up), %rax
+	shl	%rax
+	adc	%rdx, %rdx
+	mulx(	%r8, %r13, %r12)
+	mulx(	-8,(up), %r11, %rax)	C FIXME: CSE
+	add	%r10, %r13
+	adc	%r12, %r11
+	adc	$0, %rax
+	add	%rbx, %r13
+	mov	%r13, 24(rp)
+	adc	32(rp), %r11
+	adc	$0, %rax
+	mov	-16(up), %rdx		C FIXME: CSE
+	mov	-8(up), %r8		C FIXME: CSE
+	mov	-24(up), %r9
+	sar	$63, %r9
+	and	%rdx, %r9
+	add	%r11, %r9
+	mulx(	%rdx, %rbp, %r10)
+	adc	$0, %r10
+	add	%rbp, %r9
+	adc	$0, %r10
+	mov	%r9, 32(rp)
+	mov	-24(up), %rbp
+	shl	%rbp
+	adc	%rdx, %rdx
+	mulx(	%r8, %rbx, %rbp)
+	add	%r10, %rbx
+	adc	$0, %rbp
+	adc	%rbx, %rax
+	mov	%rax, 40(rp)
+	adc	$0, %rbp
+	mov	-8(up), %rdx		C FIXME: CSE
+	mov	-16(up), %r9		C FIXME: CSE
+	sar	$63, %r9
+	and	%rdx, %r9
+	add	%rbp, %r9
+	mulx(	%rdx, %rbp, %r10)
+	adc	$0, %r10
+	add	%rbp, %r9
+	adc	$0, %r10
+	mov	%r9, 48(rp)
+	mov	%r10, 56(rp)
+
+	pop	%rbx
+	pop	%rbp
+	pop	%r12
+	pop	%r13
+C	pop	%r14
+	pop	%r15
+
+	FUNC_EXIT()
+	ret
+EPILOGUE()

diff --git a/third_party/gmp/mpn/x86_64/zen/sublsh1_n.asm b/third_party/gmp/mpn/x86_64/zen/sublsh1_n.asm
new file mode 100644
index 0000000..00f6dc9
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/zen/sublsh1_n.asm

@@ -0,0 +1,37 @@
+dnl  X86-64 mpn_sublsh1_n, mpn_sublsh1_nc.
+
+dnl  Copyright 2017 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_sublsh1_n mpn_sublsh1_nc)
+include_mpn(`x86_64/atom/sublsh1_n.asm')

diff --git a/third_party/gmp/mpn/x86_64/zen2/gmp-mparam.h b/third_party/gmp/mpn/x86_64/zen2/gmp-mparam.h
new file mode 100644
index 0000000..3748c5f
--- /dev/null
+++ b/third_party/gmp/mpn/x86_64/zen2/gmp-mparam.h

@@ -0,0 +1,276 @@
+/* AMD Zen2 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+/* Disable use of slow functions.  FIXME: We should disable lib inclusion.  */
+#undef HAVE_NATIVE_mpn_mul_2
+#undef HAVE_NATIVE_mpn_addmul_2
+
+/* 3600-4400 MHz Matisse */
+/* FFT tuning limit = 703,392,483 */
+/* Generated by tuneup.c, 2019-10-19, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD                 0  /* always */
+#define MOD_1_UNNORM_THRESHOLD               0  /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          4
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          3
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD         8
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD        27
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD      1
+#define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_1_NORM_THRESHOLD              1
+#define DIV_QR_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */
+#define DIV_QR_2_PI2_THRESHOLD              13
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           22
+
+#define DIV_1_VS_MUL_1_PERCENT             385
+
+#define MUL_TOOM22_THRESHOLD                19
+#define MUL_TOOM33_THRESHOLD               125
+#define MUL_TOOM44_THRESHOLD               196
+#define MUL_TOOM6H_THRESHOLD               276
+#define MUL_TOOM8H_THRESHOLD               369
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD     121
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     138
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD     129
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD     132
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD     185
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 30
+#define SQR_TOOM3_THRESHOLD                117
+#define SQR_TOOM4_THRESHOLD                315
+#define SQR_TOOM6_THRESHOLD                446
+#define SQR_TOOM8_THRESHOLD                527
+
+#define MULMID_TOOM42_THRESHOLD             38
+
+#define MULMOD_BNM1_THRESHOLD               14
+#define SQRMOD_BNM1_THRESHOLD               20
+
+#define MUL_FFT_MODF_THRESHOLD             436  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    436, 5}, {     25, 6}, {     25, 7}, {     13, 6}, \
+    {     27, 7}, {     15, 6}, {     31, 7}, {     25, 8}, \
+    {     13, 7}, {     28, 8}, {     15, 7}, {     32, 8}, \
+    {     17, 7}, {     35, 8}, {     19, 7}, {     39, 8}, \
+    {     27, 9}, {     15, 8}, {     35, 9}, {     19, 8}, \
+    {     41, 9}, {     23, 8}, {     49, 9}, {     27,10}, \
+    {     15, 9}, {     31, 8}, {     63, 9}, {     39,10}, \
+    {     23, 9}, {     51,11}, {     15,10}, {     31, 9}, \
+    {     71,10}, {     39, 9}, {     83,10}, {     47, 9}, \
+    {     95,10}, {     55,11}, {     31,10}, {     79,11}, \
+    {     47,10}, {     95,12}, {     31,11}, {     63,10}, \
+    {    135,11}, {     79,10}, {    159,11}, {     95,10}, \
+    {    191,12}, {     63,11}, {    127,10}, {    255, 9}, \
+    {    511,11}, {    143,10}, {    287, 9}, {    575,11}, \
+    {    159,12}, {     95,11}, {    191,13}, {     63,12}, \
+    {    127,11}, {    255,10}, {    511,11}, {    271,10}, \
+    {    543, 9}, {   1087,11}, {    287,10}, {    575,12}, \
+    {    159,11}, {    319,10}, {    639,11}, {    335,10}, \
+    {    671,11}, {    351,10}, {    703,11}, {    367,12}, \
+    {    191,11}, {    383,10}, {    767,11}, {    415,10}, \
+    {    831,12}, {    223,11}, {    447,13}, {    127,12}, \
+    {    255,11}, {    543,10}, {   1087,12}, {    287,11}, \
+    {    575,10}, {   1151,11}, {    607,10}, {   1215,12}, \
+    {    319,11}, {    639,10}, {   1279,11}, {    671,10}, \
+    {   1343,12}, {    351,11}, {    703,10}, {   1407,11}, \
+    {    735,13}, {    191,12}, {    383,11}, {    767,10}, \
+    {   1535,11}, {    799,12}, {    415,11}, {    831,10}, \
+    {   1663,12}, {    447,11}, {    895,12}, {    479,14}, \
+    {    127,13}, {    255,12}, {    543,11}, {   1087,10}, \
+    {   2175,12}, {    575,11}, {   1151,12}, {    607,11}, \
+    {   1215,10}, {   2431,13}, {    319,12}, {    639,11}, \
+    {   1279,12}, {    671,11}, {   1343,10}, {   2687,12}, \
+    {    703,11}, {   1471,10}, {   2943,13}, {    383,12}, \
+    {    767,11}, {   1535,12}, {    799,11}, {   1599,12}, \
+    {    831,11}, {   1663,13}, {    447,12}, {    959,11}, \
+    {   1919,10}, {   3839,14}, {    255,13}, {    511,12}, \
+    {   1087,11}, {   2175,13}, {    575,12}, {   1215,11}, \
+    {   2431,13}, {    639,12}, {   1343,11}, {   2687,13}, \
+    {    703,12}, {   1471,11}, {   2943,14}, {    383,13}, \
+    {    767,12}, {   1599,11}, {   3199,13}, {    831,12}, \
+    {   1727,13}, {    895,12}, {   1791,13}, {    959,12}, \
+    {   1919,11}, {   3839,14}, {    511,13}, {   1087,12}, \
+    {   2175,13}, {   1215,12}, {   2431,14}, {    639,13}, \
+    {   1343,12}, {   2687,13}, {   1471,12}, {   2943,11}, \
+    {   5887,14}, {    767,13}, {   1599,12}, {   3199,13}, \
+    {   1727,12}, {   3455,14}, {    895,13}, {   1919,12}, \
+    {   3839,11}, {   7679,15}, {    511,14}, {   1023,13}, \
+    {   2175,14}, {   1151,13}, {   2431,12}, {   4863,14}, \
+    {   1279,13}, {   2687,14}, {   1407,13}, {   2943,12}, \
+    {   5887,15}, {    767,14}, {   1535,13}, {   3199,14}, \
+    {   1663,13}, {   3455,12}, {   6911,14}, {   1919,13}, \
+    {   3839,16}, {    511,15}, {   1023,14}, {   2175,13}, \
+    {   4479,12}, {   8959,14}, {   2431,13}, {   4863,15}, \
+    {   1279,14}, {   2943,13}, {   5887,12}, {  11775,15}, \
+    {   1535,14}, {   3455,15}, {   1791,14}, {   3839,13}, \
+    {   7679,14}, {   3967,16}, {   1023,15}, {   2047,14}, \
+    {   4479,15}, {   2303,14}, {   4863,15}, {   2559,14}, \
+    {   5247,15}, {   2815,14}, {   5887,16}, {   1535,15}, \
+    {   3327,14}, {   6911,15}, {   3839,14}, {   7679,13}, \
+    {  15359,17}, {   1023,16}, {   2047,15}, {   4351,14}, \
+    {   8959,15}, {   4863,16}, {   2559,15}, {   5887,14}, \
+    {  11775,16}, {   3071,15}, {   6911,16}, {   3583,15}, \
+    {   7679,14}, {  15359,15}, {   7935,17}, {   2047,16}, \
+    {   4095,15}, {   8959,16}, {   4607,15}, {   9983,14}, \
+    {  19967,16}, {   5631,15}, {  11775,17}, {   3071,16}, \
+    {   7679,15}, {  15871,18}, {   2047,17}, {   4095,16}, \
+    {   9727,15}, {  19967,17}, {   5119,16}, {  65536,17}, \
+    { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
+    {2097152,22}, {4194304,23}, {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 275
+#define MUL_FFT_THRESHOLD                 4736
+
+#define SQR_FFT_MODF_THRESHOLD             396  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    396, 5}, {     25, 6}, {     25, 7}, {     13, 6}, \
+    {     27, 7}, {     15, 6}, {     31, 7}, {     25, 8}, \
+    {     13, 7}, {     28, 8}, {     15, 7}, {     32, 8}, \
+    {     17, 7}, {     35, 8}, {     19, 7}, {     39, 8}, \
+    {     27, 9}, {     15, 8}, {     35, 9}, {     19, 8}, \
+    {     41, 9}, {     23, 8}, {     47, 9}, {     27,10}, \
+    {     15, 9}, {     39,10}, {     23, 9}, {     51,11}, \
+    {     15,10}, {     31, 9}, {     67,10}, {     39, 9}, \
+    {     79,10}, {     55,11}, {     31,10}, {     79,11}, \
+    {     47,10}, {     95,12}, {     31,11}, {     63,10}, \
+    {    135,11}, {     79,10}, {    159,11}, {     95,12}, \
+    {     63,11}, {    127,10}, {    255, 9}, {    511,11}, \
+    {    143,10}, {    287, 9}, {    575,10}, {    303,11}, \
+    {    159,12}, {     95,13}, {     63,12}, {    127,11}, \
+    {    255,10}, {    511,11}, {    271,10}, {    543,11}, \
+    {    287,10}, {    575,11}, {    303,12}, {    159,11}, \
+    {    319,10}, {    639,11}, {    335,10}, {    671, 9}, \
+    {   1343,11}, {    351,10}, {    703,11}, {    367,10}, \
+    {    735,11}, {    383,10}, {    767,11}, {    415,10}, \
+    {    831,12}, {    223,11}, {    447,13}, {    127,12}, \
+    {    255,11}, {    511,10}, {   1023,11}, {    543,10}, \
+    {   1087,12}, {    287,11}, {    575,10}, {   1151,11}, \
+    {    607,10}, {   1215,12}, {    319,11}, {    639,10}, \
+    {   1279,11}, {    671,10}, {   1343,12}, {    351,11}, \
+    {    703,10}, {   1407,11}, {    735,10}, {   1471,12}, \
+    {    383,11}, {    767,10}, {   1535,11}, {    799,12}, \
+    {    415,11}, {    831,10}, {   1663,12}, {    447,11}, \
+    {    895,12}, {    479,11}, {    959,14}, {    127,12}, \
+    {    511,11}, {   1023,12}, {    543,11}, {   1087,10}, \
+    {   2175,12}, {    575,11}, {   1151,12}, {    607,11}, \
+    {   1215,10}, {   2431,12}, {    639,11}, {   1279,12}, \
+    {    671,11}, {   1343,10}, {   2687,12}, {    703,11}, \
+    {   1407,12}, {    735,11}, {   1471,10}, {   2943,13}, \
+    {    383,12}, {    767,11}, {   1535,12}, {    799,11}, \
+    {   1599,12}, {    831,11}, {   1663,13}, {    447,12}, \
+    {    959,11}, {   1919,10}, {   3839,13}, {    511,12}, \
+    {   1087,11}, {   2175,13}, {    575,12}, {   1215,11}, \
+    {   2431,13}, {    639,12}, {   1343,11}, {   2687,13}, \
+    {    703,12}, {   1471,11}, {   2943,14}, {    383,13}, \
+    {    767,12}, {   1599,13}, {    831,12}, {   1727,11}, \
+    {   3455,13}, {    959,12}, {   1919,11}, {   3839,14}, \
+    {    511,13}, {   1023,12}, {   2047,13}, {   1087,12}, \
+    {   2175,13}, {   1215,12}, {   2431,11}, {   4863,14}, \
+    {    639,13}, {   1343,12}, {   2687,13}, {   1471,12}, \
+    {   2943,11}, {   5887,14}, {    767,13}, {   1599,12}, \
+    {   3199,13}, {   1727,12}, {   3455,14}, {    895,13}, \
+    {   1919,12}, {   3839,15}, {    511,14}, {   1023,13}, \
+    {   2175,14}, {   1151,13}, {   2431,12}, {   4863,14}, \
+    {   1279,13}, {   2687,14}, {   1407,13}, {   2943,12}, \
+    {   5887,15}, {    767,14}, {   1535,13}, {   3199,14}, \
+    {   1663,13}, {   3455,12}, {   6911,14}, {   1919,13}, \
+    {   3839,12}, {   7679,16}, {    511,15}, {   1023,14}, \
+    {   2175,13}, {   4479,14}, {   2431,13}, {   4863,15}, \
+    {   1279,14}, {   2943,13}, {   5887,12}, {  11775,15}, \
+    {   1535,14}, {   3455,13}, {   6911,15}, {   1791,14}, \
+    {   3839,13}, {   7679,14}, {   3967,16}, {   1023,15}, \
+    {   2047,14}, {   4479,15}, {   2303,14}, {   4863,15}, \
+    {   2559,14}, {   5247,15}, {   2815,14}, {   5887,13}, \
+    {  11775,16}, {   1535,15}, {   3071,14}, {   6143,15}, \
+    {   3327,14}, {   6911,15}, {   3839,14}, {   7679,17}, \
+    {   1023,16}, {   2047,15}, {   4095,14}, {   8191,15}, \
+    {   4351,14}, {   8959,15}, {   4863,16}, {   2559,15}, \
+    {   5887,14}, {  11775,16}, {   3071,15}, {   6911,16}, \
+    {   3583,15}, {   7679,14}, {  15359,15}, {   7935,17}, \
+    {   2047,16}, {   4095,15}, {   8959,16}, {   4607,15}, \
+    {   9983,14}, {  19967,16}, {   5119,15}, {  10239,16}, \
+    {   5631,15}, {  11775,17}, {   3071,16}, {   7679,15}, \
+    {  15359,18}, {   2047,17}, {   4095,16}, {   9727,15}, \
+    {  19967,17}, {   5119,16}, {  65536,17}, { 131072,18}, \
+    { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \
+    {4194304,23}, {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 282
+#define SQR_FFT_THRESHOLD                 3264
+
+#define MULLO_BASECASE_THRESHOLD             0  /* always */
+#define MULLO_DC_THRESHOLD                  57
+#define MULLO_MUL_N_THRESHOLD             8907
+#define SQRLO_BASECASE_THRESHOLD             8
+#define SQRLO_DC_THRESHOLD                   0  /* never mpn_sqrlo_basecase */
+#define SQRLO_SQR_THRESHOLD               6440
+
+#define DC_DIV_QR_THRESHOLD                 43
+#define DC_DIVAPPR_Q_THRESHOLD             154
+#define DC_BDIV_QR_THRESHOLD                46
+#define DC_BDIV_Q_THRESHOLD                 93
+
+#define INV_MULMOD_BNM1_THRESHOLD           36
+#define INV_NEWTON_THRESHOLD               141
+#define INV_APPR_THRESHOLD                 149
+
+#define BINV_NEWTON_THRESHOLD              264
+#define REDC_1_TO_REDC_N_THRESHOLD          47
+
+#define MU_DIV_QR_THRESHOLD               1470
+#define MU_DIVAPPR_Q_THRESHOLD            1528
+#define MUPI_DIV_QR_THRESHOLD               47
+#define MU_BDIV_QR_THRESHOLD              1187
+#define MU_BDIV_Q_THRESHOLD               1589
+
+#define POWM_SEC_TABLE  3,22,194,579
+
+#define GET_STR_DC_THRESHOLD                12
+#define GET_STR_PRECOMPUTE_THRESHOLD        19
+#define SET_STR_DC_THRESHOLD               195
+#define SET_STR_PRECOMPUTE_THRESHOLD      1752
+
+#define FAC_DSC_THRESHOLD                  345
+#define FAC_ODD_THRESHOLD                    0  /* always */
+
+#define MATRIX22_STRASSEN_THRESHOLD         24
+#define HGCD2_DIV1_METHOD                    1  /* 11.29% faster than 3 */
+#define HGCD_THRESHOLD                      89
+#define HGCD_APPR_THRESHOLD                 96
+#define HGCD_REDUCE_THRESHOLD             2681
+#define GCD_DC_THRESHOLD                   465
+#define GCDEXT_DC_THRESHOLD                233
+#define JACOBI_BASE_METHOD                   1  /* 25.56% faster than 4 */
+
+/* Tuneup completed successfully, took 294200 seconds */
commit	bb1338cd84d865f1eb7a653969204a06bba8261c	[log] [tgz]
author	Austin Schuh <austin.linux@gmail.com>	Sat Jun 15 19:31:16 2024 -0700
committer	Austin Schuh <austin.linux@gmail.com>	Wed Jun 19 19:49:35 2024 -0700
tree	e291dbf975ebfaebab464c64131d63191b8b0e39
parent	e4a8c6c24f636a763b512a3f87dee2225762d817 [diff]